In [1]:
import xlrd
from pathlib import Path
import pandas as pd
import numpy as np
from numba import decorators
import librosa

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import Dataset


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [15]:
# read in the data file
# Give the location of the file 

df = pd.read_excel(r'data.xlsx', sheet_name='reduced totals')
# print(df)

In [4]:
## LOADING IN DATASETS

In [None]:
dataset = Path.cwd().joinpath("SongEmotionDataset")

train_song = []
test_song = []
train_emotion = []
test_emotion = []

for i in range(1, 401):
    if i % 5 == 0:
        test_song.append(dataset.joinpath("{}.mp3".format(i)))
        emotion_arr = []
        for j in range(5):
            emotion_arr.append(sheet.cell_value(i, 2 + j))
        test_emotion.append(torch.tensor(emotion_arr, device=device).float())
    else:
        train_song.append(dataset.joinpath("{}.mp3".format(i)))
        emotion_arr = []
        for j in range(5):
            emotion_arr.append(sheet.cell_value(i, 2 + j))
        train_emotion.append(torch.tensor(emotion_arr))

print(len(train_song), len(test_song))
print(len(train_emotion), len(test_emotion))

In [12]:
for sound_file in data_path.iterdir():
    if ".mp3" in str(sound_file):
        print(sound_file)
        audio, sample_rate = librosa.load(str(sound_file), res_type='kaiser_fast')
        
    

SongEmotionDataset/249.mp3
SongEmotionDataset/37.mp3
SongEmotionDataset/120.mp3
SongEmotionDataset/221.mp3
SongEmotionDataset/347.mp3
SongEmotionDataset/247.mp3
SongEmotionDataset/183.mp3
SongEmotionDataset/214.mp3
SongEmotionDataset/45.mp3
SongEmotionDataset/390.mp3
SongEmotionDataset/275.mp3
SongEmotionDataset/121.mp3
SongEmotionDataset/262.mp3
SongEmotionDataset/182.mp3
SongEmotionDataset/386.mp3
SongEmotionDataset/235.mp3
SongEmotionDataset/126.mp3
SongEmotionDataset/372.mp3
SongEmotionDataset/35.mp3
SongEmotionDataset/283.mp3
SongEmotionDataset/14.mp3
SongEmotionDataset/333.mp3
SongEmotionDataset/265.mp3
SongEmotionDataset/62.mp3
SongEmotionDataset/211.mp3
SongEmotionDataset/79.mp3
SongEmotionDataset/171.mp3
SongEmotionDataset/60.mp3
SongEmotionDataset/61.mp3
SongEmotionDataset/293.mp3
SongEmotionDataset/357.mp3
SongEmotionDataset/209.mp3
SongEmotionDataset/396.mp3
SongEmotionDataset/328.mp3
SongEmotionDataset/308.mp3
SongEmotionDataset/201.mp3
SongEmotionDataset/173.mp3
SongEmoti

In [13]:
audio, sample_rate = librosa.load("SongEmotionDataset/1.mp3", res_type='kaiser_fast')
# [print(x) for x in audio]

#convert audio into 2d array
mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
# mfccsscaled = np.mean(mfccs.T,axis=0)
print(mfccs.shape, audio.shape)
mfccs



(40, 2586) (1323648,)


array([[-5.30341797e+02, -4.07741577e+02, -3.27536621e+02, ...,
        -2.39811523e+02, -1.96744080e+02, -1.44711777e+02],
       [ 5.81265569e-01,  1.03006027e+02,  1.29354553e+02, ...,
         1.48707626e+02,  1.45873001e+02,  1.28202530e+02],
       [ 4.58764762e-01,  7.53921986e+00, -1.18814125e+01, ...,
        -2.51551704e+01, -1.92207527e+01, -1.79366188e+01],
       ...,
       [ 3.11299562e-01, -1.29907084e+00,  1.18818974e+00, ...,
        -6.58579540e+00, -3.34302998e+00, -4.75482178e+00],
       [ 2.23848164e-01, -3.19489312e+00, -2.78556681e+00, ...,
        -1.36089420e+01, -6.40699673e+00, -5.27228928e+00],
       [ 8.67742151e-02,  1.31472754e+00, -1.41885233e+00, ...,
         3.34440261e-01,  1.14392626e+00, -3.62402201e-02]], dtype=float32)

In [5]:
audio_tensor = torch.tensor(audio)
audio_tensor
audio_tensor.shape

torch.Size([1323648])

In [10]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv1d(1, 128, 80, 4)
        self.bn1 = nn.BatchNorm1d(128)
        self.pool1 = nn.MaxPool1d(4)
        self.conv2 = nn.Conv1d(128, 128, 3)
        self.bn2 = nn.BatchNorm1d(128)
        self.pool2 = nn.MaxPool1d(4)
        self.conv3 = nn.Conv1d(128, 256, 3)
        self.bn3 = nn.BatchNorm1d(256)
        self.pool3 = nn.MaxPool1d(4)
        self.conv4 = nn.Conv1d(256, 512, 3)
        self.bn4 = nn.BatchNorm1d(512)
        self.pool4 = nn.MaxPool1d(4)
        self.avgPool = nn.AvgPool1d(30) #input should be 512x30 so this outputs a 512x1
        self.fc1 = nn.Linear(512, 5)
        
    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(self.bn1(x))
        x = self.pool1(x)
        x = self.conv2(x)
        x = F.relu(self.bn2(x))
        x = self.pool2(x)
        x = self.conv3(x)
        x = F.relu(self.bn3(x))
        x = self.pool3(x)
        x = self.conv4(x)
        x = F.relu(self.bn4(x))
        x = self.pool4(x)
        x = self.avgPool(x)
        x = x.permute(0, 2, 1) #change the 512x1 to 1x512
        x = self.fc1(x)
        return F.log_softmax(x, dim = 2)

model = Net()
model.to(device)
print(model)

Net(
  (conv1): Conv1d(1, 128, kernel_size=(80,), stride=(4,))
  (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool1): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(128, 128, kernel_size=(3,), stride=(1,))
  (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool2): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv1d(128, 256, kernel_size=(3,), stride=(1,))
  (bn3): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool3): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv4): Conv1d(256, 512, kernel_size=(3,), stride=(1,))
  (bn4): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool4): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (avgPool): AvgPool1d(kernel_size=(30,), stride=(30,), 

In [None]:
train_loader = torch.utils.Dataloader()
test_loader = torch.utils.DataLoader()

optimizer = optim.Adam(model.parameters(), lr = 0.01, weight_decay = 0.0001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size = 20, gamma = 0.1)

In [None]:
def train():
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()
        data.unsqueeze_(1)
        data = data.requires_grad_() #set requires_grad to True for training
        output = model(data)
        output = output.view(-1, len(emotions))
#         print(output.shape, target.shape)
#         print(output, target)
        loss = F.kl_div(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0: #print training stats
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss))

In [None]:
def test(model, epoch):
    model.eval()
    correct = 0
    for data, target in test_loader:
        data.unsqueeze_(1)
        output = model(data)
        output = output.permute(1, 0, 2)
        pred = output.max(2)[1] # get the index of the max log-probability
        correct += pred.eq(target.max(1)[1]).cpu().sum().item()
    print('\nTest set: Accuracy: {}/{} ({:.0f}%)\n'.format(
        correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))