In [1]:
import xlrd
from pathlib import Path
import pandas as pd
import numpy as np
from numba import decorators
import librosa

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import Dataset


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [33]:
import csv

song_to_label = {}

path = "otherdata/songs/"

with open('otherdata/mark.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        
        raw_scores = [int(raw) for raw in row[3].split('_')]
        score = (raw_scores[0]*-1 + raw_scores[2]) / (raw_scores[0] + raw_scores[1] + raw_scores[2])
        
        song_to_label[path + row[0] + ".m4a"] = score
        
songs = list(song_to_label.keys())
labels = list(song_to_label.values())

In [34]:
train_percentage = 0.8

index = round(len(songs)*train_percentage)

train_songs = songs[:index]
test_songs = songs[index:]

train_labels = labels[:index]
test_labels = labels[index:]

In [35]:
class SongEmotionDataset(Dataset):
    """
    Song Emotion Dataset. Uses librosa to process mp3 files.
    Takes first 20 seconds, and samples every 10 to get processed audio tensor.
    """

    def __init__(self, songs, labels, transform=None):
        """
        Args:
            mp3: list of paths to mp3 files
            labels: list of labels
        """
        self.labels = labels
        self.songs = songs
        self.cache = {}
        
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        if index not in self.cache.keys():
#             print("index of " + str(index) + " was cached!")
            data, rate = librosa.load(self.songs[index], sr=16000, duration=10)
            assert rate == 16000
            sample_tensor = torch.tensor(data, device=device).float()
            downsampled_tensor = sample_tensor[::10]

            self.cache[index] = (downsampled_tensor, F.softmax(self.labels[index]))
            
        return self.cache[index]

In [36]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv1d(1, 128, 80, 4)
        self.bn1 = nn.BatchNorm1d(128)
        self.pool1 = nn.MaxPool1d(4)
        self.conv2 = nn.Conv1d(128, 128, 3)
        self.bn2 = nn.BatchNorm1d(128)
        self.pool2 = nn.MaxPool1d(4)
        self.conv3 = nn.Conv1d(128, 256, 3)
        self.bn3 = nn.BatchNorm1d(256)
        self.pool3 = nn.MaxPool1d(4)
        self.conv4 = nn.Conv1d(256, 512, 3)
        self.bn4 = nn.BatchNorm1d(512)
        self.pool4 = nn.MaxPool1d(4)
        self.avgPool = nn.AvgPool1d(14) #input should be 512x30 so this outputs a 512x1
        self.fc1 = nn.Linear(512, 5)
        
    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(self.bn1(x))
        x = self.pool1(x)
        x = self.conv2(x)
        x = F.relu(self.bn2(x))
        x = self.pool2(x)
        x = self.conv3(x)
        x = F.relu(self.bn3(x))
        x = self.pool3(x)
        x = self.conv4(x)
        x = F.relu(self.bn4(x))
        x = self.pool4(x)
        x = self.avgPool(x)
        x = x.permute(0, 2, 1) #change the 512x1 to 1x512
        x = self.fc1(x)
        return F.log_softmax(x, dim = 2)

model = Net()
model.to(device)
print(model)

Net(
  (conv1): Conv1d(1, 128, kernel_size=(80,), stride=(4,))
  (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool1): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(128, 128, kernel_size=(3,), stride=(1,))
  (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool2): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv1d(128, 256, kernel_size=(3,), stride=(1,))
  (bn3): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool3): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv4): Conv1d(256, 512, kernel_size=(3,), stride=(1,))
  (bn4): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool4): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (avgPool): AvgPool1d(kernel_size=(14,), stride=(14,), 

In [37]:
train_set = SongEmotionDataset(train_songs, train_labels)
test_set = SongEmotionDataset(test_songs, test_labels)
print("Train set size: " + str(len(train_set)))
print("Test set size: " + str(len(test_set)))

kwargs = {'num_workers': 1, 'pin_memory': True} if device == 'cuda' else {} #needed for using datasets on gpu
train_loader = torch.utils.data.DataLoader(train_set, batch_size = 8, shuffle = True, **kwargs)
test_loader = torch.utils.data.DataLoader(test_set, batch_size = 8, shuffle = True, **kwargs)

optimizer = optim.Adam(model.parameters(), lr = 0.01, weight_decay = 0.0001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size = 20, gamma = 0.1)

Train set size: 3327
Test set size: 832


In [38]:
def train(model, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()
        data.unsqueeze_(1)
        data = data.requires_grad_() #set requires_grad to True for training
        output = model(data)
        output = output.view(-1, len(emotions))
#         print(output.shape, target.shape)
#         print(output, target)
        loss = F.kl_div(output, target)
        loss.backward()
        optimizer.step()
#         scheduler.step()
        if batch_idx % log_interval == 0: #print training stats
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss))

In [39]:
def test(model, epoch):
    model.eval()
    correct = 0
    for data, target in test_loader:
        data.unsqueeze_(1)
        output = model(data)
        output = output.permute(1, 0, 2)
        pred = output.max(2)[1] # get the index of the max log-probability
        correct += pred.eq(target.max(1)[1]).cpu().sum().item()
    print('\nTest set: Accuracy: {}/{} ({:.0f}%)\n'.format(
        correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [40]:
#w caching & data processing
import warnings

log_interval = 5
warnings.filterwarnings("ignore")
for epoch in range(1, 41):
    print("training epoch " + str(epoch))
    if epoch == 31:
        print("First round of training complete. Setting learn rate to 0.001.")
#     scheduler.step()
    train(model, epoch)
    scheduler.step()
    test(model, epoch)

training epoch 1


FileNotFoundError: [Errno 2] No such file or directory: 'otherdata/songs/TRAUXNG128F933AA28.m4a'

In [67]:
#W/ CACHING
import warnings

log_interval = 5
warnings.filterwarnings("ignore")
for epoch in range(1, 41):
    print("training epoch " + str(epoch))
    if epoch == 31:
        print("First round of training complete. Setting learn rate to 0.001.")
#     scheduler.step()
    train(model, epoch)
    scheduler.step()
    test(model, epoch)

training epoch 1

Test set: Accuracy: 19/80 (24%)

training epoch 2

Test set: Accuracy: 18/80 (22%)

training epoch 3

Test set: Accuracy: 23/80 (29%)

training epoch 4

Test set: Accuracy: 23/80 (29%)

training epoch 5

Test set: Accuracy: 20/80 (25%)

training epoch 6

Test set: Accuracy: 22/80 (28%)

training epoch 7

Test set: Accuracy: 27/80 (34%)

training epoch 8

Test set: Accuracy: 20/80 (25%)

training epoch 9

Test set: Accuracy: 23/80 (29%)

training epoch 10

Test set: Accuracy: 24/80 (30%)

training epoch 11

Test set: Accuracy: 18/80 (22%)

training epoch 12

Test set: Accuracy: 24/80 (30%)

training epoch 13

Test set: Accuracy: 18/80 (22%)

training epoch 14

Test set: Accuracy: 27/80 (34%)

training epoch 15

Test set: Accuracy: 18/80 (22%)

training epoch 16

Test set: Accuracy: 24/80 (30%)

training epoch 17

Test set: Accuracy: 31/80 (39%)

training epoch 18

Test set: Accuracy: 21/80 (26%)

training epoch 19

Test set: Accuracy: 30/80 (38%)

training epoch 20



Test set: Accuracy: 30/80 (38%)

training epoch 21

Test set: Accuracy: 29/80 (36%)

training epoch 22

Test set: Accuracy: 29/80 (36%)

training epoch 23

Test set: Accuracy: 30/80 (38%)

training epoch 24

Test set: Accuracy: 29/80 (36%)

training epoch 25

Test set: Accuracy: 28/80 (35%)

training epoch 26

Test set: Accuracy: 27/80 (34%)

training epoch 27

Test set: Accuracy: 30/80 (38%)

training epoch 28

Test set: Accuracy: 28/80 (35%)

training epoch 29

Test set: Accuracy: 31/80 (39%)

training epoch 30

Test set: Accuracy: 27/80 (34%)

training epoch 31
First round of training complete. Setting learn rate to 0.001.

Test set: Accuracy: 26/80 (32%)

training epoch 32

Test set: Accuracy: 26/80 (32%)

training epoch 33

Test set: Accuracy: 27/80 (34%)

training epoch 34

Test set: Accuracy: 28/80 (35%)

training epoch 35

Test set: Accuracy: 23/80 (29%)

training epoch 36

Test set: Accuracy: 29/80 (36%)

training epoch 37

Test set: Accuracy: 25/80 (31%)

training epoch 38




Test set: Accuracy: 30/80 (38%)



In [None]:
#ORIGINAL
import warnings

log_interval = 5
warnings.filterwarnings("ignore")
for epoch in range(1, 41):
    print("training epoch " + str(epoch))
    if epoch == 31:
        print("First round of training complete. Setting learn rate to 0.001.")
#     scheduler.step()
    train(model, epoch)
    scheduler.step()
    test(model, epoch)

training epoch 1

Test set: Accuracy: 21/80 (26%)

training epoch 2

Test set: Accuracy: 27/80 (34%)

training epoch 3

Test set: Accuracy: 28/80 (35%)

training epoch 4

Test set: Accuracy: 28/80 (35%)

training epoch 5

Test set: Accuracy: 28/80 (35%)

training epoch 6

Test set: Accuracy: 25/80 (31%)

training epoch 7

Test set: Accuracy: 30/80 (38%)

training epoch 8

Test set: Accuracy: 28/80 (35%)

training epoch 9

Test set: Accuracy: 27/80 (34%)

training epoch 10

Test set: Accuracy: 26/80 (32%)

training epoch 11

Test set: Accuracy: 20/80 (25%)

training epoch 12

Test set: Accuracy: 28/80 (35%)

training epoch 13

Test set: Accuracy: 28/80 (35%)

training epoch 14

Test set: Accuracy: 21/80 (26%)

training epoch 15

Test set: Accuracy: 27/80 (34%)

training epoch 16

Test set: Accuracy: 11/80 (14%)

training epoch 17

Test set: Accuracy: 25/80 (31%)

training epoch 18

Test set: Accuracy: 17/80 (21%)

training epoch 19

Test set: Accuracy: 28/80 (35%)

training epoch 20



Test set: Accuracy: 24/80 (30%)

training epoch 21

Test set: Accuracy: 19/80 (24%)

training epoch 22

Test set: Accuracy: 18/80 (22%)

training epoch 23

Test set: Accuracy: 17/80 (21%)

training epoch 24

Test set: Accuracy: 18/80 (22%)

training epoch 25

Test set: Accuracy: 17/80 (21%)

training epoch 26

Test set: Accuracy: 18/80 (22%)

training epoch 27

Test set: Accuracy: 20/80 (25%)

training epoch 28

Test set: Accuracy: 20/80 (25%)

training epoch 29

Test set: Accuracy: 18/80 (22%)

training epoch 30

Test set: Accuracy: 19/80 (24%)

training epoch 31
First round of training complete. Setting learn rate to 0.001.

Test set: Accuracy: 18/80 (22%)

training epoch 32

Test set: Accuracy: 20/80 (25%)

training epoch 33


## NOTES

below is the mfccs notes / random code

In [None]:
# Print model's state_dict
print("Model's state_dict:")
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())
# Print optimizer's state_dict
print("Optimizer's state_dict:")
for var_name in optimizer.state_dict():
    print(var_name, "\t", optimizer.state_dict()[var_name])
torch.save(model.state_dict(), 'dataset_model_soundemotion.pt')

In [8]:
# audio, sample_rate = librosa.load("SongEmotionDataset/1.mp3", res_type='kaiser_fast')
# # [print(x) for x in audio]

# #convert audio into 2d array
# mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
# # mfccsscaled = np.mean(mfccs.T,axis=0)
# print(mfccs.shape, audio.shape)
# mfccs

In [10]:
# audio_tensor = torch.tensor(audio)
# audio_tensor
# audio_tensor.shape

In [16]:
# for sound_file in data_path.iterdir():
#     if ".mp3" in str(sound_file):
#         print(sound_file)
#         audio, sample_rate = librosa.load(str(sound_file), res_type='kaiser_fast')
        
    