In [2]:
import xlrd
from pathlib import Path
import pandas as pd
import numpy as np
from numba import decorators
import librosa

import os
from PIL import Image
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchaudio
from torchvision import datasets, transforms
from torch.utils.data import Dataset


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [4]:
# read in the data file
# Give the location of the file 

df = pd.read_excel(r'data/data.xlsx', sheet_name='normalized')
# print(df)

In [5]:
## LOADING IN DATASETS

dataset = Path.cwd().joinpath("SongEmotionDataset")
datasheet = Path.cwd().joinpath("data") # for csua
imagepath = Path.cwd().joinpath("SongEmotionDatasetImages")
cmap = plt.get_cmap('inferno')

#emotion labels
label_loc = datasheet.joinpath("data.xlsx")
wb = xlrd.open_workbook(label_loc) 
sheet = wb.sheet_by_index(3)
val_sheet = wb.sheet_by_index(1)

#emotion arr
# emotions = ["amazement", "calmness", "power", "joyful activation", "sadness"]
emotions = ["solemnity", "tenderness", "nostalgia", "calmness", "power", "joyful activation", "tension", "sadness"]

In [9]:
def saveimages():
    song = []
    for filename in os.listdir(dataset):
        if filename.endswith(".mp3"):
            songname = dataset.joinpath(filename)
#             y, sr = librosa.load(songname, mono=True, duration=5)
            waveform, sr = torchaudio.load(songname)
            mono_waveform = torch.mean(waveform, dim=0)
            plt.specgram(mono_waveform, NFFT=2048, Fs=2, Fc=0, noverlap=128, cmap=cmap, sides='default', mode='default', scale='dB');
            plt.axis('off');
            plt.savefig(imagepath.joinpath(filename.replace(".mp3", ".png")))
            plt.clf()

saveimages()

  Z = 10. * np.log10(spec)


<Figure size 432x288 with 0 Axes>

In [6]:
train_percentage = 0.8

train_song = []
test_song = []
train_img = []
test_img = []
train_emotion = []
test_emotion = []

row_indexes = np.arange(1,226)
np.random.shuffle(row_indexes)

def get_data(indexes):
    song = []
    img = []
    emotion = []
    
    for x in indexes:
        row = int(sheet.cell_value(x, 13) - 1)
        trackid = int(sheet.cell_value(x, 0))
        song.append(dataset.joinpath("{}.mp3".format(trackid)))
        img.append(imagepath.joinpath("{}.png".format(trackid)))
        emotion.append(row)
            
    return song, img, emotion
    
song, img, emotion = get_data(row_indexes)

train_song = [song[i] for i in range(len(song)) if i < len(song)*train_percentage]
test_song = [song[i] for i in range(len(song)) if i >= len(song)*train_percentage]

train_img = [img[i] for i in range(len(img)) if i < len(img)*train_percentage]
test_img = [img[i] for i in range(len(img)) if i >= len(img)*train_percentage]

train_emotion = [emotion[i] for i in range(len(emotion)) if i < len(emotion)*train_percentage]
test_emotion = [emotion[i] for i in range(len(emotion)) if i >= len(emotion)*train_percentage]

def get_val_data():
    val_song = []
    val_img = []
    val_emotion = []
    
    for x in range(1, 401):
        row = int(val_sheet.cell_value(x, 13) - 1)
        trackid = int(val_sheet.cell_value(x, 0))
        val_song.append(dataset.joinpath("{}.mp3".format(trackid)))
        val_img.append(imagepath.joinpath("{}.png".format(trackid)))
        val_emotion.append(row)
            
    return val_song, val_img, val_emotion

val_song = []
val_img = []
val_emotion = []
val_song, val_img, val_emotion = get_val_data()

In [7]:
# train_emotion

In [8]:
class SongEmotionDataset(Dataset):
    """
    Song Emotion Dataset. Uses librosa to process mp3 files.
    Takes first 20 seconds, and samples every 10 to get processed audio tensor.
    """

    def __init__(self, png, labels, transform=None):
        """
        Args:
            mp3: list of paths to mp3 files
            labels: list of labels
        """
        self.labels = labels
        self.png = png
        self.cache = {}
        
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        if index not in self.cache.keys():
            transform = transforms.Compose([
                    transforms.Resize(224),
                    transforms.CenterCrop(224),
                    transforms.ToTensor(),
                    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
                ]
            )

            songimage = Image.open(self.png[index]).convert('RGB')
            songimage = transform(songimage)
            self.cache[index] = (songimage, torch.tensor(self.labels[index], device=device))
            
        return self.cache[index]

In [13]:
model_conv = torchvision.models.resnet50(pretrained=True)
for param in model_conv.parameters():
    param.requires_grad = False
# don't freeze model
    
num_ftrs = model_conv.fc.in_features
model_conv.fc = nn.Linear(num_ftrs, len(emotions))
# model_conv = nn.Sequential(model_conv.fc, nn.Linear(1024, len(emotions)))
model_conv = model_conv.to(device)
            
model = model_conv
model.to(device)
print(model)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [14]:
train_set = SongEmotionDataset(train_img, train_emotion)
test_set = SongEmotionDataset(test_img, test_emotion)
val_set = SongEmotionDataset(val_img, val_emotion)
print("Train set size: " + str(len(train_set)))
print("Test set size: " + str(len(test_set)))
print("Val set size: " + str(len(val_set)))

kwargs = {'num_workers': 2, 'pin_memory': True} if device == 'cuda' else {} #needed for using datasets on gpu
train_loader = torch.utils.data.DataLoader(train_set, batch_size = 16, shuffle = True, **kwargs)
test_loader = torch.utils.data.DataLoader(test_set, batch_size = 16, shuffle = True, **kwargs)
val_loader = torch.utils.data.DataLoader(val_set, batch_size = 16, shuffle = True, **kwargs)


optimizer = optim.Adam(model.parameters(), lr = 0.001)
# scheduler = optim.lr_scheduler.StepLR(optimizer, step_size = 20, gamma = 0.1)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer)

Train set size: 180
Test set size: 45
Val set size: 400


In [15]:
def train(model, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()
        data = data.to(device)
        target = target.to(device)
        
        output = model(data)
#         output = output.requires_grad_() #set requires_grad to True for training
        loss = F.cross_entropy(output, target)
        
        loss.backward()
        optimizer.step()
        
#         print("target: " + str(target))
#         print("output: " + str(output))
#         print("\n")
        
        if batch_idx % log_interval == 0: #print training stats
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss))
    scheduler.step(loss)

In [16]:
def test(model, epoch):
    model.eval()
    correct = 0
    for data, target in test_loader:
        data = data.to(device)
        target = target.to(device)
        output = model(data)
        
#         print(output)
#         output = output.permute(1, 0, 2)
        pred = output.max(1)[1] # get the index of the max log-probability 
        correct += pred.eq(target).sum()
#         correct += pred.eq(target.max(1)[1]).cpu().sum().item()
    print('\nTest set: Accuracy: {}/{} ({:.0f}%)\n'.format(
        correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [17]:
import warnings

log_interval = 5
warnings.filterwarnings("ignore")
for epoch in range(1, 40):
    print("training epoch " + str(epoch))
    train(model, epoch)
    test(model, epoch)

training epoch 1

Test set: Accuracy: 6/45 (13%)

training epoch 2

Test set: Accuracy: 7/45 (16%)

training epoch 3

Test set: Accuracy: 12/45 (27%)

training epoch 4

Test set: Accuracy: 7/45 (16%)

training epoch 5

Test set: Accuracy: 12/45 (27%)

training epoch 6

Test set: Accuracy: 14/45 (31%)

training epoch 7

Test set: Accuracy: 13/45 (29%)

training epoch 8

Test set: Accuracy: 11/45 (24%)

training epoch 9

Test set: Accuracy: 16/45 (36%)

training epoch 10

Test set: Accuracy: 11/45 (24%)

training epoch 11

Test set: Accuracy: 13/45 (29%)

training epoch 12

Test set: Accuracy: 9/45 (20%)

training epoch 13

Test set: Accuracy: 17/45 (38%)

training epoch 14

Test set: Accuracy: 14/45 (31%)

training epoch 15

Test set: Accuracy: 9/45 (20%)

training epoch 16

Test set: Accuracy: 15/45 (33%)

training epoch 17

Test set: Accuracy: 13/45 (29%)

training epoch 18

Test set: Accuracy: 14/45 (31%)

training epoch 19

Test set: Accuracy: 16/45 (36%)

training epoch 20

Test se

In [18]:
def val(model):
    model.eval()
    correct = 0
    for data, target in val_loader:
        data = data.to(device)
        target = target.to(device)
        output = model(data)
        
#         print(output)
#         output = output.permute(1, 0, 2)
        pred = output.max(1)[1] # get the index of the max log-probability 
        correct += pred.eq(target).sum()
#         correct += pred.eq(target.max(1)[1]).cpu().sum().item()
    print('\nVal set: Accuracy: {}/{} ({:.0f}%)\n'.format(
        correct, len(val_loader.dataset),
        100. * correct / len(val_loader.dataset)))

In [19]:
val(model)


Val set: Accuracy: 197/400 (49%)



In [20]:
torch.save(model.fc, './song_mfcc_model_fc.pth')

Inference

In [51]:
transform = transforms.Compose([
        transforms.Resize(224),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]
)

def image_loader(image_name):
    image = Image.open("./SongEmotionDatasetImages/" + image_name).convert("RGB")
    image = transform(image).to(device)
    return torch.tensor(image, device=device).unsqueeze(0)
    
img = image_loader("228.png")
model.eval()
print(model(img))

tensor([[ 0.7444, -0.4518, -1.5000,  2.4692,  5.8717, -1.9363, -2.6768, -4.4650]],
       device='cuda:0', grad_fn=<AddmmBackward>)


## NOTES

below is the mfccs notes / random code

In [95]:
for i in range(6):
    test(model, i)

tensor([3, 5, 5, 5, 7, 0, 3, 0])
tensor([7, 7, 7, 7, 7, 7, 7, 7])
tensor(1)
tensor([3, 6, 6, 0, 7, 0, 5, 3])
tensor([7, 7, 7, 7, 7, 7, 7, 7])
tensor(2)
tensor([2, 2, 5, 4, 6, 1, 2, 1])
tensor([7, 7, 7, 7, 7, 7, 7, 7])
tensor(2)
tensor([4, 2, 4, 0, 0, 7, 4, 6])
tensor([7, 7, 7, 7, 7, 7, 7, 7])
tensor(3)
tensor([4, 2, 0, 5, 1, 3, 5, 4])
tensor([7, 7, 7, 7, 7, 7, 7, 7])
tensor(3)
tensor([6, 0, 2, 2, 4])
tensor([7, 7, 7, 7, 7])
tensor(3)

Test set: Accuracy: 3/45 (7%)

tensor([3, 5, 1, 0, 1, 6, 3, 4])
tensor([7, 7, 7, 7, 7, 7, 7, 7])
tensor(0)
tensor([5, 3, 4, 2, 6, 4, 0, 6])
tensor([7, 7, 7, 7, 7, 7, 7, 7])
tensor(0)
tensor([0, 2, 0, 3, 5, 7, 3, 5])
tensor([7, 7, 7, 7, 7, 7, 7, 7])
tensor(1)
tensor([2, 4, 0, 1, 7, 6, 5, 4])
tensor([7, 7, 7, 7, 7, 7, 7, 7])
tensor(2)
tensor([4, 0, 2, 5, 2, 2, 0, 5])
tensor([7, 7, 7, 7, 7, 7, 7, 7])
tensor(2)
tensor([2, 0, 7, 6, 4])
tensor([7, 7, 7, 7, 7])
tensor(3)

Test set: Accuracy: 3/45 (7%)

tensor([0, 3, 2, 3, 0, 6, 1, 4])
tensor([7, 7, 7, 7, 7, 7, 7

In [None]:
# Print model's state_dict
print("Model's state_dict:")
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())
# Print optimizer's state_dict
print("Optimizer's state_dict:")
for var_name in optimizer.state_dict():
    print(var_name, "\t", optimizer.state_dict()[var_name])
torch.save(model.state_dict(), 'dataset_model_soundemotion.pt')

In [13]:
audio, sample_rate = librosa.load("SongEmotionDataset/1.mp3", res_type='kaiser_fast')
# [print(x) for x in audio]

#convert audio into 2d array
mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
# mfccsscaled = np.mean(mfccs.T,axis=0)
print(mfccs.shape, audio.shape)
mfccs



(40, 2586) (1323648,)


array([[-5.30341797e+02, -4.07741577e+02, -3.27536621e+02, ...,
        -2.39811523e+02, -1.96744080e+02, -1.44711777e+02],
       [ 5.81265569e-01,  1.03006027e+02,  1.29354553e+02, ...,
         1.48707626e+02,  1.45873001e+02,  1.28202530e+02],
       [ 4.58764762e-01,  7.53921986e+00, -1.18814125e+01, ...,
        -2.51551704e+01, -1.92207527e+01, -1.79366188e+01],
       ...,
       [ 3.11299562e-01, -1.29907084e+00,  1.18818974e+00, ...,
        -6.58579540e+00, -3.34302998e+00, -4.75482178e+00],
       [ 2.23848164e-01, -3.19489312e+00, -2.78556681e+00, ...,
        -1.36089420e+01, -6.40699673e+00, -5.27228928e+00],
       [ 8.67742151e-02,  1.31472754e+00, -1.41885233e+00, ...,
         3.34440261e-01,  1.14392626e+00, -3.62402201e-02]], dtype=float32)

In [10]:
# audio_tensor = torch.tensor(audio)
# audio_tensor
# audio_tensor.shape

In [16]:
# for sound_file in data_path.iterdir():
#     if ".mp3" in str(sound_file):
#         print(sound_file)
#         audio, sample_rate = librosa.load(str(sound_file), res_type='kaiser_fast')
        
    

In [10]:
# train_percentage = 0.8

# min_train = min_total*train_percentage
# min_test = min_total - min_train

# train_totals = torch.zeros(len(emotions))

# while 


# for i in range(1, 401):
#     count_total = sheet.cell_value(i, 7)
    
#     emotions_counter = [0 for e in emotions]
#     if i % 5 == 0:
#         test_song.append(dataset.joinpath("{}.mp3".format(i)))
#         emotion_arr = []
#         for j in range(5):
#             emotion_arr.append(sheet.cell_value(i, 2 + j))
#         test_emotion.append(torch.tensor(emotion_arr, device=device).float())
        
#     emotions_counter = [0 for e in emotions]
#     else:
#         train_song.append(dataset.joinpath("{}.mp3".format(i)))
#         emotion_arr = []
#         for j in range(5):
#             emotion_arr.append(sheet.cell_value(i, 2 + j))
#         train_emotion.append(torch.tensor(emotion_arr, device=device))

# print(len(train_song), len(test_song))
# print(len(train_emotion), len(test_emotion))

In [None]:
# def train(model, epoch):
#     model.train()
#     for batch_idx, (data, target) in enumerate(train_loader):
#         print(data)
#         optimizer.zero_grad()
#         data.unsqueeze_(1)
#         data = data.requires_grad_() #set requires_grad to True for training
#         output = model(data)
# #         output = output.view(-1, len(emotions))
# #         print(output.shape, target.shape)
# #         print(output, target)
# #         loss = F.kl_div(output, target)
#         kl = nn.KLDivLoss()
#         loss = kl(output, target)
# #         loss = F.cross_entropy(output, target)
# #         loss = nn.CrossEntropyLoss(output, target)
# #         loss = F.nll_loss(output, target)
#         loss.backward()
#         print(loss)
#         optimizer.step()
#         print(output)
#         print(target)
#         print("\n")
# #         scheduler.step()
#         if batch_idx % log_interval == 0: #print training stats
#             print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
#                 epoch, batch_idx * len(data), len(train_loader.dataset),
#                 100. * batch_idx / len(train_loader), loss))

In [15]:
# train_percentage = 0.8
# allowed_exceedance = 0

# train_song = []
# test_song = []
# train_emotion = []
# test_emotion = []

# row_indexes = np.arange(1,401)
# np.random.shuffle(row_indexes)

# # train_indexes = [row_indexes[i] for i in range(len(row_indexes)) if i < len(row_indexes)*train_percentage]
# # test_indexes = [row_indexes[i] for i in range(len(row_indexes)) if i >= len(row_indexes)*train_percentage]

# def get_data(indexes):
#     song = []
#     emotion = []
    
#     totals = torch.zeros(len(emotions), device=device).float()
#     for x in indexes:    
#         row = torch.tensor([sheet.cell_value(x, 2 + j) for j in range(5)], device=device).float()
#         totals += F.softmax(row)

#     min_total = torch.min(totals)
#     print(totals)
    
#     totals = torch.zeros(len(emotions), device=device).float()
#     for x in indexes:
#         row = torch.tensor([sheet.cell_value(x, 2 + j) for j in range(5)], device=device).float()
        
#         if torch.max(totals + row) < min_total*(1 + allowed_exceedance):
#             song.append(dataset.joinpath("{}.mp3".format(x)))
#             emotion.append(row)
#             totals += F.softmax(row)
            
#     print(totals)
#     return song, emotion
    
# song, emotion = get_data(row_indexes)
# # test_song, test_emotion = get_data(test_indexes)

# train_song = [song[i] for i in range(len(song)) if i < len(song)*train_percentage]
# test_song = [song[i] for i in range(len(song)) if i >= len(song)*train_percentage]

# train_emotion = [emotion[i] for i in range(len(emotion)) if i < len(emotion)*train_percentage]
# test_emotion = [emotion[i] for i in range(len(emotion)) if i >= len(emotion)*train_percentage]

# num_maxes = [0 for _ in emotions]

# for row in emotion:
#     i = torch.argmax(row)
#     num_maxes[i] += 1
    
# print(num_maxes)

tensor([ 11.4343, 157.9053,  55.5535, 120.1634,  54.9434], device='cuda:0')
tensor([3.1017, 9.4848, 9.0917, 8.7741, 8.5477], device='cuda:0')
[1, 9, 11, 10, 8]


