In [1]:
from typing import Tuple
import pandas as pd
from tqdm import tqdm
import os
import time
import random
import pickle
import numpy as np
from os.path import isfile, join
from pydub import AudioSegment
from scipy.io import wavfile
from sklearn.metrics import confusion_matrix, classification_report

import torch
from torch import nn
from torchaudio import transforms

from models.audio_LSTMCNN import AudioLSTMCNN

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


# Constants

In [2]:
RATE = 44100
MEL_SPECTROGRAM_BUCKETS = 128
MEL_SPECTROGRAM_WINDOW_LENGTH = 224
SPECTROS_PER_SECOND = RATE // (MEL_SPECTROGRAM_WINDOW_LENGTH / 2) - 1
CHUNKS_PER_SECOND = 2
CHUNK_SIZE_IN_SPECTROS = int(SPECTROS_PER_SECOND // CHUNKS_PER_SECOND)

TRAINSET_RATIO = 0.8

In [3]:
def get_quadrant(val: float, aro: float) -> int:
    if val >= 0.5 and aro >= 0.5:
        return 1
    if val >= 0.5 and aro < 0.5:
        return 2
    if val < 0.5 and aro < 0.5:
        return 3
    if val < 0.5 and aro >= 0.5:
        return 4

<hr>

# Dataset

## Create dataset

#### Load data

In [4]:
AUDIO_FOLDER = "C:\\Users\\amity\\PycharmProjects\\aimpathy\\data\\PMEmo\\PMEmo2019\\chorus"
THAYER_ANOTATIONS_CSV = "C:\\Users\\amity\\PycharmProjects\\aimpathy\\data\\PMEmo\\PMEmo2019\\annotations\\dynamic_annotations.csv"
thayer_annotations_df = pd.read_csv(THAYER_ANOTATIONS_CSV)

In [5]:
audio_files = [f for f in os.listdir(AUDIO_FOLDER) if isfile(join(AUDIO_FOLDER, f))]
audio_data = dict()
torch_spectorgrams = dict()
spectorgrammer = transforms.MelSpectrogram(sample_rate=RATE, n_fft=(MEL_SPECTROGRAM_BUCKETS * 2 - 2), win_length=MEL_SPECTROGRAM_WINDOW_LENGTH, power=2, normalized=True)



#### Convert to Spectrograms

In [6]:
spectrograms = dict()
for audio_file in tqdm(audio_files, total=len(audio_files)):
    sound = AudioSegment.from_mp3(os.sep.join([AUDIO_FOLDER, audio_file])).set_channels(1)
    audio_file_wave = sound.export(format="wav", bitrate=RATE)
    sample_rate, samples = wavfile.read(audio_file_wave)
    spectogram = spectorgrammer(torch.from_numpy(samples/(2**15)).float().reshape((1, -1)))
    spectrograms[audio_file] = spectogram
    audio_file_wave.close()
spectrograms = {key: value for key, value in spectrograms.items() if ".wav" not in key}

100%|██████████████████████████████████████████████████████████████████████████████████████████████| 794/794 [03:17<00:00,  4.02it/s]


#### Create final dataset

In [207]:
dataset = dict()  # music_id: spectrogram, (valence, arousal)
for file_name, spectrogram in tqdm(spectrograms.items(), total=len(spectrograms)):
    music_id = int(file_name.replace(".mp3", ""))
    dataset[music_id] = list()
    max_frame_time = thayer_annotations_df[thayer_annotations_df["musicId"] == music_id]["frameTime"].max()
    if np.isnan(max_frame_time):    
        dataset.pop(music_id)
        continue
    for i in range(int(CHUNKS_PER_SECOND * max_frame_time)):
        data_df = thayer_annotations_df[(thayer_annotations_df["musicId"] == music_id) & (thayer_annotations_df["frameTime"] == i/2)]
        if data_df.empty:
            #  print(f"Skipping {musicI_id} - {i/2}")
            continue
        valence = data_df.iloc[0]["Valence(mean)"]
        arousal = data_df.iloc[0]["Arousal(mean)"]
        dataset[music_id].append((spectrogram[0, :, int((i/2-1)*CHUNK_SIZE_IN_SPECTROS): int((i/2)*CHUNK_SIZE_IN_SPECTROS)], (valence, arousal)))
    
    if len(dataset[music_id]) <= 0:
        dataset.pop(music_id)
        
music_id_to_quadrant = {key: get_quadrant(np.mean([val[1][0] for val in value]), np.mean([val[1][1] for val in value])) for key, value in dataset.items() if value}
quadrant_counters = [list(music_id_to_quadrant.values()).count(i+1) for i in range(4)]
quadrant_ratios = max(quadrant_counters)//np.array(quadrant_counters)

100%|██████████████████████████████████████████████████████████████████████████████████████████████| 793/793 [00:31<00:00, 24.93it/s]


### Divide to train and test

In [266]:
train_music_ids = random.sample(dataset.keys(), int(TRAINSET_RATIO*len(dataset)))
trainset = {key: value for key, value in dataset.items() if key in train_music_ids}
testset = {key: value for key, value in dataset.items() if key not in train_music_ids}

In [267]:
with open(f'data/datasets/trainset_{int(time.time())}.pk', 'wb') as f:
    pickle.dump(list(trainset.keys()), f)
with open(f'data/datasets/testset_{int(time.time())}.pk', 'wb') as f:
    pickle.dump(list(testset.keys()), f)

### Load train and test

In [8]:
dataset_label = '1674741184'
with open(f'data/datasets/trainset_{dataset_label}.pk', 'rb') as f:
    trainset_ids = pickle.load(f)
    trainset = {key: value for key, value in dataset.items() if key in trainset_ids}
with open(f'data/datasets/testset_{dataset_label}.pk', 'rb') as f:
    testset_ids = pickle.load(f)
    testset = {key: value for key, value in dataset.items() if key in testset_ids}

In [None]:
trainset_quadrant_to_keys = dict()

for music_id in trainset.keys():
    quad = music_id_to_quadrant[music_id]
    if quad not in trainset_quadrant_to_keys:
        trainset_quadrant_to_keys[quad] = list()
    trainset_quadrant_to_keys[quad].append(music_id)

## Plot spectrogram

In [None]:
import matplotlib.pyplot as plt
import librosa

In [None]:
def plot_spectrogram(spec, title=None, ylabel="freq_bin", aspect="auto", xmax=None, ymax=None):
    fig, axs = plt.subplots(1, 1)
    axs.set_title(title or "Spectrogram (db)")
    axs.set_ylabel(ylabel)
    axs.set_xlabel("frame")
    im = axs.imshow(librosa.power_to_db(spec), origin="lower", aspect=aspect)
    if xmax:
        axs.set_xlim((0, xmax))
    if ymax:
        axs.set_ylim((0, ymax))
    fig.colorbar(im, ax=axs)
    plt.show(block=False)

In [None]:
plot_spectrogram(dataset[1000][0], title="MelSpectrogram - torchaudio", ylabel="mel freq")

# Model

In [311]:
class AudioLSTMCNN(nn.Module):
    def __init__(self, input_shape: Tuple[int, int], out_size: int = 2, cnn_channels: int = 64):
        """
        :param input_shape: (X, Y). For a spectogram with 128 buckets and chunk size of 196, will be (128, 196)
        """
        # call the parent constructor
        super(AudioLSTMCNN, self).__init__()

        self.conv11 = nn.Conv2d(in_channels=1, out_channels=cnn_channels, kernel_size=(3, 3), stride=(1, 1), padding=1)
        self.relu11 = nn.ReLU()
        self.conv12 = nn.Conv2d(in_channels=cnn_channels, out_channels=cnn_channels, kernel_size=(3, 3), stride=(1, 1),
                                padding=1)
        self.relu12 = nn.ReLU()
        self.maxpool1 = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))

        self.conv21 = nn.Conv2d(in_channels=cnn_channels, out_channels=cnn_channels, kernel_size=(3, 3), stride=(1, 1),
                                padding=1)
        self.relu21 = nn.ReLU()
        self.conv22 = nn.Conv2d(in_channels=cnn_channels, out_channels=cnn_channels, kernel_size=(3, 3), stride=(1, 1),
                                padding=1)
        self.relu22 = nn.ReLU()
        self.maxpool2 = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))

        self.conv3 = nn.Conv2d(in_channels=cnn_channels, out_channels=cnn_channels * 2, kernel_size=(3, 3),
                               stride=(1, 1), padding=1)
        self.relu3 = nn.ReLU()
        self.maxpool3 = nn.MaxPool2d(kernel_size=(3, 3), stride=(3, 3))
        self.dropout3 = nn.Dropout(p=0.25)

        self.conv4 = nn.Conv2d(in_channels=cnn_channels * 2, out_channels=cnn_channels * 4, kernel_size=(3, 3),
                               stride=(1, 1), padding=1)
        self.relu4 = nn.ReLU()
        self.maxpool4 = nn.MaxPool2d(kernel_size=(3, 3), stride=(3, 3))
        self.dropout4 = nn.Dropout(p=0.25)

        self.conv5 = nn.Conv2d(in_channels=cnn_channels * 4, out_channels=cnn_channels * 4, kernel_size=(3, 3),
                               stride=(1, 1), padding=1)
        self.relu5 = nn.ReLU()
        self.maxpool5 = nn.MaxPool2d(kernel_size=(3, 3), stride=(3, 3))
        self.dropout5 = nn.Dropout(p=0.25)

        self.lstm6 = nn.LSTM(cnn_channels * 4, cnn_channels * 4, batch_first=True)
        self.hidden = (torch.zeros(1, 1, cnn_channels * 4),
                       torch.zeros(1, 1, cnn_channels * 4))
        self.fc6 = nn.Linear(in_features=cnn_channels * 4, out_features=cnn_channels * 4)
        self.dropout6 = nn.Dropout(p=0.5)

        self.fc7 = nn.Linear(in_features=cnn_channels * 4, out_features=cnn_channels * 4)
        self.dropout7 = nn.Dropout(p=0.5)

        self.fc8 = nn.Linear(in_features=cnn_channels * 4, out_features=2)
        self.final = nn.ReLU()

    def forward(self, x):
        x = x.reshape((1, 1, x.shape[0], -1))

        x = self.conv11(x)
        x = self.relu11(x)
        x = self.conv12(x)
        x = self.relu12(x)
        x = self.maxpool1(x)

        x = self.conv21(x)
        x = self.relu21(x)
        x = self.conv22(x)
        x = self.relu22(x)
        x = self.maxpool2(x)

        x = self.conv3(x)
        x = self.relu3(x)
        x = self.maxpool3(x)
        x = self.dropout3(x)

        x = self.conv4(x)
        x = self.relu4(x)
        x = self.maxpool4(x)
        x = self.dropout4(x)

        x = self.conv5(x)
        x = self.relu5(x)
        x = self.maxpool5(x)
        x = self.dropout5(x)

        x = x.view(x.size(0), x.size(1), -1)
        x = x.permute(0, 2, 1)

        x, self.hidden = self.lstm6(x, self.hidden)
        x = self.fc6(x)
        x = self.dropout6(x)

        x = self.fc7(x)
        x = self.dropout7(x)

        x = self.fc8(x)

        final_x = self.final(x.reshape((-1)))
        # final_x = final_x*2 - 1

        return final_x


In [10]:
### Load model
model_name = f'saved_models/AudioLSTMCNN_1674751233_014772344088200705.pt'
model_c = AudioLSTMCNN(input_shape=(128, 196), out_size=2)
model_c.load_state_dict(torch.load(model_name));
model_c.eval().cuda()
print("Loaded model")

Loaded model


### Train model

In [369]:
model_c = AudioLSTMCNN(input_shape=(128, 196), out_size=2).cuda()

In [370]:
criterion = nn.MSELoss().cuda()
optimizer = torch.optim.Adam(model_c.parameters(), lr=0.005)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience = 5, factor=0.5)

In [371]:
print(dataset[10][0][0].shape)
print(dataset[10][0][0].reshape((1, 1, MEL_SPECTROGRAM_BUCKETS, -1)).shape)

torch.Size([128, 196])
torch.Size([1, 1, 128, 196])


In [372]:
start_time = time.time()
EPOCS = 50
PRINT_MARK = 500
BATCH_SIZE = 50
STOP_LOSS = 0.0025
MIN_LEARNING_RATE = 0.0005

normalize = lambda x: (x + 1)/2
denormalize = lambda x: x*2 - 1


for epoc in range(EPOCS):
    losses = list()
    quadrants = list()
    real_quadrants = list()
    # model_c.hidden = (model_c.hidden[0].cuda(), model_c.hidden[1].cuda())
    train_key_sample = [key for keys in [random.sample(trainset_quadrant_to_keys[i+1], BATCH_SIZE//4) for i in range(4)] for key in keys]
    random.shuffle(train_key_sample)
    train_sample = [datum for sample_key in train_key_sample for datum in trainset[sample_key]]
    # train_sample = [datum for key, value in trainset.items() if key in train_key_sample for datum in value]
    for batch_i, (X_train, (valence, arousal)) in enumerate(train_sample):
        optimizer.zero_grad()
        model_c.hidden = (torch.zeros(model_c.hidden[0].shape).cuda(),
                          torch.zeros(model_c.hidden[0].shape).cuda())
        
        y_train = torch.Tensor((valence, arousal)).cuda()
        # Apply the model
        y_pred = model_c(X_train.cuda())  # we don't flatten X-train here
        loss = criterion(y_pred, y_train)

        # Update parameters
        loss.backward(retain_graph=True)
        optimizer.step()

        losses.append(loss.cpu().item())
        real_quadrants.append(get_quadrant(y_train[0].item(), y_train[1].item()))
        quadrants.append(get_quadrant(y_pred[0].item(), y_pred[1].item()))
        
        # Print interim results
        if (batch_i > 0 or epoc == 0) and batch_i%PRINT_MARK == 0:
            print(f'{epoc:2}-{batch_i:4} | loss: {np.mean(losses):.5f}  |  [{quadrants.count(1):4}({real_quadrants.count(1):4}), {quadrants.count(2):4}({real_quadrants.count(2):4}), {quadrants.count(3):4}({real_quadrants.count(3):4}), {quadrants.count(4):4}({real_quadrants.count(4):4})]     lr: {optimizer.param_groups[0]["lr"]}')
                  # {y_train[0].item():.4f}, {y_train[0].item():.4f} | {y_pred[0].item():.4f},{y_pred[0].item():.4f}')
                  # [{abs(y_train[0] - y_pred[0]):.5f}, {abs(y_train[1] - y_pred[1]):.5f}]
    
    scheduler.step(np.mean(losses))

    if np.mean(losses) < STOP_LOSS or optimizer.param_groups[0]["lr"] < MIN_LEARNING_RATE:
        break
        
print(f'\nDuration: {time.time() - start_time:.0f} seconds') # print the time elapsed            

 0-   0 | loss: 0.10694  |  [   0(   0),    0(   0),    1(   1),    0(   0)]     lr: 0.005
 0- 500 | loss: 0.15194  |  [   1(  89),    0( 137),  302( 141),  198( 134)]     lr: 0.005
 0-1000 | loss: 0.12511  |  [   1( 142),    0( 195),  560( 328),  440( 336)]     lr: 0.005
 0-1500 | loss: 0.09793  |  [ 206( 368),  130( 369),  646( 373),  519( 391)]     lr: 0.005
 0-2000 | loss: 0.07536  |  [ 325( 503),  327( 564),  771( 478),  578( 456)]     lr: 0.005
 1- 500 | loss: 0.00542  |  [  48(  32),  312( 330),  105(  94),   36(  45)]     lr: 0.005
 1-1000 | loss: 0.00690  |  [ 300( 289),  395( 397),  146( 132),  160( 183)]     lr: 0.005
 1-1500 | loss: 0.00698  |  [ 466( 449),  441( 443),  295( 291),  299( 318)]     lr: 0.005
 1-2000 | loss: 0.00856  |  [ 536( 513),  521( 506),  464( 506),  480( 476)]     lr: 0.005
 2- 500 | loss: 0.00443  |  [ 101( 102),  156( 169),   84(  59),  160( 171)]     lr: 0.005
 2-1000 | loss: 0.00958  |  [ 310( 309),  234( 278),  221( 176),  236( 238)]     lr: 0.005

KeyboardInterrupt: 

In [None]:
latest_model_name = f'saved_models/AudioLSTMCNN_{int(time.time())}_{str(np.mean(losses))[2:]}.pt'
torch.save(model_c.state_dict(), latest_model_name)
print(f"Saved model {latest_model_name}")

### Test model

In [373]:
losses = list()
quadrants = list()

with torch.no_grad():
    model_c.hidden = (model_c.hidden[0].cuda(), model_c.hidden[1].cuda())
    test_sample = [datum for key, value in testset.items() for datum in value]
    for batch_i, (X_test, (valence, arousal)) in tqdm(enumerate(test_sample), total=len(test_sample)):
        y_test = torch.Tensor((valence, arousal)).cuda()
        # Apply the model
#         model_c.hidden = (torch.zeros(model_c.hidden[0].shape).cuda(),
#                           torch.zeros(model_c.hidden[0].shape).cuda())
        
        y_val = model_c(X_test.cuda())
        loss = criterion(y_val, y_test)
        losses.append(loss.cpu())
        quadrants.append((get_quadrant(y_val[0].item(), y_val[1].item()), get_quadrant(y_test[0].item(), y_test[1].item())))

100%|███████████████████████████████████████████████████████████████████████████████████████████| 6928/6928 [00:30<00:00, 225.35it/s]


In [374]:
print(f"Mean: {np.mean(losses):.5f}, median: {np.median(losses):.5f}")

Mean: 0.05365, median: 0.03952


In [375]:
print(confusion_matrix([a[0] for a in quadrants], [a[1] for a in quadrants]))
print("\n")
print(classification_report([a[0] for a in quadrants], [a[1] for a in quadrants]))

[[   0    0    0    0]
 [   0    0    1    0]
 [4584  307  772  771]
 [ 353   20   52   68]]


              precision    recall  f1-score   support

           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         1
           3       0.94      0.12      0.21      6434
           4       0.08      0.14      0.10       493

    accuracy                           0.12      6928
   macro avg       0.25      0.06      0.08      6928
weighted avg       0.87      0.12      0.20      6928



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Inspect quadrants

In [None]:
testset.keys()

In [None]:
test_index = 519
test_location = 0

with torch.no_grad():
    pred = model_c(testset[test_index][test_location][0].cuda()).cpu()
print(pred)
print([denormalize(a.item()) for a in pred])
print(testset[test_index][test_location][1])

In [None]:
[(key, value[0][1]) for key, value in testset.items() if value and value[0][1] < (0.5, 0.5)]

## Random results classifier

#### Random vs. testset

In [248]:
losses = list()

test_sample = [datum for key, value in testset.items() for datum in value]
for batch_i, (X_test, (valence, arousal)) in tqdm(enumerate(test_sample), total=len(test_sample)):
    y_test = torch.Tensor((valence, arousal)).cuda()
    y_val = torch.Tensor((random.random(), random.random())).cuda()
    loss = criterion(y_val, y_test)
    losses.append(loss.cpu())

100%|██████████████████████████████████████████████████████████████████████████████████████████| 7064/7064 [00:01<00:00, 4382.98it/s]


In [249]:
print(f"Mean: {np.mean(losses):.5f}, median: {np.median(losses):.5f}")

Mean: 0.13151, median: 0.10569


#### Random vs. Random

In [250]:
losses = list()

test_sample = [datum for key, value in testset.items() for datum in value]
for batch_i, (X_test, (valence, arousal)) in tqdm(enumerate(test_sample), total=len(test_sample)):
    y_test = torch.Tensor((random.random(), random.random())).cuda()
    y_val = torch.Tensor((random.random(), random.random())).cuda()
    loss = criterion(y_val, y_test)
    losses.append(loss.cpu())

100%|██████████████████████████████████████████████████████████████████████████████████████████| 7064/7064 [00:01<00:00, 4124.96it/s]


In [251]:
print(f"Mean: {np.mean(losses):.5f}, median: {np.median(losses):.5f}")

Mean: 0.16612, median: 0.12879


In [None]:
y_test = torch.Tensor((1, 1)).cuda()
y_val = torch.Tensor((-1, -1)).cuda()
loss = criterion(y_val, y_test)

In [None]:
loss