In [1]:
import os
import torch
from torch import nn
from torch.utils.data import Dataset
import pandas as pd
import torchaudio
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import numpy as np
from torchsummary import summary
import optuna

In [2]:
class EmotionDataset(Dataset):
    def __init__(self, data_path, data_dir, final_sample_rate, num_samples_limit, transformation, device):
        
        self.data_path = data_path
        
        self.data_dir = data_dir
        if self.data_dir[-1] != '/':
            self.data_dir += '/'
        
        self.final_sample_rate = final_sample_rate
        self.num_samples_limit = num_samples_limit
        self.device = device
        self.transformation = transformation.to(self.device)
        self.df = pd.read_csv(data_path)
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, index):
        audio_path = self.get_audio_path(index)
        label = self.get_label(index)
        signal, sr = torchaudio.load(audio_path)
        
        signal = signal.to(self.device)
        signal = self.to_mono_if_necessary(signal)
        signal = self.resample_if_necessary(signal, sr)
        signal = self.cut_down_if_necessary(signal)
        signal = self.right_pad_if_necessary(signal)
        signal = self.transformation(signal)
        
        signal[signal == 0] = 0.000000001
        signal = torch.log(signal)
        
        return signal, label
    
    def resample_if_necessary(self, signal, original_sr):
        if original_sr != self.final_sample_rate:
            resampler = torchaudio.transforms.Resample(original_sr, self.final_sample_rate)
            signal = resampler(signal)
        
        return signal
    
    def to_mono_if_necessary(self, signal):
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0, keepdim=True)
        return signal
    
    def cut_down_if_necessary(self, signal):
        return signal[:, :self.num_samples_limit]
    
    def right_pad_if_necessary(self, signal):
        if signal.shape[1] < self.num_samples_limit:
            num_missing_samples = self.num_samples_limit - signal.shape[1]
            last_dim_padding = (0, num_missing_samples)
            signal = torch.nn.functional.pad(signal, last_dim_padding)
        return signal
        
    def get_audio_path(self, index):
        return self.data_dir + self.df.iloc[index].filename
    
    def get_label(self, index):
        return self.df.iloc[index].category_num
    
    def plot_spectrogram(self, index, log=True):
        spec = self.__getitem__(index)[0]
        spec = spec.cpu().numpy().reshape(spec.shape[1], -1)
        frame_rate = self.final_sample_rate / self.transformation.hop_length
        seconds = np.arange(spec.shape[1]) / frame_rate
        
        if not log:
            spec = np.exp(spec)
        
        plt.figure(figsize=(10, 6))
        plt.imshow(spec, cmap='inferno', origin='lower', aspect='auto')
        plt.xlabel('Time (s)')
        plt.ylabel('Mel')
        plt.colorbar(label='Intensity (dB)')
        plt.xticks(np.linspace(0, spec.shape[1], 12)[:-1], np.linspace(0, seconds[-1], 12).round(2)[:-1])
        plt.show()

In [3]:
class CNNNetwork(nn.Module):
    def __init__(self, trial, loss_fn, device, data_loader):
        super().__init__()
        
        self.num_layers = trial.suggest_int('num_layers', 3, 10)
        self.layers = nn.ModuleList()
        self.learning_rate = trial.suggest_float("learning_rate", 1e-8, 0.01, log=True)
        self.loss_fn = loss_fn
        self.device = device
        self.data_loader = data_loader
        
#         conv_kernel_size = trial.suggest_int('conv_kernel_size', 2, 6)
#         stride = trial.suggest_int('stride', 1, 5)
#         padding = trial.suggest_int('padding', 1, 3)
#         pool_kernel_size = trial.suggest_int('pool_kernel_size', 1, 5)
        
        in_kernels = 1
        out_kernels = 16
        for i in range(self.num_layers):
            conv = nn.Sequential(
                nn.Conv2d(in_channels=in_kernels,
                          out_channels=out_kernels,
                          kernel_size=3,
                          stride=1,
                          padding=2),
                nn.ReLU(),
                nn.MaxPool2d(kernel_size=2)
            )
            self.layers.append(conv)
            
            in_kernels = out_kernels
            out_kernels *= 2
        
        self.flatten = nn.Flatten()
        self.linear = nn.LazyLinear(6)
        self.softmax = nn.Softmax(dim=1)
        
        self.optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
    
    def forward(self, input_data):
        out = self.layers[0](input_data)
        
        for layer in self.layers[1:]:
            out = layer(out)
        
        out = self.flatten(out)
        out = self.linear(out)
        predictions = self.softmax(out)
        
        return predictions
    
    def train(self, num_epochs):
        print('Started training\n')
        final_loss = 0
        
        for i in range(1, num_epochs+1):
            print(f'Epoch {i} / {num_epochs} started')
            final_loss = self.__train_single_epoch()
            print(f'Epoch {i} / {num_epochs} finished')
            print()
            
        return final_loss
            
    def predict(self, inp, target, class_mapping):
        self.eval()
    
        with torch.no_grad():
            prediction_probs = self(inp)
            predicted_index = prediction_probs[0].argmax()
            prediction = class_mapping[predicted_index]
            expected = class_mapping[target]
    
        return prediction, expected
    
    def __train_single_epoch(self):
        for inp, target in self.data_loader:
            inp, target = inp.to(self.device), target.to(self.device)

            pred = self(inp)
            loss = self.loss_fn(pred, target)

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

        print(f'Loss = {loss.item()}')
        return float(loss.item())

In [4]:
def create_data_loader(train_data, batch_size):
    data_loader = DataLoader(train_data, batch_size = batch_size)
    return data_loader

def predict(model, inp, target, class_mapping):
    model.eval()
    
    with torch.no_grad():
        prediction_probs = model(inp)
        predicted_index = prediction_probs[0].argmax()
        prediction = class_mapping[predicted_index]
        expected = class_mapping[target]
    
    return prediction, expected

In [5]:
RECORDING_DF_PATH = os.path.join('..', 'data', 'recording_paths.csv')
DATA_DIR = os.path.join('..', 'data', 'Crema')
FINAL_SR = 16000
NUM_SAMPLES_LIMIT = 60000
BATCH_SIZE = 128
EPOCHS = 25
LEARNING_RATE = 0.0001

mel_spectrogram = torchaudio.transforms.MelSpectrogram(
        sample_rate=FINAL_SR,
        n_fft=1024,
        hop_length=512,
        n_mels=64
    )

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

In [7]:
def objective(trial):
    emd = EmotionDataset(RECORDING_DF_PATH, DATA_DIR, FINAL_SR, NUM_SAMPLES_LIMIT, mel_spectrogram, device)
    train_dataloader = create_data_loader(emd, BATCH_SIZE)
    loss_fn = nn.CrossEntropyLoss()
    
    model = CNNNetwork(trial,
                 loss_fn=loss_fn,
                 device=device,
                 data_loader=train_dataloader
                ).to(device)
    
    
    num_epochs = 20
    
    if trial.should_prune():
        raise optuna.exceptions.TrialPruned()
        
    return model.train(num_epochs=num_epochs)

In [8]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10);

[I 2024-05-08 18:20:06,999] A new study created in memory with name: no-name-e098817f-c932-4576-80fc-2c9c48497d3a


Started training

Epoch 1 / 20 started
Loss = 1.7960638999938965
Epoch 1 / 20 finished

Epoch 2 / 20 started
Loss = 1.743338704109192
Epoch 2 / 20 finished

Epoch 3 / 20 started
Loss = 1.6649821996688843
Epoch 3 / 20 finished

Epoch 4 / 20 started
Loss = 1.626555323600769
Epoch 4 / 20 finished

Epoch 5 / 20 started
Loss = 1.6288363933563232
Epoch 5 / 20 finished

Epoch 6 / 20 started
Loss = 1.6142855882644653
Epoch 6 / 20 finished

Epoch 7 / 20 started
Loss = 1.60173499584198
Epoch 7 / 20 finished

Epoch 8 / 20 started
Loss = 1.5931397676467896
Epoch 8 / 20 finished

Epoch 9 / 20 started
Loss = 1.5892170667648315
Epoch 9 / 20 finished

Epoch 10 / 20 started
Loss = 1.5873230695724487
Epoch 10 / 20 finished

Epoch 11 / 20 started
Loss = 1.5865898132324219
Epoch 11 / 20 finished

Epoch 12 / 20 started
Loss = 1.5865817070007324
Epoch 12 / 20 finished

Epoch 13 / 20 started
Loss = 1.5868937969207764
Epoch 13 / 20 finished

Epoch 14 / 20 started
Loss = 1.587342381477356
Epoch 14 / 20 finishe

[I 2024-05-08 18:36:15,958] Trial 0 finished with value: 1.5878558158874512 and parameters: {'num_layers': 8, 'learning_rate': 5.678180728630376e-06}. Best is trial 0 with value: 1.5878558158874512.


Loss = 1.5878558158874512
Epoch 20 / 20 finished

Started training

Epoch 1 / 20 started




Loss = 1.6471244096755981
Epoch 1 / 20 finished

Epoch 2 / 20 started
Loss = 1.5688138008117676
Epoch 2 / 20 finished

Epoch 3 / 20 started
Loss = 1.5418305397033691
Epoch 3 / 20 finished

Epoch 4 / 20 started
Loss = 1.5241174697875977
Epoch 4 / 20 finished

Epoch 5 / 20 started
Loss = 1.5953236818313599
Epoch 5 / 20 finished

Epoch 6 / 20 started
Loss = 1.6053565740585327
Epoch 6 / 20 finished

Epoch 7 / 20 started
Loss = 1.5351883172988892
Epoch 7 / 20 finished

Epoch 8 / 20 started
Loss = 1.486717700958252
Epoch 8 / 20 finished

Epoch 9 / 20 started
Loss = 1.4062433242797852
Epoch 9 / 20 finished

Epoch 10 / 20 started
Loss = 1.3829857110977173
Epoch 10 / 20 finished

Epoch 11 / 20 started
Loss = 1.3302454948425293
Epoch 11 / 20 finished

Epoch 12 / 20 started
Loss = 1.3247326612472534
Epoch 12 / 20 finished

Epoch 13 / 20 started
Loss = 1.3184337615966797
Epoch 13 / 20 finished

Epoch 14 / 20 started
Loss = 1.3162319660186768
Epoch 14 / 20 finished

Epoch 15 / 20 started
Loss = 1.3

[I 2024-05-08 18:51:36,097] Trial 1 finished with value: 1.3190892934799194 and parameters: {'num_layers': 5, 'learning_rate': 0.001145884923492428}. Best is trial 1 with value: 1.3190892934799194.


Loss = 1.3190892934799194
Epoch 20 / 20 finished





Started training

Epoch 1 / 20 started
Loss = 1.609813928604126
Epoch 1 / 20 finished

Epoch 2 / 20 started
Loss = 1.553971529006958
Epoch 2 / 20 finished

Epoch 3 / 20 started
Loss = 1.6928331851959229
Epoch 3 / 20 finished

Epoch 4 / 20 started
Loss = 1.5770747661590576
Epoch 4 / 20 finished

Epoch 5 / 20 started
Loss = 1.7440732717514038
Epoch 5 / 20 finished

Epoch 6 / 20 started
Loss = 1.562385082244873
Epoch 6 / 20 finished

Epoch 7 / 20 started
Loss = 1.589001178741455
Epoch 7 / 20 finished

Epoch 8 / 20 started
Loss = 1.671248197555542
Epoch 8 / 20 finished

Epoch 9 / 20 started
Loss = 1.5181986093521118
Epoch 9 / 20 finished

Epoch 10 / 20 started
Loss = 1.7304046154022217
Epoch 10 / 20 finished

Epoch 11 / 20 started
Loss = 1.6084851026535034
Epoch 11 / 20 finished

Epoch 12 / 20 started
Loss = 1.6192806959152222
Epoch 12 / 20 finished

Epoch 13 / 20 started
Loss = 1.5852421522140503
Epoch 13 / 20 finished

Epoch 14 / 20 started
Loss = 1.6876468658447266
Epoch 14 / 20 finishe

[I 2024-05-08 19:08:56,890] Trial 2 finished with value: 1.5214648246765137 and parameters: {'num_layers': 9, 'learning_rate': 0.0008887141613710927}. Best is trial 1 with value: 1.3190892934799194.


Loss = 1.5214648246765137
Epoch 20 / 20 finished





Started training

Epoch 1 / 20 started
Loss = 1.7928417921066284
Epoch 1 / 20 finished

Epoch 2 / 20 started
Loss = 1.5750502347946167
Epoch 2 / 20 finished

Epoch 3 / 20 started
Loss = 1.5796159505844116
Epoch 3 / 20 finished

Epoch 4 / 20 started
Loss = 1.596099853515625
Epoch 4 / 20 finished

Epoch 5 / 20 started
Loss = 1.587014079093933
Epoch 5 / 20 finished

Epoch 6 / 20 started
Loss = 1.6309260129928589
Epoch 6 / 20 finished

Epoch 7 / 20 started
Loss = 1.5800001621246338
Epoch 7 / 20 finished

Epoch 8 / 20 started
Loss = 1.5626720190048218
Epoch 8 / 20 finished

Epoch 9 / 20 started
Loss = 1.53416109085083
Epoch 9 / 20 finished

Epoch 10 / 20 started
Loss = 1.47178316116333
Epoch 10 / 20 finished

Epoch 11 / 20 started
Loss = 1.4497761726379395
Epoch 11 / 20 finished

Epoch 12 / 20 started
Loss = 1.4819363355636597
Epoch 12 / 20 finished

Epoch 13 / 20 started
Loss = 1.5217175483703613
Epoch 13 / 20 finished

Epoch 14 / 20 started
Loss = 1.4550832509994507
Epoch 14 / 20 finished

[I 2024-05-08 19:26:14,229] Trial 3 finished with value: 1.4448994398117065 and parameters: {'num_layers': 9, 'learning_rate': 7.596845868195368e-05}. Best is trial 1 with value: 1.3190892934799194.


Loss = 1.4448994398117065
Epoch 20 / 20 finished

Started training

Epoch 1 / 20 started




Loss = 1.8769255876541138
Epoch 1 / 20 finished

Epoch 2 / 20 started
Loss = 1.8769255876541138
Epoch 2 / 20 finished

Epoch 3 / 20 started
Loss = 1.8769255876541138
Epoch 3 / 20 finished

Epoch 4 / 20 started
Loss = 1.8769255876541138
Epoch 4 / 20 finished

Epoch 5 / 20 started
Loss = 1.8769255876541138
Epoch 5 / 20 finished

Epoch 6 / 20 started
Loss = 1.8769255876541138
Epoch 6 / 20 finished

Epoch 7 / 20 started
Loss = 1.8769255876541138
Epoch 7 / 20 finished

Epoch 8 / 20 started
Loss = 1.8769255876541138
Epoch 8 / 20 finished

Epoch 9 / 20 started
Loss = 1.8769255876541138
Epoch 9 / 20 finished

Epoch 10 / 20 started
Loss = 1.8769255876541138
Epoch 10 / 20 finished

Epoch 11 / 20 started
Loss = 1.8769255876541138
Epoch 11 / 20 finished

Epoch 12 / 20 started
Loss = 1.8769255876541138
Epoch 12 / 20 finished

Epoch 13 / 20 started
Loss = 1.8769255876541138
Epoch 13 / 20 finished

Epoch 14 / 20 started
Loss = 1.8769255876541138
Epoch 14 / 20 finished

Epoch 15 / 20 started
Loss = 1.

[I 2024-05-08 19:41:17,434] Trial 4 finished with value: 1.8769255876541138 and parameters: {'num_layers': 3, 'learning_rate': 0.0019279970438095273}. Best is trial 1 with value: 1.3190892934799194.


Loss = 1.8769255876541138
Epoch 20 / 20 finished

Started training

Epoch 1 / 20 started




Loss = 1.652205467224121
Epoch 1 / 20 finished

Epoch 2 / 20 started
Loss = 1.6733232736587524
Epoch 2 / 20 finished

Epoch 3 / 20 started
Loss = 1.5896307229995728
Epoch 3 / 20 finished

Epoch 4 / 20 started
Loss = 1.5782006978988647
Epoch 4 / 20 finished

Epoch 5 / 20 started
Loss = 1.488666296005249
Epoch 7 / 20 finished

Epoch 8 / 20 started
Loss = 1.4503543376922607
Epoch 8 / 20 finished

Epoch 9 / 20 started
Loss = 1.450251579284668
Epoch 9 / 20 finished

Epoch 10 / 20 started
Loss = 1.4310128688812256
Epoch 10 / 20 finished

Epoch 11 / 20 started
Loss = 1.400285005569458
Epoch 11 / 20 finished

Epoch 12 / 20 started
Loss = 1.4002971649169922
Epoch 12 / 20 finished

Epoch 13 / 20 started
Loss = 1.3953211307525635
Epoch 13 / 20 finished

Epoch 14 / 20 started
Loss = 1.4182630777359009
Epoch 14 / 20 finished

Epoch 15 / 20 started
Loss = 1.3941717147827148
Epoch 15 / 20 finished

Epoch 16 / 20 started
Loss = 1.3826837539672852
Epoch 16 / 20 finished

Epoch 17 / 20 started
Loss = 1.

[I 2024-05-08 19:56:17,344] Trial 5 finished with value: 1.3234432935714722 and parameters: {'num_layers': 4, 'learning_rate': 0.0003098786249550029}. Best is trial 1 with value: 1.3190892934799194.


Loss = 1.3234432935714722
Epoch 20 / 20 finished





Started training

Epoch 1 / 20 started
Loss = 1.8769255876541138
Epoch 1 / 20 finished

Epoch 2 / 20 started
Loss = 1.8769255876541138
Epoch 2 / 20 finished

Epoch 3 / 20 started
Loss = 1.8769255876541138
Epoch 3 / 20 finished

Epoch 4 / 20 started
Loss = 1.8769255876541138
Epoch 4 / 20 finished

Epoch 5 / 20 started
Loss = 1.8769255876541138
Epoch 5 / 20 finished

Epoch 6 / 20 started
Loss = 1.8769255876541138
Epoch 6 / 20 finished

Epoch 7 / 20 started
Loss = 1.8769255876541138
Epoch 7 / 20 finished

Epoch 8 / 20 started
Loss = 1.8769255876541138
Epoch 8 / 20 finished

Epoch 9 / 20 started
Loss = 1.8769255876541138
Epoch 9 / 20 finished

Epoch 10 / 20 started
Loss = 1.8769255876541138
Epoch 11 / 20 finished

Epoch 12 / 20 started
Loss = 1.8769255876541138
Epoch 12 / 20 finished

Epoch 13 / 20 started
Loss = 1.8769255876541138
Epoch 13 / 20 finished

Epoch 14 / 20 started
Loss = 1.8769255876541138
Epoch 14 / 20 finished

Epoch 15 / 20 started
Loss = 1.8769255876541138
Epoch 15 / 20 fi

[I 2024-05-08 20:24:10,222] Trial 6 finished with value: 1.8769255876541138 and parameters: {'num_layers': 10, 'learning_rate': 0.0011916124914362403}. Best is trial 1 with value: 1.3190892934799194.


Loss = 1.8769255876541138
Epoch 20 / 20 finished

Started training

Epoch 1 / 20 started




Loss = 1.7544490098953247
Epoch 1 / 20 finished

Epoch 2 / 20 started
Loss = 1.7575236558914185
Epoch 2 / 20 finished

Epoch 3 / 20 started
Loss = 1.760411262512207
Epoch 3 / 20 finished

Epoch 4 / 20 started
Loss = 1.7631345987319946
Epoch 4 / 20 finished

Epoch 5 / 20 started
Loss = 1.765712022781372
Epoch 5 / 20 finished

Epoch 6 / 20 started
Loss = 1.768155813217163
Epoch 6 / 20 finished

Epoch 7 / 20 started
Loss = 1.770475149154663
Epoch 7 / 20 finished

Epoch 8 / 20 started
Loss = 1.7726935148239136
Epoch 8 / 20 finished

Epoch 9 / 20 started
Loss = 1.7748215198516846
Epoch 9 / 20 finished

Epoch 10 / 20 started
Loss = 1.7768723964691162
Epoch 10 / 20 finished

Epoch 11 / 20 started
Loss = 1.7788565158843994
Epoch 11 / 20 finished

Epoch 12 / 20 started
Loss = 1.78078031539917
Epoch 12 / 20 finished

Epoch 13 / 20 started
Loss = 1.7826298475265503
Epoch 13 / 20 finished

Epoch 14 / 20 started
Loss = 1.7843682765960693
Epoch 14 / 20 finished

Epoch 15 / 20 started
Loss = 1.786013

[I 2024-05-08 20:39:13,895] Trial 7 finished with value: 1.7923510074615479 and parameters: {'num_layers': 3, 'learning_rate': 3.6918108293853335e-07}. Best is trial 1 with value: 1.3190892934799194.


Loss = 1.7923510074615479
Epoch 20 / 20 finished





Started training

Epoch 1 / 20 started
Loss = 1.7922842502593994
Epoch 1 / 20 finished

Epoch 2 / 20 started
Loss = 1.7929682731628418
Epoch 2 / 20 finished

Epoch 3 / 20 started
Loss = 1.7949714660644531
Epoch 3 / 20 finished

Epoch 4 / 20 started
Loss = 1.7991106510162354
Epoch 4 / 20 finished

Epoch 5 / 20 started
Loss = 1.8044368028640747
Epoch 5 / 20 finished

Epoch 6 / 20 started
Loss = 1.807717204093933
Epoch 7 / 20 finished

Epoch 8 / 20 started
Loss = 1.7997599840164185
Epoch 8 / 20 finished

Epoch 9 / 20 started
Loss = 1.7593880891799927
Epoch 9 / 20 finished

Epoch 10 / 20 started
Loss = 1.681769609451294
Epoch 10 / 20 finished

Epoch 11 / 20 started
Loss = 1.678375244140625
Epoch 11 / 20 finished

Epoch 12 / 20 started
Loss = 1.6600818634033203
Epoch 12 / 20 finished

Epoch 13 / 20 started
Loss = 1.6440801620483398
Epoch 13 / 20 finished

Epoch 14 / 20 started
Loss = 1.634843349456787
Epoch 14 / 20 finished

Epoch 15 / 20 started
Loss = 1.6289747953414917
Epoch 15 / 20 fini

[I 2024-05-08 21:06:03,971] Trial 8 finished with value: 1.6300561428070068 and parameters: {'num_layers': 10, 'learning_rate': 7.896283305155683e-07}. Best is trial 1 with value: 1.3190892934799194.


Loss = 1.6300561428070068
Epoch 20 / 20 finished





Started training

Epoch 1 / 20 started
Loss = 1.810263991355896
Epoch 1 / 20 finished

Epoch 2 / 20 started
Loss = 1.6632863283157349
Epoch 2 / 20 finished

Epoch 3 / 20 started
Loss = 1.6436833143234253
Epoch 3 / 20 finished

Epoch 4 / 20 started
Loss = 1.628485083580017
Epoch 4 / 20 finished

Epoch 5 / 20 started
Loss = 1.6153844594955444
Epoch 5 / 20 finished

Epoch 6 / 20 started
Loss = 1.6001778841018677
Epoch 6 / 20 finished

Epoch 7 / 20 started
Loss = 1.6004043817520142
Epoch 7 / 20 finished

Epoch 8 / 20 started
Loss = 1.5976182222366333
Epoch 8 / 20 finished

Epoch 9 / 20 started
Loss = 1.5954968929290771
Epoch 9 / 20 finished

Epoch 10 / 20 started
Loss = 1.593956708908081
Epoch 10 / 20 finished

Epoch 11 / 20 started
Loss = 1.5903400182724
Epoch 11 / 20 finished

Epoch 12 / 20 started
Loss = 1.5801342725753784
Epoch 12 / 20 finished

Epoch 13 / 20 started
Loss = 1.567749261856079
Epoch 13 / 20 finished

Epoch 14 / 20 started
Loss = 1.5351977348327637
Epoch 16 / 20 finished


[I 2024-05-08 21:32:46,809] Trial 9 finished with value: 1.5270636081695557 and parameters: {'num_layers': 10, 'learning_rate': 5.356322540692413e-06}. Best is trial 1 with value: 1.3190892934799194.


Loss = 1.5270636081695557
Epoch 20 / 20 finished

