In [1]:
%load_ext autoreload
%autoreload 2

# Imports

In [2]:
import time
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import torch
import torch.nn.functional as F
import torchaudio

from model import *
from sampler import *
from helpers import *

from IPython.display import Audio, display
from python_speech_features import logfbank, mfcc
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset, random_split

# Load the dataset

In [3]:
base_dir = '../feature_engineering_Adi/full_dataset'
df = pd.read_csv(os.path.join(base_dir,'train_filtered.csv'))
df.set_index("fname", inplace=True, drop=True)
df = df.drop(['manually_verified'],axis='columns')

In [4]:
non_instrumental_classes = ['Telephone','Cowbell', 'Gong','Gunshot_or_gunfire']
instrumental_classes_df = df[~df.label.isin(non_instrumental_classes)]

instrumental_classes = list(set(instrumental_classes_df.label))
instrumental_classes_labels_mapping = {c: i for i, c in enumerate(instrumental_classes)}
instrumental_classes_df["label_id"] = instrumental_classes_df.label.map(instrumental_classes_labels_mapping)
len(instrumental_classes_labels_mapping)

17

In [5]:
test_df = pd.read_csv(os.path.join(base_dir,'test_filtered_public.csv'))
test_df.set_index("fname", inplace=True, drop=True)
test_instrumental_classes_df = test_df[~test_df.label.isin(non_instrumental_classes)]
test_instrumental_classes_df["label_id"] = test_instrumental_classes_df.label.map(instrumental_classes_labels_mapping)

In [6]:
# todo: maybe let them define some layers in the model?

In [7]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = ConvNet(num_classes = len(instrumental_classes_labels_mapping))
model = model.to(device)
model = model.eval()
model

ConvNet(
  (conv_1): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv_2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv_3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (max_pool): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
  (linear_1): Linear(in_features=7936, out_features=256, bias=True)
  (linear_out): Linear(in_features=256, out_features=17, bias=True)
)

In [8]:
dataset = MFCCDataset(instrumental_classes_df)

In [9]:
instrumental_classes_df

Unnamed: 0_level_0,label,label_id,duration
fname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
15f7e867.wav,Bass_drum,3,14112
820bcee9.wav,Bass_drum,3,14112
307b76ab.wav,Tambourine,13,14112
f093caf9.wav,Hi-hat,2,14112
2ed7a267.wav,Bass_drum,3,14112
...,...,...,...
1648effc.wav,Electric_piano,10,1206576
910dc153.wav,Electric_piano,10,1223334
7773d933.wav,Chime,6,1240092
3ac0c9ea.wav,Electric_piano,10,1281546


In [10]:
batch_size = 4
train_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=False, collate_fn=pad_batch)

In [11]:
@torch.no_grad()
def compute_accuracy(model, data_loader):
    correct_pred, num_examples = 0, 0
    for features, targets in data_loader:
        features = features.to(device).float()
        targets = targets.to(device)
        logits, probas = model(features)
        _, predicted_labels = torch.max(probas, 1)
        num_examples += targets.size(0)
        correct_pred += (predicted_labels == targets).sum()
    return correct_pred.float()/num_examples * 100

In [12]:
start_time = time.time()
num_epochs = 10
log_every = 100
curr_loss=0
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005, betas = (0.9,0.999), eps = 1e-8, weight_decay = 0)
for epoch in range(num_epochs):
    model.train()
    for batch_idx, (features, targets) in enumerate(train_loader):
        
        features = features.to(device)
        targets = targets.to(device)

        ### FORWARD AND BACK PROP
        logits, probas = model(features)
        
        loss = loss_fn(logits, targets)   
        curr_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        
        ### UPDATE MODEL PARAMETERS
        optimizer.step()
        
        ### LOGGING
        if batch_idx % log_every == log_every-1 or batch_idx == len(train_loader)-1:
            curr_loss /= log_every
            print(f'Epoch: {epoch+1}/{num_epochs} | Batch {batch_idx}/{len(train_loader)} | Loss: {curr_loss}')
            curr_loss = 0

    model.eval()

    print('Time elapsed: %.2f min' % ((time.time() - start_time)/60))

print('training accuracy: %.2f%%' % (compute_accuracy(model, train_loader)))

print('Total Training Time: %.2f min' % ((time.time() - start_time)/60))

Epoch: 1/10 | Batch 99/472 | Loss: 2.06790884912014
Epoch: 1/10 | Batch 199/472 | Loss: 2.158555458635092
Epoch: 1/10 | Batch 299/472 | Loss: 1.9153146077692509
Epoch: 1/10 | Batch 399/472 | Loss: 1.469803657233715
Epoch: 1/10 | Batch 471/472 | Loss: 1.066224380284548
Time elapsed: 2.28 min
Epoch: 2/10 | Batch 99/472 | Loss: 1.5047208038531243
Epoch: 2/10 | Batch 199/472 | Loss: 1.0704256888901
Epoch: 2/10 | Batch 299/472 | Loss: 0.883797531761229
Epoch: 2/10 | Batch 399/472 | Loss: 0.6592212415579707
Epoch: 2/10 | Batch 471/472 | Loss: 0.7122644456708804
Time elapsed: 4.60 min
Epoch: 3/10 | Batch 99/472 | Loss: 0.979869327424094
Epoch: 3/10 | Batch 199/472 | Loss: 0.7181190539712042
Epoch: 3/10 | Batch 299/472 | Loss: 0.6336059815308545
Epoch: 3/10 | Batch 399/472 | Loss: 0.4925223557092249
Epoch: 3/10 | Batch 471/472 | Loss: 0.43216737166017993
Time elapsed: 7.14 min
Epoch: 4/10 | Batch 99/472 | Loss: 0.5852364737913013
Epoch: 4/10 | Batch 199/472 | Loss: 0.46324928989681213
Epoch: 4

In [13]:
test_instrumental_classes_df

Unnamed: 0_level_0,label,usage,label_id
fname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
00eac343.wav,Electric_piano,Public,10
047bf19c.wav,Flute,Public,14
05723b3a.wav,Cello,Public,12
0716b51d.wav,Double_bass,Public,9
097cdef5.wav,Violin_or_fiddle,Public,11
...,...,...,...
f5dd877a.wav,Acoustic_guitar,Public,5
fa45b631.wav,Oboe,Public,7
fb84a793.wav,Clarinet,Public,0
fbc83b12.wav,Saxophone,Public,1


In [14]:
test_dataset = MFCCDataset(test_instrumental_classes_df, train=False)
full_test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, collate_fn=pad_batch)

In [15]:
accuracy = compute_accuracy(model, full_test_loader)
print(f"accuracy: {accuracy}%")

accuracy: 69.67742156982422%
