In [45]:
import librosa
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sklearn
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import KFold
import torch.nn.functional as F
from torchvision.transforms import Compose
import random
from datetime import datetime
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [46]:
# waveform function for me to not bang my keyboard
def disp_waveform(signal, sr=None, color='blue'):
    plt.figure(figsize=(7,2))
    return librosa.display.waveshow(signal, sr=sr, color=color)

In [47]:
def isolator(signal, sample_rate, n_fft, hop_length, before, after, threshold, show=False):
    strokes = []
    # -- signal'
    if show:
        disp_waveform(signal, sr=sample_rate)
    fft = librosa.stft(signal, n_fft=n_fft, hop_length=hop_length)
    energy = np.abs(np.sum(fft, axis=0)).astype(float)
    # norm = np.linalg.norm(energy)
    # energy = energy/norm
    # -- energy'
    if show:
        disp_waveform(energy)
    threshed = energy > threshold
    # -- peaks'
    if show:
        disp_waveform(threshed.astype(float))
    peaks = np.where(threshed == True)[0]
    peak_count = len(peaks)
    prev_end = sample_rate*0.1*(-1)
    # '-- isolating keystrokes'
    for i in range(peak_count):
        this_peak = peaks[i]
        timestamp = (this_peak*hop_length) + n_fft//2
        if timestamp > prev_end + (0.1*sample_rate):
            keystroke = signal[timestamp-before:timestamp+after]
            # strokes.append(torch.tensor(keystroke)[None, :])
            # keystroke = transform(keystroke)
            strokes.append(keystroke)
            if show:
                disp_waveform(keystroke, sr=sample_rate)
            prev_end = timestamp+after
    return strokes

In [48]:
# Constants we actually need for the task
MBP_AUDIO_DIR = '../Dataset-for-Binary/base-audio/'
MBP_AUDIO_CUSTOM_DIR = '../Dataset-custom-audio/base-audio/' #for custom audio testing
keys_s = '1234567890QWERTYUIOPASDFGHJKLZXCVBNM'
# keys_s = '12'
labels = list(keys_s)
keys = ['audio_' + k + '.wav' for k in labels]
data_dict = {'Key':[], 'File':[]}
data_dict_t= {'Key':[], 'File':[]} #for custom audio testing
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

In [132]:
def create_dataset(n_fft, hop_length, before, after, keys, custom_audio=False):
    for i, File in enumerate(keys):
        if custom_audio:
            File.replace('audio_', '')
            print(File)
            loc = MBP_AUDIO_CUSTOM_DIR + File
        else:
            loc = MBP_AUDIO_DIR + File
        samples, sr = librosa.load(loc)
        prom = 0.06
        step = 0.005
        strokes = isolator(samples, sr, n_fft, hop_length, before, after, prom, False )
        print(f'File {File} length: {len(strokes)}')
        label = [labels[i]]*len(strokes)
        #works fine here
        if custom_audio:
            data_dict_t['Key'] += label
            data_dict_t['File'] += strokes
        else:
            data_dict['Key'] += label
            data_dict['File'] += strokes


    if custom_audio:
        df = pd.DataFrame(data_dict_t)
    else:
        df = pd.DataFrame(data_dict)
    mapper = {}
    counter = 0
    for l in df['Key']:
        if not l in mapper:
            mapper[l] = counter
            counter += 1
        print(mapper)
    df.replace({'Key': mapper}, inplace = True)

    return df

In [125]:
for key in keys_s:
    sample, sr = librosa.load(f'../Dataset-for-Binary/base-audio/audio_{key}.wav')
    print(sr)
    print(len(isolator(sample, sr, 1024, 225, 2400, 12000, 0.06)), end=' ')
    

22050
25 22050
25 22050
25 22050
25 22050
25 22050
26 22050
25 22050
25 22050
27 22050
26 22050
27 22050
28 22050
28 22050
25 22050
26 22050
25 22050
25 22050
25 22050
25 22050
25 22050
25 22050
25 22050
25 22050
27 22050
26 22050
25 22050
27 22050
25 22050
25 22050
25 22050
26 22050
26 22050
25 22050
25 22050
25 22050
26 

In [133]:
n_fft = 1024
hop_length = 225
before = 2400
after = 12000
mbp_dataset = create_dataset(n_fft, hop_length, before, after, keys)
mbp_dataset

File audio_1.wav length: 25
File audio_2.wav length: 25
File audio_3.wav length: 25
File audio_4.wav length: 25
File audio_5.wav length: 25
File audio_6.wav length: 26
File audio_7.wav length: 25
File audio_8.wav length: 25
File audio_9.wav length: 27
File audio_0.wav length: 26
File audio_Q.wav length: 27
File audio_W.wav length: 28
File audio_E.wav length: 28
File audio_R.wav length: 25
File audio_T.wav length: 26
File audio_Y.wav length: 25
File audio_U.wav length: 25
File audio_I.wav length: 25
File audio_O.wav length: 25
File audio_P.wav length: 25
File audio_A.wav length: 25
File audio_S.wav length: 25
File audio_D.wav length: 25
File audio_F.wav length: 27
File audio_G.wav length: 26
File audio_H.wav length: 25
File audio_J.wav length: 27
File audio_K.wav length: 25
File audio_L.wav length: 25
File audio_Z.wav length: 25
File audio_X.wav length: 26
File audio_C.wav length: 26
File audio_V.wav length: 25
File audio_B.wav length: 25
File audio_N.wav length: 25
File audio_M.wav len

Unnamed: 0,Key,File
0,0,"[-0.00017975706, -0.00012727422, -9.371064e-05..."
1,0,"[0.000497586, 0.00049031794, 0.0005512878, 0.0..."
2,0,"[0.0003178973, 0.00034715672, 0.0003719765, 0...."
3,0,"[0.00268178, 0.0026667328, 0.0026979204, 0.002..."
4,0,"[0.0064755157, 0.0063309446, 0.0053669587, 0.0..."
...,...,...
4600,37,"[-0.250816, -0.25290224, -0.25483984, -0.25665..."
4601,37,"[0.13746458, 0.13331985, 0.12892573, 0.1242144..."
4602,37,"[0.0017171801, 0.0016756053, 0.0016776036, 0.0..."
4603,37,"[-0.00014814909, -0.00018149172, -0.0002237720..."


In [52]:
audio_samples = mbp_dataset['File'].values.tolist()
labels = mbp_dataset['Key'].values.tolist()

audioDataset = np.array(audio_samples, dtype = object)
print(audio_samples[0].shape)
mfcc = librosa.feature.mfcc(y=audio_samples[0], sr=44100) # shape: (n_mfcc, t)
print(mfcc.shape)
# labels = np.array(labels)

(14400,)
(20, 29)


In [53]:
class TimeShifting():
    def __call__(self, samples):
#       samples_shape = samples.shape
        samples = samples.flatten()
        
        shift = int(len(samples) * 0.4) #Max shift (0.4)
        random_shift = random.randint(0, shift) #Random number between 0 and 0.4*len(samples)
        data_roll = np.roll(samples, random_shift)
        return data_roll

In [54]:
def time_shift(samples):
    samples = samples.flatten()
    shift = int(len(samples) * 0.4) #Max shift (0.4)
    random_shift = random.randint(0, shift) #Random number between 0 and 0.4*len(samples)
    data_roll = np.roll(samples, random_shift)
    return data_roll

In [55]:
from skimage.transform import resize


class ToMelSpectrogram:
    def __init__(self, audio_length=14400):
        self.audio_length = audio_length

    def __call__(self, samples):
        if len(samples) > self.audio_length:
            samples = samples[:self.audio_length]
        elif len(samples) < self.audio_length:
            samples = np.pad(samples, (0, self.audio_length - len(samples)), mode='constant')

        mel_spec = librosa.feature.melspectrogram(y=samples, sr=44100, n_mels=64, n_fft=1024, hop_length=225)
        mel_spec_resized = resize(mel_spec, (64, 64), anti_aliasing=True)
        mel_spec_resized = np.expand_dims(mel_spec_resized, axis=0)
        return torch.tensor(mel_spec_resized)


class ToMelSpectrogramMfcc:
    def __init__(self, audio_length=14400):
        self.audio_length = audio_length

    def __call__(self, samples):
        if len(samples) > self.audio_length:
            samples = samples[:self.audio_length]
        elif len(samples) < self.audio_length:
            samples = np.pad(samples, (0, self.audio_length - len(samples)), mode='constant')

        mel_spec = librosa.feature.melspectrogram(y=samples, sr=44100, n_mels=64, n_fft=n_fft, hop_length=hop_length)
        mel_spec = librosa.feature.mfcc(S=librosa.power_to_db(mel_spec))
        mel_spec_resized = resize(mel_spec, (64, 64), anti_aliasing=True)
        mel_spec_resized = np.expand_dims(mel_spec_resized, axis=0)

        return torch.tensor(mel_spec_resized)


class ToMfcc:
    def __init__(self, audio_length=14400):
        self.audio_length = audio_length

    def __call__(self, samples):
        if len(samples) > self.audio_length:
            samples = samples[:self.audio_length]
        elif len(samples) < self.audio_length:
            samples = np.pad(samples, (0, self.audio_length - len(samples)), mode='constant')
        
        mfcc_spec = librosa.feature.mfcc(y=samples, sr=44100)
        mfcc_spec = np.transpose(mfcc_spec)
        return torch.tensor(mfcc_spec)


In [56]:
transform = Compose([ToMelSpectrogram()])
transform_mfcc = Compose([ToMfcc()])

In [57]:
audio_samples_new = audio_samples.copy() # audio samples CNN

for i, sample in enumerate(audio_samples):
    audio_samples_new.append(time_shift(sample))
    labels.append(labels[i])
    
# convert labels to a numpy array
labels = np.array(labels)
print(len(audio_samples_new))
print(len(labels))

1842
1842


In [58]:
audioDatasetFin, audioDatasetMfcc = [], []

for i in range(len(audio_samples_new)):
    transformed_sample = transform(audio_samples_new[i])
    transformed_mfcc = transform_mfcc(audio_samples_new[i])
    audioDatasetFin.append((transformed_sample, labels[i]))
    audioDatasetMfcc.append((transformed_sample, transformed_mfcc, labels[i]))

In [59]:
len(audioDatasetFin)

1842

In [60]:
audioDatasetMfcc[0][0].shape

torch.Size([1, 64, 64])

In [61]:
import time

class MfccLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout=0.2, num_classes=36):
        super(MfccLSTM, self).__init__()
        
        self.conv = nn.Sequential(
            nn.Conv2d(1, 32, 3, 1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(32, 64, 3, 1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Flatten(),
            nn.LazyLinear(512),
            nn.ReLU(),
            nn.Linear(512, num_classes)
        )
        
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.lstm2 = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.fc1 = nn.LazyLinear(64)
        self.fc2 = nn.Linear(64, 16)
    
        self.fc3 = nn.LazyLinear(128)
        self.final_lstm = nn.LSTM(1, 64, batch_first=True)
        
        self.fc = nn.LazyLinear(num_classes)
    
    def forward(self, image_input, sequence_input):
        # must return shape (batch_size, num_classes) 
        # batch_size: right now is 16
        # num_classes: right now is 36
        x1 = self.conv(image_input)
        out1, _ = self.lstm(sequence_input)
        out1_dp = self.dropout(out1)
        # print(f'output of first lstm: {out1_dp.shape[1:]}')
        out2, _ = self.lstm2(out1_dp[:, -1, :])
        out2_dp = self.dropout(out2)
        # print(f'output of second lstm: {out2_dp.shape[1:]}')
        x2 = self.fc2(self.fc1(out2_dp))
        x3 = torch.cat((x1, x2), 1)
        # print(f'output of concatenation: {x3.shape[1:]}')
        # x = self.fc(final_out[:, -1, :])
        x = self.fc(x3)
        return x
    

In [62]:
# Model architecture
class CNN(nn.Module):
    def __init__(self, num_classes=36):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.fc1 = nn.LazyLinear(512)
        self.fc2 = nn.Linear(512, num_classes)
    
    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 64 * 14 * 14)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [63]:
import time

def train_with_cross_validation(dataset, num_epochs, model_name, patience=15, random_state=42, n_splits=10):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    fold_results = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(dataset)):
        print(f'Fold {fold+1}/{n_splits}')
        
        # Split the dataset into training and validation sets
        train_set = Subset(dataset, train_idx)
        val_set = Subset(dataset, val_idx)
        train_loader = DataLoader(train_set, batch_size=16, shuffle=True)
        val_loader = DataLoader(val_set, batch_size=16, shuffle=True)
        
        # Initialize model, optimizer, and loss function
        model = MfccLSTM(input_size=20, hidden_size=32, num_classes=36, output_size=64)
        model = model.to(device)
        optimizer = optim.Adam(model.parameters(), lr=5e-4)
        criterion = nn.CrossEntropyLoss()
        
        best_val_acc, epochs_no_imp = 0, 0
        train_accuracies, val_accuracies = [], []

        for epoch in range(num_epochs):
            model.train()
            epoch_train_loss = 0.0
            correct_train = 0
            total_train = 0
            tic = time.perf_counter()
            
            for images, sequences, labels in train_loader:
                images = images.to(device)
                sequences = sequences.to(device)
                labels = labels.to(device)
                
                optimizer.zero_grad()

                #converting labels to Long to avoid error "not implemented for Int"
                labels = labels.long()
                
                # Forward pass
                outputs = model(images, sequences)
                loss = criterion(outputs, labels)
                epoch_train_loss += loss.item() * images.size(0)

                _, predicted_train = torch.max(outputs.data, 1)
                total_train += labels.size(0)
                correct_train += (predicted_train == labels).sum().item()
                
                # Backward pass
                loss.backward()
                optimizer.step()
            
            toc = time.perf_counter()
            time_taken = toc - tic
            
            epoch_train_loss /= len(train_loader.dataset)
            train_accuracy = correct_train / total_train
            train_accuracies.append(train_accuracy)
            
            # Evaluation of the model
            model.eval()
            total, correct = 0, 0
            for images, sequences, labels in val_loader:
                images = images.to(device)
                sequences = sequences.to(device)
                labels = labels.to(device)

                outputs = model(images, sequences)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
            
            val_accuracy = correct / total
            val_accuracies.append(val_accuracy)
            if (epoch + 1) % 5 == 0:
                print(f"Epoch [{epoch + 1}/{num_epochs}], Train Loss: {epoch_train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Val Accuracy: {val_accuracy:.4f}, Iter Time: {time_taken:.2f}s")
                
            if val_accuracy > best_val_acc:
                best_val_acc = val_accuracy
                epochs_no_imp = 0
                best_model_state = model.state_dict()  # Save the best model
            else:
                epochs_no_imp += 1
            if epochs_no_imp >= patience:
                print(f'Early stopping after {epoch+1} epochs')
                model.load_state_dict(best_model_state)  # Load the best model
                break
        
        fold_results.append((epoch+1, best_val_acc))
        print(f'Fold {fold+1} Best Validation Accuracy: {best_val_acc:.4f}')
    torch.save(model.state_dict(), model_name)

    return fold_results

In [64]:
def predict_mfcc(dataset, model_path, device_external):
    images_test_set = [t[0] for t in dataset]
    sequences_test_set = [t[1] for t in dataset]
    
    images = torch.stack(images_test_set)
    sequences = torch.stack(sequences_test_set)
    device = torch.device(device_external) #default to mps
    images = images.to(device)
    sequences = sequences.to(device)
    model = MfccLSTM(input_size=20, hidden_size=32, num_classes=36, output_size=64)
    model = model.to(device)
    model.load_state_dict(torch.load(model_path,map_location=device))
    model.eval()
    
    with torch.no_grad():
        outputs = model(images, sequences)
        _, predicted = torch.max(outputs.data, 1)

    pred = []
    keyss = '1234567890QWERTYUIOPASDFGHJKLZXCVBNM'
    phrase = predicted.tolist()
    for i in range(len(phrase)):
        pred.append(keyss[phrase[i]])

    pred_df = pd.DataFrame(pred)
    return pred_df

In [65]:
def save_csv(model_name, num_epochs, description, accuracy, precision, recall, f1_score):
    csv_file_path = 'model_comparison.csv'
    
    # Read the existing CSV file into a DataFrame
    try:
        df = pd.read_csv(csv_file_path)
    except FileNotFoundError:
        # If the file does not exist, create an empty DataFrame with the correct columns
        df = pd.DataFrame(columns=['Datetime', 'Name', 'Epochs', 'Description', 'Accuracy', 'Precision', 'Recall', 'F1'])
        
    # Data to append
    current_datetime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

    # Remove newline characters from the description
    description = description.replace('\n', ' ').replace('\r', ' ')
    
    # Create a new column with the relevant information
    new_data = {
        'Datetime': [current_datetime],
        'Name': [model_name],
        'Epochs': [num_epochs],
        'Description': [description],
        'Accuracy': [accuracy],
        'Precision': [precision],
        'Recall': [recall],
        'F1': [f1_score],
    }
    
    new_df = pd.DataFrame(new_data)
    
    df = pd.concat([df, new_df], ignore_index=True)
    
    # Save the updated DataFrame back to the CSV file
    df.to_csv(csv_file_path, index=False)

In [66]:
# current random state to split the dataset
random_state = 42

# values for current run
train_final_set, test_set = train_test_split(audioDatasetMfcc, test_size=0.2, random_state=random_state)
num_epochs = 100
main_architecture = "CNN_LSTM"
currday = datetime.today().strftime('%Y-%m-%d')
model_name = f"model_multiclass_{num_epochs}_{main_architecture}_{currday}.pth"
description = "2 layer CNN (32 and 64 output channels) with final 2 Dense Layers (512 and num_classes) result concatenated with \n 2 LSTMs (hidden_size=32),  from mfcc with 2 Dense Layers (64 and 16) with a final Lazy Linear layer output of num_classes"

# Training part
fold_stats = train_with_cross_validation(train_final_set, num_epochs, model_name, random_state=random_state)
max_val = 0
real_num_epochs = 0
for fold_stat in fold_stats: #using folds instead of LOO
    if fold_stat[1] > max_val:
        max_val = fold_stat[1]
        real_num_epochs = fold_stat[0]


Fold 1/10




Epoch [5/100], Train Loss: 1.8444, Train Accuracy: 0.4302, Val Accuracy: 0.3784, Iter Time: 0.81s
Epoch [10/100], Train Loss: 0.9792, Train Accuracy: 0.6868, Val Accuracy: 0.5405, Iter Time: 0.81s
Epoch [15/100], Train Loss: 0.5355, Train Accuracy: 0.8408, Val Accuracy: 0.6216, Iter Time: 0.78s
Epoch [20/100], Train Loss: 0.3090, Train Accuracy: 0.9011, Val Accuracy: 0.6622, Iter Time: 0.78s
Epoch [25/100], Train Loss: 0.2878, Train Accuracy: 0.9223, Val Accuracy: 0.6419, Iter Time: 0.80s
Epoch [30/100], Train Loss: 0.1852, Train Accuracy: 0.9457, Val Accuracy: 0.6892, Iter Time: 0.80s
Epoch [35/100], Train Loss: 0.1086, Train Accuracy: 0.9691, Val Accuracy: 0.7297, Iter Time: 0.80s
Epoch [40/100], Train Loss: 0.1002, Train Accuracy: 0.9736, Val Accuracy: 0.7095, Iter Time: 0.79s
Epoch [45/100], Train Loss: 0.2023, Train Accuracy: 0.9615, Val Accuracy: 0.6351, Iter Time: 0.81s
Epoch [50/100], Train Loss: 0.0429, Train Accuracy: 0.9902, Val Accuracy: 0.6959, Iter Time: 0.79s
Early stopp

KeyboardInterrupt: 

In [None]:
# Prediction part
prediction = predict_mfcc(test_set, model_name, device)
labels_set = [t[2] for t in test_set]
final_labels_set = [keys_s[ind] for ind in labels_set]

# Metrics calculation
accuracy = accuracy_score(final_labels_set, prediction[0])
precision = precision_score(final_labels_set, prediction[0], average='macro')
recall = recall_score(final_labels_set, prediction[0], average='macro')
f1 = sklearn.metrics.f1_score(final_labels_set, prediction[0], average='macro')

# Save in csv file
save_csv(model_name, real_num_epochs, description, accuracy, precision, recall, f1)

# Print results
print("Final Results!")
print(f"Model: {model_name}")
print(description)
print(f"Epochs: {real_num_epochs}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Final Results!
Model: model_multiclass_100_CNN_LSTM_2024-08-22.pth
2 layer CNN (32 and 64 output channels) with final 2 Dense Layers (512 and num_classes) result concatenated with 
 2 LSTMs (hidden_size=32),  from mfcc with 2 Dense Layers (64 and 16) with a final Lazy Linear layer output of num_classes
Epochs: 75
Accuracy: 0.6666666666666666
Precision: 0.7017860794176582
Recall: 0.6891187130770464
F1 Score: 0.6711896824492809


In [67]:
import csv

def empty_file(csv_file_path):
    # Read the header (first row) of the CSV file
    with open(csv_file_path, 'r') as file:
        reader = csv.reader(file)
        header = next(reader)  # Read the first row (header)
    
    # Write only the header back to the CSV file
    with open(csv_file_path, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(header)  # Wr`ite the header back to the file


In [None]:
# empty_file('model_comparison.csv')

# Using custom audio

The following code adapts the previous working segment to utilize custom audio recorded by the team. Work in progress.

In [140]:
#Using audio from custom-audio to create the test_set
keys_t_s='0123'
labels = list(keys_t_s)
keys_t = [k + '.wav' for k in labels]

for key in keys_t:
    sample_t, sr_t = librosa.load(f'../Dataset-custom-audio/base-audio/{key}')
    print(sr_t)
    print(len(isolator(sample_t, sr_t, 1024, 225, 2400, 12000, 0.06)), end=' ')


22050
5 22050
10 22050
12 22050
4 

In [150]:
n_fft = 50 #1024
hop_length = 225 #225
before = 2400 #2400
after = 10000 #12000

data_dict_t= {'Key':[], 'File':[]} #for custom audio testing
mbp_dataset_t = create_dataset(n_fft, hop_length, before, after, keys_t, custom_audio=True)
mbp_dataset_t

0.wav
File 0.wav length: 20
1.wav
File 1.wav length: 19
2.wav
File 2.wav length: 22
3.wav
File 3.wav length: 22
{'0': 0}
{'0': 0}
{'0': 0}
{'0': 0}
{'0': 0}
{'0': 0}
{'0': 0}
{'0': 0}
{'0': 0}
{'0': 0}
{'0': 0}
{'0': 0}
{'0': 0}
{'0': 0}
{'0': 0}
{'0': 0}
{'0': 0}
{'0': 0}
{'0': 0}
{'0': 0}
{'0': 0, '1': 1}
{'0': 0, '1': 1}
{'0': 0, '1': 1}
{'0': 0, '1': 1}
{'0': 0, '1': 1}
{'0': 0, '1': 1}
{'0': 0, '1': 1}
{'0': 0, '1': 1}
{'0': 0, '1': 1}
{'0': 0, '1': 1}
{'0': 0, '1': 1}
{'0': 0, '1': 1}
{'0': 0, '1': 1}
{'0': 0, '1': 1}
{'0': 0, '1': 1}
{'0': 0, '1': 1}
{'0': 0, '1': 1}
{'0': 0, '1': 1}
{'0': 0, '1': 1}
{'0': 0, '1': 1, '2': 2}
{'0': 0, '1': 1, '2': 2}
{'0': 0, '1': 1, '2': 2}
{'0': 0, '1': 1, '2': 2}
{'0': 0, '1': 1, '2': 2}
{'0': 0, '1': 1, '2': 2}
{'0': 0, '1': 1, '2': 2}
{'0': 0, '1': 1, '2': 2}
{'0': 0, '1': 1, '2': 2}
{'0': 0, '1': 1, '2': 2}
{'0': 0, '1': 1, '2': 2}
{'0': 0, '1': 1, '2': 2}
{'0': 0, '1': 1, '2': 2}
{'0': 0, '1': 1, '2': 2}
{'0': 0, '1': 1, '2': 2}
{'0': 0, '

Unnamed: 0,Key,File
0,0,"[0.0039855447, 0.0044311658, 0.0058992216, 0.0..."
1,0,"[-0.0005392696, 8.165794e-05, 8.288269e-05, 0...."
2,0,"[-0.011836923, -0.011112283, -0.011667683, -0...."
3,0,"[-0.0024387487, -0.0024487914, -0.002542938, -..."
4,0,"[0.0033159202, 0.0031180421, 0.0032269792, 0.0..."
...,...,...
78,3,"[0.003309218, 0.003743548, 0.0042556874, 0.004..."
79,3,"[0.0016023548, 0.0014424893, 0.002118134, 0.00..."
80,3,"[0.0046850806, 0.005076956, 0.0045152153, 0.00..."
81,3,"[0.0003028083, 0.0008702321, 0.0006461112, 0.0..."


In [151]:
audio_samples_t = mbp_dataset_t['File'].values.tolist()
labels_t = mbp_dataset_t['Key'].values.tolist()

audioDataset_t = np.array(audio_samples_t, dtype = object)
print(audio_samples_t[0].shape)
mfcc_t = librosa.feature.mfcc(y=audio_samples_t[0], sr=44100) # shape: (n_mfcc, t)
print(mfcc_t.shape)

(12400,)
(20, 25)


In [152]:
audio_samples_new_t = audio_samples_t.copy() # audio samples CNN

for i, sample in enumerate(audio_samples_t):
    audio_samples_new_t.append(time_shift(sample))
    labels_t.append(labels_t[i])
    
# convert labels to a numpy array
labels_t = np.array(labels_t)
print(len(audio_samples_new_t))
print(len(labels_t))

166
166


In [153]:
audioDatasetFin_t, audioDatasetMfcc_t = [], []

for i in range(len(audio_samples_new_t)):
    transformed_sample_t = transform(audio_samples_new_t[i])
    transformed_mfcc_t = transform_mfcc(audio_samples_new_t[i])
    audioDatasetFin_t.append((transformed_sample_t, labels_t[i]))
    audioDatasetMfcc_t.append((transformed_sample_t, transformed_mfcc_t, labels_t[i]))

In [154]:
#Using custom audio:
# current random state to split the dataset
random_state = 42

# values for current run
train_final_set, test_set = train_test_split(audioDatasetMfcc_t, test_size=0.2, random_state=random_state)
num_epochs = 100
main_architecture = "CNN_LSTM"
currday = datetime.today().strftime('%Y-%m-%d')
model_name = f"model_multiclass_custom_audio_{num_epochs}_{main_architecture}_{currday}.pth"
description = "2 layer CNN (32 and 64 output channels) with final 2 Dense Layers (512 and num_classes) result concatenated with \n 2 LSTMs (hidden_size=32),  from mfcc with 2 Dense Layers (64 and 16) with a final Lazy Linear layer output of num_classes. \n Using custom audio recorded for testing purposes. n_fft = 50"

In [155]:
# Training part
fold_stats = train_with_cross_validation(train_final_set, num_epochs, model_name, random_state=random_state)
max_val = 0
real_num_epochs = 0
for fold_stat in fold_stats: #using folds instead of LOO
    if fold_stat[1] > max_val:
        max_val = fold_stat[1]
        real_num_epochs = fold_stat[0]


Fold 1/10




Epoch [5/100], Train Loss: 1.4856, Train Accuracy: 0.1610, Val Accuracy: 0.3571, Iter Time: 0.07s
Epoch [10/100], Train Loss: 1.4224, Train Accuracy: 0.2627, Val Accuracy: 0.0714, Iter Time: 0.07s
Epoch [15/100], Train Loss: 1.4542, Train Accuracy: 0.2203, Val Accuracy: 0.3571, Iter Time: 0.08s
Early stopping after 16 epochs
Fold 1 Best Validation Accuracy: 0.3571
Fold 2/10
Epoch [5/100], Train Loss: 1.4707, Train Accuracy: 0.2627, Val Accuracy: 0.1429, Iter Time: 0.07s
Epoch [10/100], Train Loss: 1.4214, Train Accuracy: 0.2373, Val Accuracy: 0.2857, Iter Time: 0.07s
Epoch [15/100], Train Loss: 1.4016, Train Accuracy: 0.2542, Val Accuracy: 0.2143, Iter Time: 0.07s
Early stopping after 17 epochs
Fold 2 Best Validation Accuracy: 0.3571
Fold 3/10
Epoch [5/100], Train Loss: 1.5453, Train Accuracy: 0.2269, Val Accuracy: 0.3077, Iter Time: 0.07s
Epoch [10/100], Train Loss: 1.4581, Train Accuracy: 0.1849, Val Accuracy: 0.1538, Iter Time: 0.07s
Epoch [15/100], Train Loss: 1.3930, Train Accurac

In [156]:
# Prediction part
prediction = predict_mfcc(test_set, model_name, device) #using the custom test_set
labels_set = [t[2] for t in test_set]
print(labels_set)
print(prediction[0])
final_labels_set = [keys_t_s[ind] for ind in labels_set]

# Metrics calculation
accuracy = accuracy_score(final_labels_set, prediction[0])
precision = precision_score(final_labels_set, prediction[0], average='macro')
recall = recall_score(final_labels_set, prediction[0], average='macro')
f1 = sklearn.metrics.f1_score(final_labels_set, prediction[0], average='macro')

# Save in csv file
save_csv(model_name, real_num_epochs, description, accuracy, precision, recall, f1)

# Print results
print("Final Results!")
print(f"Model: {model_name}")
print(description)
print(f"Epochs: {real_num_epochs}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

[0, 2, 3, 2, 0, 1, 1, 3, 3, 1, 0, 2, 0, 3, 1, 1, 2, 2, 0, 0, 3, 0, 0, 0, 2, 3, 2, 2, 2, 3, 1, 2, 1, 0]
0     1
1     3
2     4
3     3
4     2
5     2
6     2
7     3
8     3
9     2
10    2
11    3
12    1
13    4
14    2
15    2
16    3
17    4
18    1
19    1
20    4
21    1
22    1
23    1
24    3
25    3
26    3
27    4
28    4
29    3
30    2
31    4
32    2
33    1
Name: 0, dtype: object
Final Results!
Model: model_multiclass_custom_audio_100_CNN_LSTM_2024-08-23.pth
2 layer CNN (32 and 64 output channels) with final 2 Dense Layers (512 and num_classes) result concatenated with 
 2 LSTMs (hidden_size=32),  from mfcc with 2 Dense Layers (64 and 16) with a final Lazy Linear layer output of num_classes. 
 Using custom audio recorded for testing purposes. n_fft = 50
Epochs: 78
Accuracy: 0.11764705882352941
Precision: 0.08
Recall: 0.11428571428571428
F1 Score: 0.09411764705882353


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
