In [9]:
import librosa
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sklearn
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import KFold
import torch.nn.functional as F
from torchvision.transforms import Compose
import random
from datetime import datetime
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

#audio separating test
from NoiseFilteringTest.NoiseFilter import peakIdentification as nf
import NoiseFilteringTest.NoiseFilter as nfutils

#For tools
from pydub import AudioSegment
import csv

# Setup

Important note: Space and Enter keys are denoted as the following by the model:
- Space -> -
- Enter -> +

#### Instructions

On the next cell, select which dataset you wish to use for the training.
- 0: Dataset-for-Binary, a premade dataset with audio from prerecorded keyboards.
- 1: Custom dataset built for this notebook. The audio has been normalized.
- 2: Custom dataset built for this notebook. The audio has been normalized, then denoised.
- 3: Dataset-for-Binary and the normalized-denoised dataset combined. (TESTING)
- 4: MKA-dataset

In [10]:
# Select 0 to use files from Dataset-for-Binary, 1 for custom dataset with normalized audio, 2 for custom dataset with normalized and denoised audio, 3 for Combined Dataset
dataset_choice=4

On the next cell, select what model you wish to use for the training and evaluation.
- 0: CNN
- 1: CoAtNetImp

In [11]:
model_choice=1

On the next cell, select if you want to use the NoiseFiltering class built for this or the Isolator method.
- 0: Isolator
- 1: NoiseFiltering

In [26]:
split_method=0

## Tools Section

## Setting up variables according to audio source

In [13]:
if dataset_choice == 0: 
    #Dataset-for-Binary
    MBP_AUDIO_DIR = 'Dataset-for-Binary/base-audio/'
    keys_s = '1234567890QWERTYUIOPASDFGHJKLZXCVBNM'
    labels = list(keys_s)
    keys = ['audio_' + k + '.wav' for k in labels]

    # Working parameters for the dataset
    # Dataset-for-Binary: n_fft = 1024, hop_length = 225, before = 2400, after = 12000
    n_fft = 1024 
    hop_length = 225 
    before = 2400 
    after = 12000 

    audio_descr="Using audio from Dataset-for-Binary."
    #audio_length=14400

elif dataset_choice == 1: 
    #Custom dataset with normalized audio
    MBP_AUDIO_DIR = 'Dataset-custom-audio/base-audio-normalized-only/', 'Dataset-custom-audio/base-audio-normalized-2/' #for custom audio
    keys_s = '1234567890ABCDEFGHIJKLMNÑOPQRSTUVWXYZ+-' #for custom audio
    keys_t = 'QWERTYUIOP0123456789+-' #UPDATING
    labels = list(keys_s)
    labelst = list(keys_t)
    keys = [ k + '.wav' for k in labels] #for custom audio
    keyst = [ k + '.wav' for k in labelst] #UPDATING

    # Normalized only audio: n_fft = 7, hop_length = 4450, before = 2400, after = 12000
    n_fft = n_fft_t = 7 
    hop_length = hop_length_t= 4450 
    before = before_t = 2400 
    after = after_t= 12000 

    audio_descr="Using custom audio with normalized audio."
    #audio_length=40000 #Sample audios have a length of around 40s.

elif dataset_choice == 2: 
    #Custom dataset with normalized and denoised audio
    MBP_AUDIO_DIR = 'Dataset-custom-audio/base-audio-denoised-normalized/', 'Dataset-custom-audio/base-audio-normalized-denoised-2/'  #for custom audio
    keys_s = '1234567890ABCDEFGHIJKLMNÑOPQRSTUVWXYZ+-' #for custom audio
    keys_t = 'QWERTYUIOP0123456789+-' #UPDATING
    labels = list(keys_s)
    labelst = list(keys_t)
    keys = [ k + '.wav' for k in labels] #for custom audio
    keyst = [ k + '.wav' for k in labelst] #UPDATING

    # Normalized and denoised audio: n_fft = 9, hop_length = 500, before = 2400, after = 12000
    n_fft = n_fft_t = 10
    hop_length = hop_length_t = 10
    before = before_t = 2400 
    after = after_t = 12000 

    audio_descr="Using custom audio with normalized and denoised audio."
    #audio_length=40000

elif dataset_choice == 3:
    #Custom dataset with normalized and denoised audio, and using dataset-for-binary 
    MBP_AUDIO_DIR = 'Dataset-custom-audio/base-audio-denoised-normalized/', 'Dataset-for-Binary/base-audio/'

    # normalized/denoised
    keys_s = '1234567890ABCDEFGHIJKLMNÑOPQRSTUVWXYZ+-' #for custom audio
    labels = list(keys_s)
    keys = [ k + '.wav' for k in labels] #for custom audio
    # Normalized and denoised audio: n_fft = 9, hop_length = 500, before = 2400, after = 12000
    n_fft = 10
    hop_length = 10
    before = 2400 
    after = 12000 

    #binary
    keys_t = '1234567890QWERTYUIOPASDFGHJKLZXCVBNM'
    labelst = list(keys_t)
    keyst = ['audio_' + k + '.wav' for k in labelst]
    # Dataset-for-Binary: n_fft = 1024, hop_length = 225, before = 2400, after = 12000
    n_fft_t = 1024 
    hop_length_t = 225 
    before_t = 2400 
    after_t = 12000 

    audio_descr="Using custom audio with normalized and denoised audio, and dataset-for-binary, together."

elif dataset_choice == 4:
    #Using dataset MKA-dataset
    MBP_AUDIO_DIR = 'MKA-dataset/'

    keys_s = '1234567890ABCDEFGHIJKLMNOPQRSTUVWXYZ+-'
    labels = list(keys_s)
    keys = [ k + 'hp.wav' for k in labels] + [ k + 'lenovo.wav' for k in labels] + [ k + 'mac.wav' for k in labels] + [ k + 'messnger.wav' for k in labels]+[ k + 'msi.wav' for k in labels] + [ k + 'zoom.wav' for k in labels] #that's a big boy right there
    # testing values: n_fft = 9, hop_length = 500, before = 2400, after = 12000
    n_fft = 30
    hop_length = 10
    before = 2400 
    after = 12000 

    audio_descr="Using audio from MKA-dataset."

else: 
    #defaults to 0
    dataset_choice = 0
    print("Invalid dataset choice, defaulting to 0")
    MBP_AUDIO_DIR = '../Dataset-for-Binary/base-audio/'
    keys_s = '1234567890QWERTYUIOPASDFGHJKLZXCVBNM'
    labels = list(keys_s)
    keys = ['audio_' + k + '.wav' for k in labels]

    # Working parameters for the dataset
    # Dataset-for-Binary: n_fft = 1024, hop_length = 225, before = 2400, after = 12000
    n_fft = 1024 
    hop_length = 225 
    before = 2400 
    after = 12000 

    audio_descr="Using audio from Dataset-for-Binary."
    #audio_length=14400

device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
num_classes=len(labels)
if dataset_choice == 0 or dataset_choice==4: #dataset for binary and base custom audio, or MKA-dataset
    audio_length=nfutils.get_audio_length_average(MBP_AUDIO_DIR, keys)

else:
    audio_length=nfutils.get_audio_length_average(MBP_AUDIO_DIR[0], keys)
    audio_length_t=nfutils.get_audio_length_average(MBP_AUDIO_DIR[1], keyst)

File MKA-dataset/1hp.wav length: 23.301224489795917
File MKA-dataset/2hp.wav length: 22.929705215419503
File MKA-dataset/3hp.wav length: 20.468390022675738
File MKA-dataset/4hp.wav length: 22.244716553287983
File MKA-dataset/5hp.wav length: 22.024126984126983
File MKA-dataset/6hp.wav length: 20.36390022675737
File MKA-dataset/7hp.wav length: 18.587573696145125
File MKA-dataset/8hp.wav length: 18.26249433106576
File MKA-dataset/9hp.wav length: 15.847619047619048
File MKA-dataset/0hp.wav length: 17.77487528344671
File MKA-dataset/Ahp.wav length: 19.249342403628116
File MKA-dataset/Bhp.wav length: 20.956009070294783
File MKA-dataset/Chp.wav length: 20.375510204081632
File MKA-dataset/Dhp.wav length: 19.71374149659864
File MKA-dataset/Ehp.wav length: 17.937414965986395
File MKA-dataset/Fhp.wav length: 18.75011337868481
File MKA-dataset/Ghp.wav length: 18.622403628117915
File MKA-dataset/Hhp.wav length: 17.600725623582765
File MKA-dataset/Ihp.wav length: 16.938956916099773
File MKA-dataset/

## Defining basics

In [14]:
# waveform function for me to not bang my keyboard
def disp_waveform(signal, sr=None, color='blue'):
    plt.figure(figsize=(7,2))
    return librosa.display.waveshow(signal, sr=sr, color=color)

In [15]:
def isolator(signal, sample_rate, n_fft, hop_length, before, after, threshold, show=False):
    strokes = []
    # -- signal'
    if show:
        disp_waveform(signal, sr=sample_rate)
    fft = librosa.stft(signal, n_fft=n_fft, hop_length=hop_length)
    energy = np.abs(np.sum(fft, axis=0)).astype(float)
    # norm = np.linalg.norm(energy)
    # energy = energy/norm
    # -- energy'
    if show:
        disp_waveform(energy)
    threshed = energy > threshold
    # -- peaks'
    if show:
        disp_waveform(threshed.astype(float))
    peaks = np.where(threshed == True)[0]
    peak_count = len(peaks)
    prev_end = sample_rate*0.1*(-1)
    # '-- isolating keystrokes'
    for i in range(peak_count):
        this_peak = peaks[i]
        timestamp = (this_peak*hop_length) + n_fft//2
        if timestamp > prev_end + (0.1*sample_rate):
            keystroke = signal[timestamp-before:timestamp+after]
            # strokes.append(torch.tensor(keystroke)[None, :])
            # keystroke = transform(keystroke)
            strokes.append(keystroke)
            if show:
                disp_waveform(keystroke, sr=sample_rate)
            prev_end = timestamp+after
    return strokes

In [40]:
def create_dataset(n_fft, hop_length, before, after, keys, labels, audio_dir, data_dict):
    for i, File in enumerate(keys):
        loc = audio_dir + File
        samples, sr = librosa.load(loc)
        prom = 0.2391 #0.06
        step = 0.005
        strokes = isolator(samples, sr, n_fft, hop_length, before, after, prom, False )
        print(f'File {File} length: {len(strokes)}')
        #add to dict
        label = [labels[i]]*len(strokes)
        #works fine here
        data_dict['Key'] += label
        print(data_dict['Key'])
        data_dict['File'] += strokes

    df = pd.DataFrame(data_dict)
    mapper = {}
    counter = 0
    for l in df['Key']:
        if not l in mapper:
            print ("Enters mapper if")
            mapper[l] = counter
            counter += 1
        print(mapper)
    df.replace({'Key': mapper}, inplace = True)

    return df

In [24]:
len(keys)


228

In [23]:
len(labels)

38

In [32]:
def create_dataset_isolation_test(keys, audio_dir, data_dict,labels):
    #load noise sample
    noise_profile = AudioSegment.from_file("Dataset-custom-audio/audio-standby-files/noise-profile/Noise.wav")
    for i, File in enumerate(keys):
        loc = audio_dir + File
        #loading the file with pydub
        audio = AudioSegment.from_file(loc)

        #separator
        divider=nf(audio, noise_profile)
        chunks, chunks_n=divider.divide_into_chunks()
        print(f'File {File} chunks: {chunks_n}')

        #add to dict
        temp=i
        while temp>len(labels):
            temp=i-len(labels)
        label = [labels[temp]]*chunks_n
        print(label)
        data_dict['Key'] += label
        print(data_dict['Key'])
        data_dict['File'] += chunks

    df = pd.DataFrame(data_dict)
    mapper = {}
    counter = 0
    for l in df['Key']:
        if not l in mapper:
            print ("Enters mapper if")
            mapper[l] = counter
            counter += 1
        print(mapper)
    df.replace({'Key': mapper}, inplace = True)

    return df


## Creating dataset from chosen audio files

In [48]:
if split_method == 1: #noise filtering method
    data_dict = {'Key':[], 'File':[]} #Resets data_dict for testing purposes
    if dataset_choice == 0:
        mbp_dataset = create_dataset_isolation_test(keys, MBP_AUDIO_DIR, data_dict, labels) 
    if dataset_choice == 4:
        keys_hp = [key for key in keys if "hp.wav" in key]
        keys_lenovo = [key for key in keys if "lenovo.wav" in key]
        keys_mac = [key for key in keys if "mac.wav" in key]
        keys_messenger = [key for key in keys if "messenger.wav" in key]
        keys_msi=[key for key in keys if "msi.wav" in key]
        keys_zoom=[key for key in keys if "zoom.wav" in key]
        #do one for each
        mbp_dataset = create_dataset_isolation_test(keys_hp, MBP_AUDIO_DIR, data_dict, labels) 
        mbp_dataset_lenovo = create_dataset_isolation_test(keys_lenovo, MBP_AUDIO_DIR, data_dict, labels)
        mbp_dataset_mac = create_dataset_isolation_test(keys_mac, MBP_AUDIO_DIR, data_dict, labels)
        mbp_dataset_messenger = create_dataset_isolation_test(keys_messenger, MBP_AUDIO_DIR, data_dict, labels)
        mbp_dataset_msi = create_dataset_isolation_test(keys_msi, MBP_AUDIO_DIR, data_dict, labels)
        mbp_dataset_zoom = create_dataset_isolation_test(keys_zoom, MBP_AUDIO_DIR, data_dict, labels)
        mbp_dataset = pd.concat([mbp_dataset, mbp_dataset_lenovo, mbp_dataset_mac, mbp_dataset_messenger, mbp_dataset_msi, mbp_dataset_zoom])
    else:
        data_dict_t = {'Key':[], 'File':[]} #Resets data_dict for testing purposes
        mbp_dataset = create_dataset_isolation_test(keys, MBP_AUDIO_DIR[0], data_dict, labels)
        mbp_dataset_t = create_dataset_isolation_test(keyst, MBP_AUDIO_DIR[1], data_dict_t, labelst)
        mbp_dataset = pd.concat([mbp_dataset, mbp_dataset_t])
    audio_descr=audio_descr+" Method used for key separation: NoiseFiltering through Peak ID."
    mbp_dataset

In [87]:
if split_method == 0: #isolator
    data_dict = {'Key':[], 'File':[]} #Resets data_dict for testing purposes
    if dataset_choice == 0:
        mbp_dataset = create_dataset(n_fft, hop_length, before, after, keys, labels, MBP_AUDIO_DIR, data_dict)
    if dataset_choice == 4:
        #split keys into sets
        keys_hp = [key for key in keys if "hp.wav" in key]
        keys_lenovo = [key for key in keys if "lenovo.wav" in key]
        keys_mac = [key for key in keys if "mac.wav" in key]
        keys_messenger = [key for key in keys if "messenger.wav" in key]
        keys_msi=[key for key in keys if "msi.wav" in key]
        keys_zoom=[key for key in keys if "zoom.wav" in key]
        #do one for each
        mbp_dataset = create_dataset(n_fft, hop_length, before, after, keys_hp,  labels,MBP_AUDIO_DIR, data_dict) 
        mbp_dataset_lenovo = create_dataset(n_fft, hop_length, before, after,keys_lenovo, labels, MBP_AUDIO_DIR, data_dict)
        mbp_dataset_mac = create_dataset(n_fft, hop_length, before, after,keys_mac, labels, MBP_AUDIO_DIR, data_dict)
        mbp_dataset_messenger = create_dataset(n_fft, hop_length, before, after,keys_messenger,  labels,MBP_AUDIO_DIR, data_dict)
        mbp_dataset_msi = create_dataset(n_fft, hop_length, before, after,keys_msi,  labels,MBP_AUDIO_DIR, data_dict)
        mbp_dataset_zoom = create_dataset(n_fft, hop_length, before, after,keys_zoom, labels, MBP_AUDIO_DIR, data_dict)
        mbp_dataset = pd.concat([mbp_dataset, mbp_dataset_lenovo, mbp_dataset_mac, mbp_dataset_messenger, mbp_dataset_msi, mbp_dataset_zoom])
    else:
        data_dict_t = {'Key':[], 'File':[]} #Resets data_dict for testing purposes
        mbp_dataset = create_dataset(n_fft, hop_length, before, after, keys, labels, MBP_AUDIO_DIR[0], data_dict)
        mbp_dataset_t = create_dataset(n_fft_t, hop_length_t, before_t, after_t, keyst, labelst, MBP_AUDIO_DIR[1], data_dict_t)
        mbp_dataset = pd.concat([mbp_dataset, mbp_dataset_t])
    audio_descr=audio_descr+" Method used for key separation: Isolator."
    
    mbp_dataset

File 1hp.wav length: 31
temp: 0
21284
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
File 2hp.wav length: 30
temp: 1
21284
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
File 3hp.wav length: 29
temp: 2
21284
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
File 4hp.wav length: 30
temp: 3
21284
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [51]:
audio_samples = mbp_dataset['File'].values.tolist()
labels = mbp_dataset['Key'].values.tolist()

audioDataset = np.array(audio_samples[0], dtype = np.float32)
print(audio_samples[0].shape)
mfcc = librosa.feature.mfcc(y=audioDataset, sr=44100) # shape: (n_mfcc, t)
print(mfcc.shape)
# labels = np.array(labels)

(14400,)
(20, 29)


In [52]:
audio_samples[0]


array([-0.01196817, -0.00884912, -0.01165226, ...,  0.01509483,
        0.01370334,  0.00998038], dtype=float32)

## Defining model and specifying dataset


In [54]:
class TimeShifting():
    def __call__(self, samples):
#       samples_shape = samples.shape
        samples = samples.flatten()
        
        shift = int(len(samples) * 0.4) #Max shift (0.4)
        random_shift = random.randint(0, shift) #Random number between 0 and 0.4*len(samples)
        data_roll = np.roll(samples, random_shift)
        return data_roll

In [55]:
def time_shift(samples):
    samples = samples.flatten()
    shift = int(len(samples) * 0.4) #Max shift (0.4)
    random_shift = random.randint(0, shift) #Random number between 0 and 0.4*len(samples)
    data_roll = np.roll(samples, random_shift)
    return data_roll

In [56]:
class SpecAugment(): #added from new version
    def __call__(self, samples):
        num_mask = 2
        freq_masking_max_percentage=0.10
        time_masking_max_percentage=0.10
        spec = samples.copy()
        mean_value = spec.mean()
        for i in range(num_mask):
            all_frames_num, all_freqs_num = spec.shape[1], spec.shape[1] 
            freq_percentage = random.uniform(0.0, freq_masking_max_percentage)

            num_freqs_to_mask = int(freq_percentage * all_freqs_num)
            f0 = np.random.uniform(low=0.0, high=all_freqs_num - num_freqs_to_mask)
            f0 = int(f0)
            spec[:, f0:f0 + num_freqs_to_mask] = mean_value

            time_percentage = random.uniform(0.0, time_masking_max_percentage)

            num_frames_to_mask = int(time_percentage * all_frames_num)
            t0 = np.random.uniform(low=0.0, high=all_frames_num - num_frames_to_mask)
            t0 = int(t0)
            spec[t0:t0 + num_frames_to_mask, :] = mean_value
        return spec

In [57]:
def masking(samples): #added from new version
    num_mask = 2
    freq_masking_max_percentage=0.10
    time_masking_max_percentage=0.10
    spec = samples
    mean_value = spec.mean()
    for i in range(num_mask):
        all_frames_num, all_freqs_num = spec.shape[1], spec.shape[1] 
        freq_percentage = random.uniform(0.0, freq_masking_max_percentage)

        num_freqs_to_mask = int(freq_percentage * all_freqs_num)
        f0 = np.random.uniform(low=0.0, high=all_freqs_num - num_freqs_to_mask)
        f0 = int(f0)
        spec[:, f0:f0 + num_freqs_to_mask] = mean_value

        time_percentage = random.uniform(0.0, time_masking_max_percentage)

        num_frames_to_mask = int(time_percentage * all_frames_num)
        t0 = np.random.uniform(low=0.0, high=all_frames_num - num_frames_to_mask)
        t0 = int(t0)
        spec[t0:t0 + num_frames_to_mask, :] = mean_value
    return spec

In [58]:
from skimage.transform import resize


class ToMelSpectrogram:
    def __init__(self, audio_length=14400):
        self.audio_length = audio_length

    def __call__(self, samples):
        if len(samples) > self.audio_length:
            samples = samples[:self.audio_length]
        elif len(samples) < self.audio_length:
            samples = np.pad(samples, (0, self.audio_length - len(samples)), mode='constant')

        mel_spec = librosa.feature.melspectrogram(y=samples, sr=44100, n_mels=64, n_fft=1024, hop_length=225)
        mel_spec_resized = resize(mel_spec, (64, 64), anti_aliasing=True)
        mel_spec_resized = np.expand_dims(mel_spec_resized, axis=0)
        return torch.tensor(mel_spec_resized)


class ToMelSpectrogramMfcc:
    def __init__(self, audio_length=14400):
        self.audio_length = audio_length

    def __call__(self, samples):
        if len(samples) > self.audio_length:
            samples = samples[:self.audio_length]
        elif len(samples) < self.audio_length:
            samples = np.pad(samples, (0, self.audio_length - len(samples)), mode='constant')

        mel_spec = librosa.feature.melspectrogram(y=samples, sr=44100, n_mels=64, n_fft=n_fft, hop_length=hop_length)
        mel_spec = librosa.feature.mfcc(S=librosa.power_to_db(mel_spec))
        mel_spec_resized = resize(mel_spec, (64, 64), anti_aliasing=True)
        mel_spec_resized = np.expand_dims(mel_spec_resized, axis=0)

        return torch.tensor(mel_spec_resized)


class ToMfcc:
    def __init__(self, audio_length=14400):
        self.audio_length = audio_length

    def __call__(self, samples):
        if len(samples) > self.audio_length:
            samples = samples[:self.audio_length]
        elif len(samples) < self.audio_length:
            samples = np.pad(samples, (0, self.audio_length - len(samples)), mode='constant')
        
        mfcc_spec = librosa.feature.mfcc(y=samples, sr=44100)
        mfcc_spec = np.transpose(mfcc_spec)
        return torch.tensor(mfcc_spec)


In [59]:
transform = Compose([ToMelSpectrogram(audio_length=audio_length)])
transform_mfcc = Compose([ToMfcc(audio_length=audio_length)])

In [60]:
audio_samples_new = audio_samples.copy() # audio samples CNN

for i, sample in enumerate(audio_samples):
    audio_samples_new.append(time_shift(sample))
    labels.append(labels[i])
    
# convert labels to a numpy array
labels = np.array(labels)
print(len(audio_samples_new))
print(len(labels))

21284
21284


In [61]:
audioDatasetFin, audioDatasetMfcc = [], []
audioDatasetMfccMasking = [] #from new version

for i in range(len(audio_samples_new)):
    #converting to floating point
    audio_sample_float = np.array(audio_samples_new[i], dtype=np.float32)
    
    transformed_sample = transform(audio_sample_float)
    transformed_mfcc = transform_mfcc(audio_sample_float)
    audioDatasetMfcc.append((transformed_sample, transformed_mfcc, labels[i]))
    
    # CoAtNet part
    audioDatasetFin.append((transformed_sample, labels[i]))
    audioDatasetFin.append((masking(transformed_sample), labels[i]))
    
    # masking part
    audioDatasetMfccMasking.append((masking(transformed_sample), transformed_mfcc, labels[i]))
    audioDatasetMfccMasking.append((transformed_sample, transformed_mfcc, labels[i]))

# Old version - keep just in case
#for i in range(len(audio_samples_new)):
#    transformed_sample = transform(audio_samples_new[i])
#    transformed_mfcc = transform_mfcc(audio_samples_new[i])
#    audioDatasetFin.append((transformed_sample, labels[i]))
#    audioDatasetMfcc.append((transformed_sample, transformed_mfcc, labels[i]))

In [62]:
len(audioDatasetFin)

42568

In [63]:
audioDatasetMfcc[0][0].shape

torch.Size([1, 64, 64])

In [64]:
#for CNN

class MfccLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_classes,dropout=0.2, ):
        super(MfccLSTM, self).__init__()
        
        self.conv = nn.Sequential(
            nn.Conv2d(1, 32, 3, 1),
            nn.BatchNorm2d(32), #from new version
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(32, 64, 3, 1),
            nn.BatchNorm2d(64), #from new version
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Flatten(),
            nn.LazyLinear(512),
            nn.ReLU(),
            nn.Linear(512, num_classes)
        )
        
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.lstm2 = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.fc1 = nn.LazyLinear(64)
        self.fc2 = nn.Linear(64, 16)
    
        self.fc3 = nn.LazyLinear(128)
        self.final_lstm = nn.LSTM(1, 64, batch_first=True)
        
        self.fc = nn.LazyLinear(num_classes)
    
    def forward(self, image_input, sequence_input):
        # must return shape (batch_size, num_classes) 
        # batch_size: right now is 16
        # num_classes: right now is 36
        x1 = self.conv(image_input)
        out1, _ = self.lstm(sequence_input)
        out1_dp = self.dropout(out1)
        # print(f'output of first lstm: {out1_dp.shape[1:]}')
        out2, _ = self.lstm2(out1_dp[:, -1, :])
        out2_dp = self.dropout(out2)
        # print(f'output of second lstm: {out2_dp.shape[1:]}')
        x2 = self.fc2(self.fc1(out2_dp))
        x3 = torch.cat((x1, x2), 1)
        # print(f'output of concatenation: {x3.shape[1:]}')
        # x = self.fc(final_out[:, -1, :])
        x = self.fc(x3)
        return x
    

In [65]:
# Model architecture for CNN
class CNN(nn.Module):
    def __init__(self, num_classes):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.fc1 = nn.LazyLinear(512)
        self.fc2 = nn.Linear(512, num_classes)
    
    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 64 * 14 * 14)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [66]:
# from new version: tread lightly, adjusted to fit current settings

import time
from ClassificationCNN.coatnet import CoAtNet as CoAtNetImp

num_blocks = [2, 2, 3, 5, 2]            # L
channels = [64, 96, 192, 384, 768]      # D

def train_coatnet_with_cross_val(dataset, num_epochs, model_name, device_external, num_classes=num_classes, patience=10): #not using folds?
    # Split dataset into training and validation sets
    train_set, val_set = train_test_split(dataset, test_size=0.2) # using train_test_split instead of subset this time around
    train_loader, val_loader = DataLoader(train_set, batch_size=16), DataLoader(val_set, batch_size=16)
    
    # Initialize model, optimizer, and loss function
    model = CoAtNetImp((64, 64), 1, num_blocks, channels, num_classes=num_classes) #using coatnet instead of mfcclstm
    device = torch.device(device_external) #default to mps
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=5e-4)
    criterion = nn.CrossEntropyLoss()
    
    
    best_val_acc, epochs_no_imp = 0, 0
    train_accuracies, val_accuracies = [], []
    
    for epoch in range(num_epochs):
        model.train()
        epoch_train_loss = 0.0
        correct_train = 0
        total_train = 0
        tic = time.perf_counter()
        
        for images, labels in train_loader: #not using sequences
            images = images.to(device)
            labels = labels.to(device)
            
            optimizer.zero_grad()
    
            labels = labels.long() # converting labels to Long to avoid error "not implemented for Int"

            # Check that labels are within the valid range
            assert labels.min() >= 0 and labels.max() < num_classes, "Labels are out of bounds"
            
            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)
            epoch_train_loss += loss.item() * images.size(0)
    
            _, predicted_train = torch.max(outputs.data, 1)
            total_train += labels.size(0)
            correct_train += (predicted_train == labels).sum().item()
            
            # Backward pass
            loss.backward()
            optimizer.step()
        
        toc = time.perf_counter()
        time_taken = toc - tic
        
        epoch_train_loss /= len(train_loader.dataset)
        train_accuracy = correct_train / total_train
        train_accuracies.append(train_accuracy)
        
        # Evaluation of the model
        model.eval()
        total, correct = 0, 0
        for images, labels in val_loader:
            images = images.to(device)
            labels = labels.to(device)
    
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        
        val_accuracy = correct / total
        val_accuracies.append(val_accuracy)
        print(f"Epoch [{epoch + 1}/{num_epochs}], Train Loss: {epoch_train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Val Accuracy: {val_accuracy:.4f}, Iter Time: {time_taken:.2f}s")
            
        if val_accuracy > best_val_acc:
            best_val_acc = val_accuracy
            epochs_no_imp = 0
        else:
            epochs_no_imp += 1
        if epochs_no_imp >= patience:
            print(f'Early stopping after {epoch+1} epochs')
            break
    torch.save(model.state_dict(), model_name)
    return epoch+1, best_val_acc

In [67]:
import time

def train_with_cross_validation(dataset, num_epochs, model_name, num_classes, patience=15, random_state=42, n_splits=10):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    fold_results = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(dataset)):
        print(f'Fold {fold+1}/{n_splits}')
        
        # Split the dataset into training and validation sets
        train_set = Subset(dataset, train_idx)
        val_set = Subset(dataset, val_idx)
        train_loader = DataLoader(train_set, batch_size=16, shuffle=True)
        val_loader = DataLoader(val_set, batch_size=16, shuffle=True)
        
        # Initialize model, optimizer, and loss function
        model = MfccLSTM(input_size=20, hidden_size=32, num_classes=num_classes, output_size=64)
        model = model.to(device)
        optimizer = optim.Adam(model.parameters(), lr=5e-4)
        criterion = nn.CrossEntropyLoss()
        
        best_val_acc, epochs_no_imp = 0, 0
        train_accuracies, val_accuracies = [], []

        for epoch in range(num_epochs):
            model.train()
            epoch_train_loss = 0.0
            correct_train = 0
            total_train = 0
            tic = time.perf_counter()
            
            for images, sequences, labels in train_loader:
                images = images.to(device)
                sequences = sequences.to(device)
                labels = labels.to(device)
                
                optimizer.zero_grad()

                #converting labels to Long to avoid error "not implemented for Int"
                labels = labels.long()
                
                # Check that labels are within the valid range
                assert labels.min() >= 0 and labels.max() < num_classes, "Labels are out of bounds"
                
                # Forward pass
                outputs = model(images, sequences)
                loss = criterion(outputs, labels)
                epoch_train_loss += loss.item() * images.size(0)

                _, predicted_train = torch.max(outputs.data, 1)
                total_train += labels.size(0)
                correct_train += (predicted_train == labels).sum().item()
                
                # Backward pass
                loss.backward()
                optimizer.step()
            
            toc = time.perf_counter()
            time_taken = toc - tic
            
            epoch_train_loss /= len(train_loader.dataset)
            train_accuracy = correct_train / total_train
            train_accuracies.append(train_accuracy)
            
            # Evaluation of the model
            model.eval()
            total, correct = 0, 0
            for images, sequences, labels in val_loader:
                images = images.to(device)
                sequences = sequences.to(device)
                labels = labels.to(device)

                outputs = model(images, sequences)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
            
            val_accuracy = correct / total
            val_accuracies.append(val_accuracy)
            if (epoch + 1) % 5 == 0:
                print(f"Epoch [{epoch + 1}/{num_epochs}], Train Loss: {epoch_train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Val Accuracy: {val_accuracy:.4f}, Iter Time: {time_taken:.2f}s")
                
            if val_accuracy > best_val_acc:
                best_val_acc = val_accuracy
                epochs_no_imp = 0
                best_model_state = model.state_dict()  # Save the best model
            else:
                epochs_no_imp += 1
            if epochs_no_imp >= patience:
                print(f'Early stopping after {epoch+1} epochs')
                model.load_state_dict(best_model_state)  # Load the best model
                break
        
        fold_results.append((epoch+1, best_val_acc))
        print(f'Fold {fold+1} Best Validation Accuracy: {best_val_acc:.4f}')
    torch.save(model.state_dict(), model_name)

    return fold_results

In [80]:
# from new version: tread lightly, adjusted to fit current settings

def predict(dataset, model_obj, argnames, model_path, device_external, keys, batch_size=32):
    def get_batches(dataset, batch_size):
        for i in range(0, len(dataset), batch_size):
            yield dataset[i:i + batch_size]

    all_preds = []

    # specify device: default to mps
    device = torch.device(device_external)

    # model specifying
    model = model_obj.to(device)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()

    for batch in get_batches(dataset, batch_size):
        fin_dict = {}
        
        # create the list with each of the ith range tuples
        for i in range(len(batch[0])-1):
            fin_dict[argnames[i]] = [t[i] for t in batch]
        
        # torch.stack each one of the lists
        for key in fin_dict.keys():
            fin_dict[key] = torch.stack(fin_dict[key]).to(device)
        
        with torch.no_grad():
            outputs = model(**fin_dict)
            _, predicted = torch.max(outputs.data, 1)
        
        phrase = predicted.tolist()
        for i in range(len(phrase)):
            all_preds.append(keys[phrase[i]])

    pred_df = pd.DataFrame(all_preds)
    return pred_df

In [69]:
def predict_mfcc(dataset, model_path, device_external, keys, num_classes, batch_size=32):
    def get_batches(dataset, batch_size):
        for i in range(0, len(dataset), batch_size):
            yield dataset[i:i + batch_size]

    all_preds = []

    device = torch.device(device_external) # default to mps

    model = MfccLSTM(input_size=20, hidden_size=32, num_classes=num_classes, output_size=64)
    model = model.to(device)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()

    for batch in get_batches(dataset, batch_size):
        images_test_set = [t[0] for t in batch]
        sequences_test_set = [t[1] for t in batch]

        images = torch.stack(images_test_set).to(device)
        sequences = torch.stack(sequences_test_set).to(device)

        with torch.no_grad():
            outputs = model(images, sequences)
            _, predicted = torch.max(outputs.data, 1)

        phrase = predicted.tolist()
        for i in range(len(phrase)):
            all_preds.append(keys[phrase[i]])

    pred_df = pd.DataFrame(all_preds)
    return pred_df

In [70]:
def save_csv(model_name, num_epochs, description, accuracy, precision, recall, f1_score):
    csv_file_path = 'model_comparison.csv'
    
    # Read the existing CSV file into a DataFrame
    try:
        df = pd.read_csv(csv_file_path)
    except FileNotFoundError:
        # If the file does not exist, create an empty DataFrame with the correct columns
        df = pd.DataFrame(columns=['Datetime', 'Name', 'Epochs', 'Description', 'Accuracy', 'Precision', 'Recall', 'F1'])
        
    # Data to append
    current_datetime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

    # Remove newline characters from the description
    description = description.replace('\n', ' ').replace('\r', ' ')
    
    # Create a new column with the relevant information
    new_data = {
        'Datetime': [current_datetime],
        'Name': [model_name],
        'Epochs': [num_epochs],
        'Description': [description],
        'Accuracy': [accuracy],
        'Precision': [precision],
        'Recall': [recall],
        'F1': [f1_score],
    }
    
    new_df = pd.DataFrame(new_data)
    
    df = pd.concat([df, new_df], ignore_index=True)
    
    # Save the updated DataFrame back to the CSV file
    df.to_csv(csv_file_path, index=False)

# Running CNN

In [71]:
if model_choice == 0:
    # current random state to split the dataset
    random_state = 42

    # values for current run
    train_final_set, test_set = train_test_split(audioDatasetMfcc, test_size=0.2, random_state=random_state)
    num_epochs = 100
    main_architecture = "CNN_LSTM"
    currday = datetime.today().strftime('%Y-%m-%d')
    model_name = f"model_multiclass_{num_epochs}_{main_architecture}_{currday}.pth"
    description = "2 layer CNN (32 and 64 output channels) with final 2 Dense Layers (512 and "+str(num_classes)+") result concatenated with \n 2 LSTMs (hidden_size=32),  from mfcc with 2 Dense Layers (64 and 16) with a final Lazy Linear layer output of "+str(num_classes)+". "+audio_descr+" n_fft="+str(n_fft)+", hop_length="+str(hop_length)+". Using np.average(fold_results). Using updated model from new version."


    # Training part
    fold_stats = train_with_cross_validation(train_final_set, num_epochs, model_name, num_classes,random_state=random_state)
    max_val = 0
    real_num_epochs = 0
    for fold_stat in fold_stats: #using folds instead of LOO
        if fold_stat[1] > max_val:
            max_val = fold_stat[1]
            real_num_epochs = fold_stat[0]


In [72]:
if model_choice == 0:
    # Prediction part
    prediction = predict_mfcc(test_set, model_name, device, keys_s,num_classes)
    labels_set = [t[2] for t in test_set]
    final_labels_set = [keys_s[ind] for ind in labels_set]

    # Metrics calculation
    accuracy = accuracy_score(final_labels_set, prediction[0])
    precision = precision_score(final_labels_set, prediction[0], average='macro')
    recall = recall_score(final_labels_set, prediction[0], average='macro')
    f1 = sklearn.metrics.f1_score(final_labels_set, prediction[0], average='macro')

    # Save in csv file
    save_csv(model_name, real_num_epochs, description, accuracy, precision, recall, f1)

    # Print results
    print("Final Results!")
    print(f"Model: {model_name}")
    print(description)
    print(f"Epochs: {real_num_epochs}")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")

# Running imported CoAtNet model

This section uses the new settings.

In [81]:
if model_choice == 1:
    # current random state to split the dataset
    random_state = 42
    # patience value
    patience=15

    # values for current run
    train_final_set, test_set = train_test_split(audioDatasetFin, test_size=0.2, random_state=random_state)

    # specific to current run
    num_epochs = 500
    main_architecture = "CoAtNetImp"
    currday = datetime.today().strftime('%Y-%m-%d')
    model_name = f"model_multiclass_{num_epochs}_{main_architecture}_{currday}.pth"

    # Training part
    # changed so with the current k-fold cross validation we can track the average number of epochs
    # real_num_epochs, best_val_acc = num_epochs, train_with_cross_validation(train_final_set, num_epochs, model_name, random_state=random_state)
    real_num_epochs, best_val_acc = train_coatnet_with_cross_val(train_final_set, num_epochs, model_name, device, num_classes=num_classes, patience=patience)

KeyboardInterrupt: 

In [82]:
if model_choice == 1:
    # Prediction part
    # All metrics are calculated from the model with the best validation accuracy
    # model = MfccLSTM(input_size=20, hidden_size=32, num_classes=21, output_size=64)
    model = CoAtNetImp((64, 64), 1, num_blocks, channels, num_classes=num_classes)
    #clear cache before predicting
    torch.cuda.empty_cache()
    # prediction = predict(test_set, model, ["images", "sequences"],model_name, device)
    prediction = predict(test_set, model, ["x"],model_name, device, keys=keys_s)
    # prediction = predict(test_set, model, main_architecture, model_name, random_state)
    # prediction = predict_mfcc(test_set, model_name, device)
    labels_set = [t[-1] for t in test_set]
    final_labels_set = [keys_s[ind] for ind in labels_set]
    print(list(prediction[0])[15:25])
    print(final_labels_set[15:25])

    # Metrics calculation
    accuracy = accuracy_score(final_labels_set, prediction[0])
    precision = precision_score(final_labels_set, prediction[0], average='macro')
    recall = recall_score(final_labels_set, prediction[0], average='macro')
    f1 = sklearn.metrics.f1_score(final_labels_set, prediction[0], average='macro')

    # Save in csv file
    # CNN
    #description = "2 layer CNN (32 and 64 output channels) with final 2 Dense Layers (512 and "+str(num_classes)+") result concatenated with \n 2 LSTMs (hidden_size=32),  from mfcc with 2 Dense Layers (64 and 16) with a final Lazy Linear layer output of "+str(num_classes)+". "+audio_descr+" n_fft="+str(n_fft)+", hop_length="+str(hop_length)+". Using np.average(fold_results)"

    # CoAtNet
    description = "Imported CoAtNet model, with 2 Conv layers and then 2 Attention layers followed by a fully connected layer. "+audio_descr+" Test from IdeaPad. Did "+str(real_num_epochs)+" epochs. "+str(num_classes)+" keys recorded. Patience: "+str(patience)+" n_fft="+str(n_fft)+", hop_length="+str(hop_length)+". Using np.average(fold_results)"
    save_csv(model_name, int(real_num_epochs), description, accuracy, precision, recall, f1)

    # Print results
    print("Final Results!")
    print(f"Model: {model_name}")
    print(description)
    print(f"Epochs: {num_epochs}")
    print(f"Accuracy: {accuracy:.3f}")
    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")
    print(f"F1 Score: {f1:.3f}")
    print(f"Best val accuracy: {best_val_acc:.3f}")

['J', 'F', 'Y', 'Q', 'I', 'T', '7', '4', 'J', 'L']
['J', 'F', 'Y', '-', 'K', 'T', '7', '4', 'J', 'L']
Final Results!
Model: model_multiclass_500_CoAtNetImp_2024-10-01.pth
Imported CoAtNet model, with 2 Conv layers and then 2 Attention layers followed by a fully connected layer. Using audio from MKA-dataset. Method used for key separation: Isolator. Test from IdeaPad. Did 29 epochs. 38 keys recorded. Patience: 15 n_fft=10, hop_length=10. Using np.average(fold_results)
Epochs: 500
Accuracy: 0.650
Precision: 0.747
Recall: 0.649
F1 Score: 0.666
Best val accuracy: 0.657
