In [1]:
import IPython.display as ipd
# % pylab inline
import os
import pandas as pd
import librosa
import glob 
#import librosa.display
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import cm
import random

In [2]:
pd.set_option('display.max_colwidth', None)
pd.options.display.max_rows=500

In [3]:
from pathlib import Path
filelist = Path('LibriSpeech/train-clean-100').rglob('*.flac')

df_train = pd.DataFrame(filelist)
df_train = df_train.rename(columns={0:'file'})
df_train.head()

Unnamed: 0,file
0,LibriSpeech/train-clean-100/5393/19218/5393-19218-0016.flac
1,LibriSpeech/train-clean-100/5393/19218/5393-19218-0019.flac
2,LibriSpeech/train-clean-100/5393/19218/5393-19218-0059.flac
3,LibriSpeech/train-clean-100/5393/19218/5393-19218-0011.flac
4,LibriSpeech/train-clean-100/5393/19218/5393-19218-0005.flac


In [4]:
speakers_file = "LibriSpeech/SPEAKERS.TXT"
speakers = pd.read_table(
    speakers_file,
    engine='python',
    sep='\s+\|\s+',
    names=['id','gender','subset','duration','name'],
    dtype={'id': 'i','gender': 'U1', 'subset': 'U', 'duration': 'f', 'name': 'U'}, comment=';')
speakers = speakers.drop(speakers[
    (speakers['subset']=='train-clean-360')
    |(speakers['subset']=='train-other-500')
    |(speakers['subset']=='dev-other')
    |(speakers['subset']=='test-other')
].index)
speakers

Unnamed: 0,id,gender,subset,duration,name
3,19,F,train-clean-100,25.190001,Kara Shallenberg
8,26,M,train-clean-100,25.08,Denny Sayers
9,27,M,train-clean-100,20.139999,Sean McKinley
14,32,F,train-clean-100,24.01,Betsie Bush
18,39,F,train-clean-100,25.049999,Sherry Crowther
19,40,F,train-clean-100,25.040001,Vicki Barbour
32,60,M,train-clean-100,20.18,|CBW|Simon
33,61,M,test-clean,8.08,Paul-Gabriel Wiener
41,78,M,train-clean-100,25.049999,Hugh McGuire
45,83,F,train-clean-100,25.040001,Catharine Eastman


In [5]:
speakers['subset'].value_counts()

subset
train-clean-100    251
test-clean          40
dev-clean           40
Name: count, dtype: int64

In [6]:
from pathlib import Path
filelist = Path('LibriSpeech').rglob('*.flac')
files = [{'speaker_id': int(str(file).split('/')[2]), 'file': file} for file in filelist]
df_files=pd.DataFrame(files)
df_files

Unnamed: 0,speaker_id,file
0,5393,LibriSpeech/train-clean-100/5393/19218/5393-19218-0016.flac
1,5393,LibriSpeech/train-clean-100/5393/19218/5393-19218-0019.flac
2,5393,LibriSpeech/train-clean-100/5393/19218/5393-19218-0059.flac
3,5393,LibriSpeech/train-clean-100/5393/19218/5393-19218-0011.flac
4,5393,LibriSpeech/train-clean-100/5393/19218/5393-19218-0005.flac
...,...,...
33857,2428,LibriSpeech/dev-clean/2428/83705/2428-83705-0019.flac
33858,2428,LibriSpeech/dev-clean/2428/83705/2428-83705-0028.flac
33859,2428,LibriSpeech/dev-clean/2428/83705/2428-83705-0025.flac
33860,2428,LibriSpeech/dev-clean/2428/83705/2428-83705-0021.flac


In [7]:
from fast_ml.model_development import train_valid_test_split

df_files["Sets"] = "Training"
for lbl in df_files['speaker_id'].unique():
    temp_data = df_files[df_files['speaker_id'] == lbl]
    X_train, y_train, X_val, y_val, X_test, y_test = train_valid_test_split(
        temp_data,
        target="speaker_id",
        train_size=0.6,
        valid_size=0.1,
        test_size=0.3
    )
    df_files.Sets.iloc[X_test.index] = "Testing"
    df_files.Sets.iloc[X_val.index] = "Validation"

In [8]:
# Although this function was modified and many parameteres were explored with, most of it
# came from Source 8 (sources in the READ.ME)

def extract_features(files):
    
    # Sets the name to be the path to where the file is in my computer
    file_name = str(files.file)

    # Loads the audio file as a floating point time series and assigns the default sample rate
    # Sample rate is set to 22050 by default
    X, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 

    # Generate Mel-frequency cepstral coefficients (MFCCs) from a time series 
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)

    # Generates a Short-time Fourier transform (STFT) to use in the chroma_stft
    stft = np.abs(librosa.stft(X))

    # Computes a chromagram from a waveform or power spectrogram.
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)

    # Computes a mel-scaled spectrogram.
    mel = np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T,axis=0)

    # Computes spectral contrast
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)

    # Computes the tonal centroid features (tonnetz)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X),
    sr=sample_rate).T,axis=0)
        
    
    # We add also the classes of each file as a label at the end
    label = files.speaker_id

    return mfccs, chroma, mel, contrast, tonnetz, label

In [64]:
df_files.to_pickle('Librispeech_PyTorch_df_files.pkl')

In [8]:
df_files = pd.read_pickle('Librispeech_PyTorch_df_files.pkl')

In [10]:
import numpy as np
import os
from datetime import datetime

picklefile = 'Librispeach_PyTorch_features_label.pkl'
if os.path.isfile(picklefile):
    print(f"Reading features from {picklefile}")
    features_label = pd.read_pickle(picklefile)
else:
    startTime = datetime.now()
    features_label = df_files.apply(extract_features, axis=1)
    print(datetime.now() - startTime)
    features_label.to_pickle(picklefile)

Reading features from Librispeach_PyTorch_features_label.pkl


In [11]:
features_label.head()

0      ([-355.78824, 115.00196, -33.891296, 38.743874, -9.843966, 1.6396469, -3.4062624, -5.2952833, 9.099011, -2.4850764, 0.7190697, -3.04304, 5.025559, 3.5158784, -1.4026278, -0.12272247, 2.450649, -0.13805504, -6.4034905, 0.22591878, -1.1307915, -4.3972487, -1.0534327, 0.05638659, -2.9429982, -1.1572398, -0.08161958, -1.9492955, 0.79942846, -0.13769592, 1.6648215, 3.1556494, 3.4880686, 4.2836747, 2.9823754, 3.6624057, 2.6771302, 2.9374325, 2.6298814, 1.1149267], [0.6906988, 0.6511583, 0.58476067, 0.5434637, 0.555994, 0.57990086, 0.61002755, 0.61547655, 0.62042934, 0.6521874, 0.669347, 0.6868791], [0.041331366, 0.19435953, 0.088610314, 0.010506179, 0.031644356, 0.47447613, 2.575366, 5.955543, 3.446617, 0.3173315, 0.06681397, 0.08960947, 0.48753977, 0.86967057, 1.6997426, 2.862485, 7.4286966, 7.401466, 0.4023322, 1.0836115, 0.9524352, 1.0556357, 1.2197704, 1.6775879, 1.8304838, 0.66789055, 0.38433993, 0.69279563, 0.95153636, 3.043987, 1.3170439, 1.9210573, 3.3581932, 1.4897953, 0.7631

In [12]:
features_label.shape

(33862,)

In [13]:
import numpy as np

features = []
for i in range(0, len(features_label)):
    features.append(np.concatenate((features_label[i][0], features_label[i][1], 
                features_label[i][2], features_label[i][3],
                features_label[i][4]), axis=0))

In [14]:
df_files['X'] = features

In [15]:
import torch

speakers_tensor = torch.tensor(df_files['speaker_id'])
speakers_tensor.shape

torch.Size([33862])

In [16]:
speakers_onehot = torch.zeros(speakers_tensor.shape[0], 331)

In [17]:
label2index = {label:idx for idx,label in enumerate(set(df_files['speaker_id']))}

In [18]:
len(label2index)

331

In [19]:
numeric_labels = [label2index[label] for label in df_files['speaker_id']]

In [20]:
numeric_labels

[228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 228,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,
 78,


In [21]:
num_classes = len(label2index)
one_hot_encoded = torch.nn.functional.one_hot(torch.tensor(numeric_labels), num_classes)

In [22]:
one_hot_encoded.shape

torch.Size([33862, 331])

In [52]:
df_files = df_files.drop(['y'], axis=1)

In [23]:
df_files['y'] = one_hot_encoded.tolist()

In [24]:
df_files['y'] = numeric_labels

In [25]:
df_files.head()

Unnamed: 0,speaker_id,file,Sets,X,y
0,5393,LibriSpeech/train-clean-100/5393/19218/5393-19...,Training,"[-355.7882385253906, 115.00196075439453, -33.8...",228
1,5393,LibriSpeech/train-clean-100/5393/19218/5393-19...,Training,"[-339.40924072265625, 126.5940170288086, -35.3...",228
2,5393,LibriSpeech/train-clean-100/5393/19218/5393-19...,Validation,"[-353.0205383300781, 126.88507080078125, -33.1...",228
3,5393,LibriSpeech/train-clean-100/5393/19218/5393-19...,Validation,"[-299.06805419921875, 119.9009780883789, -51.2...",228
4,5393,LibriSpeech/train-clean-100/5393/19218/5393-19...,Testing,"[-336.1969909667969, 125.14303588867188, -29.4...",228


In [26]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

X_train = ss.fit_transform(np.array(df_files[df_files['Sets']=='Training']['X'].tolist()))
y_train = np.array(df_files[df_files['Sets']=='Training']['y'].tolist())
X_val = ss.fit_transform(np.array(df_files[df_files['Sets']=='Validation']['X'].tolist()))
y_val = np.array(df_files[df_files['Sets']=='Validation']['y'].tolist())
X_test = ss.fit_transform(np.array(df_files[df_files['Sets']=='Testing']['X'].tolist()))
y_test = np.array(df_files[df_files['Sets']=='Testing']['y'].tolist())

In [27]:
X_train.shape

(20183, 193)

In [28]:
y_train.shape

(20183,)

In [29]:
y_train[0]

228

In [30]:
import torch
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, parameters, labels, device):
        self.parameters = parameters
        self.labels = labels
        self.device = device

    def __len__(self):
        return len(self.parameters)

    def __getitem__(self, idx):
        param = torch.tensor(self.parameters[idx], dtype=torch.float32).to(device)
        label = torch.tensor(self.labels[idx], dtype=torch.long).to(device)
        return param, label

In [31]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f"Using device {device}")
train_loader = torch.utils.data.DataLoader(CustomDataset(X_train, y_train, device), batch_size=10)
for data, targets in train_loader:
    print(f"Data: {data}")
    print(f"Targets: {targets}")
    break

Using device cpu
Data: tensor([[-0.3111,  0.0318, -0.4333,  ..., -1.6627,  1.8160, -0.9501],
        [ 0.1062,  0.6596, -0.5080,  ..., -0.9844,  1.6718, -1.5117],
        [ 0.2290,  0.3779, -0.5216,  ..., -1.5478,  0.1473, -1.4794],
        ...,
        [ 0.3495,  0.3239, -0.6233,  ...,  1.0778, -1.9075, -1.1685],
        [-0.0137,  0.4033, -0.6486,  ..., -1.5458,  0.1171, -1.7513],
        [ 0.5422, -0.1370, -0.1985,  ..., -0.2513,  0.2013, -1.8000]])
Targets: tensor([228, 228, 228, 228, 228, 228, 228, 228, 228, 228])


In [32]:
import torch
import torch.nn as nn
import torch.optim as optim

class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(193, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 331)
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.25)
        self.dropout3 = nn.Dropout(0.5)
        self.softmax = nn.Softmax()

    def forward(self, x):
        x = torch.nn.functional.relu(self.fc1(x))
        x = self.dropout1(x)
        x = torch.nn.functional.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.fc3(x)
        x = self.dropout3(x)
        x = self.softmax(x)
        #print(x)
        return x

In [38]:
torch.cuda.is_available()

False

In [34]:
model = NeuralNetwork()
model.to(device)

NeuralNetwork(
  (fc1): Linear(in_features=193, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=331, bias=True)
  (dropout1): Dropout(p=0.1, inplace=False)
  (dropout2): Dropout(p=0.25, inplace=False)
  (dropout3): Dropout(p=0.5, inplace=False)
  (softmax): Softmax(dim=None)
)

In [37]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f"Using device {device}")

train_loader = torch.utils.data.DataLoader(CustomDataset(X_train, y_train, device), batch_size=256, shuffle=True)
val_loader = torch.utils.data.DataLoader(CustomDataset(X_val, y_val, device), batch_size=256)

criterion = nn.CrossEntropyLoss()
# criterion = nn.functional.cross_entropy
optimizer = optim.Adam(model.parameters(), lr=0.001)

best_val_loss = float('inf')
patience = 100
counter = 0
num_epochs = 200

for epoch in range(num_epochs):
    print(f"Epoch: {epoch+1}/{num_epochs}")
    # Training loop
    model.train()
    train_loss_accumulator = 0.0
    train_correct_accumulator = 0
    train_total_samples = 0
    for batch_idx, (data, targets) in enumerate(train_loader):
        # print(f"Data: {data}")
        # print(f"Targets: {targets}")
        # print(f"Batch: {batch_idx}")
        optimizer.zero_grad()
        outputs = model(data)
        #print(f"Outputs: {outputs}")
        #print(f"Targets: {targets}")
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        train_loss_accumulator += loss.item() * data.size(0)
        _, predicted = torch.max(outputs, 1)
        train_correct_accumulator += (predicted == targets).sum().item()
        train_total_samples += data.size(0)
    train_loss = train_loss_accumulator / train_total_samples
    train_accuracy = train_correct_accumulator / train_total_samples

    #Validation loop
    model.eval()
    val_loss_accumulator = 0.0
    val_correct_accumulator = 0
    val_total_samples = 0
    with torch.no_grad():
        val_loss = 0
        for data, targets in val_loader:
            outputs = model(data)
            loss = criterion(outputs, targets)
            val_loss_accumulator += loss.item() * data.size(0)
            _, predicted = torch.max(outputs, 1)
            val_correct_accumulator += (predicted==targets).sum().item()
            val_total_samples += data.size(0)

        val_loss = val_loss_accumulator / val_total_samples
        val_accuracy = val_correct_accumulator / val_total_samples

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            counter = 0
        else:
            counter += 1
            if counter >= patience:
                print("Early stopping triggered")
                break
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
    print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

    for param_group in optimizer.param_groups:
        print(param_group['lr'])

Using device cpu
Epoch: 1/200
Train Loss: 5.3746, Train Accuracy: 0.4324
Validation Loss: 4.9407, Validation Accuracy: 0.8679
0.001
Epoch: 2/200
Train Loss: 5.3703, Train Accuracy: 0.4369
Validation Loss: 4.9399, Validation Accuracy: 0.8694
0.001
Epoch: 3/200
Train Loss: 5.3726, Train Accuracy: 0.4340
Validation Loss: 4.9380, Validation Accuracy: 0.8709
0.001
Epoch: 4/200
Train Loss: 5.3694, Train Accuracy: 0.4374
Validation Loss: 4.9377, Validation Accuracy: 0.8724
0.001
Epoch: 5/200
Train Loss: 5.3601, Train Accuracy: 0.4471
Validation Loss: 4.9325, Validation Accuracy: 0.8761
0.001
Epoch: 6/200
Train Loss: 5.3700, Train Accuracy: 0.4367
Validation Loss: 4.9318, Validation Accuracy: 0.8776
0.001
Epoch: 7/200
Train Loss: 5.3677, Train Accuracy: 0.4400
Validation Loss: 4.9280, Validation Accuracy: 0.8821
0.001
Epoch: 8/200
Train Loss: 5.3624, Train Accuracy: 0.4446
Validation Loss: 4.9272, Validation Accuracy: 0.8818
0.001
Epoch: 9/200
Train Loss: 5.3585, Train Accuracy: 0.4482
Validat

In [36]:
# does my validation set contain the same speakers as my training set?
np.unique(y_train, return_counts=True)

(array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
         26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
         39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
         52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
         65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
         78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
         91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
        104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
        117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
        130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
        143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
        156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
        169, 170, 171, 172, 173, 174, 175, 176, 177

In [50]:
np.unique(y_val, return_counts=True)

(array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
         26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
         39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
         52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
         65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
         78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
         91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
        104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
        117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
        130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
        143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
        156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
        169, 170, 171, 172, 173, 174, 175, 176, 177

In [104]:
y_train[5]

228

In [122]:
for i in range(500):
    print(f"{i}: {y_train[i]} - {model(torch.tensor([X_train[i]], dtype=torch.float32)).argmax()}")


0: 228 - 228
1: 228 - 228
2: 228 - 228
3: 228 - 4
4: 228 - 1
5: 228 - 1
6: 228 - 321
7: 228 - 3
8: 228 - 228
9: 228 - 228
10: 228 - 0
11: 228 - 228
12: 228 - 0
13: 228 - 250
14: 228 - 319
15: 228 - 228
16: 228 - 319
17: 228 - 228
18: 228 - 228
19: 228 - 228
20: 228 - 228
21: 228 - 228
22: 228 - 228
23: 228 - 39
24: 228 - 228
25: 228 - 228
26: 228 - 2
27: 228 - 228
28: 228 - 0
29: 228 - 228
30: 228 - 228
31: 228 - 228
32: 228 - 321
33: 228 - 228
34: 228 - 228
35: 228 - 2
36: 228 - 23
37: 228 - 0
38: 228 - 2
39: 228 - 228
40: 228 - 0
41: 228 - 321
42: 228 - 228
43: 228 - 37
44: 228 - 228
45: 228 - 250
46: 228 - 228
47: 228 - 321
48: 228 - 228
49: 228 - 250
50: 228 - 275
51: 228 - 228
52: 228 - 228
53: 228 - 250
54: 228 - 0
55: 228 - 321
56: 228 - 228
57: 228 - 228
58: 228 - 250
59: 228 - 39
60: 228 - 228
61: 228 - 228
62: 228 - 228
63: 228 - 228
64: 228 - 60
65: 228 - 0
66: 228 - 4
67: 228 - 228
68: 228 - 1
69: 228 - 228
70: 228 - 60
71: 228 - 228
72: 228 - 228
73: 228 - 321
74: 228 - 22