This is my implementation of Jurgen Arias' original Tensorflow model. As far as I can tell, the biggest change is that he had reorganized his LibreSpeech files into male and female directories. He also manually downloaded his Librespeech files, but I am using the torchaudio dataset.

Install environment:
```
conda create -n Capstone python=3.9 tensorflow-gpu cudatoolkit keras pytorch pytorch::torchaudio pytorch::torchinfo onnx onnx2pytorch conda-forge::librosa tqdm pandas anaconda::seaborn numpy==1.19 jupyter
```

In [1]:
import os
import torch
import torchaudio

dataroot = os.path.expanduser("~")
librispeech_data = torchaudio.datasets.LIBRISPEECH(dataroot, download=True)

In [2]:
import os
import pandas as pd
from pathlib import Path

dataroot = os.path.expanduser("~")
filelist = Path(os.path.join(dataroot, 'LibriSpeech', 'train-clean-100')).rglob('*.flac')

files = [{'speaker_id': int(str(file)[len(dataroot)+1:].split('/')[2]), 'file': file} for file in filelist]
df_files=pd.DataFrame(files)

In [3]:
speakers_file = os.path.join(dataroot, "LibriSpeech", "SPEAKERS.TXT")
speakers = pd.read_table(
    speakers_file,
    engine='python',
    sep=r'\s+\|\s+',
    names=['id','gender','subset','duration','name'],
    dtype={'id': 'i','gender': 'U1', 'subset': 'U', 'duration': 'f', 'name': 'U'}, comment=';')
speakers = speakers.drop(speakers[
    (speakers['subset']=='train-clean-360')
    |(speakers['subset']=='train-other-500')
    |(speakers['subset']=='dev-other')
    |(speakers['subset']=='test-other')
].index)
speakers.head()

Unnamed: 0,id,gender,subset,duration,name
3,19,F,train-clean-100,25.190001,Kara Shallenberg
8,26,M,train-clean-100,25.08,Denny Sayers
9,27,M,train-clean-100,20.139999,Sean McKinley
14,32,F,train-clean-100,24.01,Betsie Bush
18,39,F,train-clean-100,25.049999,Sherry Crowther


In [4]:
# Although this function was modified and many parameteres were explored with, most of it
# came from Source 8 (sources in the READ.ME)
import librosa
import numpy as np


def extract_features(files):
    
    # Sets the name to be the path to where the file is in my computer
    file_name = str(files.file)

    # Loads the audio file as a floating point time series and assigns the default sample rate
    # Sample rate is set to 22050 by default
    X, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 

    # Generate Mel-frequency cepstral coefficients (MFCCs) from a time series 
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)

    # Generates a Short-time Fourier transform (STFT) to use in the chroma_stft
    stft = np.abs(librosa.stft(X))

    # Computes a chromagram from a waveform or power spectrogram.
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)

    # Computes a mel-scaled spectrogram.
    mel = np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T,axis=0)

    # Computes spectral contrast
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)

    # Computes the tonal centroid features (tonnetz)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X),
    sr=sample_rate).T,axis=0)
        
    
    # We add also the classes of each file as a label at the end
    label = files.speaker_id

    return mfccs, chroma, mel, contrast, tonnetz, label

In [5]:
# The following operation can take hours depending on your cpu.
# Luckily, we only have to run it once, then we can save and load the processed data from a pickle file
import os
import pandas as pd
from datetime import datetime
from tqdm import tqdm

picklefile = 'Librispeech_features_label.pkl'
if os.path.isfile(picklefile):
    print(f"Reading features_label from {picklefile}")
    features_label = pd.read_pickle(picklefile)
else:
    tqdm.pandas()
    starttime = datetime.now()
    features_label = df_files.progress_apply(extract_features, axis=1)
    print(f"Extracting features took {datetime.now() - startTime}")
features_label.shape

Reading features_label from Librispeech_features_label.pkl


(28539,)

In [6]:
import numpy as np

features = []
for i in range(0, len(features_label)):
    features.append(np.concatenate((features_label[i][0], features_label[i][1], 
                features_label[i][2], features_label[i][3],
                features_label[i][4]), axis=0))
df_files['X'] = features

In [7]:
import tensorflow as tf
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder

lb = LabelEncoder()
labels = to_categorical(lb.fit_transform(df_files['speaker_id']))
df_files['y']=labels.tolist()

In [9]:
!pip install fast_ml

Collecting fast_ml
  Using cached fast_ml-3.68-py3-none-any.whl.metadata (12 kB)
Using cached fast_ml-3.68-py3-none-any.whl (42 kB)
Installing collected packages: fast_ml
Successfully installed fast_ml-3.68


In [11]:
!conda install anaconda::seaborn

/bin/bash: line 1: conda: command not found


In [12]:
# Split samples into train/val/test
from fast_ml.model_development import train_valid_test_split

df_files["Sets"] = "Training"
for lbl in df_files['speaker_id'].unique():
    temp_data = df_files[df_files['speaker_id'] == lbl]
    X_train, y_train, X_val, y_val, X_test, y_test = train_valid_test_split(
        temp_data,
        target="speaker_id",
        train_size=0.6,
        valid_size=0.1,
        test_size=0.3
    )
    df_files.Sets.iloc[X_test.index] = "Testing"
    df_files.Sets.iloc[X_val.index] = "Validation"

In [13]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

In [14]:
ss.fit(np.array(df_files['X'].tolist()))

X_train = ss.transform(np.array(df_files[df_files['Sets']=='Training']['X'].tolist()))
y_train = np.array(df_files[df_files['Sets']=='Training']['y'].tolist())
X_val = ss.transform(np.array(df_files[df_files['Sets']=='Validation']['X'].tolist()))
y_val = np.array(df_files[df_files['Sets']=='Validation']['y'].tolist())
X_test = ss.transform(np.array(df_files[df_files['Sets']=='Testing']['X'].tolist()))
y_test = np.array(df_files[df_files['Sets']=='Testing']['y'].tolist())

In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout 
from keras.callbacks import EarlyStopping


model = Sequential()


2024-04-17 21:15:46.554792: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [16]:

model.add(Dense(193, input_shape=(193,), activation="relu"))
model.add(Dropout(0.1))

model.add(Dense(128, activation="relu"))
model.add(Dropout(0.25))

model.add(Dense(128, activation="relu"))
model.add(Dropout(0.5))

model.add(Dense(y_train.shape[1], activation="softmax"))

model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

early_stop = EarlyStopping(monitor='val_loss', min_delta=0, patience=100, verbose=1, mode='auto')

In [17]:
start = datetime.now()
history = model.fit(
    X_train, y_train,
    batch_size=256,
    epochs=100, 
    validation_data=(X_val, y_val),
    callbacks=[early_stop]
)
print(f"{datetime.now()-start} seconds")

2024-04-17 21:15:57.461759: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2024-04-17 21:15:57.463015: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 1996800000 Hz


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [14]:
import tensorflow as tf

input_signature = [tf.TensorSpec([1, 193], tf.float32, name='X')]

In [15]:
import tf2onnx
import onnx

onnx_model, _ = tf2onnx.convert.from_keras(model, input_signature)

2024-04-14 21:49:39.846366: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-04-14 21:49:39.846793: I tensorflow/core/grappler/devices.cc:69] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0
2024-04-14 21:49:39.846976: I tensorflow/core/grappler/clusters/single_machine.cc:356] Starting new session
2024-04-14 21:49:39.847638: I tensorflow/compiler/jit/xla_gpu_device.cc:99] Not creating XLA devices, tf_xla_enable_xla_devices not set
2024-04-14 21:49:39.847964: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-04-14 21:49:39.848474: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:03:00.0 name: NVIDIA GeForce GTX 750 Ti c

In [16]:
onnx.save(onnx_model, 'Librispeech_Tensorflow_model.onnx')

In [17]:
import pickle

with open('Librispeech_Tensorflow_X_train.pkl','wb') as f:
    pickle.dump(X_train, f)
with open('Librispeech_Tensorflow_y_train.pkl','wb') as f:
    pickle.dump(y_train, f)
with open('Librispeech_Tensorflow_X_val.pkl','wb') as f:
    pickle.dump(X_val, f)
with open('Librispeech_Tensorflow_y_val.pkl','wb') as f:
    pickle.dump(y_val, f)
with open('Librispeech_Tensorflow_X_test.pkl','wb') as f:
    pickle.dump(X_test, f)
with open('Librispeech_Tensorflow_y_test.pkl','wb') as f:
    pickle.dump(y_test, f)

In [18]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 193)               37442     
_________________________________________________________________
dropout (Dropout)            (None, 193)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               24832     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 251)               3