# Spoken Digit Classification

## Imports and dataset download

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import os
import librosa
from tqdm.notebook import tqdm
from IPython.display import Audio
import scipy as sp

# Classifier
from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler

In [0]:
! git clone -q https://github.com/Jakobovski/free-spoken-digit-dataset
#! pip install -r free-spoken-digit-dataset/pip_requirements.txt # probably not needed

## Dataset import 

The informations about the audio track are retrieved from the name using the following regexp
```python
'([0-9]+)_([a-z]+)_([0-9]+).wav'
```
With this we can retrieve from the 3 groups
```python
# Digit Label
regexp.search(filename).group(1)
# Spearker Name (maybe we'll need this later while analyzing the data?)
regexp.search(filename).group(2)
# File Index
regexp.search(filename).group(3)
```


In [3]:
# recordings directory
directory = 'free-spoken-digit-dataset/recordings/'
# regexp matching {digitLabel}_{speakerName}_{index}.wav
regexp = re.compile('([0-9]+)_([a-z]+)_([0-9]+).wav') 

labels  = []
audios  = []
indexes = []
sizes   = []

# setup progress bar
pbar = tqdm(desc='Loading audio samples')
pbar.reset(total=len(os.listdir(directory))) 

for f in os.listdir(directory):
  if f.endswith(".wav"): 
    labels.append(regexp.search(f).group(1))       # Take the first group (label)
    indexes.append(int(regexp.search(f).group(3))) # Take the last group (index)
    x, sr = librosa.load(directory+f, sr=None)     # Load audio file
    sizes.append(x.size)
    audios.append(x)                               # Put it in the list
  pbar.update()                                  # updates progress bar
pbar.refresh();

# python lists to np.ndarray
Y = np.asarray(labels)
X = np.asarray(audios)
I = np.asarray(indexes)

HBox(children=(IntProgress(value=1, bar_style='info', description='Loading audio samples', max=1, style=Progre…

Here we're doing a train-test split using the conditions stated in the github repository

In [0]:
# condition stated in the git repository for the test/train subdivision
is_in_test  = I <= 4
is_in_train = I >  4

Y_test, Y_train = Y[is_in_test], Y[is_in_train]
X_test, X_train = X[is_in_test], X[is_in_train]

## Preprocessing


In [37]:
# python lists to np.ndarray
Y = np.asarray(labels)
X = np.asarray(audios)
I = np.asarray(indexes)

'''
Create an array (L) of empty audio tracks of 8000 samples
For each audio track in X:
  if the track is too long, trim it to be 1 second long (8000 samples)
  Then fill the first
'''
# L is the trimmed and zero-padded dataset
L = np.zeros((X.size, 8000))
pbar = tqdm(desc='Zero Padding')
pbar.reset(total=len(X))
for i in range(len(X)):
  if(X[i].size>8000): X[i] = X[i][0:8000] 
  L[i, 0:X[i].size] = X[i]                 
  pbar.update();                             
pbar.refresh();

HBox(children=(IntProgress(value=1, bar_style='info', description='Zero Padding', max=1, style=ProgressStyle(d…

In [0]:
is_in_test  = I <= 4
is_in_train = I >  4

Y_test, Y_train = Y[is_in_test], Y[is_in_train]
L_test, L_train = L[is_in_test], L[is_in_train]  # Trimmed and zero-padded dataset
X_test, X_train = X[is_in_test], X[is_in_train]  # Original dataset

## Features extraction

### Functions

MFCC

In [0]:
def compute_mfcc(audio, fs, n_mfcc):
    # Compute the spectrogram of the audio signal
    X = np.abs(librosa.stft(
        audio,
        window='hamming',
        n_fft=1024,
        hop_length=512,)
        )
    
    # Find the weights of the mel filters
    mel = librosa.filters.mel(
        sr=fs,
        n_fft=1024,
        n_mels=40,
        fmin=133.33,
        fmax=4000,
    )

    # Apply the filters to spectrogram
    melspectrogram = np.dot(mel, X)

    # Take the logarithm
    log_melspectrogram = np.log10(melspectrogram + 1e-16)
    
    # Apply the DCT to log melspectrogram to obtain the coefficients
    mfcc = sp.fftpack.dct(log_melspectrogram, axis=0, norm='ortho')[1:n_mfcc+1]

    return mfcc

In [0]:
def feature_extraction(data, type):
  pbar = tqdm(desc='Feature extraction')
  size = len(data)
  pbar.reset(total=size)

  Fs = 8000
  features = None
  if(type == "mfcc20"):
    n_mfcc = 20
    features = np.zeros((size, n_mfcc))
    for i in range(size):
      mfcc = compute_mfcc(data[i], Fs, n_mfcc)
      features[i, :] = np.mean(mfcc, axis=1)
      pbar.update()
    pbar.refresh()
  if(type == "mfcc13"):
    n_mfcc = 13
    features = np.zeros((size, n_mfcc))
    for i in range(size):
      mfcc = compute_mfcc(data[i], Fs, n_mfcc)
      features[i, :] = np.mean(mfcc, axis=1)
      pbar.update()
    pbar.refresh()
  
  return features

## Classifier

In [42]:
feature_method = "mfcc13"

train_features = feature_extraction(X_train, feature_method)

scaler = MinMaxScaler()
scaler.fit(train_features)
scaler.transform(train_features)

clf = svm.SVC()
clf.fit(train_features, Y_train)


test_features = feature_extraction(X_test, feature_method)

scaler.transform(test_features)
y_pred = clf.predict(test_features)
confusion_matrix(Y_test, y_pred)

HBox(children=(IntProgress(value=1, bar_style='info', description='Feature extraction', max=1, style=ProgressS…

HBox(children=(IntProgress(value=1, bar_style='info', description='Feature extraction', max=1, style=ProgressS…

array([[20,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0, 18,  0,  0,  1,  1,  0,  0,  0,  0],
       [ 0,  0, 20,  0,  0,  0,  0,  0,  0,  0],
       [ 1,  0,  6, 13,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0, 20,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0, 20,  0,  0,  0,  0],
       [ 1,  0,  0,  3,  0,  0, 14,  1,  1,  0],
       [ 0,  0,  0,  0,  0,  0,  0, 20,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  2,  0, 18,  0],
       [ 0,  1,  0,  0,  0,  2,  0,  0,  0, 17]])

## Style

In [0]:
import matplotlib as mpl
COLOR = 'darkgrey'
mpl.rcParams['text.color'] = COLOR
mpl.rcParams['axes.labelcolor'] = COLOR
mpl.rcParams['xtick.color'] = COLOR
mpl.rcParams['ytick.color'] = COLOR