In [1]:
import os
import sys
import numpy as np
from scipy.io import wavfile 
from hmmlearn import hmm
from librosa.feature import mfcc
from librosa import load
from sklearn.metrics import confusion_matrix

In [2]:
def load_wav_files(
    path='free-spoken-digit-dataset-master/recordings', 
    digit='0',
    test_pct=0.1,
    test_speaker=None
    ):
    X_train, X_test = [], []
    len_train, len_test = [], []
    for filename in [x for x in os.listdir(path) if x.endswith('.wav') and x.startswith(digit)]:
        # Read the input file
        # print(filename)
        filepath = os.path.join(path, filename)
        sig, rate = load(filepath)

        # Extract MFCC features
        mfcc_features = mfcc(sig, rate).T
        
        if test_speaker:
            if test_speaker in filename:
                X_test.extend(mfcc_features)
                len_test.append(mfcc_features.shape[0])
            else:
                X_train.extend(mfcc_features)
                len_train.append(mfcc_features.shape[0])
        else:
            if np.random.uniform() < test_pct:
                X_test.extend(mfcc_features)
                len_test.append(mfcc_features.shape[0])
            else:
                X_train.extend(mfcc_features)
                len_train.append(mfcc_features.shape[0])
            
    return (np.array(X_train), len_train), (np.array(X_test), len_test)

In [3]:
digits = '0123456789'

def train_models(test_speaker=None):
    models = {}
    testdata = {}
    print('Training Models...')
    for d in digits:
        sys.stdout.write(d + ' ')
        traind, testd = load_wav_files(digit=d, test_speaker=test_speaker)
        xd, ld = traind
        hmmd = hmm.GaussianHMM(n_components=5, n_iter=100)
        hmmd.fit(xd, ld)
        models[d] = hmmd
        testdata[d] = testd
    print()
    return models, testdata
        
def evaluate_models(models, testdata):
    print('Evaluating Performance...')
    y_true, y_pred = [], []
    for d in digits:
        sys.stdout.write(d + ' ')
        xd, ld = testdata[d]
        xd = np.split(xd, np.cumsum(ld))[:-1]
        for td in xd:
            scores = [models[di].score(td) for di in digits]
            y_true.append(d)
            y_pred.append(str(scores.index(max(scores))))
    print()
    return y_true, y_pred

# Experiment 1: HMM With Random Train/Test Split

In [4]:
models, testdata = train_models()
y_true, y_pred = evaluate_models(models, testdata)

Training Models...
0 1 2 3 4 5 6 7 8 9 
Evaluating Performance...
0 1 2 3 4 5 6 7 8 9 


In [5]:
confusion_matrix(y_true, y_pred)

array([[24,  1,  0,  2,  0,  0,  0,  0,  0,  0],
       [ 0, 21,  0,  0,  0,  0,  0,  0,  0,  1],
       [ 0,  1, 28,  4,  0,  0,  0,  0,  0,  0],
       [ 6,  0,  2, 19,  0,  0,  1,  0,  0,  0],
       [ 0,  0,  0,  0, 30,  0,  0,  0,  0,  0],
       [ 0,  3,  0,  0,  0, 25,  0,  1,  0,  0],
       [ 0,  0,  0,  2,  0,  0, 25,  0,  3,  0],
       [ 1,  0,  0,  0,  0,  0,  0, 29,  0,  0],
       [ 0,  0,  0,  1,  0,  2,  0,  0, 23,  0],
       [ 0,  4,  0,  1,  0,  2,  0,  0,  0, 27]])

# Experiment 2: HMM Testing on New Speakers

In [6]:
speakers = ['george', 'jackson', 'lucas', 'nicolas', 'theo', 'yweweler']

## Test Speaker `george`

In [7]:
models, testdata = train_models(test_speaker='george')
y_true, y_pred = evaluate_models(models, testdata)
print('Results:')
print(confusion_matrix(y_true, y_pred))

Training Models...
0 1 2 3 4 5 6 7 8 9 
Evaluating Performance...
0 1 2 3 4 5 6 7 8 9 
Results:
[[ 0  1  7 42  0  0  0  0  0  0]
 [ 0 41  0  9  0  0  0  0  0  0]
 [ 0  5 11 34  0  0  0  0  0  0]
 [ 0  0  0 50  0  0  0  0  0  0]
 [ 0  4  0 46  0  0  0  0  0  0]
 [ 0 12  0 38  0  0  0  0  0  0]
 [ 0  0  3 47  0  0  0  0  0  0]
 [ 0  0  0 50  0  0  0  0  0  0]
 [ 0  0  5 45  0  0  0  0  0  0]
 [ 0  2  0 46  0  0  0  0  0  2]]


## Test Speaker `jackson`

In [8]:
models, testdata = train_models(test_speaker='jackson')
y_true, y_pred = evaluate_models(models, testdata)
print('Results:')
print(confusion_matrix(y_true, y_pred))

Training Models...
0 1 2 3 4 5 6 7 8 9 
Evaluating Performance...
0 1 2 3 4 5 6 7 8 9 
Results:
[[50  0  0  0  0  0  0  0  0  0]
 [ 0 48  0  0  2  0  0  0  0  0]
 [ 1  0 48  1  0  0  0  0  0  0]
 [13  0  2 30  0  0  2  0  0  3]
 [ 0  0  0  0 50  0  0  0  0  0]
 [ 0 17  0  0  0 22  0  5  0  6]
 [ 0  2  0  0  1  0  2 37  2  6]
 [ 0  0  0  0  0  0  0 50  0  0]
 [ 0  0  0  6  0  0 16  0 28  0]
 [ 2 13  0  0  0  0  0  4  0 31]]


## Test Speaker `lucas`

In [9]:
models, testdata = train_models(test_speaker='lucas')
y_true, y_pred = evaluate_models(models, testdata)
print('Results:')
print(confusion_matrix(y_true, y_pred))

Training Models...
0 1 2 3 4 5 6 7 8 9 
Evaluating Performance...
0 1 2 3 4 5 6 7 8 9 
Results:
[[16  0  0 31  3  0  0  0  0  0]
 [ 0 37  0 12  0  0  0  0  0  1]
 [ 0  0  1 49  0  0  0  0  0  0]
 [ 0  0  0 50  0  0  0  0  0  0]
 [ 4  6  0  7 33  0  0  0  0  0]
 [ 0  0  0  5  0 45  0  0  0  0]
 [ 0  0  0 19  0 30  0  1  0  0]
 [ 0  0  0 30  0  1  0 19  0  0]
 [ 0  0  0 43  0  0  0  0  7  0]
 [ 0  4  0  3  0  0  0  0  0 43]]


## Test Speaker `nicolas`

In [10]:
models, testdata = train_models(test_speaker='nicolas')
y_true, y_pred = evaluate_models(models, testdata)
print('Results:')
print(confusion_matrix(y_true, y_pred))

Training Models...
0 1 2 3 4 5 6 7 8 9 
Evaluating Performance...
0 1 2 3 4 5 6 7 8 9 
Results:
[[43  0  5  0  0  0  2  0  0  0]
 [ 5  3 35  0  0  0  1  2  0  4]
 [37  0  9  0  0  0  4  0  0  0]
 [19  0  1  0  0  0 30  0  0  0]
 [ 2 12 17  0 17  0  0  2  0  0]
 [ 0 12 24  0  0  7  2  0  0  5]
 [ 5  0  2  0  0  0 39  0  4  0]
 [ 7  0  0  0  0  0 39  4  0  0]
 [ 1  0 11  2  0  0 33  0  3  0]
 [28  0  2  1  0  2  0  8  0  9]]


## Test Speaker `theo`

In [11]:
models, testdata = train_models(test_speaker='theo')
y_true, y_pred = evaluate_models(models, testdata)
print('Results:')
print(confusion_matrix(y_true, y_pred))

Training Models...
0 1 2 3 4 5 6 7 8 9 
Evaluating Performance...
0 1 2 3 4 5 6 7 8 9 
Results:
[[ 5  0 12  0  0  4 13 10  1  5]
 [ 0 45  0  0  0  1  0  0  0  4]
 [ 1  0 37  0  0  0 12  0  0  0]
 [ 0  0 17  8  0  0 22  0  3  0]
 [ 7 27  0  0 13  1  0  2  0  0]
 [ 0  0  0  0  0 21  0  0  0 29]
 [ 0  0 23  0  0  0  5 19  3  0]
 [ 0  0  0  0  0 20  1 28  0  1]
 [ 0  0  3  0  0  0  2  0 45  0]
 [ 0  4  0  0  0  6  0  2  0 38]]


## Test Speaker `yweweler`

In [12]:
models, testdata = train_models(test_speaker='yweweler')
y_true, y_pred = evaluate_models(models, testdata)
print('Results:')
print(confusion_matrix(y_true, y_pred))

Training Models...
0 1 2 3 4 5 6 7 8 9 
Evaluating Performance...
0 1 2 3 4 5 6 7 8 9 
Results:
[[44  2  0  4  0  0  0  0  0  0]
 [ 0 50  0  0  0  0  0  0  0  0]
 [14  1 34  0  0  0  0  1  0  0]
 [ 2  0  7 33  0  0  1  0  7  0]
 [ 0 12  0  0 38  0  0  0  0  0]
 [ 0  0  0  0  0 18  1  0  0 31]
 [ 4  0 19  1  0  0  2  0 23  1]
 [ 0  0  0  0  0  0  0 22  0 28]
 [ 0  0  5  0  0  0  1  0 44  0]
 [ 0  9  0  0  0  0  0  0  0 41]]
