# Speech recognition

Simple speech recognition system can be implemented using DTW.

This notebook is inspired by [Rouanet DTW library example](http://nbviewer.jupyter.org/github/pierre-rouanet/dtw/blob/master/speech-recognition.ipynb)

We will use a simple [database](https://www.dropbox.com/s/c12fmsctfwwov5d/sounds.zip) composed of 12 french words pronounced about 25 times by different speakers.

In [1]:
import librosa
from dtw import dtw
import matplotlib as plt
import numpy as np
import glob
import operator

%matplotlib inline

### Loading Data

In [2]:
%%time

y = []
with open('sounds/wavToTag.txt') as f:
    y = list([l.replace('\n', '') for l in f.readlines()])

X = []
for i in range(len(y)):
    x, sample_rate = librosa.load("sounds/{}.wav".format(i))
    X.append(x)

CPU times: user 10.6 s, sys: 32 ms, total: 10.6 s
Wall time: 10.7 s


### Processing

In [3]:
n_window_samples = int(sample_rate * 2 * 10**(-3))

def reshape_sound(x):
    # reshape into windows of width of 20 ms
    
    new_len = np.floor_divide(x.shape[0], n_window_samples) * n_window_samples
    x = x[0:new_len]
    x = x.reshape((n_window_samples, -1), order='F')
    return x

In [4]:
X = [reshape_sound(x) for x in X]

### Define groundtruth data

In [None]:
gt = dict()

unique_labels = set(y)
for l in unique_labels:
    idx = y.index(l)
    y.pop(idx)
    x = X.pop(idx)
    gt[l] = x

### Classificating!

In [None]:
%%time

classifications = dict(zip(unique_labels, [[]]*len(unique_labels)))

for idx in range(len(y)):
    x = X[idx]
    
    # for each reference sound calculate dtw distance to x
    # save it in a dictionary where the key is the reference label
    x_distance = {}
    for label, ground in gt.items():
        cost, path = librosa.dtw(x, ground)
        path = np.array(path)
        columns = path[:,0]
        rows = path[:,1]
        min_cost = np.sum(cost[columns, rows])
        x_distance[label] = min_cost

    # ascending order by distance
    ordered_distance = sorted(x_distance.items(), key=operator.itemgetter(1))
    predictions_rank = [l[0] for l in ordered_distance]
    real_label_position = predictions_rank.index(y[idx])
    classifications[y[idx]].append(real_label_position)
    
    print("{0: <15s}{1:2d}º".format(y[idx], (real_label_position+1)))

chaussure       5º
manette         2º
chaussure       4º
sofoot          1º
manette         1º
stade           3º
gants           1º
jeuvideo        8º
stade           5º
gants           1º
gants           2º
biere           6º
jeuvideo        8º
zidane         10º
sofoot          1º
biere           4º
zidane          9º
jeuvideo        8º
cocacola       11º
manette         2º
sofoot          1º
cocacola       12º
biere           5º
sofoot          1º
beckham         6º
stade           4º
zidane          9º
chaussure       7º
manette         1º
biere           3º
sofoot          2º
