# Constructing the Speech Network Model

#### Import Modules for Building the Neural Network

In [1]:
import numpy as np
import theano
import theano.tensor as T
import lasagne
import ctc_cost
import h5py
import random
import math


Using gpu device 1: GeForce GTX 980 Ti (CNMeM is enabled with initial size: 40.0% of memory, CuDNN 4007)


## Hyperparameters 

In [2]:
#initial parameters
LEARNING_RATE = 0.001
NUM_EPOCHS=1
VALIDATION_SIZE=0.2

batch_size=125
feature_size=55
frames=336
num_phonomes=62

## Build the Bidirectional LSTM Network

In [3]:
#Rebuilding the DBRNN
#Source: http://www.cs.toronto.edu/~graves/asru_2013.pdf

#input layer
input_layer=lasagne.layers.InputLayer(shape=(batch_size, frames, feature_size))
batchsize, fm, feature = input_layer.input_var.shape


#layer 1
fwd_layer_1=lasagne.layers.LSTMLayer(input_layer,num_units=256, backwards=False, learn_init=True)
bwd_layer_1=lasagne.layers.LSTMLayer(input_layer, num_units=256, backwards=True, learn_init=True)
recurrent_layer_1= lasagne.layers.ElemwiseSumLayer([fwd_layer_1,bwd_layer_1])

#layer 2
fwd_layer_2=lasagne.layers.LSTMLayer(recurrent_layer_1,num_units=256, backwards=False, learn_init=True)
bwd_layer_2=lasagne.layers.LSTMLayer(recurrent_layer_1, num_units=256, backwards=True, learn_init=True)
recurrent_layer_2= lasagne.layers.ElemwiseSumLayer([fwd_layer_2,bwd_layer_2])

#layer 3
fwd_layer_3=lasagne.layers.LSTMLayer(recurrent_layer_2,num_units=256, backwards=False, learn_init=True)
bwd_layer_3=lasagne.layers.LSTMLayer(recurrent_layer_2, num_units=256, backwards=True, learn_init=True)
recurrent_layer_3= lasagne.layers.ElemwiseSumLayer([fwd_layer_3,bwd_layer_3])

#layer 4
fwd_layer_4=lasagne.layers.LSTMLayer(recurrent_layer_3,num_units=256, backwards=False, learn_init=True)
bwd_layer_4=lasagne.layers.LSTMLayer(recurrent_layer_3, num_units=256, backwards=True, learn_init=True)
recurrent_layer_4= lasagne.layers.ElemwiseSumLayer([fwd_layer_4,bwd_layer_4])

#layer 5
fwd_layer_5=lasagne.layers.LSTMLayer(recurrent_layer_4,num_units=256, backwards=False, learn_init=True)
bwd_layer_5=lasagne.layers.LSTMLayer(recurrent_layer_4, num_units=256, backwards=True, learn_init=True)
recurrent_layer_5= lasagne.layers.ElemwiseSumLayer([fwd_layer_5,bwd_layer_5])

#connected layers
reshape_layer=lasagne.layers.ReshapeLayer(recurrent_layer_5,(-1,256))
densed_output_layer=lasagne.layers.DenseLayer(reshape_layer, num_units=num_phonomes, nonlinearity=lasagne.nonlinearities.identity)
output_reshape=lasagne.layers.ReshapeLayer(densed_output_layer, (batchsize, fm, num_phonomes))

#softmax of the connected layer
output_softmax=lasagne.layers.NonlinearityLayer(densed_output_layer, nonlinearity=lasagne.nonlinearities.softmax)
output_softmax_shp=lasagne.layers.ReshapeLayer(output_softmax, (batchsize, fm, num_phonomes))

output_lin_ctc=lasagne.layers.get_output(output_reshape)
network_output=lasagne.layers.get_output(output_softmax_shp)
all_params=lasagne.layers.get_all_params(recurrent_layer_5, trainable=True)

## Costs and Training Functions 

In [4]:
# Cost functions
target_values = T.imatrix('target_output')
input_values  = T.imatrix()

### Gradients ###
# pseudo costs - ctc cross entropy b/n targets and linear output - used in training
pseudo_cost = ctc_cost.pseudo_cost(target_values, output_lin_ctc)
pseudo_cost_grad = T.grad(pseudo_cost.sum() / batchsize, all_params)
pseudo_cost = pseudo_cost.mean()

# true costs
cost = ctc_cost.cost(target_values, network_output)
cost = cost.mean()

# Compute SGD updates for training
print("Computing updates ...")
updates = lasagne.updates.rmsprop(pseudo_cost_grad, all_params, LEARNING_RATE)

# Theano functions for training and computing cost
print("Compiling functions ...")
train = theano.function([input_layer.input_var, target_values], [cost, pseudo_cost, network_output], updates=updates)
validate = theano.function([input_layer.input_var, target_values], [cost, network_output]) 
predict  = theano.function([input_layer.input_var], network_output)

Computing updates ...
Compiling functions ...


## Loading Data in Batches

### Extracting Training Audio File Names

In [5]:
max_frame_size=0
audio_names=[]


with h5py.File('timit_files/train_audio.h5', 'r') as h5:
    with open('audio_key.txt','r') as f:
        for line in f:
            line=line.rstrip()
            audio_names.append(line)
            cur=h5[line].shape[1]
            #print cur
            if cur>max_frame_size:
                max_frame_size=cur
number_of_audio_files=len(audio_names)

In [6]:
#transposing the data and creating a 3d tensor
all_audio=np.zeros((number_of_audio_files,336,55))
file_ind=0
with open('audio_key.txt','r') as f:
    for line in f:
        line=line.rstrip()
        with h5py.File('timit_files/train_audio_zero_padded.h5', 'r') as h5:
            zero_padded_audio=np.transpose(h5[line][:])
            for i in range(zero_padded_audio.shape[0]):
                for j in range(zero_padded_audio.shape[1]):
                    all_audio[file_ind][i][j]=zero_padded_audio[i][j]
            file_ind+=1

In [7]:
all_phn=np.zeros((number_of_audio_files,151))
file_ind=0
with open('audio_key.txt','r') as f:
    for line in f:
        line=line.rstrip()
        with h5py.File('timit_files/phoneme_list_encode_space_padded.h5', 'r') as h5:
            blank_padded_phn=h5[line][:]
            for i in range(blank_padded_phn.shape[0]):
                all_phn[file_ind][i]=blank_padded_phn[i]
            file_ind+=1

In [8]:
#combining all the data as one
all_audio=all_audio.astype(np.float32)
all_phn=all_phn.astype(np.int32)
training_data=zip(all_audio,all_phn)

## Training Network 

In [9]:
print("Training network ...")
num_batches_train = int(np.ceil(len(all_audio) / batch_size))
split_ratio = 0.8*batch_size


for epoch in range(NUM_EPOCHS):
    print("EPOCH #"+str(epoch))
    tlosses = []
    validation_losses = []
    vlosses = []
    plosses = []
    probabilities = []
    
    training_data=zip(all_audio,all_phn)
    np.random.shuffle(training_data)
    audio, phonemes = zip(*training_data)
    
    for batch in range(num_batches_train):
        
        batch_slice = slice(batch_size * batch, batch_size * (batch + 1))
        
        xi = audio[batch_slice]
        yi = phonemes[batch_slice]
        
        if batch < split_ratio:
            loss, ploss, probs = train(xi,yi)
            tlosses.append(loss)
            plosses.append(ploss)
        else:
            loss, probs = validate(xi,yi)
            y_pred = np.argmax(probs, axis=-1)
            vlosses.append(loss)
            probabilities.append(probs)    
            
        print("Batch {0}/{1}, loss:{2:.6}, ploss:{3:.6}".format(batch,num_batches_train,loss,ploss))

Training network ...
EPOCH #0
Batch 0/36, loss:1153.8, ploss:5.3493
Batch 1/36, loss:216.18, ploss:-548.2
Batch 2/36, loss:212.86, ploss:429.48
Batch 3/36, loss:222.15, ploss:-529.0
Batch 4/36, loss:150.04, ploss:355.24
Batch 5/36, loss:42.538, ploss:44.621
Batch 6/36, loss:83.971, ploss:-135.1
Batch 7/36, loss:84.363, ploss:243.67
Batch 8/36, loss:42.048, ploss:-100.1
Batch 9/36, loss:28.336, ploss:95.203
Batch 10/36, loss:33.233, ploss:-113.0
Batch 11/36, loss:30.251, ploss:122.78
Batch 12/36, loss:23.965, ploss:-74.49
Batch 13/36, loss:16.484, ploss:52.102
Batch 14/36, loss:16.608, ploss:-61.75
Batch 15/36, loss:19.096, ploss:60.873
Batch 16/36, loss:15.813, ploss:-89.53
Batch 17/36, loss:19.165, ploss:87.715
Batch 18/36, loss:14.848, ploss:-91.60
Batch 19/36, loss:18.361, ploss:87.599
Batch 20/36, loss:8.5992, ploss:-77.33
Batch 21/36, loss:14.297, ploss:66.326
Batch 22/36, loss:7.6399, ploss:-66.27
Batch 23/36, loss:7.7050, ploss:54.274
Batch 24/36, loss:7.9469, ploss:-60.07
Batch

### Sample Alignment Test using Validation Data 

In [58]:

np.random.shuffle(training_data)
audio, phonemes = zip(*training_data)
batch_slice = slice(batch_size * batch, batch_size * (batch + 1))
xi= audio[batch_slice]
yi=phonemes[batch_slice]


h5=h5py.File('timit_files/phoneme_list.h5','r')
phn_list=h5['list_phn'][:]
h5.close()

probs=predict(xi)
y_pred=np.argmax(probs,axis=-1)
y_pred=y_pred[9]

phn_str=""
label_str=''

for i in range(len(y_pred)):
    phn_str+=str(phn_list[y_pred[i]])

print 'network output: \n'
print phn_str

labeled=yi[9]
for i in range(len(labeled)):
    label_str+=str(phn_list[labeled[i]])
print 'actual output: \n'
print label_str

network output: 

_h#_q_ix__tcl_t_ss_ix_____l___iy___gcl_g_ih_l__tcl_t__ax__pcl__p_ow____s__tcl_t_ey__dx__ix_tcltcl__ch___eh_kcl_k_h#__________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________
actual output: 

_h#_q_ix_tcl_t_s_ix_l_iy_gcl_g_ih_l_tcl_t_ax_pcl_p_ow_s_tcl_d_ey_dx_ix_tcl_ch_eh_kcl_k_h#_________________________________________________________________________________________


### Processing Testing Data

In [60]:
max_frame_size=0
audio_names=[]


with h5py.File('timit_files/test_audio.h5', 'r') as h5:
    with open('timit_files/test_audio_key.txt','r') as f:
        for line in f:
            line=line.rstrip()
            audio_names.append(line)
            cur=h5[line].shape[1]
            #print cur
            if cur>max_frame_size:
                max_frame_size=cur
number_of_audio_files=len(audio_names)

In [62]:
number_of_audio_files

1680

In [63]:
#transposing the data and creating a 3d tensor
test_all_audio=np.zeros((number_of_audio_files,327,55))
file_ind=0
with open('timit_files/test_audio_key.txt','r') as f:
    for line in f:
        line=line.rstrip()
        with h5py.File('timit_files/test_audio_zero_padded.h5', 'r') as h5:
            zero_padded_audio=np.transpose(h5[line][:])
            for i in range(zero_padded_audio.shape[0]):
                for j in range(zero_padded_audio.shape[1]):
                    test_all_audio[file_ind][i][j]=zero_padded_audio[i][j]
            file_ind+=1

In [66]:
test_all_phn=np.zeros((number_of_audio_files,147))
file_ind=0
with open('timit_files/test_audio_key.txt','r') as f:
    for line in f:
        line=line.rstrip()
        with h5py.File('timit_files/test_phoneme_list_encode_space_padded.h5', 'r') as h5:
            blank_padded_phn=h5[line][:]
            for i in range(blank_padded_phn.shape[0]):
                test_all_phn[file_ind][i]=blank_padded_phn[i]
            file_ind+=1

In [73]:
#combining audio signals and labels as one
test_all_audio=all_audio.astype(np.float32)
test_all_phn=all_phn.astype(np.int32)
testing_data=zip(all_audio,all_phn)

### Sample Output using Testing Data 

In [82]:
np.random.shuffle(testing_data)
audio, phonemes = zip(*testing_data)
batch_slice = slice(batch_size * batch, batch_size * (batch + 1))
xi= audio[batch_slice]
yi=phonemes[batch_slice]


h5=h5py.File('timit_files/phoneme_list.h5','r')
phn_list=h5['list_phn'][:]
h5.close()

probs=predict(xi)
y_pred=np.argmax(probs,axis=-1)
y_pred=y_pred[11]

phn_str=""
label_str=''

for i in range(len(y_pred)):
    phn_str+=str(phn_list[y_pred[i]])

print 'network output: \n'
print phn_str

labeled=yi[11]
for i in range(len(labeled)):
    label_str+=str(phn_list[labeled[i]])
print 'actual output: \n'
print label_str

network output: 

_h#__y_ux__kcl_k_uh__dcl__b__er___n_dcl_d__aw__n__dh__ax_s___hh__ow_l___m_n__aw___tcl_________s_____ay___dcl_w_ax_dh__ax___f___ay_____axr__dh__ae___tcl_t_s____ay___s__h#________________________________________________________________________________________________________________________________________________________________________________________________________
actual output: 

_h#_y_ux_kcl_k_uh_dcl_b_er_n_dcl_d_aw_n_dh_ax_s_hh_ow_l_m_aw_n_tcl_en_s_ay_dcl_w_ax_dh_ax_f_ay_axr_dh_ae_tcl_t_s_ay_z_h#_________________________________________________________________
