In [1]:
from __future__ import print_function

from keras.engine import Model
from keras_vggface.vggface import VGGFace
from keras.preprocessing import image
from keras.models import Sequential
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense

import numpy as np

from pre_processing import load_data

Using TensorFlow backend.


In [5]:
# Load the data.
data, vocab = load_data(k=5, speakers=['s1'], shuffle=True, use_delta_frames=True)

Skipped 5 empty data files.


In [6]:
vocab.occurrences()

{'a': 40,
 'again': 250,
 'at': 252,
 'b': 40,
 'bin': 240,
 'blue': 250,
 'by': 248,
 'c': 40,
 'd': 40,
 'e': 40,
 'eight': 100,
 'f': 40,
 'five': 100,
 'four': 100,
 'g': 40,
 'green': 250,
 'h': 40,
 'i': 40,
 'in': 252,
 'j': 40,
 'k': 40,
 'l': 40,
 'lay': 248,
 'm': 40,
 'n': 40,
 'nine': 100,
 'now': 250,
 'o': 40,
 'one': 100,
 'p': 40,
 'place': 256,
 'please': 250,
 'q': 40,
 'r': 40,
 'red': 250,
 's': 40,
 'set': 256,
 'seven': 100,
 'six': 100,
 'soon': 250,
 't': 40,
 'three': 100,
 'two': 100,
 'u': 40,
 'v': 40,
 'white': 250,
 'with': 248,
 'x': 40,
 'y': 40,
 'z': 40,
 'zero': 100}

In [7]:
# Select which fold of the k-folds to use.
FOLD = 0

train, test = data[FOLD]
x_train, y_train = train
x_test, y_test = test

print('Number of words for training: ', x_train.shape[0])
print('Number of words for testing: ', x_test.shape[0])
print('Frames per word: ', x_train.shape[1])
print('Features per frame: ', x_train.shape[2])

Number of words for training:  4800
Number of words for testing:  1200
Frames per word:  6
Features per frame:  512


In [17]:
# Build the LSTM model.
model = Sequential()
model.add(LSTM(128, input_shape=x_train[0].shape, return_sequences=True))
model.add(LSTM(128))
model.add(Dense(len(vocab), activation='tanh'))
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])

In [None]:
# Train the model.
model.fit(x_train, y_train, batch_size=32, epochs=15, verbose=1)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15

In [16]:
# Test the model.
score, acc = model.evaluate(x_test, y_test, batch_size=32)
print('Test score: ', score)
print('Test accuracy: ', acc)

Test accuracy:  0.005


In [8]:
# Try matching some results.
frames_per_word = x_test.shape[1]
features_per_frame = x_test.shape[2]

RESULTS_TO_CHECK = 50
MATCHES_PER_WORD = 4

for i in xrange(RESULTS_TO_CHECK):
    pred = model.predict(x_test[i].reshape(1, frames_per_word, features_per_frame))
    predicted_indexes = np.argsort(pred.reshape(len(vocab)))[::-1][:MATCHES_PER_WORD]
    correct_index = np.argsort(y_test[i])[::-1][0]

    correct_word = vocab[correct_index]
    predicted_words = [vocab[i] for i in predicted_indexes]

    print('{} : {}'.format(correct_word, predicted_words))

k : ['k', 'five', 'f', 'lay']
now : ['now', 'l', 'eight', 'q']
set : ['set', 'lay', 'z', 'h']
at : ['at', 'with', 'nine', 'eight']
green : ['green', 'in', 'q', 'c']
place : ['place', 'x', 'z', 'u']
please : ['please', 'eight', 'c', 'nine']
in : ['at', 'in', 'with', 'r']
blue : ['blue', 'p', 'bin', 'q']
six : ['six', 'zero', 'c', 'five']
at : ['in', 'at', 'with', 's']
c : ['with', 'c', 'six', 'z']
place : ['place', 'u', 'c', 'nine']
k : ['k', 'eight', 'a', 'at']
zero : ['zero', 'soon', 'q', 'again']
q : ['q', 'zero', 'again', 'set']
place : ['place', 'l', 'x', 'q']
soon : ['soon', 'four', 'eight', 'again']
soon : ['soon', 'h', 'with', 'z']
soon : ['soon', 'h', 'with', 'l']
green : ['green', 'c', 'nine', 'place']
set : ['set', 'o', 'i', 'bin']
set : ['set', 'bin', 'nine', 'five']
again : ['again', 'five', 'd', 'l']
in : ['at', 'in', 'with', 'r']
zero : ['zero', 'q', 'again', 'lay']
soon : ['soon', 'with', 'again', 'u']
five : ['five', 'eight', 'y', 'at']
one : ['one', 'y', 'by', 'nine']
