In [1]:
from data_generator import DataGenerator

train_desc_file = 'train_corpus.json'
val_desc_file = 'validation_corpus.json'

# Prepare the data generator
datagen = DataGenerator()
# Load the JSON file that contains the dataset
datagen.load_train_data(train_desc_file)
datagen.load_validation_data(val_desc_file)
# Use a few samples from the dataset, to calculate the means and variance
# of the features, so that we can center our inputs to the network
datagen.fit_train(100)

Using TensorFlow backend.


In [2]:
import json

desc_file = 'train_corpus_test.json'
partition = 'train'
max_duration = 10.0

print('Reading description file: {} for partition: {}'
                    .format(desc_file, partition))
audio_paths, durations, texts = [], [], []
with open(desc_file) as json_line_file:
    for line_num, json_line in enumerate(json_line_file):
        print(json_line)
        try:
            spec = json.loads(json_line)
            if float(spec['duration']) > max_duration:
                continue
            audio_paths.append(spec['key'])
            durations.append(float(spec['duration']))
            texts.append(spec['text'])
        except Exception as e:
            print('Error reading line #{}: {}'
                                .format(line_num, json_line))

Reading description file: train_corpus_test.json for partition: train
{"duration": 5.855, "key": "LibriSpeech/train-clean-100/1272/128104/1272-128104-0000.wav", "text": "mister quilter is the apostle of the middle classes and we are glad to welcome his gospel"}

{"duration": 4.815, "key": "LibriSpeech/train-clean-100/1272/128104/1272-128104-0001.wav", "text": "nor is mister quilter's manner less interesting than his matter"}

{"duration": 12.485, "key": "LibriSpeech/train-clean-100/1272/128104/1272-128104-0002.wav", "text": "he tells us that at this festive season of the year with christmas and roast beef looming before us similes drawn from eating and its results occur most readily to the mind"}

{"duration": 9.9, "key": "LibriSpeech/train-clean-100/1272/128104/1272-128104-0003.wav", "text": "he has grave doubts whether sir frederick leighton's work is really greek after all and can discover in it but little of rocky ithaca"}


In [3]:
print(audio_paths)
print(durations)
print(texts)

[u'LibriSpeech/train-clean-100/1272/128104/1272-128104-0000.wav', u'LibriSpeech/train-clean-100/1272/128104/1272-128104-0001.wav', u'LibriSpeech/train-clean-100/1272/128104/1272-128104-0003.wav']
[5.855, 4.815, 9.9]
[u'mister quilter is the apostle of the middle classes and we are glad to welcome his gospel', u"nor is mister quilter's manner less interesting than his matter", u"he has grave doubts whether sir frederick leighton's work is really greek after all and can discover in it but little of rocky ithaca"]


In [5]:
import numpy as np
from utils import calc_feat_dim, spectrogram_from_file, text_to_int_sequence

minibatch_size = 3
k_iters = int(np.ceil(len(audio_paths) / minibatch_size))

# featurize
def featurize(audio_clip):
    return spectrogram_from_file(audio_clip, step=10, window=20, max_freq=8000)

# prepare_minibatch
def prepare_minibatch(audio_paths, texts):
    assert len(audio_paths) == len(texts),\
        "Inputs and outputs to the network must be of the same number"
    features = [featurize(a) for a in audio_paths] # returns a list
    input_lengths = [f.shape[0] for f in features]
    max_length = max(input_lengths)
    feature_dim = features[0].shape[1]
    mb_size = len(features)
    # Pad all the inputs so that they are all the same length
    x = np.zeros((mb_size, max_length, feature_dim))
    print(x.shape)
    '''
        y = []
        label_lengths = []
        for i in range(mb_size):
            feat = features[i]
            feat = self.normalize(feat)  # Center using means and std
            x[i, :feat.shape[0], :] = feat
            label = text_to_int_sequence(texts[i])
            y.append(label)
            label_lengths.append(len(label))
        # Flatten labels to comply with warp-CTC signature
        y = reduce(lambda i, j: i + j, y)
        return {
            'x': x,  # (0-padded features of shape(mb_size,timesteps,feat_dim)
            'y': y,  # list(int) Flattened labels (integer sequences)
            'texts': texts,  # list(str) Original texts
            'input_lengths': input_lengths,  # list(int) Length of each input
            'label_lengths': label_lengths  # list(int) Length of each label
        }
    '''

In [6]:
prepare_minibatch(audio_paths, texts)

(3, 989, 161)


In [None]:
from keras.layers import (BatchNormalization, Convolution1D, Dense, Input, GRU, TimeDistributed)
from keras.models import Model

input_dim=161 
output_dim=29
recur_layers=3 
nodes=1024
conv_context=11
conv_border_mode='valid' 
conv_stride=2
initialization='glorot_uniform' 
batch_norm=True

acoustic_input = Input(shape=(None, input_dim), name='acoustic_input')

    # Setup the network
conv_1d = Convolution1D(nodes, conv_context, name='conv1d',
                            border_mode=conv_border_mode,
                            subsample_length=conv_stride, init=initialization,
                            activation='relu')(acoustic_input)
if batch_norm:
    output = BatchNormalization(name='bn_conv_1d', mode=2)(conv_1d)
else:
    output = conv_1d

for r in range(recur_layers):
    output = GRU(nodes, activation='relu',
                     name='rnn_{}'.format(r + 1), init=initialization,
                     return_sequences=True)(output)
    if batch_norm:
        bn_layer = BatchNormalization(name='bn_rnn_{}'.format(r + 1),
                                          mode=2)
        output = bn_layer(output)

# We don't softmax here because CTC does that
network_output = TimeDistributed(Dense(
    output_dim, name='dense', activation='linear', init=initialization))(output)
model = Model(input=acoustic_input, output=network_output)

model.summary()

In [None]:
import ctc
import keras.backend as K
import lasagne

def compile_train_fn(model, learning_rate=2e-4):
    # get input and output tensors
    acoustic_input = model.inputs[0]
    network_output = model.outputs[0]
    # not sure ??
    output_lens = K.placeholder(ndim=1, dtype='int32')
    label = K.placeholder(ndim=1, dtype='int32')
    label_lens = K.placeholder(ndim=1, dtype='int32')
    # prep for CTC 
    network_output = network_output.dimshuffle((1, 0, 2))
    # calculate CTC cost
    ctc_cost = ctc.cpu_ctc_th(network_output, output_lens,
                              label, label_lens).mean()
    
    # gradient // update stuff - not replicated in test version
    trainable_vars = model.trainable_weights
    grads = K.gradients(ctc_cost, trainable_vars)
    grads = lasagne.updates.total_norm_constraint(grads, 100)
    updates = lasagne.updates.nesterov_momentum(grads, trainable_vars,
                                                learning_rate, 0.99)
    
    # not sure ...
    train_fn = K.function([acoustic_input, output_lens, label, label_lens,
                           K.learning_phase()],
                          [network_output, ctc_cost],
                          updates=updates)
    return train_fn

def compile_test_fn(model):
    # get input and output tensors
    acoustic_input = model.inputs[0]
    network_output = model.outputs[0]
    # not sure ??
    output_lens = K.placeholder(ndim=1, dtype='int32')
    label = K.placeholder(ndim=1, dtype='int32')
    label_lens = K.placeholder(ndim=1, dtype='int32')
    # prep for CTC
    network_output = network_output.dimshuffle((1, 0, 2))
    # calculate CTC cost
    ctc_cost = ctc.cpu_ctc_th(network_output, output_lens,
                              label, label_lens).mean()
    # not sure
    val_fn = K.function([acoustic_input, output_lens, label, label_lens,
                        K.learning_phase()],
                        [network_output, ctc_cost])
    return val_fn

In [None]:
# Compile the CTC training function
train_fn = compile_train_fn(model)
# Compile the validation function
val_fn = compile_test_fn(model)

In [None]:
epochs = 10
mb_size = 16

train_costs, val_costs = [], []
iters = 0

for e in range(epochs):
    # sortagrad on first epoch, shuffle on all future epochs
    shuffle = e != 0
    sortagrad = e == 0

    for i, batch in \
        enumerate(datagen.iterate_train(mb_size, shuffle=shuffle,
                                        sort_by_duration=sortagrad)):
        inputs = batch['x']
        labels = batch['y']
        input_lengths = batch['input_lengths']
        label_lengths = batch['label_lengths']
        # Due to convolution, the number of timesteps of the output
        # is different from the input length. Calculate the resulting
        # timesteps
        output_lengths = [model.conv_output_length(l)
                              for l in input_lengths]
        _, ctc_cost = train_fn([inputs, output_lengths, labels,
                                    label_lengths, True])
        train_costs.append(ctc_cost)
        if i % 10 == 0:
                logger.info("Epoch: {}, Iteration: {}, Loss: {}"
                            .format(e, i, ctc_cost, input_lengths))
            iters += 1
            if iters % 500 == 0:
                val_cost = validation(model, val_fn, datagen, mb_size)
                val_costs.append(val_cost)
                save_model(save_dir, model, train_costs, val_costs, iters)
                
    if iters % 500 != 0:
        # End of an epoch. Check validation cost and save costs
        val_cost = validation(model, val_fn, datagen, mb_size)
        val_costs.append(val_cost)
        save_model(save_dir, model, train_costs, val_costs, iters)