# The New Kaggle Ecosystem


## (Or how I learned to stop worrying and love the bomb)

# Things We Need

In [None]:
import tensorflow as tf

# Things We Need

### 1. A Model

## A Model

In [None]:
class ImageToDeepSpeech(tf.keras.layers.Layer):
    def __init__(self, num_frames, frame_step, **kwargs):
        self.num_frames = num_frames
        self.frame_step = frame_step
        super(ImageToDeepSpeech, self).__init__(**kwargs)

    def call(self, inputs):
        inputs = tf.squeeze(inputs, axis=3)
        time_slice = lambda x, i: x[:, i:(-(self.num_frames-1)+i) or None:self.frame_step]
        time_shifted_inputs = [time_slice(inputs, i) for i in range(self.num_frames)]
        return tf.concat(time_shifted_inputs, axis=2)

    def compute_output_shape(self, input_shape):
        shape = tf.TensorShape(input_shape).as_list()
        time_dim = tf.ceil((shape[1] - self.num_frames + 2) / self.frame_step)
        feature_dim = self.num_frames*shape[2]
        if self.frame_step == 1:
            time_dim += 1
        return tf.TensorShape([shape[0], time_dim, feature_dim])

    def get_config(self):
        base_config = super(ImageToDeepSpeech, self).get_config()
        base_config['num_frames'] = self.num_frames
        base_config['frame_step'] = self.frame_step
        return base_config

In [None]:
class DeepSpeechCell(tf.keras.layers.Layer):
    def __init__(self, state_size, **kwargs):
        self.state_size = state_size
        super(DeepSpeechCell, self).__init__(**kwargs)

    def call(self, inputs, states):
        prev_output = states[0]
        h = tf.matmul(inputs, self.kernel)
        u = tf.matmul(prev_output, self.recurrent_kernel)
        output = tf.nn.relu(h + u + self.bias)
        return output, [output]

    def build(self, input_shape):
        self.input_dim = input_shape[1]
        if self.built:
            # normally you just return. This is a hack to allow the
            # second calling of this cell to use a different recurrent
            # kernel. Not elegant but that's showbiz baby
            self.recurrent_kernel = self.backward_recurrent_kernel
            return

        self.kernel = self.add_weight(
            shape=(self.input_dim, self.state_size),
            name='kernel',
            initializer='glorot_normal')
        self.bias = self.add_weight(
            shape=(self.state_size,),
            name='bias',
            initializer='zeros')

        self.forward_recurrent_kernel = self.add_weight(
            shape=(self.state_size, self.state_size),
            name='forward_recurrent_kernel',
            initializer='glorot_normal')
        self.backward_recurrent_kernel = self.add_weight(
            shape=(self.state_size, self.state_size),
            name='backward_recurrent_kernel',
            initializer='glorot_normal')

        self.recurrent_kernel = self.forward_recurrent_kernel
        super(DeepSpeechCell, self).build(input_shape)

    def get_config(self):
        base_config = super(DeepSpeechCell, self).get_config()
        base_config['state_size'] = self.state_size
        return base_config

## A Model

In [None]:
def deepspeech_model(
        num_frames,
        frame_step,
        hidden_dims,
        num_classes,
        dropout=0.05):
    # input and convert from image to time series representation
    input = tf.keras.Input(shape=(99, 161, 1), name='spec')
    x = ImageToDeepSpeech(num_frames, frame_step)(input)

    # transform with 3 time distributed dense layers
    for n, hdim in enumerate(hidden_dims[:3]):
        x = tf.keras.layers.Dropout(dropout, name='dropout_{}'.format(n))(x)
        dense = tf.keras.layers.Dense(hdim, activation='relu')
        x = tf.keras.layers.TimeDistributed(dense, name='dense_{}'.format(n))(x)

    # perform forwards and backwards recurrent layers then combine
    # note that we're not return sequences, so we're going from shape
    # B x T x F --> B x F
    cell = DeepSpeechCell(hidden_dims[3])
    forward = tf.keras.layers.RNN(cell, return_sequences=False, name='forward_rnn')(x)
    backward = tf.keras.layers.RNN(cell, return_sequences=False, go_backwards=True, name='backward_rnn')(x)
    x = tf.keras.layers.Add(name='rnn_combiner')([forward, backward])

    # transform with more dense layers (now not time distributed)
    for n, hdim in enumerate(hidden_dims[4:]):
        x = tf.keras.layers.Dropout(dropout, name='dropout_{}'.format(n+3))(x)
        x = tf.keras.layers.Dense(hdim, activation='relu', name='dense_{}'.format(n+3))(x)

    # produce output
    x = tf.keras.layers.Dropout(dropout, name='dropout_labels')(x)
    x = tf.keras.layers.Dense(num_classes, activation='softmax', name='labels')(x)
    return tf.keras.Model(inputs=input, outputs=x)

# Things We Need
## 1. A Model

## 2. A Data Pipeline

## A Data Pipeline

In [None]:
import numpy as np
import multiprocessing as mp

num_cpus = mp.cpu_count()

## A Data Pipeline

In [None]:
def get_input_fn(
        dataset_path,
        labels,
        batch_size,
        num_epochs,
        mean='/data/mean.npy',
        std='/data/std.npy',
        eps=0.0001):
    def input_fn():
        dataset = tf.data.TFRecordDataset([dataset_path])
        mean_spec = np.load(mean)
        std_spec = np.load(std)**0.5 # saved as variance
        table = tf.contrib.lookup.index_table_from_tensor(
            mapping=tf.constant(labels),
            num_oov_buckets=1)

        def parse_spectrogram(record):
            features = {
                'spec': tf.FixedLenSequenceFeature((), tf.float32, allow_missing=True),
                'label': tf.FixedLenFeature((), tf.string, default_value="")
            }
            parsed = tf.parse_single_example(record, features)

            spec = tf.reshape(parsed['spec'], [99, 161]) # Time steps x Frequency bins
            spec = (spec - mean_spec) / (std_spec + eps) # normalize
            spec = tf.expand_dims(spec, axis=2) # add channel dimension, T x F x 1

            label = tf.string_split([parsed['label']], delimiter="/").values[-2:-1]
            label = table.lookup(label)[0]
            label = tf.one_hot(label, len(labels))
            return (spec, label)

        # naive approach
#         dataset = dataset.shuffle(buffer_size=10000)
#         dataset = dataset.repeat(num_epochs)
#         dataset = dataset.map(parse_spectrogram)
#         dataset = dataset.batch(batch_size)

        # BREAK IN CASE OF EMERGENCY
        dataset = dataset.apply(
            tf.contrib.data.shuffle_and_repeat(10000, num_epochs))
        dataset = dataset.apply(
            tf.contrib.data.map_and_batch(
                map_func=parse_spectrogram,
                batch_size=batch_size,
                num_parallel_calls=num_cpus))
        dataset.prefetch(buffer_size=None)
        return dataset
    return input_fn

# What We Need
## 1. A Model
## 2. A Data Pipeline

## 3. A Training Script

## A Training Script
Define some hyperparameters for training

In [None]:
# Data info
TRAIN_DATA = '/data/train.tfrecords'
VALID_DATA = '/data/valid.tfrecords'
LABELS = '/data/labels.txt'
NUM_TRAIN_SAMPLES = 51088

# Model hyperparameters
NUM_FRAMES = 7
FRAME_STEP = 2
HIDDEN_SIZES = [1024, 2048, 2048, 1024, 2048]

# Training hyperparameters
BATCH_SIZE = 512
LEARNING_RATE = 8e-5
NUM_EPOCHS = 100
NUM_GPUS = 4
EVAL_THROTTLE_SECS = 30
DROPOUT = 0.05

# Quick equivalencies
STEPS_PER_EPOCH = NUM_TRAIN_SAMPLES // (BATCH_SIZE * NUM_GPUS)
MAX_STEPS = NUM_EPOCHS * STEPS_PER_EPOCH

## A Training Script
Build and compile a tf.keras model

In [None]:
with open(LABELS, 'r') as f:
    labels = f.read().split(",")
    labels = labels[:20] + ['unknown']

model = deepspeech_model(NUM_FRAMES, FRAME_STEP, HIDDEN_SIZES, len(labels), DROPOUT)
optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metric='accuracy')

## A Training Script

In [None]:
print(model.summary())

## A Training Script
Convert our tf.keras model to a distributed TensorFlow estimator

In [None]:
# BREAK IN CASE OF EMERGENCY
strategy = tf.contrib.distribute.MirroredStrategy(num_gpus=NUM_GPUS, prefetch_on_device=True)
strategy = tf.contrib.distribute.DistributeConfig(train_distribute=strategy)
config = tf.estimator.RunConfig(
    save_checkpoints_steps=STEPS_PER_EPOCH,
    experimental_distribute=strategy)

# config w/o distribution
# config = tf.estimator.RunConfig(save_checkpoints_steps=STEPS_PER_EPOCH)

custom_objects = {
    'DeepSpeechCell': DeepSpeechCell,
    'ImageToDeepSpeech': ImageToDeepSpeech}
estimator = tf.keras.estimator.model_to_estimator(
    model,
    custom_objects=custom_objects,
    config=config)

## A Training Script
Get our data generation functions and build training and evaluation specs

In [None]:
train_input_fn = get_input_fn(
    TRAIN_DATA,
    labels,
    BATCH_SIZE,
    NUM_EPOCHS)

eval_input_fn = get_input_fn(
    VALID_DATA,
    labels,
    BATCH_SIZE*8,
    1)

train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=MAX_STEPS)
eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn, throttle_secs=EVAL_THROTTLE_SECS)

## A Training Script
Add a metric to keep track of validation accuracy

In [None]:
def accuracy(labels, predictions):
    labels = tf.argmax(labels, axis=1)
    predictions = tf.argmax(predictions['labels'], axis=1)
    return {'accuracy': tf.metrics.accuracy(labels, predictions)}
estimator = tf.contrib.estimator.add_metrics(estimator, accuracy)

## A Training Script
The moment we've all ben waiting for

In [None]:
tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)