In [1]:
import numpy as np
import keras as k
import keras.backend as K
from keras.engine.topology import Layer
import sklearn as sk
import tensorflow as tf
import h5py
import os
from itertools import izip

h5_dir = 'h5/'

Using TensorFlow backend.


## Reopen saved datasets

Pull all the data out of HDF5 file and back into memory. This is much faster than doing random access directly on the files, particularly if you're using a cloud server with network-attached storage.

In [2]:
datafile = h5py.File(os.path.join(h5_dir, 'training_data.h5'), 'r')
orders_dataset = datafile['orders'][:]
reorders_dataset = datafile['reorders'][:]
users_dataset = datafile['users'][:]
labels_dataset = datafile['labels'][:]
num_rows = datafile['num_rows'][0]
biggest_order_size = datafile['biggest_order_size'][0]
max_product_id = datafile['max_product_id'][0]
max_user_id = datafile['max_user_id'][0]
datafile.close()
del datafile

## Network helpers

From: https://github.com/fchollet/keras/issues/2728

In [3]:
class ZeroMaskedEntries(k.engine.topology.Layer):
    """
    This layer is called after an Embedding layer.
    It zeros out all of the masked-out embeddings.
    It also swallows the mask without passing it on.
    You can change this to default pass-on behavior as follows:

    def compute_mask(self, x, mask=None):
        if not self.mask_zero:
            return None
        else:
            return K.not_equal(x, 0)
    """

    def __init__(self, **kwargs):
        self.support_mask = True
        super(ZeroMaskedEntries, self).__init__(**kwargs)

    def build(self, input_shape):
        self.output_dim = input_shape[1]
        self.repeat_dim = input_shape[2]

    def call(self, x, mask=None):
        mask = K.cast(mask, 'float32')
        mask = K.repeat(mask, self.repeat_dim)
        mask = K.permute_dimensions(mask, (0, 2, 1))
        return x * mask

    def compute_mask(self, input_shape, input_mask=None):
        return None


def mask_aware_mean(x):
    # recreate the masks - all zero rows have been masked
    mask = K.not_equal(K.sum(K.abs(x), axis=2, keepdims=True), 0)

    # number of that rows are not all zeros
    n = K.sum(K.cast(mask, 'float32'), axis=1, keepdims=False)
    
    # compute mask-aware mean of x, or all zeroes if no rows present
    x_mean = K.sum(x, axis=1, keepdims=False) / n
    x_mean = tf.where(tf.is_nan(x_mean), tf.zeros_like(x_mean), x_mean)
    x_mean = tf.verify_tensor_all_finite(x_mean, 'fuck')

    return x_mean


def mask_aware_mean_output_shape(input_shape):
    shape = list(input_shape)
    assert len(shape) == 3 
    return (shape[0], shape[2])

## Network structure

In [5]:
# Layer size constants
user_embedding_size = 50
product_embedding_size = 50

# Activation function for the hidden layers
activation = 'relu'

# Dropout rate for the hidden layers
dropout = 0.1

# Initial learning rate etc.
learning_rate = 0.1
decay_rate = 1e-5

def bn_active_dropout(input, scale=False):
  return k.layers.Dropout(dropout)(
    k.layers.Activation(activation)(
      k.layers.normalization.BatchNormalization(scale=scale)(input)))

# Input layers for the datasets, and the continuous features

user_input = k.layers.Input(shape=(1,), name='user_input')

order_input = k.layers.Input(shape=(biggest_order_size,), name='order_input')

reorder_input = k.layers.Input(shape=(biggest_order_size,), name='reorder_input')

order_sizes_input = k.layers.Input(shape=(1,), name='order_sizes_input')

reorder_sizes_input = k.layers.Input(shape=(1,), name='reorder_sizes_input')

# Set up the embeddings tables -- these map the order, reorder, and user IDs
# into embeddings
# Think of them as lookup tables mapping IDs to vectors of floats

product_embedding_in = k.layers.Embedding(
  input_dim=max_product_id + 1, output_dim=product_embedding_size,
  name='product_embedding_in')

# Set up a separate user embedding table

user_embedding = k.layers.Embedding(
  input_dim=max_user_id + 1, output_dim=user_embedding_size, name='user_embedding')

# Deep averaging networks for the order and reorder sets
# We add order size and reorder size features to averaged vectors, before passing them thru the DANs

order_embedding = product_embedding_in(order_input)

order_vector = k.layers.Lambda(
  mask_aware_mean, mask_aware_mean_output_shape)(order_embedding)

order_vector_ext = k.layers.concatenate(
  [order_vector, order_sizes_input], name='order_vector_ext')

order_hidden_1 = bn_active_dropout(k.layers.Dense(
  product_embedding_size, name='order_hidden_1', use_bias=False)(order_vector_ext))

order_hidden_2 = bn_active_dropout(k.layers.Dense(
  product_embedding_size, name='order_hidden_2', use_bias=False)(order_hidden_1))

order_hidden_3 = bn_active_dropout(k.layers.Dense(
  product_embedding_size, name='order_hidden_3', use_bias=False)(order_hidden_2))

reorder_embedding = product_embedding_in(reorder_input)

reorder_vector = k.layers.Lambda(
  mask_aware_mean, mask_aware_mean_output_shape)(reorder_embedding)

reorder_vector_ext = k.layers.concatenate(
  [reorder_vector, reorder_sizes_input], name='reorder_vector_ext')

reorder_hidden_1 = bn_active_dropout(k.layers.Dense(
  product_embedding_size, name='reorder_hidden_1', use_bias=False)(reorder_vector_ext))

reorder_hidden_2 = bn_active_dropout(k.layers.Dense(
  product_embedding_size, name='reorder_hidden_2', use_bias=False)(reorder_hidden_1))

reorder_hidden_3 = bn_active_dropout(k.layers.Dense(
  product_embedding_size, name='reorder_hidden_3', use_bias=False)(reorder_hidden_2))

# Flatten the single user embedding into a vector

user_vector = k.layers.Flatten(name='user_vector')(
  user_embedding(user_input))

# Concatenate embeddings and n-hot item IDs, and output

full_input_vector = k.layers.concatenate(
  [order_hidden_3, reorder_hidden_3, user_vector],
  name="full_input_vector")

output = k.layers.Dense(
  max_product_id + 1, activation='sigmoid', name='output')(full_input_vector)

# Compile the model

model = k.models.Model(
  inputs=[order_input, reorder_input, user_input, order_sizes_input, reorder_sizes_input],
  outputs=output)

model.compile(
  optimizer=k.optimizers.SGD(lr=learning_rate, decay=decay_rate), loss='binary_crossentropy')

model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
order_input (InputLayer)         (None, 145)           0                                            
____________________________________________________________________________________________________
reorder_input (InputLayer)       (None, 145)           0                                            
____________________________________________________________________________________________________
product_embedding_in (Embedding) (None, 145, 50)       2484450     order_input[0][0]                
                                                                   reorder_input[0][0]              
____________________________________________________________________________________________________
lambda_3 (Lambda)                (None, 50)            0           product_embedding_in[0][

## Training

In [11]:
batch_size = 1000
num_valid = 5000
assert num_valid % batch_size == 0

index = np.arange(num_rows, dtype=np.uint32)
np.random.shuffle(index)
index_valid = index[:num_valid]
index_train = index[num_valid:]

labels_train =  np.hstack(labels_dataset[index_train])
class_weights = sk.utils.class_weight.compute_class_weight(
  'balanced', np.unique(labels_train), labels_train)
del labels_train

Now we define a couple of helper functions.

`make_training_data` is called once per batch, on a list of indices into the training data. It retrieves the records corresponding to those indices from the underlying datasets, performs some minor adjustments and returns them as Numpy arrays.

`validate` is called once per epoch, i.e. after one whole run through the training data. It calculates some metrics on the validation data -- batch by batch as if it was training data -- and returns them.


`validate` is a bit inefficient as it regenerates the input arrays (via `make_training_data`) every time it's called, even though these don't change and could be cached. This would be easy to implement at the cost of higher memory usage.

In [20]:
def n_hot(item_ids):
  output = np.zeros(max_product_id + 1, dtype=np.uint8)
  output[item_ids] = 1
  return output


def make_training_data(indices):
  
  # Get labels for this batch's instances, and n-hot encode them
  labels_separate = labels_dataset[indices]
  labels_n_hot = [n_hot(order) for order in labels_separate]
  
  # Get the arrays of ordered item IDs and reordered item IDs for each order
  # (these are already zero-padded to the same length)
  orders_separate = orders_dataset[indices]
  reorders_separate = reorders_dataset[indices]
  
  # Get user IDs
  user_ids = users_dataset[indices]
  
  # Stack all the inputs and labels
  orders_data = np.vstack(orders_separate)
  reorders_data = np.vstack(orders_separate)
  labels_data = np.vstack(labels_n_hot)
  
  # Calculate continuous features: currently just order size and reorder size
  # (both normalized by biggest_order_size)
  order_sizes = np.hstack(np.count_nonzero(order) for order in orders_separate)
  reorder_sizes = np.hstack(np.count_nonzero(order) for order in reorders_separate)

  # Now we return a bunch of things:
  #   orders_data is a zero-padded matrix of IDs, batch size x biggest_order_size
  #   reorders_data is a zero-padded matrix of IDs, same size as orders_data
  #   user_ids is a vector of IDs, length = batch size
  #   order_sizes is a vector of ints, length = batch size
  #   reorder_sizes is a vector of ints, length = batch size
  #   labels is a binary matrix, batch size x biggest_order_size
  return (orders_data, reorders_data, user_ids, order_sizes, reorder_sizes, labels_data)


def validate(model):
  
  orders_valid, reorders_valid, users_valid, order_sizes_valid, reorder_sizes_valid, labels_valid = \
    make_training_data(index_valid)
    
  # Loss isn't really comparable to training loss as it doesn't use class weights
  loss = model.test_on_batch({'order_input': orders_valid,
                              'reorder_input': reorders_valid,
                              'user_input': users_valid,
                              'order_sizes_input': order_sizes_valid,
                              'reorder_sizes_input': reorder_sizes_valid},
                             labels_valid)
  
  # Predictions, for calculating P/R/F
  output = model.predict_on_batch({'order_input': orders_valid,
                                   'reorder_input': reorders_valid,
                                   'user_input': users_valid,
                                   'order_sizes_input': order_sizes_valid,
                                   'reorder_sizes_input': reorder_sizes_valid})

  # Arrays of predictions and matches, batch size x num categories
  preds = np.where(output > 0.5, 1, 0)
  matches = preds * labels_valid
  
  epsilon = 1e-12
  precision = matches.sum(axis=1) / (preds.sum(axis=1) + epsilon)
  recall = matches.sum(axis=1) / (labels_valid.sum(axis=1) + epsilon)
  f1 = (2 * precision * recall) / (precision + recall + epsilon)
  
  return (loss, precision.mean(), recall.mean(), f1.mean())

Finally it's time to train the model.

We do this by defining a generator that Keras runs in a separate thread -- this is invoked repeatedly by Keras, and each time, it grabs the next batch of indices and yields the result of calling `make_training_data` on these.

The `epochs` constant configures how many whole passes through the data we do, scoring the model's predictions on the validation set after each one. After the last epoch you can re-run the cell manually -- it will continue training from where it left off, although the epoch number reported will reset to "1" when you restart it. If you want to totally reset the model, rerun the model definition cell above.

### Note about warnings

You may see a warning about a `StopIteration` exception at the end of each epoch. It's safe to ignore this, it's an artefact of how Keras invokes the generator.

You might also see a warning about invalid values in a divide, which occurs if one or more orders have both a precision _and_ recall of 0. It's also fine to ignore this, as the resulting NaNs are converted to zeros before averaging across the validation data.

In [21]:
steps_per_epoch = num_rows // batch_size
epochs = 5

print("Batch size: %d" % batch_size)
print("Training examples: %d" % num_rows)
print("Batches per epoch: %d" % steps_per_epoch)

print("Initial validation: loss = %0.5f, P = %0.5f, R = %0.5f, F = %0.5f"
      % validate(model))

for epoch in range(epochs):
  
  def train_data_generator():
    for chunk in np.array_split(index_train, steps_per_epoch):
      orders_data, reorders_data, user_ids, order_sizes, reorder_sizes, labels = \
        make_training_data(chunk)
      yield ({'order_input': orders_data,
              'reorder_input': reorders_data,
              'user_input': user_ids,
              'order_sizes_input': order_sizes,
              'reorder_sizes_input': reorder_sizes},
             {'output': labels})
    return

  # Train the model on the entire training set
  print("Starting epoch %d" % epoch)
  print("Learning rate: %0.5f" %
        K.eval(model.optimizer.lr * (1. / (1. + model.optimizer.decay * model.optimizer.iterations))))
  
  model.fit_generator(
    train_data_generator(), steps_per_epoch=steps_per_epoch, epochs=1, max_q_size=1,
    class_weight=class_weights)
  
  # Score it against the validation set
  print("Validation: loss = %0.5f, P = %0.5f, R = %0.5f, F = %0.5f"
        % validate(model))
  
  # Shuffle the training data (actually just the indices) for next epoch
  print("Shuffling training data")
  np.random.shuffle(index_train)

Batch size: 1000
Training examples: 2933665
Batches per epoch: 2933
Initial validation: loss = 0.69333, P = 0.00020, R = 0.48789, F = 0.00040
Starting epoch 0
Learning rate: 0.10000
Epoch 1/1

KeyboardInterrupt: 

## TODO

* Add dropout over the input items, as in the DAN paper

"Deep Unordered Composition Rivals Syntactic Methods for Text Classification"

## Remaining work

There are a few outstanding TODOs that would make this training loop more useful on real work:

* Make the generator 'greedier' by using multiple workers and a longer queue, this should speed it up
* See if it's possible to push some of the work of `make_training_data` into TensorFlow on the GPU instead
* Save the model to disk after every epoch
* Terminate training early if validation F-measure stagnates
* Reduce the learning rate manually if the training loss stagnates
* Test on a held-out set from the end of the time period (see notes at top of file)

Also, the process of experimenting with the model structure or constants (size of layers, dropout rate etc.), or the optimizer parameters, is pretty tedious to do by hand.

In real life, you'd want to write a hyperparameter tuning script that automatically generates a bunch of different model variants, trains them, tests them on the same validation data, and reports the results. You can do this in parallel over multiple servers to speed up the process.