# Reorder size prediction

From a given order, how well can we predict the number of items in it which will be reordered next time?

In [3]:
import numpy as np
import pandas as pd
import h5py
import shutil
import os
import time
from itertools import *
from collections import defaultdict
import keras as k
import keras.backend as K
import tensorflow as tf

csv_dir = '../csv/'
h5_dir = 'h5/'

path = os.path.join

## CSV parsing

First read the CSV files into Pandas. See the competition website for descriptions of these.

In [4]:
order_products_prior = pd.read_csv(path(csv_dir, 'order_products__prior.csv'), engine='c',
                                   dtype={'order_id':np.int32, 
                                          'product_id':np.int32, 
                                          'add_to_cart_order':np.int8, 
                                          'reordered':np.int8})

order_products_train = pd.read_csv(path(csv_dir, 'order_products__train.csv'), engine='c',
                                   dtype={'order_id':np.int32, 
                                          'product_id':np.int32, 
                                          'add_to_cart_order':np.int8, 
                                          'reordered':np.int8})

orders = pd.read_csv(path(csv_dir, 'orders.csv'), engine='c',
                     dtype={'order_id':np.int32,
                            'user_id':np.int32,
                            'order_number':np.int8,
                            'order_dow':np.int8,
                            'order_hour_of_day':np.int8
                           })

products = pd.read_csv(path(csv_dir, 'products.csv'), engine='c',
                       dtype={'product_id':np.int32,
                              'aisle_id':np.int8,
                              'department_id':np.int8
                             })

## Data preprocessing

Now we need to process the data to:

* Count the number of users and products, and the size of the biggest order
 * The model needs to know how big to make its inputs and embedding tables
* Filter out each user's first order, for now
 * None of the items in those are reorders (by definition) so they make things harder for the model
 * NB this means we can't make predictions based on just one order, which is a tradeoff worth revisiting later
* Retrieve pairs of consecutive order IDs for the same user
 * We're trying to predict the **number of** reorders in the second from the contents of the first

In [5]:
max_product_id = max(products.product_id)
max_product_id

49688

In [6]:
max_user_id = max(orders.user_id)
max_user_id

206209

`order_products` tells us what products were present in each order, and which of those were reorders. It also contains an `add_to_cart_order` column that we're ignoring for this project.

In [7]:
order_products = pd.concat([order_products_train, order_products_prior], axis=0
                          )[['order_id', 'product_id', 'add_to_cart_order']]
order_products.head()

Unnamed: 0,order_id,product_id,add_to_cart_order
0,1,49302,1
1,1,11109,2
2,1,10246,3
3,1,49683,4
4,1,43633,5


Now we need to generate the order pairs for each user. This will also take a few minutes.

Some of the later orders will be in the Kaggle test set, which we're not using for this project (as our objective is slightly different from the Kaggle contest). So we filter down to only ones that we have data for.

In [8]:
# From the itertools docs -- get consecutive pairs from a sequence
def pairwise(iterable):
  "s -> (s0, s1), (s1, s2), (s2, s3), ..."
  a, b = tee(iterable)
  next(b, None)
  return izip(a, b)

def make_x_y(order_ids):
  pairs = list(pairwise(order_ids))
  return pd.DataFrame.from_records(pairs, columns=['prev', 'next'])

order_pairs = orders.sort_values(['user_id', 'order_number']).groupby(['user_id']).order_id.apply(make_x_y)
order_pairs.reset_index(inplace=True)
del order_pairs['level_1']
order_pairs.set_index('prev', inplace=True)
order_pairs.head()

Unnamed: 0_level_0,user_id,next
prev,Unnamed: 1_level_1,Unnamed: 2_level_1
2539329,1,2398795
2398795,1,473747
473747,1,2254736
2254736,1,431534
431534,1,3367565


Training data is ordered, zero-padded product IDs. And also user IDs and size for each order (as a fraction of biggest order size).

Labels is the fraction of IDs from *that* order that are present in the next one.

In [9]:
products_per_order = order_products.sort_values(
  by=['order_id', 'add_to_cart_order']).groupby(['order_id'], sort=False, group_keys=False).product_id.apply(list)

products_per_order.head()

order_id
1    [49302, 11109, 10246, 49683, 43633, 13176, 472...
2    [33120, 28985, 9327, 45918, 30035, 17794, 4014...
3    [33754, 24838, 17704, 21903, 17668, 46667, 174...
4    [46842, 26434, 39758, 27761, 10054, 21351, 225...
5    [13176, 15005, 47329, 27966, 23909, 48370, 132...
Name: product_id, dtype: object

In [10]:
num_rows = len(order_pairs)
biggest_order_size = max(products_per_order.apply(len))
(num_rows, biggest_order_size)

(3214874, 145)

In [11]:
order_matrix = np.zeros((num_rows, biggest_order_size), dtype=np.uint16)
#order_sizes = np.zeros(num_rows, dtype=np.float32)
#user_ids = np.zeros(num_rows, dtype=np.uint32)
labels = np.zeros(num_rows, dtype=np.float32)

row_idx = 0
for order_id in order_pairs.index:
  product_ids = products_per_order.loc[order_id]
  order_matrix[row_idx, :len(product_ids)] = product_ids
  #user_ids[row_idx] = order_pairs.loc[order_id].user_id
  next_order_id = order_pairs.loc[order_id].next
  if next_order_id in products_per_order.index:
    next_product_ids = products_per_order.loc[next_order_id] 
  else:
    next_product_ids = []
  count = len(product_ids)
  #order_sizes[row_idx] = float(count) / biggest_order_size
  next_count = len(set(product_ids).intersection(next_product_ids))
  labels[row_idx] = float(next_count) / count
  row_idx += 1

## Cleanup

In [12]:
del products_per_order
del order_pairs
del order_products
del products
del orders
del order_products_train
del order_products_prior

## Network helpers

In [13]:
class ZeroMaskedEntries(k.engine.topology.Layer):
    """
    This layer is called after an Embedding layer.
    It zeros out all of the masked-out embeddings.
    It also swallows the mask without passing it on.
    You can change this to default pass-on behavior as follows:

    def compute_mask(self, x, mask=None):
        if not self.mask_zero:
            return None
        else:
            return K.not_equal(x, 0)
    """

    def __init__(self, **kwargs):
        self.support_mask = True
        super(ZeroMaskedEntries, self).__init__(**kwargs)

    def build(self, input_shape):
        self.output_dim = input_shape[1]
        self.repeat_dim = input_shape[2]

    def call(self, x, mask=None):
        mask = K.cast(mask, 'float32')
        mask = K.repeat(mask, self.repeat_dim)
        mask = K.permute_dimensions(mask, (0, 2, 1))
        return x * mask

    def compute_mask(self, input_shape, input_mask=None):
        return None


def mask_aware_mean(x):
    # recreate the masks - all zero rows have been masked
    mask = K.not_equal(K.sum(K.abs(x), axis=2, keepdims=True), 0)

    # number of that rows are not all zeros
    n = K.sum(K.cast(mask, 'float32'), axis=1, keepdims=False)
    
    # compute mask-aware mean of x, or all zeroes if no rows present
    x_mean = K.sum(x, axis=1, keepdims=False) / n
    x_mean = tf.where(tf.is_nan(x_mean), tf.zeros_like(x_mean), x_mean)
    x_mean = tf.check_numerics(x_mean, 'fucksticks')

    return x_mean


def mask_aware_mean_output_shape(input_shape):
    shape = list(input_shape)
    assert len(shape) == 3 
    return (shape[0], shape[2])

## Constants for both models

In [19]:
batch_size = 100

# Layer size constants
#user_embedding_size = 50
product_embedding_size = 50

# TODO Dropout rate
dropout = 0.1

## DAN model

In [17]:
# Inputs

#user_input = k.layers.Input(shape=(1,), name='user_input')

dan_order_input = k.layers.Input(shape=(biggest_order_size,), name='order_input')

#order_size_input = k.layers.Input(shape=(1,), name='order_size_input')

# Embeddings

dan_product_embedding = k.layers.Embedding(
  input_dim=max_product_id + 1,
  output_dim=product_embedding_size,
  mask_zero=True,
  name='product_embedding')

#user_embedding = k.layers.Embedding(
#  input_dim=max_user_id + 1, output_dim=user_embedding_size, name='user_embedding')

dan_order_embeddings = dan_product_embedding(dan_order_input)

# The DAN itself

dan_order_vector = k.layers.Lambda(
  mask_aware_mean, mask_aware_mean_output_shape, name='mean')(dan_order_embeddings)

dan_order_hidden_1 = k.layers.Dense(
  product_embedding_size, activation='relu', name='order_hidden_1')(dan_order_vector)

dan_order_hidden_2 = k.layers.Dense(
  product_embedding_size, activation='relu', name='order_hidden_2')(dan_order_hidden_1)

# Output -- no activation

dan_output = k.layers.Dense(1, name='output')(dan_order_hidden_2)

# Compile the model

dan_model = k.models.Model(
  inputs=[dan_order_input],
  outputs=dan_output)

dan_model.compile(optimizer='adam', loss='mse')

dan_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
order_input (InputLayer)     (None, 145)               0         
_________________________________________________________________
product_embedding (Embedding (None, 145, 50)           2484450   
_________________________________________________________________
mean (Lambda)                (None, 50)                0         
_________________________________________________________________
order_hidden_1 (Dense)       (None, 50)                2550      
_________________________________________________________________
order_hidden_2 (Dense)       (None, 50)                2550      
_________________________________________________________________
output (Dense)               (None, 1)                 51        
Total params: 2,489,601
Trainable params: 2,489,601
Non-trainable params: 0
_________________________________________________________________


In [20]:
start = time.time()

dan_model.fit(order_matrix, labels,
              batch_size=batch_size,
              validation_split=0.01)

duration = time.time() - start
print('Finished training in %d secs' % duration)

Train on 3182725 samples, validate on 32149 samples
Epoch 1/1
Finished training in 277 secs


## GRU model

In [21]:
# Inputs

gru_order_input = k.layers.Input(shape=(biggest_order_size,), name='order_input')

# Embeddings

gru_product_embedding = k.layers.Embedding(
  input_dim=max_product_id + 1,
  output_dim=product_embedding_size,
  mask_zero=True,
  name='product_embedding')

gru_order_embeddings = gru_product_embedding(gru_order_input)

# The GRU itself

order_gru = k.layers.GRU(
  product_embedding_size, name='order_gru')(gru_order_embeddings)

# Output -- no activation

gru_output = k.layers.Dense(1, name='output')(order_gru)

# Compile the model

gru_model = k.models.Model(
  inputs=[gru_order_input],
  outputs=gru_output)

gru_model.compile(optimizer='adam', loss='mse')

gru_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
order_input (InputLayer)     (None, 145)               0         
_________________________________________________________________
product_embedding (Embedding (None, 145, 50)           2484450   
_________________________________________________________________
order_gru (GRU)              (None, 50)                15150     
_________________________________________________________________
output (Dense)               (None, 1)                 51        
Total params: 2,499,651
Trainable params: 2,499,651
Non-trainable params: 0
_________________________________________________________________


In [22]:
start = time.time()

gru_model.fit(order_matrix, labels,
              batch_size=batch_size,
              validation_split=0.01)

duration = time.time() - start
print('Finished training in %d secs' % duration)

Train on 3182725 samples, validate on 32149 samples
Epoch 1/1
Finished training in 19375 secs


## Junk

In [None]:
def shuffle(orders_in, sizes_in, users_in, labels_in):
  n = len(orders_in)
  assert len(sizes_in) == n
  assert len(users_in) == n
  assert len(labels_in) == n
  p = np.random.permutation(n)
  return orders_in[p], sizes_in[p], users_in[p], labels_in[p]

In [None]:
batch_size = 100
num_valid = 5000

index = np.arange(num_rows, dtype=np.uint32)
np.random.shuffle(index)
index_valid = index[:num_valid]
index_train = index[num_valid:]

order_matrix_valid = order_matrix[index_valid, :]
order_sizes_valid = order_sizes[index_valid]
user_ids_valid = user_ids[index_valid]
labels_valid = labels[index_valid]