## Prepare Product Data

In [21]:
from collections import Counter
import os
import numpy as np
import pandas as pd

In [22]:
def pad_1d(array, max_len):
    array = list(array)[:max_len]
    length = len(array)
    padded = array + [0]*(max_len - len(array))
    return padded, length


def make_word_idx(product_names):
    words = [word for name in product_names for word in name.split()]
    word_counts = Counter(words)

    max_id = 1
    word_idx = {}
    for word, count in word_counts.items():
        if count < 10:
            word_idx[word] = 0
        else:
            word_idx[word] = max_id
            max_id += 1

    return word_idx


def encode_text(text, word_idx):
    return ' '.join([str(word_idx[i]) for i in text.split()]) if text else '0'

In [30]:
# Embedding debugging
product_data = pd.read_csv('../../data/processed/product_data.csv')
# Remove floats
product_data = product_data.loc[product_data['product_name'].apply(lambda x: isinstance(x, str)),:]
product_data = product_data.loc[product_data['is_ordered_history'].apply(lambda x: isinstance(x, str)),:]

In [31]:
# Check Data Shape
product_data.shape
# Check Label type
label_col = product_data['label']
label_set = set(label_col)
label_count = {i: sum(label_col == i) for i in label_set}
print(label_count)

{0: 116178, 1: 12416, -1: 9209}


In [32]:
# Name Embedding
product_data['product_name'] = product_data['product_name'].map(lambda x: x.lower() if type(x)==str else 0)

product_df = pd.read_csv('../../data/raw/products.csv')
product_df['product_name'] = product_df['product_name'].map(lambda x: x.lower())

word_idx = make_word_idx(product_df['product_name'].tolist())
product_data['product_name_encoded'] = product_data['product_name'].map(lambda x: encode_text(x, word_idx))

In [33]:
# Check the property of list product_name
mixlist = product_data['product_name']
mixlist_type = [type(s) for s in mixlist]
mixlist_type_dic = {t: mixlist_type.count(t) for t in set(mixlist_type)}
print(mixlist_type_dic)

{<class 'str'>: 137803}


In [34]:
num_rows = len(product_data)

user_id = np.zeros(shape=[num_rows], dtype=np.int32)
product_id = np.zeros(shape=[num_rows], dtype=np.int32)
aisle_id = np.zeros(shape=[num_rows], dtype=np.int16)
department_id = np.zeros(shape=[num_rows], dtype=np.int8)
eval_set = np.zeros(shape=[num_rows], dtype='S5')
label = np.zeros(shape=[num_rows], dtype=np.int8)

is_ordered_history = np.zeros(shape=[num_rows, 100], dtype=np.int8)
index_in_order_history = np.zeros(shape=[num_rows, 100], dtype=np.int8)
order_dow_history = np.zeros(shape=[num_rows, 100], dtype=np.int8)
order_hour_history = np.zeros(shape=[num_rows, 100], dtype=np.int8)
days_since_prior_order_history = np.zeros(shape=[num_rows, 100], dtype=np.int8)
order_size_history = np.zeros(shape=[num_rows, 100], dtype=np.int8)
reorder_size_history = np.zeros(shape=[num_rows, 100], dtype=np.int8)
order_number_history = np.zeros(shape=[num_rows, 100], dtype=np.int8)
product_name = np.zeros(shape=[num_rows, 30], dtype=np.int32)
product_name_length = np.zeros(shape=[num_rows], dtype=np.int8)
history_length = np.zeros(shape=[num_rows], dtype=np.int8)

In [35]:
# Check the length of lists
print(product_data.shape, user_id.shape)

(137803, 16) (137803,)


In [36]:
for i, row in product_data.iterrows():
    # Index Error Check: False
    # i = i - 1
    if i % 10000 == 0:
        print(i, num_rows)
    
    # Avoid over indexing
    if i == num_rows:
        break

    user_id[i] = row['user_id']
    product_id[i] = row['product_id']
    aisle_id[i] = row['aisle_id']
    department_id[i] = row['department_id']
    eval_set[i] = row['eval_set']
    label[i] = row['label']

    is_ordered_history[i, :], history_length[i] = pad_1d(list(map(int, row['is_ordered_history'].split())), 100)
    index_in_order_history[i, :], _ = pad_1d(list(map(int, row['index_in_order_history'].split())), 100)
    order_dow_history[i, :], _ = pad_1d(list(map(int, row['order_dow_history'].split())), 100)
    order_hour_history[i, :], _ = pad_1d(list(map(int, row['order_hour_history'].split())), 100)
    days_since_prior_order_history[i, :], _ = pad_1d(list(map(int, row['days_since_prior_order_history'].split())), 100)
    order_size_history[i, :], _ = pad_1d(list(map(int, row['order_size_history'].split())), 100)
    reorder_size_history[i, :], _ = pad_1d(list(map(int, row['reorder_size_history'].split())), 100)
    order_number_history[i, :], _ = pad_1d(list(map(int, row['order_number_history'].split())), 100)
    product_name[i, :], product_name_length[i] = pad_1d(list(map(int, row['product_name_encoded'].split())), 30)


10000 137803
20000 137803
30000 137803
40000 137803
50000 137803
60000 137803
70000 137803
80000 137803
90000 137803
100000 137803
110000 137803
120000 137803
130000 137803


In [37]:
# Length check
print(user_id.shape, is_ordered_history.shape, order_dow_history.shape)

(137803,) (137803, 100) (137803, 100)


In [38]:
if not os.path.isdir('data'):
    os.makedirs('data')

np.save('data/user_id.npy', user_id)
np.save('data/product_id.npy', product_id)
np.save('data/aisle_id.npy', aisle_id)
np.save('data/department_id.npy', department_id)
np.save('data/eval_set.npy', eval_set)
np.save('data/label.npy', label)

np.save('data/is_ordered_history.npy', is_ordered_history)
np.save('data/index_in_order_history.npy', index_in_order_history)
np.save('data/order_dow_history.npy', order_dow_history)
np.save('data/order_hour_history.npy', order_hour_history)
np.save('data/days_since_prior_order_history.npy', days_since_prior_order_history)
np.save('data/order_size_history.npy', order_size_history)
np.save('data/reorder_size_history.npy', reorder_size_history)
np.save('data/order_number_history.npy', order_number_history)
np.save('data/product_name.npy', product_name)
np.save('data/product_name_length.npy', product_name_length)
np.save('data/history_length.npy', history_length)

# Modeling

In [39]:
import os
import numpy as np
import sys

In [40]:
import tensorflow as tf

print(tf.__version__)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


1.3.0


In [41]:
# Personalized Function
sys.path.append(os.path.join(os.getcwd(), '..'))
from data_frame import DataFrame
from tf_utils import lstm_layer, time_distributed_dense_layer, dense_layer, sequence_log_loss, wavenet
from tf_base_model import TFBaseModel

In [42]:
# Additional packages for python 2 functions
from importlib import reload

In [43]:
class DataReader(object):

    def __init__(self, data_dir):
        data_cols = [
            'user_id',
            'product_id',
            'aisle_id',
            'department_id',
            'is_ordered_history',
            'index_in_order_history',
            'order_dow_history',
            'order_hour_history',
            'days_since_prior_order_history',
            'order_size_history',
            'reorder_size_history',
            'order_number_history',
            'history_length',
            'product_name',
            'product_name_length',
            'eval_set',
            'label'
        ]
        data = [np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r') for i in data_cols]
        self.test_df = DataFrame(columns=data_cols, data=data)

        print(self.test_df.shapes())
        print("loaded data")

        # Split the data into training and validation sets
        self.train_df, self.val_df = self.test_df.train_test_split(train_size=0.9)
        # Output set information
        print('train size', len(self.train_df))
        print('validation size', len(self.val_df))
        print('test size', len(self.test_df))
        
    def train_batch_generator(self, batch_size):
        return self.batch_generator(
            batch_size=batch_size,
            df=self.train_df,
            shuffle=True,
            num_epochs=10000,
            is_test=False
        )

    def val_batch_generator(self, batch_size):
        return self.batch_generator(
            batch_size=batch_size,
            df=self.val_df,
            shuffle=True,
            num_epochs=10000,
            is_test=False
        )

    def test_batch_generator(self, batch_size):
        return self.batch_generator(
            batch_size=batch_size,
            df=self.test_df,
            shuffle=False,
            num_epochs=1,
            is_test=True
        )

    def batch_generator(self, batch_size, df, shuffle=True, num_epochs=10000, is_test=False):
        batch_gen = df.batch_generator(batch_size, shuffle=shuffle, num_epochs=num_epochs, allow_smaller_final_batch=is_test)
        for batch in batch_gen:
            batch['order_dow_history'] = np.roll(batch['order_dow_history'], -1, axis=1)
            batch['order_hour_history'] = np.roll(batch['order_hour_history'], -1, axis=1)
            batch['days_since_prior_order_history'] = np.roll(batch['days_since_prior_order_history'], -1, axis=1)
            batch['order_number_history'] = np.roll(batch['order_number_history'], -1, axis=1)
            batch['next_is_ordered'] = np.roll(batch['is_ordered_history'], -1, axis=1)
            batch['is_none'] = batch['product_id'] == 0
            if not is_test:
                batch['history_length'] = batch['history_length'] - 1
            yield batch

In [44]:
class rnn(TFBaseModel):

    def __init__(self, lstm_size, dilations, filter_widths, skip_channels, residual_channels, **kwargs):
        self.lstm_size = lstm_size
        self.dilations = dilations
        self.filter_widths = filter_widths
        self.skip_channels = skip_channels
        self.residual_channels = residual_channels
        super(rnn, self).__init__(**kwargs)

    def calculate_loss(self):
        x = self.get_input_sequences()
        preds = self.calculate_outputs(x)
        loss = sequence_log_loss(self.next_is_ordered, preds, self.history_length, 100)
        return loss

    def get_input_sequences(self):
        self.user_id = tf.placeholder(tf.int32, [None])
        self.product_id = tf.placeholder(tf.int32, [None])
        self.aisle_id = tf.placeholder(tf.int32, [None])
        self.department_id = tf.placeholder(tf.int32, [None])
        self.is_none = tf.placeholder(tf.int32, [None])
        self.history_length = tf.placeholder(tf.int32, [None])

        self.is_ordered_history = tf.placeholder(tf.int32, [None, 100])
        self.index_in_order_history = tf.placeholder(tf.int32, [None, 100])
        self.order_dow_history = tf.placeholder(tf.int32, [None, 100])
        self.order_hour_history = tf.placeholder(tf.int32, [None, 100])
        self.days_since_prior_order_history = tf.placeholder(tf.int32, [None, 100])
        self.order_size_history = tf.placeholder(tf.int32, [None, 100])
        self.reorder_size_history = tf.placeholder(tf.int32, [None, 100])
        self.order_number_history = tf.placeholder(tf.int32, [None, 100])
        self.product_name = tf.placeholder(tf.int32, [None, 30])
        self.product_name_length = tf.placeholder(tf.int32, [None])
        self.next_is_ordered = tf.placeholder(tf.int32, [None, 100])

        self.keep_prob = tf.placeholder(tf.float32)
        self.is_training = tf.placeholder(tf.bool)

        # product data
        product_embeddings = tf.get_variable(
            name='product_embeddings',
            shape=[50000, self.lstm_size],
            dtype=tf.float32
        )
        aisle_embeddings = tf.get_variable(
            name='aisle_embeddings',
            shape=[250, 50],
            dtype=tf.float32
        )
        department_embeddings = tf.get_variable(
            name='department_embeddings',
            shape=[50, 10],
            dtype=tf.float32
        )
        product_names = tf.one_hot(self.product_name, 2532)
        product_names = tf.reduce_max(product_names, 1)
        product_names = dense_layer(product_names, 100, activation=tf.nn.relu)

        is_none = tf.cast(tf.expand_dims(self.is_none, 1), tf.float32)

        x_product = tf.concat([
            tf.nn.embedding_lookup(product_embeddings, self.product_id),
            tf.nn.embedding_lookup(aisle_embeddings, self.aisle_id),
            tf.nn.embedding_lookup(department_embeddings, self.department_id),
            is_none,
            product_names
        ], axis=1)
        x_product = tf.tile(tf.expand_dims(x_product, 1), (1, 100, 1))

        # user data
        user_embeddings = tf.get_variable(
            name='user_embeddings',
            shape=[207000, self.lstm_size],
            dtype=tf.float32
        )
        x_user = tf.nn.embedding_lookup(user_embeddings, self.user_id)
        x_user = tf.tile(tf.expand_dims(x_user, 1), (1, 100, 1))

        # sequence data
        is_ordered_history = tf.one_hot(self.is_ordered_history, 2)
        index_in_order_history = tf.one_hot(self.index_in_order_history, 20)
        order_dow_history = tf.one_hot(self.order_dow_history, 8)
        order_hour_history = tf.one_hot(self.order_hour_history, 25)
        days_since_prior_order_history = tf.one_hot(self.days_since_prior_order_history, 31)
        order_size_history = tf.one_hot(self.order_size_history, 60)
        reorder_size_history = tf.one_hot(self.reorder_size_history, 50)
        order_number_history = tf.one_hot(self.order_number_history, 101)

        index_in_order_history_scalar = tf.expand_dims(tf.cast(self.index_in_order_history, tf.float32) / 20.0, 2)
        order_dow_history_scalar = tf.expand_dims(tf.cast(self.order_dow_history, tf.float32) / 8.0, 2)
        order_hour_history_scalar = tf.expand_dims(tf.cast(self.order_hour_history, tf.float32) / 25.0, 2)
        days_since_prior_order_history_scalar = tf.expand_dims(tf.cast(self.days_since_prior_order_history, tf.float32) / 31.0, 2)
        order_size_history_scalar = tf.expand_dims(tf.cast(self.order_size_history, tf.float32) / 60.0, 2)
        reorder_size_history_scalar = tf.expand_dims(tf.cast(self.reorder_size_history, tf.float32) / 50.0, 2)
        order_number_history_scalar = tf.expand_dims(tf.cast(self.order_number_history, tf.float32) / 100.0, 2)

        x_history = tf.concat([
            is_ordered_history,
            index_in_order_history,
            order_dow_history,
            order_hour_history,
            days_since_prior_order_history,
            order_size_history,
            reorder_size_history,
            order_number_history,
            index_in_order_history_scalar,
            order_dow_history_scalar,
            order_hour_history_scalar,
            days_since_prior_order_history_scalar,
            order_size_history_scalar,
            reorder_size_history_scalar,
            order_number_history_scalar,
        ], axis=2)

        x = tf.concat([x_history, x_product, x_user], axis=2)

        return x

    def calculate_outputs(self, x):
        h = lstm_layer(x, self.history_length, self.lstm_size)
        c = wavenet(x, self.dilations, self.filter_widths, self.skip_channels, self.residual_channels)
        h = tf.concat([h, c, x], axis=2)

        self.h_final = time_distributed_dense_layer(h, 50, activation=tf.nn.relu, scope='dense-1')
        y_hat = time_distributed_dense_layer(self.h_final, 1, activation=tf.nn.sigmoid, scope='dense-2')
        y_hat = tf.squeeze(y_hat, 2)

        final_temporal_idx = tf.stack([tf.range(tf.shape(self.history_length)[0]), tf.maximum(self.history_length - 1, 0)], axis=1)
        self.final_states = tf.gather_nd(self.h_final, final_temporal_idx)
        self.final_predictions = tf.gather_nd(y_hat, final_temporal_idx)

        self.prediction_tensors = {
            'user_ids': self.user_id,
            'product_ids': self.product_id,
            'final_states': self.final_states,
            'predictions': self.final_predictions
        }

        return y_hat


In [None]:
base_dir = './'

dr = DataReader(data_dir=os.path.join(base_dir, 'data'))

nn = rnn(
    reader=dr,
    log_dir=os.path.join(base_dir, 'logs'),
    checkpoint_dir=os.path.join(base_dir, 'checkpoints'),
    prediction_dir=os.path.join(base_dir, 'predictions'),
    optimizer='adam',
    learning_rate=.001,
    lstm_size=100,
    dilations=[2**i for i in range(6)],
    filter_widths=[2]*6,
    skip_channels=64,
    residual_channels=128,
    batch_size=128,
    num_training_steps=2000,
    early_stopping_steps=100,
    warm_start_init_step=0,
    regularization_constant=0.0,
    keep_prob=0.5,
    enable_parameter_averaging=False,
    num_restarts=2,
    min_steps_to_checkpoint=200,
    log_interval=20,
    num_validation_batches=4,
)
nn.fit() # Training finished, start prediction
nn.restore()
nn.predict()

user_id                               (137803,)
product_id                            (137803,)
aisle_id                              (137803,)
department_id                         (137803,)
is_ordered_history                (137803, 100)
index_in_order_history            (137803, 100)
order_dow_history                 (137803, 100)
order_hour_history                (137803, 100)
days_since_prior_order_history    (137803, 100)
order_size_history                (137803, 100)
reorder_size_history              (137803, 100)
order_number_history              (137803, 100)
history_length                        (137803,)
product_name                       (137803, 30)
product_name_length                   (137803,)
eval_set                              (137803,)
label                                 (137803,)
dtype: object
loaded data



new run with parameters:
{'batch_size': 128,
 'checkpoint_dir': './checkpoints',
 'dilations': [1, 2, 4, 8, 16, 32],
 'early_stopping_steps': 100,
 'enable_parameter_averaging': False,
 'filter_widths': [2, 2, 2, 2, 2, 2],
 'grad_clip': 5,
 'keep_prob_scalar': 0.5,
 'learning_rate': 0.001,
 'log_dir': './logs',
 'log_interval': 20,
 'loss_averaging_window': 100,
 'lstm_size': 100,
 'min_steps_to_checkpoint': 200,
 'num_restarts': 2,
 'num_training_steps': 2000,
 'num_validation_batches': 4,
 'optimizer': 'adam',
 'prediction_dir': './predictions',
 'reader': <__main__.DataReader object at 0x2add840784a8>,
 'regularization_constant': 0.0,
 'residual_channels': 128,
 'skip_channels': 64,
 'warm_start_init_step': 0}


train size 124022
validation size 13781
test size 137803


all parameters:
[('product_embeddings:0', [50000, 100]),
 ('aisle_embeddings:0', [250, 50]),
 ('department_embeddings:0', [50, 10]),
 ('dense-layer/weights:0', [2532, 100]),
 ('dense-layer/biases:0', [100]),
 ('user_embeddings:0', [207000, 100]),
 ('lstm-layer/rnn/lstm_cell/kernel:0', [765, 400]),
 ('lstm-layer/rnn/lstm_cell/bias:0', [400]),
 ('wavenet/x-proj/weights:0', [665, 128]),
 ('wavenet/x-proj/biases:0', [128]),
 ('wavenet/cnn-0/weights:0', [2, 128, 256]),
 ('wavenet/cnn-0/biases:0', [256]),
 ('wavenet/cnn-0-proj/weights:0', [128, 192]),
 ('wavenet/cnn-0-proj/biases:0', [192]),
 ('wavenet/cnn-1/weights:0', [2, 128, 256]),
 ('wavenet/cnn-1/biases:0', [256]),
 ('wavenet/cnn-1-proj/weights:0', [128, 192]),
 ('wavenet/cnn-1-proj/biases:0', [192]),
 ('wavenet/cnn-2/weights:0', [2, 128, 256]),
 ('wavenet/cnn-2/biases:0', [256]),
 ('wavenet/cnn-2-proj/weights:0', [128, 192]),
 ('wavenet/cnn-2-proj/biases:0', [192]),
 ('wavenet/cnn-3/weights:0', [2, 128, 256]),
 ('wavenet/cnn-3/biases:

built graph


[[step        0]]     [[train]]     loss: 0.50389457       [[val]]     loss: 0.50544643       
[[step       20]]     [[train]]     loss: 0.0440548        [[val]]     loss: 0.05221456       
[[step       40]]     [[train]]     loss: 0.03051687       [[val]]     loss: 0.03462561       
[[step       60]]     [[train]]     loss: 0.02406596       [[val]]     loss: 0.02746421       
[[step       80]]     [[train]]     loss: 0.02041885       [[val]]     loss: 0.02338304       
[[step      100]]     [[train]]     loss: 0.01258761       [[val]]     loss: 0.01535527       
[[step      120]]     [[train]]     loss: 0.00890037       [[val]]     loss: 0.01006605       
[[step      140]]     [[train]]     loss: 0.00578102       [[val]]     loss: 0.00698902       
[[step      160]]     [[train]]     loss: 0.00362739       [[val]]     loss: 0.00444908       
