## Prepare Product Data

In [1]:
from collections import Counter
import os
import numpy as np
import pandas as pd

In [2]:
def pad_1d(array, max_len):
    array = list(array)[:max_len]
    length = len(array)
    padded = array + [0]*(max_len - len(array))
    return padded, length


def make_word_idx(product_names):
    words = [word for name in product_names for word in name.split()]
    word_counts = Counter(words)

    max_id = 1
    word_idx = {}
    for word, count in word_counts.items():
        if count < 10:
            word_idx[word] = 0
        else:
            word_idx[word] = max_id
            max_id += 1

    return word_idx


def encode_text(text, word_idx):
    return ' '.join([str(word_idx[i]) for i in text.split()]) if text else '0'

In [3]:
# Embedding debugging
product_data = pd.read_csv('../../data/processed/product_data.csv')
# Remove floats
product_data = product_data.loc[product_data['product_name'].apply(lambda x: isinstance(x, str)),:]
product_data = product_data.loc[product_data['is_ordered_history'].apply(lambda x: isinstance(x, str)),:]

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# Check Data Shape
product_data.shape
# Check Label type
label_col = product_data['label']
label_set = set(label_col)
label_count = {i: sum(label_col == i) for i in label_set}
print(label_count)

{0: 119128, 1: 13286, -1: 272304}


In [5]:
# Name Embedding
product_data['product_name'] = product_data['product_name'].map(lambda x: x.lower() if type(x)==str else 0)

product_df = pd.read_csv('../../data/raw/products.csv')
product_df['product_name'] = product_df['product_name'].map(lambda x: x.lower())

word_idx = make_word_idx(product_df['product_name'].tolist())
product_data['product_name_encoded'] = product_data['product_name'].map(lambda x: encode_text(x, word_idx))

In [6]:
# Check the property of list product_name
mixlist = product_data['product_name']
mixlist_type = [type(s) for s in mixlist]
mixlist_type_dic = {t: mixlist_type.count(t) for t in set(mixlist_type)}
print(mixlist_type_dic)

{<class 'str'>: 404718}


In [7]:
num_rows = len(product_data)

user_id = np.zeros(shape=[num_rows], dtype=np.int32)
product_id = np.zeros(shape=[num_rows], dtype=np.int32)
aisle_id = np.zeros(shape=[num_rows], dtype=np.int16)
department_id = np.zeros(shape=[num_rows], dtype=np.int8)
eval_set = np.zeros(shape=[num_rows], dtype='S5')
label = np.zeros(shape=[num_rows], dtype=np.int8)

is_ordered_history = np.zeros(shape=[num_rows, 100], dtype=np.int8)
index_in_order_history = np.zeros(shape=[num_rows, 100], dtype=np.int8)
order_dow_history = np.zeros(shape=[num_rows, 100], dtype=np.int8)
order_hour_history = np.zeros(shape=[num_rows, 100], dtype=np.int8)
days_since_prior_order_history = np.zeros(shape=[num_rows, 100], dtype=np.int8)
order_size_history = np.zeros(shape=[num_rows, 100], dtype=np.int8)
reorder_size_history = np.zeros(shape=[num_rows, 100], dtype=np.int8)
order_number_history = np.zeros(shape=[num_rows, 100], dtype=np.int8)
product_name = np.zeros(shape=[num_rows, 30], dtype=np.int32)
product_name_length = np.zeros(shape=[num_rows], dtype=np.int8)
history_length = np.zeros(shape=[num_rows], dtype=np.int8)

In [8]:
# Check the length of lists
print(product_data.shape, user_id.shape)

(404718, 16) (404718,)


In [9]:
for i, row in product_data.iterrows():
    # Index Error Check: False
    # i = i - 1
    if i % 10000 == 0:
        print(i, num_rows)
    
    # Avoid over indexing
    if i == num_rows:
        break

    user_id[i] = row['user_id']
    product_id[i] = row['product_id']
    aisle_id[i] = row['aisle_id']
    department_id[i] = row['department_id']
    eval_set[i] = row['eval_set']
    label[i] = row['label']

    is_ordered_history[i, :], history_length[i] = pad_1d(list(map(int, row['is_ordered_history'].split())), 100)
    index_in_order_history[i, :], _ = pad_1d(list(map(int, row['index_in_order_history'].split())), 100)
    order_dow_history[i, :], _ = pad_1d(list(map(int, row['order_dow_history'].split())), 100)
    order_hour_history[i, :], _ = pad_1d(list(map(int, row['order_hour_history'].split())), 100)
    days_since_prior_order_history[i, :], _ = pad_1d(list(map(int, row['days_since_prior_order_history'].split())), 100)
    order_size_history[i, :], _ = pad_1d(list(map(int, row['order_size_history'].split())), 100)
    reorder_size_history[i, :], _ = pad_1d(list(map(int, row['reorder_size_history'].split())), 100)
    order_number_history[i, :], _ = pad_1d(list(map(int, row['order_number_history'].split())), 100)
    product_name[i, :], product_name_length[i] = pad_1d(list(map(int, row['product_name_encoded'].split())), 30)


0 404718
10000 404718
20000 404718
30000 404718
40000 404718
50000 404718
60000 404718
70000 404718
80000 404718
90000 404718
100000 404718
110000 404718
120000 404718
130000 404718
140000 404718
150000 404718
160000 404718
170000 404718
180000 404718
190000 404718
200000 404718
220000 404718
230000 404718
240000 404718
250000 404718
260000 404718
270000 404718
280000 404718
290000 404718
300000 404718
310000 404718
320000 404718
350000 404718
360000 404718
370000 404718
390000 404718
400000 404718


In [10]:
# Length check
print(user_id.shape, is_ordered_history.shape, order_dow_history.shape)

(404718,) (404718, 100) (404718, 100)


In [11]:
if not os.path.isdir('data'):
    os.makedirs('data')

np.save('data/user_id.npy', user_id)
np.save('data/product_id.npy', product_id)
np.save('data/aisle_id.npy', aisle_id)
np.save('data/department_id.npy', department_id)
np.save('data/eval_set.npy', eval_set)
np.save('data/label.npy', label)

np.save('data/is_ordered_history.npy', is_ordered_history)
np.save('data/index_in_order_history.npy', index_in_order_history)
np.save('data/order_dow_history.npy', order_dow_history)
np.save('data/order_hour_history.npy', order_hour_history)
np.save('data/days_since_prior_order_history.npy', days_since_prior_order_history)
np.save('data/order_size_history.npy', order_size_history)
np.save('data/reorder_size_history.npy', reorder_size_history)
np.save('data/order_number_history.npy', order_number_history)
np.save('data/product_name.npy', product_name)
np.save('data/product_name_length.npy', product_name_length)
np.save('data/history_length.npy', history_length)

# Modeling

In [1]:
import os
import numpy as np
import sys

In [2]:
import tensorflow as tf

print(tf.__version__)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


1.3.0


In [10]:
# Personalized Function
sys.path.append(os.path.join(os.getcwd(), '..'))
from data_frame import DataFrame
from tf_utils import lstm_layer, time_distributed_dense_layer, dense_layer, sequence_log_loss, wavenet, log_loss
from tf_base_model import TFBaseModel

In [4]:
# Additional packages for python 2 functions
from importlib import reload

In [6]:
class DataReader(object):

    def __init__(self, data_dir):
        data_cols = [
            'user_id',
            'product_id',
            'aisle_id',
            'department_id',
            'is_ordered_history',
            'index_in_order_history',
            'order_dow_history',
            'order_hour_history',
            'days_since_prior_order_history',
            'order_size_history',
            'reorder_size_history',
            'order_number_history',
            'history_length',
            'product_name',
            'product_name_length',
            'eval_set',
            'label'
        ]
        data = [np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r') for i in data_cols]
        self.test_df = DataFrame(columns=data_cols, data=data)

        print(self.test_df.shapes())
        print("loaded data")

        # Split the data into training and validation sets
        self.train_df, self.val_df = self.test_df.train_test_split(train_size=0.9)
        # Output set information
        print('train size', len(self.train_df))
        print('validation size', len(self.val_df))
        print('test size', len(self.test_df))
        
    def train_batch_generator(self, batch_size):
        return self.batch_generator(
            batch_size=batch_size,
            df=self.train_df,
            shuffle=True,
            num_epochs=10000,
            is_test=False
        )

    def val_batch_generator(self, batch_size):
        return self.batch_generator(
            batch_size=batch_size,
            df=self.val_df,
            shuffle=True,
            num_epochs=10000,
            is_test=False
        )

    def test_batch_generator(self, batch_size):
        return self.batch_generator(
            batch_size=batch_size,
            df=self.test_df,
            shuffle=False,
            num_epochs=1,
            is_test=True
        )

    def batch_generator(self, batch_size, df, shuffle=True, num_epochs=10000, is_test=False):
        batch_gen = df.batch_generator(batch_size, shuffle=shuffle, num_epochs=num_epochs, allow_smaller_final_batch=is_test)
        for batch in batch_gen:
            batch['order_dow_history'] = np.roll(batch['order_dow_history'], -1, axis=1)
            batch['order_hour_history'] = np.roll(batch['order_hour_history'], -1, axis=1)
            batch['days_since_prior_order_history'] = np.roll(batch['days_since_prior_order_history'], -1, axis=1)
            batch['order_number_history'] = np.roll(batch['order_number_history'], -1, axis=1)
            batch['next_is_ordered'] = np.roll(batch['is_ordered_history'], -1, axis=1)
            batch['is_none'] = batch['product_id'] == 0
            if not is_test:
                batch['history_length'] = batch['history_length'] - 1
            yield batch

In [7]:
class rnn(TFBaseModel):

    def __init__(self, lstm_size=300, **kwargs):
        self.lstm_size = lstm_size
        super(rnn, self).__init__(**kwargs)

    def calculate_loss(self):
        x = self.get_input_sequences()
        return self.calculate_outputs(x)

    def get_input_sequences(self):
        self.user_id = tf.placeholder(tf.int32, [None])
        self.product_id = tf.placeholder(tf.int32, [None])
        self.aisle_id = tf.placeholder(tf.int32, [None])
        self.department_id = tf.placeholder(tf.int32, [None])
        self.is_none = tf.placeholder(tf.int32, [None])
        self.history_length = tf.placeholder(tf.int32, [None])

        self.is_ordered_history = tf.placeholder(tf.int32, [None, 100])
        self.index_in_order_history = tf.placeholder(tf.int32, [None, 100])
        self.order_dow_history = tf.placeholder(tf.int32, [None, 100])
        self.order_hour_history = tf.placeholder(tf.int32, [None, 100])
        self.days_since_prior_order_history = tf.placeholder(tf.int32, [None, 100])
        self.order_size_history = tf.placeholder(tf.int32, [None, 100])
        self.reorder_size_history = tf.placeholder(tf.int32, [None, 100])
        self.order_number_history = tf.placeholder(tf.int32, [None, 100])
        self.product_name = tf.placeholder(tf.int32, [None, 30])
        self.product_name_length = tf.placeholder(tf.int32, [None])
        self.next_is_ordered = tf.placeholder(tf.int32, [None, 100])

        self.keep_prob = tf.placeholder(tf.float32)
        self.is_training = tf.placeholder(tf.bool)

        # product data
        product_embeddings = tf.get_variable(
            name='product_embeddings',
            shape=[50000, self.lstm_size],
            dtype=tf.float32
        )
        aisle_embeddings = tf.get_variable(
            name='aisle_embeddings',
            shape=[250, 50],
            dtype=tf.float32
        )
        department_embeddings = tf.get_variable(
            name='department_embeddings',
            shape=[50, 10],
            dtype=tf.float32
        )
        product_names = tf.one_hot(self.product_name, 2532)
        product_names = tf.reduce_max(product_names, 1)
        product_names = dense_layer(product_names, 100, activation=tf.nn.relu)

        is_none = tf.cast(tf.expand_dims(self.is_none, 1), tf.float32)

        x_product = tf.concat([
            tf.nn.embedding_lookup(product_embeddings, self.product_id),
            tf.nn.embedding_lookup(aisle_embeddings, self.aisle_id),
            tf.nn.embedding_lookup(department_embeddings, self.department_id),
            is_none,
            product_names
        ], axis=1)
        x_product = tf.tile(tf.expand_dims(x_product, 1), (1, 100, 1))

        # user data
        user_embeddings = tf.get_variable(
            name='user_embeddings',
            shape=[207000, self.lstm_size],
            dtype=tf.float32
        )
        x_user = tf.nn.embedding_lookup(user_embeddings, self.user_id)
        x_user = tf.tile(tf.expand_dims(x_user, 1), (1, 100, 1))

        # sequence data
        is_ordered_history = tf.one_hot(self.is_ordered_history, 2)
        index_in_order_history = tf.one_hot(self.index_in_order_history, 20)
        order_dow_history = tf.one_hot(self.order_dow_history, 8)
        order_hour_history = tf.one_hot(self.order_hour_history, 25)
        days_since_prior_order_history = tf.one_hot(self.days_since_prior_order_history, 31)
        order_size_history = tf.one_hot(self.order_size_history, 60)
        reorder_size_history = tf.one_hot(self.reorder_size_history, 50)
        order_number_history = tf.one_hot(self.order_number_history, 101)

        index_in_order_history_scalar = tf.expand_dims(tf.cast(self.index_in_order_history, tf.float32) / 20.0, 2)
        order_dow_history_scalar = tf.expand_dims(tf.cast(self.order_dow_history, tf.float32) / 8.0, 2)
        order_hour_history_scalar = tf.expand_dims(tf.cast(self.order_hour_history, tf.float32) / 25.0, 2)
        days_since_prior_order_history_scalar = tf.expand_dims(tf.cast(self.days_since_prior_order_history, tf.float32) / 31.0, 2)
        order_size_history_scalar = tf.expand_dims(tf.cast(self.order_size_history, tf.float32) / 60.0, 2)
        reorder_size_history_scalar = tf.expand_dims(tf.cast(self.reorder_size_history, tf.float32) / 50.0, 2)
        order_number_history_scalar = tf.expand_dims(tf.cast(self.order_number_history, tf.float32) / 100.0, 2)

        x_history = tf.concat([
            is_ordered_history,
            index_in_order_history,
            order_dow_history,
            order_hour_history,
            days_since_prior_order_history,
            order_size_history,
            reorder_size_history,
            order_number_history,
            index_in_order_history_scalar,
            order_dow_history_scalar,
            order_hour_history_scalar,
            days_since_prior_order_history_scalar,
            order_size_history_scalar,
            reorder_size_history_scalar,
            order_number_history_scalar,
        ], axis=2)

        x = tf.concat([x_history, x_product, x_user], axis=2)

        return x

    def calculate_outputs(self, x):
        # An LSTM layer is applied to the input 'x'. The output 'h' is a sequence of hidden states.
        h = lstm_layer(x, self.history_length, self.lstm_size, scope='lstm-1')
        
        # The output of the LSTM layer 'h' is concatenated with the original input 'x'. 
        # This is a common technique in sequence-to-sequence models, allowing the model to have access to the original input in later layers.
        h = tf.concat([h, x], axis=2)
        
        # The concatenated output is passed through a dense layer with ReLU activation function. 
        # This serves to transform the data into a more useful form for the final prediction.
        h_final = time_distributed_dense_layer(h, 50, activation=tf.nn.relu, scope='dense-1')

        # The number of components for the mixture model is set to 1.
        n_components = 1
        
        # Another dense layer is applied to 'h_final' without an activation function. 
        # The output 2-d 'params' is then split into two parts: 'ps' and 'mixing_coefs'.
        params = time_distributed_dense_layer(h_final, n_components*2, scope='dense-2', activation=None)
        ps, mixing_coefs = tf.split(params, 2, axis=2)

        # The 'mixing_coefs' are passed through a softmax function, which normalizes them to sum to 1. 
        # This is a common operation when the coefficients represent a mixture of components.
        # However, the author notes that this is implemented incorrectly, likely because there is only one score per sequence
        mixing_coefs = tf.nn.softmax(mixing_coefs - tf.reduce_min(mixing_coefs, 2, keep_dims=True))
        
        # The 'ps' are passed through a sigmoid function, which squashes their values to the range [0, 1]. 
        # This is a common operation when the output represents a probability.
        ps = tf.nn.sigmoid(ps)

        # The labels are replicated to match the shape of 'ps' and 'mixing_coefs'.
        labels = tf.tile(tf.expand_dims(self.next_is_ordered, 2), (1, 1, n_components))
        
        # The losses are calculated as the sum of the product of 'mixing_coefs' and the log loss between 'labels' and 'ps', averaged over the sequence length.
        losses = tf.reduce_sum(mixing_coefs*log_loss(labels, ps), axis=2)
        sequence_mask = tf.cast(tf.sequence_mask(self.history_length, maxlen=100), tf.float32)
        avg_loss = tf.reduce_sum(losses*sequence_mask) / tf.cast(tf.reduce_sum(self.history_length), tf.float32)

        # The final states are selected from 'h_final' and stored in 'self.final_states'.
        final_temporal_idx = tf.stack([tf.range(tf.shape(self.history_length)[0]), self.history_length - 1], axis=1)
        self.final_states = tf.gather_nd(h_final, final_temporal_idx)
        self.final_predictions = tf.gather_nd(ps, final_temporal_idx) # Add prediction part

        # The prediction tensors are stored in 'self.prediction_tensors'.
        self.prediction_tensors = {
            'user_ids': self.user_id,
            'product_ids': self.product_id,
            'final_states': self.final_states,
            'predictions': self.final_predictions
        }

        # The function returns the average loss.
        return avg_loss

In [11]:
base_dir = './'

dr = DataReader(data_dir=os.path.join(base_dir, 'data'))

nn = rnn(
    reader=dr,
    log_dir=os.path.join(base_dir, 'logs_bmm'),
    checkpoint_dir=os.path.join(base_dir, 'checkpoints_bmm'),
    prediction_dir=os.path.join(base_dir, 'predictions_bmm'),
    optimizer='adam',
    learning_rate=.001,
    lstm_size=300,
    batch_size=128,
    num_training_steps=800,
    early_stopping_steps=100,
    warm_start_init_step=0,
    regularization_constant=0.0,
    keep_prob=0.5,
    enable_parameter_averaging=False,
    num_restarts=2,
    min_steps_to_checkpoint=200,
    log_interval=20,
    num_validation_batches=4,
)
nn.fit() # Training finished, start prediction
nn.restore()
nn.predict()

user_id                               (404718,)
product_id                            (404718,)
aisle_id                              (404718,)
department_id                         (404718,)
is_ordered_history                (404718, 100)
index_in_order_history            (404718, 100)
order_dow_history                 (404718, 100)
order_hour_history                (404718, 100)
days_since_prior_order_history    (404718, 100)
order_size_history                (404718, 100)
reorder_size_history              (404718, 100)
order_number_history              (404718, 100)
history_length                        (404718,)
product_name                       (404718, 30)
product_name_length                   (404718,)
eval_set                              (404718,)
label                                 (404718,)
dtype: object
loaded data



new run with parameters:
{'batch_size': 128,
 'checkpoint_dir': './checkpoints_bmm',
 'early_stopping_steps': 100,
 'enable_parameter_averaging': False,
 'grad_clip': 5,
 'keep_prob_scalar': 0.5,
 'learning_rate': 0.001,
 'log_dir': './logs_bmm',
 'log_interval': 20,
 'loss_averaging_window': 100,
 'lstm_size': 300,
 'min_steps_to_checkpoint': 200,
 'num_restarts': 2,
 'num_training_steps': 800,
 'num_validation_batches': 4,
 'optimizer': 'adam',
 'prediction_dir': './predictions_bmm',
 'reader': <__main__.DataReader object at 0x2b4ee9af1e48>,
 'regularization_constant': 0.0,
 'warm_start_init_step': 0}


train size 364246
validation size 40472
test size 404718


all parameters:
[('product_embeddings:0', [50000, 300]),
 ('aisle_embeddings:0', [250, 50]),
 ('department_embeddings:0', [50, 10]),
 ('dense-layer/weights:0', [2532, 100]),
 ('dense-layer/biases:0', [100]),
 ('user_embeddings:0', [207000, 300]),
 ('lstm-1/rnn/lstm_cell/kernel:0', [1365, 1200]),
 ('lstm-1/rnn/lstm_cell/bias:0', [1200]),
 ('dense-1/weights:0', [1365, 50]),
 ('dense-1/biases:0', [50]),
 ('dense-2/weights:0', [50, 2]),
 ('dense-2/biases:0', [2]),
 ('Variable:0', []),
 ('Variable_1:0', []),
 ('beta1_power:0', []),
 ('beta2_power:0', []),
 ('product_embeddings/Adam:0', [50000, 300]),
 ('product_embeddings/Adam_1:0', [50000, 300]),
 ('aisle_embeddings/Adam:0', [250, 50]),
 ('aisle_embeddings/Adam_1:0', [250, 50]),
 ('department_embeddings/Adam:0', [50, 10]),
 ('department_embeddings/Adam_1:0', [50, 10]),
 ('dense-layer/weights/Adam:0', [2532, 100]),
 ('dense-layer/weights/Adam_1:0', [2532, 100]),
 ('dense-layer/biases/Adam:0', [100]),
 ('dense-layer/biases/Adam_1:0', [100]),

built graph


[[step        0]]     [[train]]     loss: 0.72492611       [[val]]     loss: 0.72382855       


KeyboardInterrupt: 