In [1]:
import numpy as np
import pandas as pd
import os
import utils
import pickle
from gensim.models import word2vec
from sklearn.preprocessing import MultiLabelBinarizer
from scipy import sparse
import tensorflow as tf

Using TensorFlow backend.


In [2]:
np.random.seed(7)

In [3]:
utils.data_directory()

On AWS instance
/home/ec2-user/data


In [4]:
def average_pooling(sentence, model):
    matrix_pooling = list(map(lambda x: model[x], filter(lambda x: x in model.wv.vocab.keys(),sentence)))
    return np.mean(matrix_pooling, axis=0)

def max_pooling(sentence, model):
    matrix_pooling = list(map(lambda x: model[x], filter(lambda x: x in model.wv.vocab.keys(),sentence)))
    return np.max(matrix_pooling, axis=0)

def items_to_latent(sentences, model, pooling_function=average_pooling):
    latent_list = list(map(lambda x : pooling_function(x, model) ,sentences))
    return np.array(latent_list)

In [5]:
# load corpus
with open('w2v_corpus_order','rb') as f:
    sentences = pickle.load(f)

In [6]:
model = word2vec.Word2Vec.load("test1.model")

## Process Data

In [7]:
order_products__prior = pd.read_csv("order_products__prior.csv")
order_products__train = pd.read_csv("order_products__train.csv")
order = pd.read_csv("orders.csv")

In [8]:
order_products__prior = pd.merge(order_products__prior[["order_id", "product_id", "reordered"]], 
                               order[["order_id", "user_id","order_number"]], on="order_id")
order_products__prior["product_id"] = order_products__prior["product_id"].astype("unicode")

order_products__train = pd.merge(order_products__train, order[["order_id", "user_id","order_number"]],
                                 how="left", on="order_id")
order_products__train["product_id"] = order_products__train["product_id"].astype("unicode")

In [9]:
liste_user = order.ix[order["eval_set"]=="train", "user_id"].unique()
order_products__prior = order_products__prior[order_products__prior["user_id"].isin(liste_user)]
order_products__train = order_products__train[order_products__train["user_id"].isin(liste_user)]

In [10]:
print(len(order_products__prior["user_id"].unique()))
print(len(order_products__train["user_id"].unique()))

131209
131209


In [11]:
order_products__prior.head()

Unnamed: 0,order_id,product_id,reordered,user_id,order_number
0,2,33120,1,202279,3
1,2,28985,1,202279,3
2,2,9327,0,202279,3
3,2,45918,1,202279,3
4,2,30035,0,202279,3


In [12]:
order_products__train.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,order_number
0,1,49302,1,1,112108,4
1,1,11109,2,1,112108,4
2,1,10246,3,0,112108,4
3,1,49683,4,0,112108,4
4,1,43633,5,1,112108,4


In [13]:
order_products__prior = order_products__prior[["user_id", "order_number","product_id","reordered"]]
order_products__train = order_products__train[["user_id", "order_number","product_id","reordered"]]
df = pd.concat([order_products__prior, order_products__train])
df = df.sort_values(["user_id","order_number"])
df.head()

Unnamed: 0,user_id,order_number,product_id,reordered
24076664,1,1,196,0
24076665,1,1,14084,0
24076666,1,1,12427,0
24076667,1,1,26088,0
24076668,1,1,26405,0


In [14]:
del order_products__prior
del order_products__train
print(len(df["user_id"].unique()))

131209


In [15]:
df_x = df[df["user_id"]==1].groupby(["user_id","order_number"])["product_id"].apply(
    lambda order : average_pooling(order.tolist(), model))
df_x = df_x.rename("train")
df_x = df_x.reset_index()
df_x.head()

Unnamed: 0,user_id,order_number,train
0,1,1,"[1.63034, -1.6842, 0.362723, -2.25177, -2.9475..."
1,1,2,"[3.24611, -1.18167, -0.153879, -2.62414, -3.15..."
2,1,3,"[2.82928, -1.96417, -0.976352, -2.04495, -1.63..."
3,1,4,"[3.77367, -2.18435, -0.602984, -2.14054, -2.85..."
4,1,5,"[2.6546, -1.4735, -0.565062, -1.81519, -2.0413..."


In [16]:
df_y = df[(df["user_id"]==1)&(df["reordered"]==1)].groupby(["user_id","order_number"])["product_id"].apply(list)
df_y = df_y.rename("label")
df_y = df_y.reset_index()
df_y["order_number"] = df_y["order_number"]-1
df_y.head()

Unnamed: 0,user_id,order_number,label
0,1,1,"[196, 12427, 26088]"
1,1,2,"[196, 12427, 10258]"
2,1,3,"[196, 12427, 10258, 25133, 26405]"
3,1,4,"[196, 12427, 10258, 25133, 13176]"
4,1,5,"[196, 12427, 10258, 25133]"


In [17]:
df_one_user = pd.merge(df_x, df_y,  on=["user_id", "order_number"])

In [18]:
df_one_user

Unnamed: 0,user_id,order_number,train,label
0,1,1,"[1.63034, -1.6842, 0.362723, -2.25177, -2.9475...","[196, 12427, 26088]"
1,1,2,"[3.24611, -1.18167, -0.153879, -2.62414, -3.15...","[196, 12427, 10258]"
2,1,3,"[2.82928, -1.96417, -0.976352, -2.04495, -1.63...","[196, 12427, 10258, 25133, 26405]"
3,1,4,"[3.77367, -2.18435, -0.602984, -2.14054, -2.85...","[196, 12427, 10258, 25133, 13176]"
4,1,5,"[2.6546, -1.4735, -0.565062, -1.81519, -2.0413...","[196, 12427, 10258, 25133]"
5,1,6,"[4.37876, -1.828, -0.917015, -2.35073, -2.1946...","[196, 10258, 12427, 25133, 13032]"
6,1,7,"[4.68682, -1.58539, -1.1854, -2.09141, -2.5066...","[12427, 196, 10258, 25133]"
7,1,8,"[4.16767, -2.07204, -1.09154, -2.25562, -3.201...","[49235, 46149, 25133, 196, 10258, 12427]"
8,1,9,"[4.16767, -2.07204, -1.09154, -2.25562, -3.201...","[196, 46149, 25133, 10258, 13032, 12427]"
9,1,10,"[3.84236, -2.05088, -1.02184, -2.50447, -3.328...","[196, 25133, 38928, 26405, 39657, 10258, 13032..."


In [19]:
mlb =  MultiLabelBinarizer()
mlb.fit(df_one_user["label"])
nb_classes = len(mlb.classes_)
print(nb_classes)

12


In [20]:
y = mlb.transform(df_one_user["label"])

In [21]:
X = np.zeros((df_one_user.shape[0],20))
for i in range(df_one_user.shape[0]):
    X[i] = df_one_user.ix[i, "train"]
print(X.shape)

(10, 20)


In [22]:
X.ravel().shape

(200,)

In [23]:
y.ravel().shape

(120,)

In [24]:
y

array([[0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0],
       [1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0],
       [1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0],
       [1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0],
       [1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
       [1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1],
       [1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0],
       [1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1]])

## One user

In [48]:
num_epochs = 50
total_series_length = 200
truncated_backprop_length = 20
state_size = 4
num_classes = nb_classes
batch_size = 1
num_batches = total_series_length//batch_size//truncated_backprop_length

In [26]:
batchX_placeholder = tf.placeholder(tf.float32, [batch_size, truncated_backprop_length])
batchY_placeholder = tf.placeholder(tf.int32, [batch_size, num_classes])

init_state = tf.placeholder(tf.float32, [batch_size, state_size])

In [27]:
W2 = tf.Variable(np.random.rand(state_size, num_classes),dtype=tf.float32)
b2 = tf.Variable(np.zeros((1,num_classes)), dtype=tf.float32)

In [28]:
inputs_series = tf.split( batchX_placeholder, truncated_backprop_length, 1)
labels_series = tf.unstack(batchY_placeholder, axis=1)

In [29]:
cell = tf.contrib.rnn.BasicRNNCell(state_size)
states_series, current_state = tf.contrib.rnn.static_rnn(cell, inputs_series, init_state)

In [30]:
logits_series = [tf.matmul(state, W2) + b2 for state in states_series] #Broadcasted addition
predictions_series = [tf.nn.softmax(logits) for logits in logits_series]

losses = [tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits)  for logits, labels in zip(logits_series,labels_series)]
total_loss = tf.reduce_mean(losses)

train_step = tf.train.AdagradOptimizer(0.3).minimize(total_loss)

In [31]:
x = X.ravel()
y = y.ravel()

In [54]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    loss_list = []

    for epoch_idx in range(num_epochs):
        _current_state = np.zeros((batch_size, state_size))

        print("New data, epoch", epoch_idx)

        for batch_idx in range(num_batches):
            start_idx = batch_idx * truncated_backprop_length
            end_idx = start_idx + truncated_backprop_length
            start_idxx = batch_idx * num_classes
            end_idxx = start_idxx + num_classes
            print(start_idx, end_idx)
            batchX = x[start_idx:end_idx].reshape(batch_size , truncated_backprop_length)
            batchY = y[start_idxx:end_idxx].reshape(batch_size , num_classes )

            _total_loss, _train_step, _current_state,_states_series, _predictions_series = sess.run(
                [total_loss, train_step, current_state, states_series,predictions_series],
                feed_dict={
                    batchX_placeholder:batchX,
                    batchY_placeholder:batchY,
                    init_state:_current_state
                })

loss_list.append(_total_loss)

New data, epoch 0
0 20
20 40
40 60
60 80
80 100
100 120
120 140
140 160
160 180
180 200
New data, epoch 1
0 20
20 40
40 60
60 80
80 100
100 120
120 140
140 160
160 180
180 200
New data, epoch 2
0 20
20 40
40 60
60 80
80 100
100 120
120 140
140 160
160 180
180 200
New data, epoch 3
0 20
20 40
40 60
60 80
80 100
100 120
120 140
140 160
160 180
180 200
New data, epoch 4
0 20
20 40
40 60
60 80
80 100
100 120
120 140
140 160
160 180
180 200
New data, epoch 5
0 20
20 40
40 60
60 80
80 100
100 120
120 140
140 160
160 180
180 200
New data, epoch 6
0 20
20 40
40 60
60 80
80 100
100 120
120 140
140 160
160 180
180 200
New data, epoch 7
0 20
20 40
40 60
60 80
80 100
100 120
120 140
140 160
160 180
180 200
New data, epoch 8
0 20
20 40
40 60
60 80
80 100
100 120
120 140
140 160
160 180
180 200
New data, epoch 9
0 20
20 40
40 60
60 80
80 100
100 120
120 140
140 160
160 180
180 200
New data, epoch 10
0 20
20 40
40 60
60 80
80 100
100 120
120 140
140 160
160 180
180 200
New data, epoch 11
0 20
20 40
4

In [55]:
_states_series

[array([[-0.99939173, -0.89158797,  0.99954498, -0.81886262]], dtype=float32),
 array([[-0.09210306, -0.99773389,  0.01820691,  0.99999511]], dtype=float32),
 array([[-0.94816178, -0.89913541,  0.42694882,  0.99932736]], dtype=float32),
 array([[-0.8922053 , -0.99681824,  0.16975932,  0.9999994 ]], dtype=float32),
 array([[-0.78617752, -0.99579448, -0.48377296,  0.99999976]], dtype=float32),
 array([[-0.97294313, -0.88793933,  0.85003573,  0.99876207]], dtype=float32),
 array([[-0.96542019, -0.99832428,  0.81005549,  0.99999464]], dtype=float32),
 array([[-0.99741608, -0.99388397,  0.99645793,  0.99042022]], dtype=float32),
 array([[-0.99320966, -0.99799865,  0.98323792,  0.99972987]], dtype=float32),
 array([[-0.99769628, -0.99633044,  0.99705815,  0.99317819]], dtype=float32),
 array([[-0.83232194, -0.99966186, -0.30682284,  1.        ]], dtype=float32),
 array([[-0.98833346, -0.90586579,  0.96054006,  0.99290395]], dtype=float32),
 array([[-0.99477941, -0.99684733,  0.99061185,  0.9