In [None]:
import numpy as np
import pandas as pd
import os
import utils
import pickle
from gensim.models import word2vec
from sklearn.preprocessing import MultiLabelBinarizer
from scipy import sparse
import tensorflow as tf

Using TensorFlow backend.


In [None]:
np.random.seed(7)

In [None]:
utils.data_directory()

In [None]:
def average_pooling(sentence, model):
    matrix_pooling = list(map(lambda x: model[x], filter(lambda x: x in model.wv.vocab.keys(),sentence)))
    return np.mean(matrix_pooling, axis=0)

def max_pooling(sentence, model):
    matrix_pooling = list(map(lambda x: model[x], filter(lambda x: x in model.wv.vocab.keys(),sentence)))
    return np.max(matrix_pooling, axis=0)

def items_to_latent(sentences, model, pooling_function=average_pooling):
    latent_list = list(map(lambda x : pooling_function(x, model) ,sentences))
    return np.array(latent_list)

In [None]:
# load corpus
with open('w2v_corpus_order','rb') as f:
    sentences = pickle.load(f)

In [None]:
model = word2vec.Word2Vec.load("test1.model")

## Process Data

In [None]:
order_products__prior = pd.read_csv("order_products__prior.csv")
order_products__train = pd.read_csv("order_products__train.csv")
order = pd.read_csv("orders.csv")

In [None]:
order_products__prior = pd.merge(order_products__prior[["order_id", "product_id", "reordered"]], 
                               order[["order_id", "user_id","order_number"]], on="order_id")
order_products__prior["product_id"] = order_products__prior["product_id"].astype("unicode")

order_products__train = pd.merge(order_products__train, order[["order_id", "user_id","order_number"]],
                                 how="left", on="order_id")
order_products__train["product_id"] = order_products__train["product_id"].astype("unicode")

In [None]:
liste_user = order.ix[order["eval_set"]=="train", "user_id"].unique()
order_products__prior = order_products__prior[order_products__prior["user_id"].isin(liste_user)]
order_products__train = order_products__train[order_products__train["user_id"].isin(liste_user)]

In [None]:
print(len(order_products__prior["user_id"].unique()))
print(len(order_products__train["user_id"].unique()))

In [None]:
order_products__prior.head()

In [None]:
order_products__train.head()

In [None]:
order_products__prior = order_products__prior[["user_id", "order_number","product_id","reordered"]]
order_products__train = order_products__train[["user_id", "order_number","product_id","reordered"]]
df = pd.concat([order_products__prior, order_products__train])
df = df.sort_values(["user_id","order_number"])
df.head()

In [None]:
del order_products__prior
del order_products__train
print(len(df["user_id"].unique()))

In [None]:
df_x = df[df["user_id"]==1].groupby(["user_id","order_number"])["product_id"].apply(
    lambda order : average_pooling(order.tolist(), model))
df_x = df_x.rename("train")
df_x = df_x.reset_index()
df_x.head()

In [None]:
df_y = df[(df["user_id"]==1)&(df["reordered"]==1)].groupby(["user_id","order_number"])["product_id"].apply(list)
df_y = df_y.rename("label")
df_y = df_y.reset_index()
df_y["order_number"] = df_y["order_number"]-1
df_y.head()

In [None]:
df_one_user = pd.merge(df_x, df_y,  on=["user_id", "order_number"])

In [None]:
df_one_user

In [None]:
mlb =  MultiLabelBinarizer()
mlb.fit(df_one_user["label"])
nb_classes = len(mlb.classes_)
print(nb_classes)

In [None]:
y = mlb.transform(df_one_user["label"])

In [None]:
X = np.zeros((df_one_user.shape[0],20))
for i in range(df_one_user.shape[0]):
    X[i] = df_one_user.ix[i, "train"]
print(X.shape)

In [None]:
X.ravel().shape

In [None]:
y.ravel().shape

## One user

In [None]:
num_epochs = 3
total_series_length = 200
truncated_backprop_length = 10
state_size = 4
num_classes = nb_classes
batch_size = 3
num_batches = total_series_length//batch_size//truncated_backprop_length

In [None]:
batchX_placeholder = tf.placeholder(tf.float32, [batch_size, truncated_backprop_length])
batchY_placeholder = tf.placeholder(tf.int32, [batch_size, truncated_backprop_length])

init_state = tf.placeholder(tf.float32, [batch_size, state_size])

In [None]:
W2 = tf.Variable(np.random.rand(state_size, num_classes),dtype=tf.float32)
b2 = tf.Variable(np.zeros((1,num_classes)), dtype=tf.float32)

In [None]:
inputs_series = tf.split( batchX_placeholder, truncated_backprop_length, 1)
labels_series = tf.unpack(batchY_placeholder, axis=1)

In [None]:
cell = tf.nn.rnn_cell.BasicRNNCell(state_size)
states_series, current_state = tf.nn.rnn(cell, inputs_series, init_state)

In [None]:
logits_series = [tf.matmul(state, W2) + b2 for state in states_series] #Broadcasted addition
predictions_series = [tf.nn.softmax(logits) for logits in logits_series]

losses = [tf.nn.sparse_softmax_cross_entropy_with_logits(logits, labels) for logits, labels in zip(logits_series,labels_series)]
total_loss = tf.reduce_mean(losses)

train_step = tf.train.AdagradOptimizer(0.3).minimize(total_loss)