In [94]:
import numpy as np
import pandas as pd
import gc
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import tensorflow as tf
import keras
from keras import backend as K
from keras import initializers
from keras.models import Sequential, Model, load_model, save_model
from keras.layers.core import Dense, Lambda, Activation
from keras.layers import Embedding, Input, Dense, merge, Reshape, Merge, Flatten
from keras.optimizers import Adagrad, Adam, SGD, RMSprop
from keras import regularizers
from time import time
import multiprocessing as mp
from sklearn import metrics
import sys
import math
import argparse

In [95]:
#Load InstaCart Data
myfolder = '/Users/BharathiSrinivasan/Documents/HU-MEMS-Sem3/Info_Systems/repo/Recommender_DNN/Data/'
big_users = pd.read_csv(myfolder + 'data_subset.csv')
big_users.head(n=5)

Unnamed: 0,user_id,product_id,up_orders,up_first_order,up_last_order,up_average_cart_position,prod_orders,prod_reorder_probability,prod_reorder_ratio,user_orders,...,user_average_basket,order_id,days_since_prior_order,up_order_rate,up_orders_since_last_order,reordered,aisle_id,department_id,order_dow,order_hour_of_day
0,129870,19511,25,2,34,10,5130,0.471,0.708187,34,...,17,2308721,11,0.7354,0,1,53,16,0,13
1,170239,31821,1,38,38,11,481,0.294,0.455301,63,...,10,2191246,1,0.01587,25,0,83,4,3,13
2,24245,47209,8,1,8,4,16976,0.668,0.796553,8,...,10,864819,4,1.0,0,1,24,4,4,23
3,126649,16185,7,1,25,6,30170,0.4612,0.614551,25,...,11,552250,14,0.28,0,1,21,16,0,19
4,19487,45,2,3,4,2,18413,0.4883,0.655461,5,...,6,3007640,30,0.4,1,0,83,4,2,14


In [97]:
big_users.shape
df = big_users.sample(frac=1, random_state= 123)
df.head()

Unnamed: 0,user_id,product_id,up_orders,up_first_order,up_last_order,up_average_cart_position,prod_orders,prod_reorder_probability,prod_reorder_ratio,user_orders,...,user_average_basket,order_id,days_since_prior_order,up_order_rate,up_orders_since_last_order,reordered,aisle_id,department_id,order_dow,order_hour_of_day
2029800,168818,13176,19,15,36,1,51770,0.712,0.832555,36,...,8,1387017,25,0.528,0,1,24,4,0,10
1531690,110479,5663,3,9,62,12,239,0.593,0.661088,99,...,17,2101508,2,0.0303,37,0,47,11,1,1
794662,45069,3191,13,2,35,3,208,0.4658,0.649038,38,...,8,2237919,18,0.342,3,0,26,7,2,19
2423672,159373,5785,5,2,18,3,47839,0.6157,0.805702,18,...,5,2219196,10,0.2778,0,0,84,16,2,21
2833404,20657,15344,5,6,13,6,138,0.4055,0.463768,17,...,14,931566,14,0.2942,4,0,79,1,6,15


In [98]:
up_matrix = df[['user_id','product_id','reordered']].copy()

In [99]:
up_matrix.reset_index(drop=True)

Unnamed: 0,user_id,product_id,reordered
0,168818,13176,1
1,110479,5663,0
2,45069,3191,0
3,159373,5785,0
4,20657,15344,0
5,164484,18838,0
6,34122,27344,0
7,121231,25442,0
8,4326,45123,1
9,75805,37734,0


In [100]:
# Create user-product interaction matrix

#rank or the reorder frequency - used as implicit feedback metric
new = pd.DataFrame({'reorder_freq' : up_matrix.groupby( [ "user_id", "product_id"] )['reordered'].sum()}).reset_index()

In [101]:
#Splitting datasets into train and test
y= new['reorder_freq'].values
new.pop('reorder_freq')

0          1
1          2
2          0
3          1
4          1
5          1
6          1
7          2
8          0
9          0
10         0
11         0
12         0
13         0
14         0
15         0
16         0
17         0
18         0
19         0
20         0
21         0
22         0
23         0
24         0
25         0
26         0
27         1
28         0
29         0
          ..
2839914    1
2839915    0
2839916    0
2839917    1
2839918    0
2839919    0
2839920    0
2839921    0
2839922    0
2839923    0
2839924    0
2839925    0
2839926    1
2839927    0
2839928    0
2839929    0
2839930    0
2839931    0
2839932    0
2839933    1
2839934    2
2839935    0
2839936    0
2839937    0
2839938    0
2839939    2
2839940    0
2839941    0
2839942    0
2839943    0
Name: reorder_freq, Length: 2839944, dtype: int64

In [102]:
X = new.astype('int32')
y = y.astype('int32')
#X_df = pd.DataFrame(X)

In [103]:
# assign IDs from 0 to N for users and products
# Use enumerate() to create a list of the new and original IDs
users = enumerate(X.user_id.unique())
products = enumerate(X.product_id.unique())
# Create a dictionary with old IDs to new IDs
userid2idx = {o:i for i,o in users}
prodid2idx = {o:i for i,o in products}
# Create id variable with the new IDs
final_X = []
X['userIdx'] = X.user_id.apply(lambda x: userid2idx[x])
X['prodIdx'] = X.product_id.apply(lambda x: prodid2idx[x])

In [104]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [105]:
#Network Parameters
n_users = X['user_id'].nunique()
n_prods = X['product_id'].nunique()
latent_dim = 8

In [106]:
n_prods

42518

In [114]:
### Matrix Factorization ###
# Input variables
user_input = Input(shape=(1,), dtype='int32', name = 'user_input')
item_input = Input(shape=(1,), dtype='int32', name = 'item_input')

User_Embedding = Embedding(input_dim = n_users, output_dim = latent_dim, name = 'user_embedding', input_length=1,embeddings_regularizer = regularizers.l2(1e-5))
Item_Embedding = Embedding(input_dim = n_prods, output_dim = latent_dim, name = 'item_embedding', input_length=1,embeddings_regularizer = regularizers.l2(1e-5))   
    
# Crucial to flatten an embedding vector!
user_latent = Flatten()(User_Embedding(user_input))
item_latent = Flatten()(Item_Embedding(item_input))
    
# Element-wise product of user and item embeddings 
predict_vector = merge([user_latent, item_latent], mode = 'mul')
    
# Final prediction layer
prediction = Dense(1, activation='sigmoid', init='lecun_uniform', name = 'prediction')(predict_vector)
    
MF_model = Model(input=[user_input, item_input],output=prediction)

  
  name=name)


In [115]:
#Build model
MF_model.compile(optimizer=Adam(0.01), loss='mean_squared_error',metrics=['accuracy'])
#print(MF_model.summary())

In [116]:
# Training
user_id = X_train['userIdx']
prod_id = X_train['prodIdx']
MF_model.fit([user_id, prod_id],y_train, nb_epoch=1,batch_size =100)

  after removing the cwd from sys.path.


Epoch 1/1


<keras.callbacks.History at 0x1a34873dd8>

In [117]:
#Test
MF_model.evaluate([X_test['userIdx'], X_test['prodIdx']],y_test)
#prediction = model.predict([X_test['userIdx'], X_test['prodIdx'])



[0.9717071010057282, 0.852537937367779]

In [107]:
### Matrix Factorization + Bias ###
# Input variables
mfb_user_inp = Input(shape=(1,), dtype='int32', name = 'user_inp')
mfb_prod_inp= Input(shape=(1,), dtype='int32', name = 'prod_inp')

mfb_User_Embedding = Embedding(input_dim = n_users, output_dim = latent_dim, name = 'user_embedding', input_length=1,embeddings_regularizer = regularizers.l2(1e-5))(mfb_user_inp)
mfb_Prod_Embedding = Embedding(input_dim = n_prods, output_dim = latent_dim, name = 'item_embedding', input_length=1,embeddings_regularizer = regularizers.l2(1e-5))(mfb_prod_inp) 
    
# Crucial to flatten an embedding vector!
mfb_user_bias = Flatten()(Embedding(n_users, 1, input_length = 1)(mfb_user_inp))
mfb_prod_bias = Flatten()(Embedding(n_prods, 1, input_length = 1)(mfb_prod_inp))
    
# Element-wise product of user and item embeddings 
mfb_predict_vector = merge([mfb_User_Embedding, mfb_Prod_Embedding], mode='mul')
mfb_predict_vector = Flatten()(mfb_predict_vector)
mfb_predict_vector = keras.layers.Add()([mfb_predict_vector, mfb_user_bias])
#mfb_predict_vector = merge([mfb_predict_vector, mfb_user_bias], mode = 'sum')
mfb_predict_vector = keras.layers.Add()([mfb_predict_vector, mfb_prod_bias])
#mfb_predict_vector = merge([mfb_predict_vector, mfb_prod_bias], mode = 'sum')

# Final prediction layer
mfb_prediction = Dense(1, activation='sigmoid', init='lecun_uniform', name = 'prediction')(mfb_predict_vector)
    
MF_model_bias = Model(input=[mfb_user_inp, mfb_prod_inp],output=mfb_prediction)

  
  name=name)


In [108]:
#Build model
MF_model_bias.compile(optimizer=Adam(0.01), loss='mean_squared_error',metrics=['accuracy'])

In [109]:
# Training
user_id = X_train['userIdx']
prod_id = X_train['prodIdx']
MF_model_bias.fit([user_id, prod_id],y_train, nb_epoch=1,batch_size =100)

  after removing the cwd from sys.path.


Epoch 1/1


<keras.callbacks.History at 0x1a2c3645c0>

In [110]:
#Test
MF_model_bias.evaluate([X_test['userIdx'], X_test['prodIdx']],y_test)



[0.9159302060503951, 0.7703011045288217]

In [111]:
#### MLP

# Input variables
mlp_user_input = Input(shape=(1,), dtype='int32', name = 'user_input')
mlp_item_input = Input(shape=(1,), dtype='int32', name = 'item_input')

MLP_Embedding_User = Embedding(input_dim = n_users, output_dim = latent_dim, name = 'user_embedding', input_length=1,embeddings_regularizer = regularizers.l2(1e-5))
MLP_Embedding_Item = Embedding(input_dim = n_prods, output_dim = latent_dim, name = 'item_embedding', input_length=1,embeddings_regularizer = regularizers.l2(1e-5))   
    
# Crucial to flatten an embedding vector!
mlp_user_latent = Flatten()(MLP_Embedding_User(mlp_user_input))
mlp_item_latent = Flatten()(MLP_Embedding_Item(mlp_item_input))
    
# The 0-th layer is the concatenation of embedding layers
vector = merge([mlp_user_latent, mlp_item_latent], mode = 'mul')

num_layer = 3
layers = [200,100,50]
# MLP layers
for idx in range(1, num_layer):
    layer = Dense(layers[idx], kernel_regularizer=regularizers.l2(0.01), activation='relu',activity_regularizer=regularizers.l1(0.01), name = 'layer%d' %idx)
    vector = layer(vector)
  
# Final prediction layer
mlp_prediction = Dense(1, activation='sigmoid', init='lecun_uniform', name = 'prediction')(vector)
MLP = Model(input=[mlp_user_input, mlp_item_input], output=mlp_prediction)

  from ipykernel import kernelapp as app
  name=name)


In [112]:
#Build model
MLP.compile(optimizer=Adam(0.01), loss='mean_squared_error',metrics=['accuracy'])

In [113]:
# Training
user_id = X_train['userIdx']
prod_id = X_train['prodIdx']
MLP.fit([user_id, prod_id],y_train, nb_epoch=1,batch_size =100,)

  after removing the cwd from sys.path.


Epoch 1/1


<keras.callbacks.History at 0x1a327f7828>

In [93]:
#Test
MLP.evaluate([X_test['userIdx'], X_test['prodIdx']],y_test)



[0.19465630078791102, 0.7985746887136165]

In [159]:
def get_metrics(y_test, prd):
    fpr, tpr, thresholds = metrics.roc_curve(y_test, prd)
    auc = metrics.auc(fpr, tpr)
    print("auc: ",auc)
    precision, recall, thresholds = metrics.precision_recall_curve(y_test, prd)    
    auprc  = metrics.auc(recall, precision)
    max_f1 = 0
    for r, p, t in zip(recall, precision, thresholds):
        if p + r == 0: continue
        if (2*p*r)/(p + r) > max_f1:
            max_f1 = (2*p*r)/(p + r) 
            max_f1_threshold = t
    print("max_f1: ",max_f1)
    print("max_f1_threshold: ",max_f1_threshold)
    log_loss = metrics.log_loss(y_test, prd)
    print("log_loss: ",log_loss)
    return auc, max_f1, max_f1_threshold, log_loss

In [153]:
prd_mf = MF_model.predict([X_test['userIdx'], X_test['prodIdx']])
prd_mfb = MF_model_bias.predict([X_test['userIdx'], X_test['prodIdx']])
prd_mlp = MLP.predict([X_test['userIdx'], X_test['prodIdx']])

In [160]:
auc_mf, max_f1_mf, max_f1_threshold_mf, log_loss_mf = get_metrics(y_test, prd_mf)
auc_mfb, max_f1_d, max_f1_threshold_mfb, log_loss_mfb = get_metrics(y_test, prd_mfb)
auc_mlp, max_f1_mlp, max_f1_threshold_mlp, log_loss_mlp = get_metrics(y_test, prd_mlp)

ValueError: multiclass format is not supported