In [1]:
import numpy as np
import pandas as pd
import os
import argparse
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
import time

from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.layers import Input,Activation, concatenate, Embedding, Reshape
from keras.layers import Merge, Flatten, merge, Lambda, Dropout
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from keras.models import model_from_json
from keras.models import load_model
from keras.regularizers import l2, l1_l2
import tensorflow as tf
import gc

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
#Importing Instacart data
df_full = pd.read_csv('/Users/BharathiSrinivasan/Documents/GitHub/Thesis/merged_data.csv', index_col = False)

In [3]:
#Sampling a fraction for data for initial training
df_small = df_full.sample(frac = 0.01)
print(df_small.shape)

(338191, 14)


In [4]:
#Investigating NAs
print(df_small.isnull().sum().sum())

0


In [5]:
CATEGORICAL_COLUMNS = ["order_dow", "order_hour_of_day"]
CONTINUOUS_COLUMNS = ["days_since_prior_order","order_number","add_to_cart_order"]
EMBEDDING_COLUMNS = ["user_id", "product_id","aisle_id","department_id"]

In [6]:
#One-hot encoding categorical columns
df_small = pd.get_dummies(df_small, columns=[x for x in CATEGORICAL_COLUMNS])

In [7]:
#Normalising the feature columns
df_small[CONTINUOUS_COLUMNS] = MinMaxScaler().fit_transform(df_small[CONTINUOUS_COLUMNS].values)


In [8]:
#Drop product names and department names for the time being! Incorporate as embedded vectors later!

df_small.drop(['product_name','department'],axis=1, inplace = True)

In [9]:
df_small.head()

Unnamed: 0.1,Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,order_number,days_since_prior_order,aisle_id,department_id,...,order_hour_of_day_14,order_hour_of_day_15,order_hour_of_day_16,order_hour_of_day_17,order_hour_of_day_18,order_hour_of_day_19,order_hour_of_day_20,order_hour_of_day_21,order_hour_of_day_22,order_hour_of_day_23
2505459,2505459,264199,21137,0.010753,1,142723,0.10101,1.0,24,4,...,0,0,0,0,0,0,0,0,0,0
9974076,9974076,1053143,13964,0.096774,0,42699,0.050505,0.166667,37,1,...,0,0,0,0,0,0,0,0,0,0
23108336,23108336,2437202,6193,0.064516,1,185910,0.383838,0.1,21,16,...,0,0,0,0,0,0,0,0,0,0
20201978,20201978,2130882,1559,0.032258,1,98891,0.020202,0.2,120,16,...,0,0,0,0,0,0,0,0,0,0
6808827,6808827,718647,14306,0.150538,0,180594,0.070707,0.9,110,13,...,0,0,1,0,0,0,0,0,0,0


In [10]:
df_small.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [11]:
#Helper to index columns before embeddings
def val2idx(df, cols):
    val_types = dict()
    for c in cols:
        val_types[c] = df[c].unique()

    val_to_idx = dict()
    for k, v in val_types.items():
        val_to_idx[k] = {o: i for i, o in enumerate(val_types[k])}

    for k, v in val_to_idx.items():
        df[k] = df[k].apply(lambda x: v[x])

    unique_vals = dict()
    for c in cols:
        unique_vals[c] = df[c].nunique()

    return df, unique_vals

In [12]:
#Using Keras layer to create Embeddings
def embedding_input(name, n_in, n_out, reg):
    inp = Input(shape=(1,), dtype='int64', name=name)
    return inp, Embedding(n_in, n_out, input_length=1, embeddings_regularizer=l2(reg))(inp)

In [13]:
#Input layers for continuous vectors to the deep network
def continous_input(name):
    inp = Input(shape=(1,), dtype='float32', name=name)
    return inp, Reshape((1, 1))(inp)

In [14]:
df_small.isnull().sum().sum()

0

In [15]:
df_deep, unique_vals = val2idx(df_small, EMBEDDING_COLUMNS)

In [16]:
df_deep.head

<bound method NDFrame.head of           order_id  product_id  add_to_cart_order  reordered  user_id  \
2505459     264199           0           0.010753          1        0   
9974076    1053143           1           0.096774          0        1   
23108336   2437202           2           0.064516          1        2   
20201978   2130882           3           0.032258          1        3   
6808827     718647           4           0.150538          0        4   
33168623   1805524           5           0.129032          0        5   
24187239   2550975           6           0.010753          1        6   
14888893   1570962           7           0.000000          1        7   
1872704     197499           0           0.043011          1        8   
1312464     138440           8           0.129032          0        9   
5505383     581077           9           0.161290          0       10   
21411641   2258508          10           0.000000          0       11   
890090       94136   

In [17]:
#Target variable
y = df_deep['reordered'].values
df_deep.drop(['reordered'], axis=1, inplace = True)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(df_deep, y, test_size=0.25, random_state=42, stratify=y)

In [19]:
X_train.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,user_id,order_number,days_since_prior_order,aisle_id,department_id,order_dow_0,order_dow_1,...,order_hour_of_day_14,order_hour_of_day_15,order_hour_of_day_16,order_hour_of_day_17,order_hour_of_day_18,order_hour_of_day_19,order_hour_of_day_20,order_hour_of_day_21,order_hour_of_day_22,order_hour_of_day_23
33197896,1879325,468,0.032258,20294,0.151515,0.066667,16,0,0,0,...,0,0,1,0,0,0,0,0,0,0
6956808,734549,1522,0.021505,5769,0.262626,0.2,15,0,1,0,...,0,0,0,0,0,0,0,0,0,0
30737153,3242185,6493,0.0,5158,0.070707,1.0,54,3,0,0,...,0,0,0,0,0,0,0,0,0,0
29062642,3065077,8949,0.849462,25460,0.272727,0.1,63,12,0,0,...,0,0,0,0,0,0,0,0,1,0
10668936,1126287,344,0.043011,46988,0.090909,0.366667,26,3,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
#Defining input column for the deep network
#DEEP_COLNS = EMBEDDING_COLUMNS + CONTINUOUS_COLUMNS

In [21]:
#Creating input dataframe for the merged model
X_train_deep = [X_train[c] for c in X_train.columns]
y_train_deep = np.array(y_train).reshape(-1, 1)
X_test_deep = [X_test[c] for c in X_test.columns]
y_test_deep = np.array(y_test).reshape(-1, 1)



In [87]:
#Building input tensors for deep network
embeddings_tensors = []
n_factors = 8
reg = 1e-3

for ec in EMBEDDING_COLUMNS:
    layer_name = ec + '_inp'
    t_inp, t_build = embedding_input(layer_name, unique_vals[ec], n_factors, reg)
    embeddings_tensors.append((t_inp, t_build))
    del(t_inp, t_build)
    
continuous_tensors = []
for cc not in EMBEDDING_COLUMNS:
    layer_name = cc + '_in'
    t_inp, t_build = continous_input(layer_name)
    continuous_tensors.append((t_inp, t_build))
    del(t_inp, t_build)
    
deep_inp_layer =  [et[0] for et in embeddings_tensors]
deep_inp_layer += [ct[0] for ct in continuous_tensors]
deep_inp_embed =  [et[1] for et in embeddings_tensors]
deep_inp_embed += [ct[1] for ct in continuous_tensors]

SyntaxError: invalid syntax (<ipython-input-87-336758d46629>, line 13)

In [22]:
#Building input tensors for deep network
input_tensors = []

n_factors = 8
reg = 1e-3

for col in X_train.columns:
    if col in EMBEDDING_COLUMNS:
        layer_name = col + '_inp'
        t_inp, t_build = embedding_input(layer_name, unique_vals[col], n_factors, reg)
        input_tensors.append((t_inp, t_build))
        del(t_inp, t_build)
    else:
        layer_name = col + '_in'
        t_inp, t_build = continous_input(layer_name)
        input_tensors.append((t_inp, t_build))
        del(t_inp, t_build)

In [23]:
deep_input_tensors =  [t[0] for t in input_tensors]
deep_merge_tensors = [t[1] for t in input_tensors]

In [39]:
#Modeling vanilla neural network with dropout
dropout_prob = 0.2
output_classes = 1

#Layers
x = merge(deep_merge_tensors, mode = 'concat')
x = Flatten()(x)
# 2_. layer to normalise continous columns with the embeddings
x = BatchNormalization()(x)
x = Dropout(dropout_prob)(x,training=True)
x = Dense(100, activation='relu', kernel_regularizer=l1_l2(l1=0.01, l2=0.01))(x)
x = BatchNormalization()(x)
x = Dropout(dropout_prob)(x, training=True)
x = Dense(50, activation='relu')(x)

x = BatchNormalization()(x)
x = Dropout(dropout_prob)(x, training=True)

logits = Dense(output_classes)(x)
variance_pre = Dense(1)(x)
variance = Activation('softplus', name='variance')(variance_pre)
logits_variance = concatenate([logits, variance], name='logits_variance')
softmax_output = Activation('softmax', name='softmax_output')(logits)

model = Model(inputs=deep_input_tensors, outputs=[softmax_output])


  
  name=name)


In [40]:
print(model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
order_id_in (InputLayer)        (None, 1)            0                                            
__________________________________________________________________________________________________
product_id_inp (InputLayer)     (None, 1)            0                                            
__________________________________________________________________________________________________
add_to_cart_order_in (InputLaye (None, 1)            0                                            
__________________________________________________________________________________________________
user_id_inp (InputLayer)        (None, 1)            0                                            
__________________________________________________________________________________________________
order_numb

In [41]:
from keras.utils.vis_utils import plot_model

In [42]:
model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])

In [36]:
len(X_train_deep)

39

In [37]:
len(y_train_deep)

253643

In [43]:
model.fit(X_train_deep, y_train_deep, batch_size=64, epochs=1)

Epoch 1/1


<keras.callbacks.History at 0x1121c8cc0>

In [44]:
def CrossEntropy(yHat, y):
    if y == 1:
          return -log(yHat)
    else:
          return -log(1 - yHat)

In [49]:
def predictor(model, 
              X_test = X_test_deep, y_test = y_test_deep, T = 10):
    probs_mc_dropout = []
    for _ in range(T):
        probs_mc_dropout += [model.predict(X_test,verbose=0)]
    #predictive_mean = np.mean(probs_mc_dropout, axis=0)
    #predictive_variance = np.var(probs_mc_dropout, axis=0)
    #logloss_mc_dropout = CrossEntropy(predictive_mean, y_test)
    #print(logloss_mc_dropout)
  
    return probs_mc_dropout

In [50]:
probs_mc_dropout = predictor(model)

In [51]:
predictive_mean = np.mean(probs_mc_dropout, axis=0)
predictive_variance = np.var(probs_mc_dropout, axis=0)

In [57]:
from sklearn import metrics
print(metrics.precision_score(y_test_deep, predictive_mean))

0.5878554194067276


In [58]:
print(metrics.recall_score(y_test_deep, predictive_mean))

1.0


In [60]:
print(metrics.roc_auc_score(y_test_deep, predictive_mean))

0.5
