In [1]:
import numpy as np
import pandas as pd
import os
import argparse
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
import time

from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.layers import Input, concatenate, Embedding, Reshape
from keras.layers import Merge, Flatten, merge, Lambda, Dropout
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from keras.models import model_from_json
from keras.models import load_model
from keras.regularizers import l2, l1_l2
import tensorflow as tf
import gc

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [13]:
#Importing Instacart data
df = pd.read_csv('/Users/BharathiSrinivasan/Documents/GitHub/Thesis/merged_data.csv')

In [14]:
df.head()

Unnamed: 0.1,Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id,department
0,0,2,33120,1,1,202279,3,5,9,8.0,Organic Egg Whites,86,16,dairy eggs
1,1,2,28985,2,1,202279,3,5,9,8.0,Michigan Organic Kale,83,4,produce
2,2,2,9327,3,0,202279,3,5,9,8.0,Garlic Powder,104,13,pantry
3,3,2,45918,4,1,202279,3,5,9,8.0,Coconut Butter,19,13,pantry
4,4,2,30035,5,0,202279,3,5,9,8.0,Natural Sweetener,17,13,pantry


In [15]:
CATEGORICAL_COLUMNS = ["order_dow", "order_hour_of_day"]
CONTINUOUS_COLUMNS = ["days_since_prior_order","order_number","add_to_cart_order"]
EMBEDDING_COLUMNS = ["user_id", "product_id","aisle_id","department_id"]

In [16]:
#One-hot encoding categorical columns
df = pd.get_dummies(df, columns=[x for x in CATEGORICAL_COLUMNS])

In [17]:
#Normalising the feature columns
df[CONTINUOUS_COLUMNS] = pd.DataFrame(MinMaxScaler().fit_transform(df[CONTINUOUS_COLUMNS]), columns=CONTINUOUS_COLUMNS)

In [18]:
#Helper to index columns before embeddings
def val2idx(df, cols):
    val_types = dict()
    for c in cols:
        val_types[c] = df[c].unique()

    val_to_idx = dict()
    for k, v in val_types.items():
        val_to_idx[k] = {o: i for i, o in enumerate(val_types[k])}

    for k, v in val_to_idx.items():
        df[k] = df[k].apply(lambda x: v[x])

    unique_vals = dict()
    for c in cols:
        unique_vals[c] = df[c].nunique()

    return df, unique_vals

In [19]:
#Using Keras layer to create Embeddings
def embedding_input(name, n_in, n_out, reg):
    inp = Input(shape=(1,), dtype='int64', name=name)
    return inp, Embedding(n_in, n_out, input_length=1, embeddings_regularizer=l2(reg))(inp)

In [20]:
#Input layers for continuous vectors to the deep network
def continous_input(name):
    inp = Input(shape=(1,), dtype='float32', name=name)
    return inp, Reshape((1, 1))(inp)

In [21]:
df_deep, unique_vals = val2idx(df, EMBEDDING_COLUMNS)

In [23]:
#Target variable
y = df['reordered'].values
df.drop(['reordered'], axis=1, inplace = True)

In [24]:
X_deep_tr, X_deep_te, y_deep_tr, y_deep_te = train_test_split(df_deep, y, test_size=0.25, random_state=42)

In [25]:
#Defining input column for the deep network
DEEP_COLNS = EMBEDDING_COLUMNS + CONTINUOUS_COLUMNS

In [26]:
#Creating input dataframe for the merged model
X_train_deep = [X_deep_tr[c] for c in DEEP_COLNS]
y_train_deep = np.array(y_deep_tr).reshape(-1, 1)
X_test_deep = [X_deep_te[c] for c in DEEP_COLNS]
y_test_deep = np.array(y_deep_te).reshape(-1, 1)

In [27]:
#Building input tensors for deep network
embeddings_tensors = []
n_factors = 8
reg = 1e-3

for ec in EMBEDDING_COLUMNS:
    layer_name = ec + '_inp'
    t_inp, t_build = embedding_input(
    layer_name, unique_vals[ec], n_factors, reg)
    embeddings_tensors.append((t_inp, t_build))
    del(t_inp, t_build)
    
continuous_tensors = []
for cc in CONTINUOUS_COLUMNS:
    layer_name = cc + '_in'
    t_inp, t_build = continous_input(layer_name)
    continuous_tensors.append((t_inp, t_build))
    del(t_inp, t_build)
    
deep_inp_layer =  [et[0] for et in embeddings_tensors]
deep_inp_layer += [ct[0] for ct in continuous_tensors]
deep_inp_embed =  [et[1] for et in embeddings_tensors]
deep_inp_embed += [ct[1] for ct in continuous_tensors]

In [None]:
#Modeling deep network

d = merge(deep_inp_embed, mode='concat')
d = Flatten()(d)
# 2_. layer to normalise continous columns with the embeddings
d = BatchNormalization()(d)
d = Dense(100, activation='relu', kernel_regularizer=l1_l2(l1=0.01, l2=0.01))(d)
d = Dense(50, activation='relu')(d)

deep_out = Dense(y_train_deep.shape[1], activation='sigmoid')(d)
deep = Model(deep_inp_layer, deep_out)
deep.compile(Adam(0.1), loss='binary_crossentropy', metrics=['accuracy'])
deep.fit(X_train_deep, y_train_deep, batch_size=64, epochs=1)
results = deep.evaluate(X_test_deep, y_test_deep)

print ("\n", results)