In [1]:
import numpy as np
import pandas as pd
import os
import argparse
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
import time

from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.layers import Input, concatenate, Embedding, Reshape
from keras.layers import Merge, Flatten, merge, Lambda, Dropout
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from keras.regularizers import l2, l1_l2
import tensorflow as tf
import gc

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
myfolder = 'F:/rs/Recommender_DNN/input/'


print('loading files ...')

prior = pd.read_csv(myfolder + 'order_products__prior.csv', dtype={'order_id': np.uint32,
           'product_id': np.uint16, 'reordered': np.uint8, 'add_to_cart_order': np.uint8})

train_orders = pd.read_csv(myfolder + 'order_products__train.csv', dtype={'order_id': np.uint32,
           'product_id': np.uint16, 'reordered': np.int8, 'add_to_cart_order': np.uint8 })

orders = pd.read_csv(myfolder + 'orders.csv', dtype={'order_hour_of_day': np.uint8,
           'order_number': np.uint8, 'order_id': np.uint32, 'user_id': np.uint32,
           'order_dow': np.uint8, 'days_since_prior_order': np.float16})

orders.eval_set = orders.eval_set.replace({'prior': 0, 'train': 1, 'test':2}).astype(np.uint8)
orders.days_since_prior_order = orders.days_since_prior_order.fillna(30).astype(np.uint8)

products = pd.read_csv(myfolder + 'products.csv', dtype={'product_id': np.uint16,
            'aisle_id': np.uint8, 'department_id': np.uint8},
             usecols=['product_id', 'aisle_id', 'department_id'])

print('done loading')


print('merge prior and orders and keep train separate ...')

orders_products = orders.merge(prior, how = 'inner', on = 'order_id')
train_orders = train_orders.merge(orders[['user_id','order_id']], left_on = 'order_id', right_on = 'order_id', how = 'inner')


del prior
gc.collect()

loading files ...
done loading
merge prior and orders and keep train separate ...


35

In [3]:
print('Creating features I ...')

# sort orders and products to get the rank or the reorder frequency
prdss = orders_products.sort_values(['user_id', 'order_number', 'product_id'], ascending=True)
prdss['product_time'] = prdss.groupby(['user_id', 'product_id']).cumcount()+1

# getting products ordered first and second times to calculate probability later
sub1 = prdss[prdss['product_time'] == 1].groupby('product_id').size().to_frame('prod_first_orders')
sub2 = prdss[prdss['product_time'] == 2].groupby('product_id').size().to_frame('prod_second_orders')
sub1['prod_orders'] = prdss.groupby('product_id')['product_id'].size()
sub1['prod_reorders'] = prdss.groupby('product_id')['reordered'].sum()
sub2 = sub2.reset_index().merge(sub1.reset_index())
sub2['prod_reorder_probability'] = sub2['prod_second_orders']/sub2['prod_first_orders']
sub2['prod_reorder_ratio'] = sub2['prod_reorders']/sub2['prod_orders']
prd = sub2[['product_id', 'prod_orders','prod_reorder_probability', 'prod_reorder_ratio']]

del sub1, sub2, prdss
gc.collect()

Creating features I ...


196

In [4]:
print('Creating features II ...')

# extracting prior information (features) by user
users = orders[orders['eval_set'] == 0].groupby(['user_id'])['order_number'].max().to_frame('user_orders')
users['user_period'] = orders[orders['eval_set'] == 0].groupby(['user_id'])['days_since_prior_order'].sum()
users['user_mean_days_since_prior'] = orders[orders['eval_set'] == 0].groupby(['user_id'])['days_since_prior_order'].mean()

# merging features about users and orders into one dataset
us = orders_products.groupby('user_id').size().to_frame('user_total_products')
us['eq_1'] = orders_products[orders_products['reordered'] == 1].groupby('user_id')['product_id'].size()
us['gt_1'] = orders_products[orders_products['order_number'] > 1].groupby('user_id')['product_id'].size()
us.drop(['eq_1', 'gt_1'], axis = 1, inplace = True)
us['user_distinct_products'] = orders_products.groupby(['user_id'])['product_id'].nunique()

# the average basket size of the user
users = users.reset_index().merge(us.reset_index())
users['user_average_basket'] = users['user_total_products'] / users['user_orders']

us = orders[orders['eval_set'] != 0]
us = us[['user_id', 'order_id', 'eval_set', 'days_since_prior_order']]
users = users.merge(us)

del us
gc.collect()

Creating features II ...


95

In [5]:
print('Finalizing features and the main data file  ...')
# merging orders and products and grouping by user and product and calculating features for the user/product combination
data = orders_products.groupby(['user_id', 'product_id']).size().to_frame('up_orders')
data['up_first_order'] = orders_products.groupby(['user_id', 'product_id'])['order_number'].min()
data['up_last_order'] = orders_products.groupby(['user_id', 'product_id'])['order_number'].max()
data['up_average_cart_position'] = orders_products.groupby(['user_id', 'product_id'])['add_to_cart_order'].mean()
data = data.reset_index()

#merging previous data with users
data = data.merge(prd, on = 'product_id')
data = data.merge(users, on = 'user_id')

#user/product combination features about the particular order
data['up_order_rate'] = data['up_orders'] / data['user_orders']
data['up_orders_since_last_order'] = data['user_orders'] - data['up_last_order']
data = data.merge(train_orders[['user_id', 'product_id', 'reordered']],
                  how = 'left', on = ['user_id', 'product_id'])
data = data.merge(products, on = 'product_id')
del users,prd,products
     #, orders, train_orders
gc.collect()


Finalizing features and the main data file  ...


77

In [6]:
data = data.astype(dtype= {'user_id' : np.uint32, 'product_id'  : np.uint16,
            'up_orders'  : np.uint8, 'up_first_order' : np.uint8, 'up_last_order' : np.uint8,
            'up_average_cart_position' : np.uint8, 'prod_orders' : np.uint16,
            'prod_reorder_probability' : np.float16,
            'user_orders' : np.uint8,
            'user_period' : np.uint8, 'user_mean_days_since_prior' : np.uint8,
            'user_total_products' : np.uint8, 
            'user_distinct_products' : np.uint8, 'user_average_basket' : np.uint8,
            'order_id'  : np.uint32, 'eval_set' : np.uint8,
            'days_since_prior_order' : np.uint8, 'up_order_rate' : np.float16,
            'up_orders_since_last_order':np.uint8,
            'aisle_id': np.uint8, 'department_id': np.uint8})

data['reordered'].fillna(0, inplace=True)  # replace NaN with zeros (not reordered)
data['reordered']=data['reordered'].astype(np.uint8)

gc.collect()


7

In [7]:
df = data.merge(orders_products[['user_id', 'product_id',"order_dow", "order_hour_of_day"]],
                  how = 'left', on = ['user_id', 'product_id'])
del orders_products
del data


In [8]:
df = df.sample(frac=0.3, random_state= 123)
gc.collect()

91

In [9]:
df.drop(['eval_set'],axis=1)
y = df['reordered'].values


In [10]:
CATEGORICAL_COLUMNS = ["order_dow", "order_hour_of_day"]
CONTINUOUS_COLUMNS = [ "user_orders", "days_since_prior_order","up_orders","up_first_order","up_last_order","up_average_cart_position","prod_orders","prod_reorder_probability","prod_reorder_ratio","user_period","user_distinct_products","user_mean_days_since_prior","user_total_products", "user_average_basket","up_order_rate","up_orders_since_last_order"]
EMBEDDING_COLUMNS = ["user_id", "product_id","order_id","aisle_id","department_id"]

In [11]:
#Normalising the feature columns
df = pd.DataFrame(MinMaxScaler().fit_transform(df), columns=df.columns)
gc.collect()

0

In [12]:
#One-hot encoding categorical columns
df = pd.get_dummies(df, columns=[x for x in CATEGORICAL_COLUMNS])
gc.collect()

21

In [13]:
#Helper to index columns before embeddings
def val2idx(df, cols):
    val_types = dict()
    for c in cols:
        val_types[c] = df[c].unique()

    val_to_idx = dict()
    for k, v in val_types.items():
        val_to_idx[k] = {o: i for i, o in enumerate(val_types[k])}

    for k, v in val_to_idx.items():
        df[k] = df[k].apply(lambda x: v[x])

    unique_vals = dict()
    for c in cols:
        unique_vals[c] = df[c].nunique()

    return df, unique_vals

In [14]:
#Using Keras layer to create Embeddings
def embedding_input(name, n_in, n_out, reg):
    inp = Input(shape=(1,), dtype='int64', name=name)
    return inp, Embedding(n_in, n_out, input_length=1, embeddings_regularizer=l2(reg))(inp)

In [15]:
#Input layers for continuous vectors to the deep network
def continous_input(name):
    inp = Input(shape=(1,), dtype='float32', name=name)
    return inp, Reshape((1, 1))(inp)

In [16]:
#Splitting datasets into train and test
df.reset_index()
gc.collect()
X = df.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [18]:

# simply connecting the features to an output layer
wide_inp = Input(shape=(X_train.shape[1],), dtype='float32', name='wide_inp')
x = Dropout(0.2)(wide_inp)
wide_out = Dense(1, activation='sigmoid')(x)
wide = Model(wide_inp, wide_out)
wide.compile(Adam(0.1), loss='binary_crossentropy', metrics=['accuracy'])
wide.fit(X_train, y_train, epochs=1,batch_size =64)
results = wide.evaluate(X_test, y_test)

print("\n", results)

Epoch 1/1

 [0.018693832841810188, 0.9999658484279167]


In [19]:

# serialize linear model to JSON
wide_json = wide.to_json()
with open("wide.json", "w") as json_file:
    json_file.write(wide_json)
# serialize weights to HDF5
wide.save_weights("wide.h5")
print("Saved model to disk")

Saved model to disk


In [50]:
df_deep, unique_vals = val2idx(df, EMBEDDING_COLUMNS)
X_deep_tr, X_deep_te, y_deep_tr, y_deep_te = train_test_split(df_deep, y, test_size=0.25, random_state=42)

In [58]:
#Defining input column for the deep network
DEEP_COLNS = EMBEDDING_COLUMNS + CONTINUOUS_COLUMNS

In [59]:
#Creating input dataframe for the merged model
X_train_deep = [X_deep_tr[c] for c in DEEP_COLNS]
y_train_deep = np.array(y_deep_tr).reshape(-1, 1)
X_test_deep = [X_deep_te[c] for c in DEEP_COLNS]
y_test_deep = np.array(y_deep_te).reshape(-1, 1)

In [60]:
#Building input tensors for deep network
embeddings_tensors = []
n_factors = 8
reg = 1e-3

for ec in EMBEDDING_COLUMNS:
    layer_name = ec + '_inp'
    t_inp, t_build = embedding_input(
    layer_name, unique_vals[ec], n_factors, reg)
    embeddings_tensors.append((t_inp, t_build))
    del(t_inp, t_build)
    
continuous_tensors = []
for cc in CONTINUOUS_COLUMNS:
    layer_name = cc + '_in'
    t_inp, t_build = continous_input(layer_name)
    continuous_tensors.append((t_inp, t_build))
    del(t_inp, t_build)

In [61]:
#Building inputs for deep network
deep_inp_layer =  [et[0] for et in embeddings_tensors]
deep_inp_layer += [ct[0] for ct in continuous_tensors]
deep_inp_embed =  [et[1] for et in embeddings_tensors]
deep_inp_embed += [ct[1] for ct in continuous_tensors]

In [62]:
#Modeling deep network

d = merge(deep_inp_embed, mode='concat')
d = Flatten()(d)
# 2_. layer to normalise continous columns with the embeddings
d = BatchNormalization()(d)
d = Dense(100, activation='relu', kernel_regularizer=l1_l2(l1=0.01, l2=0.01))(d)
d = Dense(50, activation='relu')(d)

deep_out = Dense(y_train_deep.shape[1], activation='sigmoid')(d)
deep = Model(deep_inp_layer, deep_out)
deep.compile(Adam(0.1), loss='binary_crossentropy', metrics=['accuracy'])
deep.fit(X_train_deep, y_train_deep, batch_size=64, epochs=1)
results = deep.evaluate(X_test_deep, y_test_deep)

print ("\n", results)

  This is separate from the ipykernel package so we can avoid doing imports until
  name=name)
  del sys.path[0]


Epoch 1/1

 [0.42280053404732637, 0.8694220813367007]


In [64]:
# serialize deep model to JSON
deep_json = deep.to_json()
with open("deep.json", "w") as json_file:
    json_file.write(deep_json)
# serialize weights to HDF5
deep.save_weights("deep.h5")
print("Saved model to disk")

Saved model to disk


In [65]:
#Inputs
X_tr_wd = [X_train] + X_train_deep
Y_tr_wd = y_train_deep  # wide or deep is the same here
X_te_wd = [X_test] + X_test_deep
Y_te_wd = y_test_deep  # wide or deep is the same here

In [68]:
# WIDE
w = Input(shape=(X_train.shape[1],), dtype='float32', name='wide')

# WIDE + DEEP
wd_inp = concatenate([w, d])
x = Dropout(0.2)(wd_inp)
wd_out = Dense(Y_tr_wd.shape[1], activation='sigmoid', name='wide_deep')(x)

wide_deep = Model(inputs=[w] + deep_inp_layer, outputs=wd_out)

wide_deep.compile(optimizer=Adam(lr=0.1), loss='binary_crossentropy', metrics=['accuracy'])
wide_deep.fit(X_tr_wd, Y_tr_wd,nb_epoch=1, batch_size=128)

results = wide_deep.evaluate(X_te_wd, Y_te_wd)

print( "\n", results)

  # This is added back by InteractiveShellApp.init_path()


Epoch 1/1

 [0.0725073088209857, 1.0]


In [69]:
# serialize model to JSON
wide_deep_json = wide_deep.to_json()
with open("wide_deep.json", "w") as json_file:
    json_file.write(wide_deep_json)
# serialize weights to HDF5
wide_deep.save_weights("wide_deep.h5")
print("Saved model to disk")

Saved model to disk


In [None]:
#Loading pre-trained models

# load json and create model
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")