In [1]:
import numpy as np
import pandas as pd
import os
import argparse
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
import time

from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.layers import Input, concatenate, Embedding, Reshape
from keras.layers import Merge, Flatten, merge, Lambda, Dropout
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from keras.models import model_from_json
from keras.models import load_model
from keras.regularizers import l2, l1_l2
from keras import backend as K
from sklearn import metrics
import tensorflow as tf
import gc

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
myfolder = 'F:/rs/Recommender_DNN/input/'
df = pd.read_csv(myfolder + 'data_subset.csv')
y = df['reordered'].values
del df['reordered']
df.columns

Index(['user_id', 'product_id', 'up_orders', 'up_first_order', 'up_last_order',
       'up_average_cart_position', 'prod_orders', 'prod_reorder_probability',
       'prod_reorder_ratio', 'user_orders', 'user_period',
       'user_mean_days_since_prior', 'user_total_products',
       'user_distinct_products', 'user_average_basket', 'order_id',
       'days_since_prior_order', 'up_order_rate', 'up_orders_since_last_order',
       'aisle_id', 'department_id', 'order_dow', 'order_hour_of_day'],
      dtype='object')

In [3]:
del df['prod_reorder_probability']
del df['prod_reorder_ratio']

In [4]:
CATEGORICAL_COLUMNS = ["order_dow", "order_hour_of_day"]
CONTINUOUS_COLUMNS = [ "user_orders", "days_since_prior_order","up_orders","up_first_order","up_last_order","up_average_cart_position","prod_orders","user_period","user_distinct_products","user_mean_days_since_prior","user_total_products", "user_average_basket","up_order_rate","up_orders_since_last_order"]
EMBEDDING_COLUMNS = ["user_id", "product_id","aisle_id","department_id"]
#"prod_reorder_ratio","prod_reorder_probability",

In [5]:
#One-hot encoding categorical columns
df = pd.get_dummies(df, columns=[x for x in CATEGORICAL_COLUMNS])
gc.collect()

35

In [6]:
#Normalising the feature columns
df[CONTINUOUS_COLUMNS] = pd.DataFrame(MinMaxScaler().fit_transform(df[CONTINUOUS_COLUMNS]), columns=CONTINUOUS_COLUMNS)
gc.collect()

14

In [7]:
#Helper to index columns before embeddings
def val2idx(df, cols):
    val_types = dict()
    for c in cols:
        val_types[c] = df[c].unique()

    val_to_idx = dict()
    for k, v in val_types.items():
        val_to_idx[k] = {o: i for i, o in enumerate(val_types[k])}

    for k, v in val_to_idx.items():
        df[k] = df[k].apply(lambda x: v[x])

    unique_vals = dict()
    for c in cols:
        unique_vals[c] = df[c].nunique()

    return df, unique_vals
#Using Keras layer to create Embeddings
def embedding_input(name, n_in, n_out, reg):
    inp = Input(shape=(1,), dtype='int64', name=name)
    return inp, Embedding(n_in, n_out, input_length=1, embeddings_regularizer=l2(reg))(inp)
#Input layers for continuous vectors to the deep network
def continous_input(name):
    inp = Input(shape=(1,), dtype='float32', name=name)
    return inp, Reshape((1, 1))(inp)

In [8]:
#Splitting datasets into train and test
df.reset_index()
gc.collect()
X = df[df.columns.difference(['user_id','product_id','order_id'])].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [9]:
#Defining input column for the deep network
DEEP_COLNS = EMBEDDING_COLUMNS + CONTINUOUS_COLUMNS
df_deep, unique_vals = val2idx(df, EMBEDDING_COLUMNS)
X_deep_tr, X_deep_te, y_deep_tr, y_deep_te = train_test_split(df_deep, y, test_size=0.25, random_state=42)
#Creating input dataframe for the merged model
X_train_deep = [X_deep_tr[c] for c in DEEP_COLNS]
y_train_deep = np.array(y_deep_tr).reshape(-1, 1)
X_test_deep = [X_deep_te[c] for c in DEEP_COLNS]
y_test_deep = np.array(y_deep_te).reshape(-1, 1)
gc.collect()
#Inputs
X_tr_wd = [X_train] + X_train_deep
Y_tr_wd = y_train_deep  # wide or deep is the same here
X_te_wd = [X_test] + X_test_deep
Y_te_wd = y_test_deep  # wide or deep is the same here

In [10]:
#Loading pre-trained models

# load json and create model
json_file = open('wide.json', 'r')
wide_json = json_file.read()
json_file.close()
wide = model_from_json(wide_json)
# load weights into new model
wide.load_weights("wide.h5")
json_file = open('deep.json', 'r')
deep_json = json_file.read()
json_file.close()
deep = model_from_json(deep_json)
# load weights into new model
deep.load_weights("deep.h5")
json_file = open('wide_deep.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
wide_deep = model_from_json(loaded_model_json)
# load weights into new model
wide_deep.load_weights("wide_deep.h5")
print("Loaded model from disk")

  return cls(**config)


Loaded model from disk


In [11]:
wide.compile(Adam(0.1), loss='binary_crossentropy', metrics=['accuracy'])
deep.compile(Adam(0.1), loss='binary_crossentropy', metrics=['accuracy'])
wide_deep.compile(optimizer=Adam(lr=0.1), loss='binary_crossentropy', metrics=['accuracy'])

In [12]:
results_w = wide.evaluate(X_test, y_test)
print("Wide:",results_w)
results_d = deep.evaluate(X_test_deep, y_test_deep)
print("Deep:",results_d)
results_wd = wide_deep.evaluate(X_te_wd, Y_te_wd)
print("Wide and Deep:",results_wd)

Wide: [0.42105097553864146, 0.8075040197682589]
Deep: [1.114770540285856, 0.7945893629856638]
Wide and Deep: [1.0514775766263171, 0.8024233043757173]


In [16]:
def get_metrics(y_test, prd):
    fpr, tpr, thresholds = metrics.roc_curve(y_test, prd)
    auc = metrics.auc(fpr, tpr)
    print("auc: ",auc)
    precision, recall, thresholds = metrics.precision_recall_curve(y_test, prd)    
    auprc  = metrics.auc(recall, precision)
    max_f1 = 0
    for r, p, t in zip(recall, precision, thresholds):
        if p + r == 0: continue
        if (2*p*r)/(p + r) > max_f1:
            max_f1 = (2*p*r)/(p + r) 
            max_f1_threshold = t
    print("max_f1: ",max_f1)
    print("max_f1_threshold: ",max_f1_threshold)
    log_loss = metrics.log_loss(y_test, prd)
    print("log_loss: ",log_loss)
    return auc, max_f1, max_f1_threshold, log_loss

In [13]:
prd_w = wide.predict(X_test)
prd_d = deep.predict(X_test_deep)
prd_wd = wide_deep.predict(X_te_wd)

In [17]:
auc_w, max_f1_w, max_f1_threshold_w, log_loss_w = get_metrics(y_test, prd_w)
auc_d, max_f1_d, max_f1_threshold_d, log_loss_d = get_metrics(y_test, prd_d)
auc_wd, max_f1_wd, max_f1_threshold_wd, log_loss_wd = get_metrics(y_test, prd_wd)

auc:  0.7986973935526261
max_f1:  0.5222984720904563
max_f1_threshold:  0.17564811
log_loss:  0.4210509770792291
auc:  0.5
max_f1:  0.3408143759594003
max_f1_threshold:  0.21945308
log_loss:  0.5083970414376625
auc:  0.8155017773438071
max_f1:  0.5444768983101497
max_f1_threshold:  0.42255753
log_loss:  0.43932537743489186
