In [1]:
import numpy as np
import pandas as pd
import os
import argparse
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
import time

from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.layers import Input, concatenate, Embedding, Reshape
from keras.layers import Merge, Flatten, merge, Lambda, Dropout
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from keras.models import model_from_json
from keras.models import load_model
from keras.regularizers import l2, l1_l2
import tensorflow as tf
import gc

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
myfolder = 'F:/rs/Recommender_DNN/input/'
df = pd.read_csv(myfolder + 'data_subset.csv')
y = df['reordered'].values
del df['reordered']
df.columns

Index(['user_id', 'product_id', 'up_orders', 'up_first_order', 'up_last_order',
       'up_average_cart_position', 'prod_orders', 'prod_reorder_probability',
       'prod_reorder_ratio', 'user_orders', 'user_period',
       'user_mean_days_since_prior', 'user_total_products',
       'user_distinct_products', 'user_average_basket', 'order_id',
       'days_since_prior_order', 'up_order_rate', 'up_orders_since_last_order',
       'aisle_id', 'department_id', 'order_dow', 'order_hour_of_day'],
      dtype='object')

In [3]:
del df['prod_reorder_probability']
del df['prod_reorder_ratio']

In [4]:
CATEGORICAL_COLUMNS = ["order_dow", "order_hour_of_day"]
CONTINUOUS_COLUMNS = [ "user_orders", "days_since_prior_order","up_orders","up_first_order","up_last_order","up_average_cart_position","prod_orders","user_period","user_distinct_products","user_mean_days_since_prior","user_total_products", "user_average_basket","up_order_rate","up_orders_since_last_order"]
EMBEDDING_COLUMNS = ["user_id", "product_id","aisle_id","department_id"]
#"prod_reorder_ratio","prod_reorder_probability",

In [5]:
#One-hot encoding categorical columns
df = pd.get_dummies(df, columns=[x for x in CATEGORICAL_COLUMNS])
gc.collect()

35

In [6]:
#Normalising the feature columns
df[CONTINUOUS_COLUMNS] = pd.DataFrame(MinMaxScaler().fit_transform(df[CONTINUOUS_COLUMNS]), columns=CONTINUOUS_COLUMNS)
gc.collect()

14

In [7]:
#Helper to index columns before embeddings
def val2idx(df, cols):
    val_types = dict()
    for c in cols:
        val_types[c] = df[c].unique()

    val_to_idx = dict()
    for k, v in val_types.items():
        val_to_idx[k] = {o: i for i, o in enumerate(val_types[k])}

    for k, v in val_to_idx.items():
        df[k] = df[k].apply(lambda x: v[x])

    unique_vals = dict()
    for c in cols:
        unique_vals[c] = df[c].nunique()

    return df, unique_vals
#Using Keras layer to create Embeddings
def embedding_input(name, n_in, n_out, reg):
    inp = Input(shape=(1,), dtype='int64', name=name)
    return inp, Embedding(n_in, n_out, input_length=1, embeddings_regularizer=l2(reg))(inp)
#Input layers for continuous vectors to the deep network
def continous_input(name):
    inp = Input(shape=(1,), dtype='float32', name=name)
    return inp, Reshape((1, 1))(inp)

In [8]:
#Splitting datasets into train and test
df.reset_index()
gc.collect()
X = df[df.columns.difference(['user_id','product_id','order_id'])].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [9]:
#Defining input column for the deep network
DEEP_COLNS = EMBEDDING_COLUMNS + CONTINUOUS_COLUMNS
df_deep, unique_vals = val2idx(df, EMBEDDING_COLUMNS)
X_deep_tr, X_deep_te, y_deep_tr, y_deep_te = train_test_split(df_deep, y, test_size=0.25, random_state=42)
#Creating input dataframe for the merged model
X_train_deep = [X_deep_tr[c] for c in DEEP_COLNS]
y_train_deep = np.array(y_deep_tr).reshape(-1, 1)
X_test_deep = [X_deep_te[c] for c in DEEP_COLNS]
y_test_deep = np.array(y_deep_te).reshape(-1, 1)
gc.collect()
#Inputs
X_tr_wd = [X_train] + X_train_deep
Y_tr_wd = y_train_deep  # wide or deep is the same here
X_te_wd = [X_test] + X_test_deep
Y_te_wd = y_test_deep  # wide or deep is the same here

In [10]:
#Loading pre-trained models

# load json and create model
json_file = open('wide.json', 'r')
wide_json = json_file.read()
json_file.close()
wide = model_from_json(wide_json)
# load weights into new model
wide.load_weights("wide.h5")
json_file = open('deep.json', 'r')
deep_json = json_file.read()
json_file.close()
deep = model_from_json(deep_json)
# load weights into new model
deep.load_weights("deep.h5")
json_file = open('wide_deep.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
wide_deep = model_from_json(loaded_model_json)
# load weights into new model
wide_deep.load_weights("wide_deep.h5")
print("Loaded model from disk")

  return cls(**config)


Loaded model from disk


In [11]:
wide.compile(Adam(0.1), loss='binary_crossentropy', metrics=['accuracy'])
deep.compile(Adam(0.1), loss='binary_crossentropy', metrics=['accuracy'])
wide_deep.compile(optimizer=Adam(lr=0.1), loss='binary_crossentropy', metrics=['accuracy'])

In [12]:
results = wide.evaluate(X_test, y_test)
print("Wide:",results)
results = deep.evaluate(X_test_deep, y_test_deep)
print("Deep:",results)
results = wide_deep.evaluate(X_te_wd, Y_te_wd)
print("Wide and Deep:",results)

Wide: [0.42105097553864146, 0.8075040197682589]
Deep: [1.114770614773136, 0.7945893629856638]
Wide and Deep: [1.0514776181248389, 0.8024233043757173]


In [13]:
prd = wide_deep.predict(X_te_wd)

In [14]:
print(prd)

[[0.1818818 ]
 [0.1878082 ]
 [0.14095223]
 ...
 [0.06057081]
 [0.07826935]
 [0.05727785]]


In [62]:
def combi(z,df):
    
    prd_bag = dict()
    for row in df.itertuples():
        if row.reordered > z:   
            try:
                prd_bag[row.order_id] += ' ' + str(row.product_id)
            except:
                prd_bag[row.order_id] = str(row.product_id)

    for order in df.order_id:
        if order not in prd_bag:
            prd_bag[order] = ' '

    return prd_bag

# F1 function uses the actual products as a list in the train set and the list of predicted products

def f1_score_single(x):                 #from LiLi but modified to get 1 for both empty

    y_true = x.actual
    y_pred = x.pred
    if y_true == '' and y_pred =='' : return 1.
    y_true = set(y_true)
    y_pred = set(y_pred)
    cross_size = len(y_true & y_pred)
    if cross_size == 0: return 0.
    p = 1. * cross_size / len(y_pred)
    r = 1. * cross_size / len(y_true)
    return 2 * p * r / (p + r)

def get_true_set(df):
    
    true_set = dict()
    for row in df.itertuples():
        if row.true == 1:   
            try:
                true_set[row.order_id] += ' ' + str(row.product_id)
            except:
                true_set[row.order_id] = str(row.product_id)

    for order in df.order_id:
        if order not in true_set:
            true_set[order] = ' '

    return true_set

def get_n_product(df):
    products = df.nlargest(5, 'reordered')['product_id'].values
    products = np.array2string(products)
    return products
    

In [16]:
df.head()

Unnamed: 0,user_id,product_id,up_orders,up_first_order,up_last_order,up_average_cart_position,prod_orders,user_orders,user_period,user_mean_days_since_prior,...,order_hour_of_day_14,order_hour_of_day_15,order_hour_of_day_16,order_hour_of_day_17,order_hour_of_day_18,order_hour_of_day_19,order_hour_of_day_20,order_hour_of_day_21,order_hour_of_day_22,order_hour_of_day_23
0,0,0,0.244898,0.010204,0.336735,0.068182,0.079767,0.322917,0.447059,0.333333,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0.0,0.377551,0.377551,0.075758,0.007451,0.625,0.533333,0.2,...,0,0,0,0,0,0,0,0,0,0
2,2,2,0.071429,0.0,0.071429,0.022727,0.264035,0.052083,0.376471,0.4,...,0,0,0,0,0,0,0,0,0,1
3,3,3,0.061224,0.0,0.244898,0.037879,0.469271,0.229167,0.254902,0.4,...,0,0,0,0,0,1,0,0,0,0
4,4,4,0.010204,0.020408,0.030612,0.007576,0.286388,0.020833,0.188235,0.3,...,1,0,0,0,0,0,0,0,0,0


In [20]:
X_drop, X_eval, y_drop, y_eval = train_test_split(df, y, test_size=0.25, random_state=42)
X_eval.head()

Unnamed: 0,user_id,product_id,up_orders,up_first_order,up_last_order,up_average_cart_position,prod_orders,user_orders,user_period,user_mean_days_since_prior,...,order_hour_of_day_14,order_hour_of_day_15,order_hour_of_day_16,order_hour_of_day_17,order_hour_of_day_18,order_hour_of_day_19,order_hour_of_day_20,order_hour_of_day_21,order_hour_of_day_22,order_hour_of_day_23
3754795,2669,16069,0.0,0.520408,0.520408,0.106061,0.006844,0.5625,0.509804,0.2,...,1,0,0,0,0,0,0,0,0,0
3087927,51229,1360,0.030612,0.061224,0.214286,0.022727,0.193429,0.208333,0.403922,0.5,...,0,0,0,0,0,0,0,1,0,0
122280,2706,293,0.010204,0.0,0.061224,0.045455,0.202794,0.125,0.486275,0.266667,...,0,0,0,0,0,0,0,0,0,0
2261385,10781,61,0.081633,0.071429,0.632653,0.083333,0.079254,0.625,0.52549,0.2,...,0,0,0,0,1,0,0,0,0,0
905639,10495,3897,0.0,0.010204,0.010204,0.098485,0.091247,0.260417,0.368627,0.4,...,0,0,0,0,0,0,0,0,0,0


In [31]:
y_eval

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [72]:
X_eval['order_id'].unique()

(117762,)

In [24]:
test_ranking = X_eval
test_ranking['reordered'] = prd
test_ranking['true'] = y_eval

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [33]:
prd_bag = combi(0.21,test_ranking)

In [36]:
true_set = get_true_set(test_ranking)

In [41]:
p_b = pd.Series(prd_bag)
t_s = pd.Series(true_set)


36                                    226 1507 4670 3719
38                                         5641 2955 800
96                                             5364 8594
98     3473 126 3473 2710 2710 14479 3120 1529 14479 ...
112                          1255 555 4937 8 1154 2838 8
dtype: object

In [49]:
f1_set = pd.concat([p_b, t_s], axis=1)
f1_set = f1_set.rename(index=str, columns = {0:"pred",1:"actual"})

In [57]:
#calculate f1 score for each order
f1_set['f1']=f1_set.apply(f1_score_single,axis=1).astype(np.float16)

In [63]:
f1_set['f1'].mean()

0.519

In [89]:
ranking = test_ranking[['product_id','order_id','reordered']]
grouped = ranking.groupby('order_id')
grouped.agg(get_n_product)

Unnamed: 0_level_0,product_id,reordered
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1
36,[1507 4670 3719 226 2255],[1507 4670 3719 226 2255]
38,[ 800 5641 2955 307],[ 800 5641 2955 307]
96,[8594 5364 11],[8594 5364 11]
98,[ 16 16 7667 3745 2905],[ 16 16 7667 3745 2905]
112,[1255 8 8 1154 555],[1255 8 8 1154 555]
170,[9741],[9741]
218,[ 783 8344 13511],[ 783 8344 13511]
349,[ 19 5900 55 3553 12720],[ 19 5900 55 3553 12720]
393,[ 65 1388 4681 4675 120],[ 65 1388 4681 4675 120]
456,[80],[80]
