In [1]:
import numpy as np
import pandas as pd
import os
import argparse
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
import time

from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.layers import Input, concatenate, Embedding, Reshape
from keras.layers import Merge, Flatten, merge, Lambda, Dropout
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from keras.models import model_from_json
from keras.models import load_model
from keras.regularizers import l2, l1_l2
import tensorflow as tf
import gc

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
myfolder = 'F:/rs/Recommender_DNN/input/'

df = pd.read_csv(myfolder + 'data_subset.csv')
y = df['reordered'].values
del df['reordered']

In [7]:
ids = df[['product_id','order_id']]
del df['order_id']

In [8]:
CATEGORICAL_COLUMNS = ["order_dow", "order_hour_of_day"]
CONTINUOUS_COLUMNS = [ "user_orders", "days_since_prior_order","up_orders","up_first_order","up_last_order","up_average_cart_position","prod_orders","user_period","prod_reorder_ratio","prod_reorder_probability","user_distinct_products","user_mean_days_since_prior","user_total_products", "user_average_basket","up_order_rate","up_orders_since_last_order"]
EMBEDDING_COLUMNS = ["user_id", "product_id","aisle_id","department_id"]

In [9]:
#One-hot encoding categorical columns
df = pd.get_dummies(df, columns=[x for x in CATEGORICAL_COLUMNS])
gc.collect()

393

In [10]:
#Normalising the feature columns
df = pd.DataFrame(MinMaxScaler().fit_transform(df), columns=df.columns)
gc.collect()

0

In [11]:
#Helper to index columns before embeddings
def val2idx(df, cols):
    val_types = dict()
    for c in cols:
        val_types[c] = df[c].unique()

    val_to_idx = dict()
    for k, v in val_types.items():
        val_to_idx[k] = {o: i for i, o in enumerate(val_types[k])}

    for k, v in val_to_idx.items():
        df[k] = df[k].apply(lambda x: v[x])

    unique_vals = dict()
    for c in cols:
        unique_vals[c] = df[c].nunique()

    return df, unique_vals
#Using Keras layer to create Embeddings
def embedding_input(name, n_in, n_out, reg):
    inp = Input(shape=(1,), dtype='int64', name=name)
    return inp, Embedding(n_in, n_out, input_length=1, embeddings_regularizer=l2(reg))(inp)
#Input layers for continuous vectors to the deep network
def continous_input(name):
    inp = Input(shape=(1,), dtype='float32', name=name)
    return inp, Reshape((1, 1))(inp)

In [14]:
#Splitting datasets into train and test
df.reset_index()
gc.collect()
X = df.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [15]:
#Defining input column for the deep network
DEEP_COLNS = EMBEDDING_COLUMNS + CONTINUOUS_COLUMNS
df_deep, unique_vals = val2idx(df, EMBEDDING_COLUMNS)
X_deep_tr, X_deep_te, y_deep_tr, y_deep_te = train_test_split(df_deep, y, test_size=0.25, random_state=42)
#Creating input dataframe for the merged model
X_train_deep = [X_deep_tr[c] for c in DEEP_COLNS]
y_train_deep = np.array(y_deep_tr).reshape(-1, 1)
X_test_deep = [X_deep_te[c] for c in DEEP_COLNS]
y_test_deep = np.array(y_deep_te).reshape(-1, 1)
gc.collect()
#Inputs
X_tr_wd = [X_train] + X_train_deep
Y_tr_wd = y_train_deep  # wide or deep is the same here
X_te_wd = [X_test] + X_test_deep
Y_te_wd = y_test_deep  # wide or deep is the same here

In [16]:
#Loading pre-trained models

# load json and create model
json_file = open('wide.json', 'r')
wide_json = json_file.read()
json_file.close()
wide = model_from_json(wide_json)
# load weights into new model
wide.load_weights("wide.h5")
json_file = open('deep.json', 'r')
deep_json = json_file.read()
json_file.close()
deep = model_from_json(deep_json)
# load weights into new model
deep.load_weights("deep.h5")
json_file = open('wide_deep.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
wide_deep = model_from_json(loaded_model_json)
# load weights into new model
wide_deep.load_weights("wide_deep.h5")
print("Loaded model from disk")

  return cls(**config)


Loaded model from disk


In [17]:
wide.compile(Adam(0.1), loss='binary_crossentropy', metrics=['accuracy'])
deep.compile(Adam(0.1), loss='binary_crossentropy', metrics=['accuracy'])
wide_deep.compile(optimizer=Adam(lr=0.1), loss='binary_crossentropy', metrics=['accuracy'])

In [12]:
results = wide.evaluate(X_test, y_test)
print("Wide:",results)
results = deep.evaluate(X_test_deep, y_test_deep)
print("Deep:",results)
results = wide_deep.evaluate(X_te_wd, Y_te_wd)
print("Wide and Deep:",results)

Wide: [0.41565120182276577, 0.8078725389225269]

 Deep: [1.136664420352819, 0.7945893629856638]

 Wide and Deep: [1.0013501690341742, 0.8186381471633909]


In [18]:
prd = wide_deep.predict(X_te_wd)

In [15]:
print(prd)

[[0.06621979]
 [0.11559835]
 [0.04629791]
 ...
 [0.07036499]
 [0.07070364]
 [0.03768555]]


In [12]:
def combi(z,df):
    
    prd_bag = dict()
    z_bag = dict()
    for row in df.itertuples():
        if row.reordered > z:   
            try:
                prd_bag[row.order_id] += ' ' + str(row.product_id)
                z_bag[row.order_id]+= ' ' + str(int(100*row.reordered))
            except:
                prd_bag[row.order_id] = str(row.product_id)
                z_bag[row.order_id]= str(int(100*row.reordered))

    for order in df.order_id:
        if order not in prd_bag:
            prd_bag[order] = ' '
            z_bag[order] = ' '

    return prd_bag,z_bag 

# F1 function uses the actual products as a list in the train set and the list of predicted products

def f1_score_single(x):                 #from LiLi but modified to get 1 for both empty

    y_true = x.actual
    y_pred = x.list_prod
    if y_true == '' and y_pred ==[] : return 1.
    y_true = set(y_true)
    y_pred = set(y_pred)
    cross_size = len(y_true & y_pred)
    if cross_size == 0: return 0.
    p = 1. * cross_size / len(y_pred)
    r = 1. * cross_size / len(y_true)
    return 2 * p * r / (p + r)

In [13]:
df.reset_index()
df.head()

Unnamed: 0,user_id,product_id,up_orders,up_first_order,up_last_order,up_average_cart_position,prod_orders,prod_reorder_probability,prod_reorder_ratio,user_orders,...,order_hour_of_day_14,order_hour_of_day_15,order_hour_of_day_16,order_hour_of_day_17,order_hour_of_day_18,order_hour_of_day_19,order_hour_of_day_20,order_hour_of_day_21,order_hour_of_day_22,order_hour_of_day_23
0,0.629796,0.392658,0.244898,0.010204,0.336735,0.068182,0.079767,0.468368,0.751146,0.322917,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.825564,0.640409,0.0,0.377551,0.377551,0.075758,0.007451,0.290488,0.481042,0.625,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.117571,0.950108,0.071429,0.0,0.071429,0.022727,0.264035,0.666348,0.845529,0.052083,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.614176,0.325719,0.061224,0.0,0.244898,0.037879,0.469271,0.45852,0.651134,0.229167,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.094497,0.000886,0.010204,0.020408,0.030612,0.007576,0.286388,0.485754,0.69483,0.020833,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
ids.head()

Unnamed: 0,product_id,order_id
0,19511,2308721
1,31821,2191246
2,47209,864819
3,16185,552250
4,45,3007640


In [12]:
df.head()

Unnamed: 0,user_id,product_id,up_orders,up_first_order,up_last_order,up_average_cart_position,prod_orders,prod_reorder_probability,prod_reorder_ratio,user_orders,...,order_hour_of_day_14,order_hour_of_day_15,order_hour_of_day_16,order_hour_of_day_17,order_hour_of_day_18,order_hour_of_day_19,order_hour_of_day_20,order_hour_of_day_21,order_hour_of_day_22,order_hour_of_day_23
0,0.629796,0.392658,0.244898,0.010204,0.336735,0.068182,0.079767,0.468368,0.751146,0.322917,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.825564,0.640409,0.0,0.377551,0.377551,0.075758,0.007451,0.290488,0.481042,0.625,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.117571,0.950108,0.071429,0.0,0.071429,0.022727,0.264035,0.666348,0.845529,0.052083,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.614176,0.325719,0.061224,0.0,0.244898,0.037879,0.469271,0.45852,0.651134,0.229167,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.094497,0.000886,0.010204,0.020408,0.030612,0.007576,0.286388,0.485754,0.69483,0.020833,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
print(df.shape,df_deep.shape)

(4124613, 51) (4124613, 51)


In [20]:
a,b,c,d = train_test_split(ids, y, test_size=0.25, random_state=42)

In [21]:
b.head()

Unnamed: 0,product_id,order_id
3754795,5955,328106
3087927,19691,1270775
122280,7751,1171383
2261385,5114,717426
905639,35168,639184


In [22]:
test_ranking = b
test_ranking['reordered'] = prd

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [23]:
test_ranking.head()

Unnamed: 0,product_id,order_id,reordered
3754795,5955,328106,0.06622
3087927,19691,1270775,0.115598
122280,7751,1171383,0.046298
2261385,5114,717426,0.197152
905639,35168,639184,0.005602


In [24]:
gc.collect()
traintest1 = pd.read_csv(myfolder + 'traintest1.csv')

In [25]:
traintest1.head()

Unnamed: 0,order_id,eval_set,actual
0,1187899,1,"[196, 25133, 38928, 26405, 39657, 10258, 13032..."
1,1492625,1,"[22963, 7963, 16589, 32792, 41787, 22825, 2485..."
2,2196797,1,"[15349, 21413, 40706, 21616]"
3,525192,1,"[47272, 37999, 13198, 43967, 40852, 17638, 298..."
4,880375,1,"[15937, 23165, 21903, 41540]"
