In [1]:
from keras.models import Model
from keras.layers.core import Dense, Reshape, Lambda
from keras.layers import Input, Embedding, merge
from keras import backend as K
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras import preprocessing
from keras.regularizers import l2
import random
from keras.layers.advanced_activations import LeakyReLU

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
df = pd.read_csv('/Users/BharathiSrinivasan/Documents/GitHub/Thesis/merged_data.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id,department
0,0,2,33120,1,1,202279,3,5,9,8.0,Organic Egg Whites,86,16,dairy eggs
1,1,2,28985,2,1,202279,3,5,9,8.0,Michigan Organic Kale,83,4,produce
2,2,2,9327,3,0,202279,3,5,9,8.0,Garlic Powder,104,13,pantry
3,3,2,45918,4,1,202279,3,5,9,8.0,Coconut Butter,19,13,pantry
4,4,2,30035,5,0,202279,3,5,9,8.0,Natural Sweetener,17,13,pantry


In [3]:
#Sample orders of n customer
def data_nusers(df, n):
    unique_users = df.user_id.unique()
    i = 0
    df_nusers = pd.DataFrame()  
    for user in unique_users:
        df_nusers = df_nusers.append(df[df.user_id == user])
        i +=1
        if (i == n):
            break
    return pd.DataFrame(df_nusers)

In [4]:
df_use = data_nusers(df,10)

In [5]:
len(df_use.product_id.unique())

796

In [6]:
# Number of product IDs available
N_products = df_use['product_id'].nunique()
N_dept = df['department'].nunique()
N_shoppers = df_use['user_id'].nunique()

In [7]:
EMBEDDING_COLUMNS = ["user_id", "product_id"]

In [8]:
#Helper to index columns before embeddings
def val2idx(df, cols):
    val_types = dict()
    for c in cols:
        val_types[c] = df[c].unique()

    val_to_idx = dict()
    for k, v in val_types.items():
        val_to_idx[k] = {o: i for i, o in enumerate(val_types[k])}

    for k, v in val_to_idx.items():
        df[k] = df[k].apply(lambda x: v[x]+1)

    unique_vals = dict()
    for c in cols:
        unique_vals[c] = df[c].nunique()

    return df, unique_vals

In [9]:
print(N_products, N_dept, N_shoppers )

796 21 10


In [10]:
df_deep, unique_vals = val2idx(df_use, EMBEDDING_COLUMNS)

In [11]:
df_deep.shape

(3339, 14)

In [12]:
def first_prod(order):
    for _,row in order.iterrows():
        if row['add_to_cart_order']==1:
            return row['product_id']

In [13]:
def next_prod(order):
    for _,row in order.iterrows():
        if row['add_to_cart_order']==2:
            return row['product_id']

In [14]:
def create_basket(order):
    order['product_id']= order['product_id'].astype(str)
    
    basket = []
    for _,row in order.iterrows():
        if row['add_to_cart_order']!=1:
            basket.append(row['product_id'])
    #basket = random.shuffle(basket)
    return basket

In [17]:
def transform_data_for_embedding(df):
    first = df.groupby(['order_id']).apply(first_prod)
    next_product = df.groupby(['order_id']).apply(lambda x:next_prod(x))
    basket =df.groupby(['order_id']).apply(lambda x: create_basket(x))
    
    transform_df = pd.DataFrame(first, columns = ['first_prod'])
    transform_df['next_product']= next_product.values
    transform_df['basket']= basket.tolist()
    transform_df.reset_index(inplace=True)
    print('first transformed data')
    print(transform_df.head())

    # Number of product IDs available
    N_products = df['product_id'].nunique()
    N_shoppers = df['user_id'].nunique()

    return transform_df, N_products, N_shoppers

In [18]:
transformed_dat, N_products, N_shoppers = transform_data_for_embedding(df_deep)

first transformed data
   order_id  first_prod  next_product  \
0         2           1           2.0   
1         3          51          52.0   
2         4         184         185.0   
3         5         168         286.0   
4         6         431         432.0   

                                              basket  
0                           [2, 3, 4, 5, 6, 7, 8, 9]  
1                       [52, 53, 54, 55, 56, 57, 58]  
2  [185, 186, 187, 188, 189, 190, 191, 192, 193, ...  
3  [286, 287, 288, 199, 289, 290, 291, 292, 293, ...  
4                                         [432, 433]  


In [19]:
transformed_dat.head()

Unnamed: 0,order_id,first_prod,next_product,basket
0,2,1,2.0,"[2, 3, 4, 5, 6, 7, 8, 9]"
1,3,51,52.0,"[52, 53, 54, 55, 56, 57, 58]"
2,4,184,185.0,"[185, 186, 187, 188, 189, 190, 191, 192, 193, ..."
3,5,168,286.0,"[286, 287, 288, 199, 289, 290, 291, 292, 293, ..."
4,6,431,432.0,"[432, 433]"


In [23]:
def create_input_for_embed_network(df, transform_df, N_products):

    # Creating df with order_id, user_id, first prod, next prod, basket 
 
    print('next function', transform_df.head())
    x = df.drop_duplicates(subset=['order_id','user_id'])
    train_df = pd.merge(transform_df, x[['order_id','user_id']], how='left', on='order_id' )
    train_df.dropna(inplace=True)

    # Creating basket as categorical matrix for deep neural network output
    names = []
    for col in range(N_products):
        names.append('col_' + str(col)) 

    basket_df = pd.DataFrame(columns= names)
    for i,row in train_df.iterrows():
        for val in row.basket:
            if val!=0:
                basket_df.loc[i,'col_'+val] = 1
    basket_df.fillna(0, inplace=True)

    train_df['next_product'] = train_df['next_product'].astype('category', categories = df.product_id.unique())
    y_df = pd.get_dummies(train_df, columns = ['next_product'])
    y_df.drop(['user_id','order_id','first_prod','basket'], axis=1, inplace=True)
    
    train_df.drop(['order_id','next_product','basket'], axis=1, inplace=True)

    return train_df['first_prod'], train_df['user_id'], basket_df, y_df

In [24]:
prior_in, shopper_in, candidates_in, predicted = create_input_for_embed_network(df_deep, transformed_dat, N_products)

next function    order_id  first_prod  next_product  \
0         2           1           2.0   
1         3          51          52.0   
2         4         184         185.0   
3         5         168         286.0   
4         6         431         432.0   

                                              basket  
0                           [2, 3, 4, 5, 6, 7, 8, 9]  
1                       [52, 53, 54, 55, 56, 57, 58]  
2  [185, 186, 187, 188, 189, 190, 191, 192, 193, ...  
3  [286, 287, 288, 199, 289, 290, 291, 292, 293, ...  
4                                         [432, 433]  


In [48]:
first = df_deep.groupby(['order_id']).apply(first_prod)

In [49]:
next_prod = df_deep.groupby(['order_id']).apply(lambda x:next_prod(x))

In [50]:
basket =df_deep.groupby(['order_id']).apply(lambda x: create_basket(x))

In [51]:
final_df = pd.DataFrame(first, columns = ['first_prod'])

In [52]:
final_df['next_prod']= pd.DataFrame(next_prod)

In [53]:
final_df['basket']= pd.DataFrame(basket)

In [54]:
for _,row in final_df.iterrows():
    row.basket = random.shuffle(row.basket)
    

In [55]:
names = []
for col in range(N_products):
    names.append('col_' + str(col))  

In [56]:
test = pd.DataFrame(columns= names)


In [162]:
final_df.first_prod.nunique()

398

In [59]:
final_df.reset_index(inplace=True)

In [28]:
basket_df.fillna(0, inplace=True)

In [91]:
final_df.shape

(896, 4)

In [95]:
x = df_use.drop_duplicates(subset=['order_id','user_id'])

In [97]:
x.shape

(896, 14)

In [164]:
train_df = pd.merge(final_df, x[['order_id','user_id']], how='left', on='order_id' )

In [143]:
train_df.shape

(896, 5)

In [118]:
#train_df.drop_duplicates(subset= ['order_id','first_prod','next_prod','user_id'],keep=False, inplace=True)

In [144]:
train_df.dropna(inplace=True)

In [145]:
train_df.shape

(857, 5)

In [165]:
basket_df = pd.DataFrame(columns= names)
for i,row in train_df.iterrows():
    for val in row.basket:
        if val!=0:
            basket_df.loc[i,'col_'+val] = 1

In [169]:
basket_df.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,col_2242,col_2243,col_2244,col_2245,col_2246,col_2247,col_2248,col_2249,col_2250,col_2251
0,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [147]:
#basket_df.drop(['col_0'], axis=1, inplace=True)

In [148]:
basket_df.fillna(0, inplace=True)

In [128]:
#padded_basket = preprocessing.sequence.pad_sequences(train_df['basket'], maxlen=N_products, dtype='int32', padding='pre', truncating='pre', value=0.0)

In [149]:
train_df['next_prod'] = train_df['next_prod'].astype('category', categories = df_use.product_id.unique())
y_df = pd.get_dummies(train_df, columns = ['next_prod'])

In [150]:
#users = train_df['user_id']
#input_prod = train_df['first_prod']
#basket = train_df['basket']

In [151]:
y_df.drop(['user_id','order_id','first_prod','basket'], axis=1, inplace=True)
train_df.drop(['order_id','next_prod','basket'], axis=1, inplace=True)

In [152]:
#padded_basket

In [153]:
train_df.describe()

Unnamed: 0,first_prod,user_id
count,857.0,857.0
mean,924.106184,15.465578
std,689.697951,8.167724
min,1.0,1.0
25%,220.0,9.0
50%,866.0,15.0
75%,1504.0,23.0
max,2243.0,30.0


In [134]:
#X_train,X_test,y_train, y_test = train_test_split(train_df,y, test_size=0.3, random_state=42)

In [157]:
# Integer IDs representing 1-hot encodings
prior_in = Input(shape=(1,))
#dept_in = Input(shape=(1,))
shopper_in = Input(shape=(1,))

# Dense N-hot encoding for candidate products
candidates_in = Input(shape=(N_products,))

# Embeddings
prior = Embedding(N_products+1, 10)(prior_in)
#dept = Embedding(N_dept, 10)(dept_in)
shopper = Embedding(N_shoppers+1, 10)(shopper_in)

# Reshape and merge all embeddings together
reshape = Reshape(target_shape=(10,))
combined = merge([reshape(prior), reshape(shopper)],
                 mode='concat')

# Hidden layers
#hidden_1 = Dense(1024, activation='relu',W_regularizer=l2(0.02))(combined)
#hidden_2 = Dense(512, activation='relu',W_regularizer=l2(0.02))(hidden_1)
hidden_3 = Dense(100, activation='relu')(combined)
#LR1 = LeakyReLU(alpha=0.1)(hidden_3)
hidden_4 = Dense(1, activation='relu')(hidden_3)

# Final 'fan-out' into the space of future products
final = Dense(N_products, activation='relu')(hidden_4)
#LR_final = LeakyReLU(alpha=0.1)(final)

# Ensure we do not overflow when we exponentiate
final = Lambda(lambda x: x - K.max(x))(final)

# Masked soft-max using Lambda and merge-multiplication
exponentiate = Lambda(lambda x: K.exp(x))(final)
masked = merge([exponentiate, candidates_in], mode='mul')
predicted = Lambda(lambda x: x / K.sum(x))(masked)

# Compile with categorical crossentropy and adam
mdl = Model(input=[prior_in, shopper_in, candidates_in],
            output=predicted)
mdl.compile(loss='categorical_crossentropy', 
            optimizer='adam',
            metrics=['accuracy'])

  name=name)


In [158]:
mdl.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_19 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_20 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_13 (Embedding)        (None, 1, 10)        22520       input_19[0][0]                   
__________________________________________________________________________________________________
embedding_14 (Embedding)        (None, 1, 10)        310         input_20[0][0]                   
__________________________________________________________________________________________________
reshape_7 

In [159]:
mdl.fit([train_df['first_prod'], train_df['user_id'], basket_df], y_df,  batch_size=128, epochs=3, verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1a227e6fd0>