In [1]:
from keras.models import Model
from keras.layers.core import Dense, Reshape, Lambda
from keras.layers import Input, Embedding, concatenate, Multiply
from keras import backend as K
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras import preprocessing
from keras.regularizers import l2
import random
from keras.layers.advanced_activations import LeakyReLU
import keras

Using TensorFlow backend.


In [2]:
#df = pd.read_csv('/Users/BharathiSrinivasan/Documents/GitHub/Thesis/merged_data.csv')
folder = 'C:\\Users\\Pascal\\Documents\\GitHub\\instacart-market-basket-analysis\\'

In [3]:
df_big = pd.read_csv(folder + 'merged_data.csv')

In [4]:
#Sample orders of n customer
def data_nusers(df, n):
    unique_users = df.user_id.unique()
    i = 0
    df_nusers = pd.DataFrame()  
    for user in unique_users:
        df_nusers = df_nusers.append(df[df.user_id == user])
        i +=1
        if (i == n):
            break
    return pd.DataFrame(df_nusers)

In [5]:
df_use = data_nusers(df_big,10)

In [6]:
len(df_use.product_id.unique())

796

In [7]:
# Number of product IDs available
N_products = df_use['product_id'].nunique()
N_shoppers = df_use['user_id'].nunique()

In [8]:
EMBEDDING_COLUMNS = ["user_id", "product_id"]

In [9]:
#Helper to index columns before embeddings
def val2idx(df, cols):
    val_types = dict()
    for c in cols:
        val_types[c] = df[c].unique()

    val_to_idx = dict()
    for k, v in val_types.items():
        val_to_idx[k] = {o: i for i, o in enumerate(val_types[k])}

    for k, v in val_to_idx.items():
        df[k] = df[k].apply(lambda x: v[x]+1)

    unique_vals = dict()
    for c in cols:
        unique_vals[c] = df[c].nunique()

    return df, unique_vals

In [10]:
print(N_products,N_shoppers )

796 10


In [15]:
df_deep, unique_vals = val2idx(df_use, EMBEDDING_COLUMNS)

In [16]:
df_deep.shape

(10624, 13)

In [17]:
def first_prod(order):
    for _,row in order.iterrows():
        if row['add_to_cart_order']==1:
            return row['product_id']

In [18]:
def next_prod(order):
    for _,row in order.iterrows():
        if row['add_to_cart_order']==2:
            return row['product_id']

In [19]:
def create_basket(order):
    order['product_id']= order['product_id'].astype(str)
    
    basket = []
    for _,row in order.iterrows():
        if row['add_to_cart_order']!=1:
            basket.append(row['product_id'])
    #basket = random.shuffle(basket)
    return basket

In [23]:
def transform_data_for_embedding(df):
    first = df.groupby(['order_id']).apply(first_prod)
    next_product = df.groupby(['order_id']).apply(lambda x:next_prod(x))
    basket =df.groupby(['order_id']).apply(lambda x: create_basket(x))
    
    transform_df = pd.DataFrame(first, columns = ['first_prod'])
    transform_df['next_product']= next_product.values
    transform_df['basket']= basket.tolist()
    transform_df.reset_index(inplace=True)

    # Number of product IDs available
    N_products = df['product_id'].nunique()
    N_shoppers = df['user_id'].nunique()

    return transform_df, N_products, N_shoppers

In [24]:
df1, N_products, N_shoppers = transform_data_for_embedding(df_deep)

In [54]:
#for _,row in final_df.iterrows():
#    row.basket = random.shuffle(row.basket)
    

In [33]:
def create_input_for_embed_network(df, transform_df, N_products):

    # Creating df with order_id, user_id, first prod, next prod, basket 
 
    print('next function', transform_df.head())
    x = df.drop_duplicates(subset=['order_id','user_id'])
    train_df = pd.merge(transform_df, x[['order_id','user_id']], how='left', on='order_id' )
    train_df.dropna(inplace=True)

    # Creating basket as categorical matrix for deep neural network output
    names = []
    for col in range(N_products):
        names.append('col_' + str(col)) 

    basket_df = pd.DataFrame(columns= names)
    for i,row in train_df.iterrows():
        for val in row.basket:
            if val!=0:
                basket_df.loc[i,'col_'+val] = 1
    basket_df.fillna(0, inplace=True)
    basket_in.drop(['col_0'], axis=1, inplace=True)

    train_df['next_product'] = train_df['next_product'].astype('category', categories = df.product_id.unique())
    y_df = pd.get_dummies(train_df, columns = ['next_product'])
    y_df.drop(['user_id','order_id','first_prod','basket'], axis=1, inplace=True)
    
    train_df.drop(['order_id','next_product','basket'], axis=1, inplace=True)

    return train_df['first_prod'], train_df['user_id'], basket_df, y_df

In [55]:
product_in , user_in, basket_in, predicted_product = create_input_for_embed_network(df_deep, df1, N_products)

next function    order_id  first_prod  next_product  \
0         2           1           2.0   
1         3          51          52.0   
2         4         184         185.0   
3         5         168         286.0   
4         6         431         432.0   

                                              basket  
0                           [2, 3, 4, 5, 6, 7, 8, 9]  
1                       [52, 53, 54, 55, 56, 57, 58]  
2  [185, 186, 187, 188, 189, 190, 191, 192, 193, ...  
3  [286, 287, 288, 199, 289, 290, 291, 292, 293, ...  
4                                         [432, 433]  


  """Entry point for launching an IPython kernel.


In [48]:
# Integer IDs representing 1-hot encodings
prior_in = Input(shape=(1,))
shopper_in = Input(shape=(1,))

# Dense N-hot encoding for candidate products
candidates_in = Input(shape=(N_products,))

# Embeddings
prior = Embedding(N_products+1, 10)(prior_in)
shopper = Embedding(N_shoppers+1, 10)(shopper_in)

# Reshape and merge all embeddings together
reshape = Reshape(target_shape=(10,))
combined = keras.layers.concatenate([reshape(prior), reshape(shopper)])

# Hidden layers
#hidden_1 = Dense(1024, activation='relu',W_regularizer=l2(0.02))(combined)
#hidden_2 = Dense(512, activation='relu',W_regularizer=l2(0.02))(hidden_1)
hidden_3 = Dense(100, activation='relu')(combined)
#LR1 = LeakyReLU(alpha=0.1)(hidden_3)
hidden_4 = Dense(1, activation='relu')(hidden_3)

# Final 'fan-out' into the space of future products
final = Dense(N_products, activation='relu')(hidden_4)
#LR_final = LeakyReLU(alpha=0.1)(final)

# Ensure we do not overflow when we exponentiate
final = Lambda(lambda x: x - K.max(x))(final)

# Masked soft-max using Lambda and merge-multiplication
exponentiate = Lambda(lambda x: K.exp(x))(final)
masked = keras.layers.multiply([exponentiate, candidates_in])
predicted = Lambda(lambda x: x / K.sum(x))(masked)

# Compile with categorical crossentropy and adam
mdl = Model(input=[prior_in , shopper_in, candidates_in],
            output=predicted)
mdl.compile(loss='categorical_crossentropy', 
            optimizer='adam',
            metrics=['accuracy'])



In [49]:
mdl.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_46 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_47 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_31 (Embedding)        (None, 1, 10)        22520       input_46[0][0]                   
__________________________________________________________________________________________________
embedding_32 (Embedding)        (None, 1, 10)        310         input_47[0][0]                   
__________________________________________________________________________________________________
reshape_16

In [66]:
mdl.fit([product_in , user_in, basket_in], predicted_product,  batch_size=128, epochs=50, verbose=0)

<keras.callbacks.History at 0x1f042a7ef98>

In [67]:
model_json = mdl.to_json()
with open("NN_embed_model.json", "w") as json_file:
    json_file.write(model_json)
    # serialize weights to HDF5
mdl.save_weights("NN_embed_model.h5")
print("Saved model to disk")

Saved model to disk
