In [44]:
import pandas as pd
import numpy as np
import seaborn as sns #visualisation
import matplotlib.pyplot as plt #visualisation
%matplotlib inline 
sns.set(color_codes=True)

## Import Data

In [59]:
df_ratings = pd.read_csv("All_Beauty.csv", names=['productID', 'reviewerID', 'rating', 'unixReviewTime'])
df_ratings.head()

Unnamed: 0,productID,reviewerID,rating,unixReviewTime
0,143026860,A1V6B6TNIC10QE,1.0,1424304000
1,143026860,A2F5GHSXFQ0W6J,4.0,1418860800
2,143026860,A1572GUYS7DGSR,4.0,1407628800
3,143026860,A1PSGLFK1NSVO,5.0,1362960000
4,143026860,A6IKXKZMTKGSC,5.0,1324771200


In [60]:
df_2 =pd.read_csv("AMAZON_FASHION.csv", names=['productID', 'reviewerID', 'rating', 'unixReviewTime'])
df_ratings = pd.concat([df_ratings, df_2])

In [61]:
df_3 =pd.read_csv("Luxury_Beauty.csv", names=['productID', 'reviewerID', 'rating', 'unixReviewTime'])
df_ratings = pd.concat([df_ratings, df_3])

In [62]:
df_ratings.head()

Unnamed: 0,productID,reviewerID,rating,unixReviewTime
0,143026860,A1V6B6TNIC10QE,1.0,1424304000
1,143026860,A2F5GHSXFQ0W6J,4.0,1418860800
2,143026860,A1572GUYS7DGSR,4.0,1407628800
3,143026860,A1PSGLFK1NSVO,5.0,1362960000
4,143026860,A6IKXKZMTKGSC,5.0,1324771200


## Exploratory Data Analysis

In [63]:
# drop irrelevant columns
# We only need ratings, so we drop other columns
df_ratings.drop(columns=['unixReviewTime'], inplace=True)

In [64]:
# checking data types
df_ratings.dtypes

productID      object
reviewerID     object
rating        float64
dtype: object

In [65]:
# total number of rows and columns
df_ratings.shape

(1829609, 3)

In [66]:
# total rows of duplicate data
df_ratings[df_ratings.duplicated()].shape

(56187, 3)

In [67]:
# drop duplicate data
df_ratings = df_ratings.drop_duplicates()

In [68]:
df_ratings.count()

productID     1773422
reviewerID    1773422
rating        1773422
dtype: int64

In [69]:
# Finding the null values.
print(df_ratings.isnull().sum())

productID     0
reviewerID    0
rating        0
dtype: int64


In [70]:
# check if there is error data
print("Unique value : ", df_ratings['rating'].unique())

Unique value :  [1. 4. 5. 2. 3.]


In [71]:
# count unique of reviewer
df_ratings['reviewerID'].nunique()

1399004

In [72]:
# count unique of product
df_ratings['productID'].nunique()

230895

In [80]:
# count how many ratings has done by reviewer
df_ratings['reviewerID'].value_counts()

A2GJX2KCUSR0EI    147
AENH50GW3OKDA     115
A2V5R832QCSOMX    106
AHN86VFJIJ2JP      99
AQY5XBYSENNZQ      94
                 ... 
A3CVVSNLPRNM3N      1
A2W5IG9AS5Q6L0      1
A17O5L9ADHROAT      1
A1ZDLWCOPV6RUT      1
A2KIGUQ54W9AQ6      1
Name: reviewerID, Length: 1399004, dtype: int64

In [116]:
# filter reviewer which has reviewed below 10 times
df_filter = df_ratings.groupby('reviewerID').filter(lambda x : len(x)>10)

In [117]:
# number of reviewerID after filtering
df_filter['reviewerID'].nunique()

1323

In [118]:
# number of productID after filtering
df_filter['productID'].nunique()

8155

## Data Preparations

In [119]:
# rename columns
df_filter = df_filter.rename(columns={"reviewerID": "user_id", "productID": "product_id", "rating": "weight"})

In [120]:
# convert categorical columns to numerical columns
df_filter["user_id"] = df_filter["user_id"].astype('category')
df_filter["product_id"] = df_filter["product_id"].astype('category')
df_filter.dtypes

product_id    category
user_id       category
weight         float64
dtype: object

In [121]:
df_filter["user_id_cat"] = df_filter["user_id"].cat.codes
df_filter["product_id_cat"] = df_filter["product_id"].cat.codes
df_filter.head()

Unnamed: 0,product_id,user_id,weight,user_id_cat,product_id_cat
720,1620213982,AIBZYH7KI1AF6,5.0,1164,1
1562,1620213982,A15WSZ11YEOKMU,3.0,43,1
1766,1620213982,A3K6852AILJ825,5.0,895,1
4455,1620213982,A3LX2VT91TE1M1,5.0,914,1
4518,1620213982,A2NJVP1JZ5G8VD,5.0,577,1


In [92]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

import tensorflow as tf
tf.version.VERSION

'2.2.0'

In [93]:
from tensorflow import keras
from tensorflow.keras import layers

In [94]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import time

In [122]:
# split data train, validation, test
def split_data(df) :
    X_train, X_val, y_train, y_val = train_test_split(df[['user_id_cat', 'product_id_cat']], df['weight'], test_size=0.1)
    return X_train, y_train, X_val, y_val

# define num_users and num_items
def define_num_users_items(df) :
    num_users = df_df['user_id_cat'].nunique() + 1
    num_items = df_df['product_id_cat'].nunique() + 1
    return num_users, num_items

# define inputs
def define_inputs(X, y):
    user_input = np.array(X['user_id_cat']).reshape(np.array(X['user_id_cat']).shape[0],1)
    item_input = np.array(X['product_id_cat']).reshape(np.array(X['product_id_cat']).shape[0],1)
    target = np.array(y).reshape(np.array(y).shape[0],1)
    return user_input, item_input, target

In [123]:
# normalize weight
df_filter['weight'] = df_filter['weight']*0.1

# split data train, validation, test
X_train, y_train, X_val, y_val = split_data(df_filter)

# define num_users num_items
num_users, num_items = define_num_users_items(X_train)
val_num_users, val_num_items = define_num_users_items(X_val)

# define inputs for training
user_input, item_input, target = define_inputs(X_train, y_train)
val_user_input, val_item_input, val_target = define_inputs(X_val, y_val)

In [124]:
print('num_user : ', num_users, 'num_item : ', num_items)

num_user :  1324 num_item :  8156


In [125]:
print('user-item input : ', user_input.shape[0], '\nvalidation user-item input : ', val_user_input.shape[0])

user-item input :  21031 
validation user-item input :  2337


## Create Model

In [104]:
from statistics import mean
import math
import random
from tqdm import tqdm

In [126]:
#create n_recommendation product
def create_nodes(user_id, product) :
    user = np.full((len(product), 1), user_id)
    item = product.reshape(product.shape[0],1)
    return user, item

# generate predicted recommendation
def top_n_recommendation(user_id, df, model, N) :
    product = df['product_id_cat'].unique()
    user_test, item_candidate = create_nodes(user_id, product)
    link_pred = model.predict({"user": user_test, "item": item_candidate})
    merge = np.stack((item_candidate.flatten(), link_pred.flatten()), axis=1)
    merge = merge[np.argsort(merge[:, 1])]
    return merge[-N:, 0].astype(int)

In [127]:
# create model
def create_model(num_hidden, num_users=num_users, num_items=num_items, lr=0.001) :
    input_user = keras.Input(shape=(1,), name="user")  
    input_item = keras.Input(shape=(1,), name="item")  

    user_embedding = layers.Embedding(num_users, 64)(input_user)
    item_embedding = layers.Embedding(num_items, 64)(input_item)

    user_latent = layers.Flatten()(user_embedding)
    item_latent = layers.Flatten()(item_embedding)

    x = layers.Concatenate()([user_latent, item_latent])

    for i in range(num_hidden) :
        x = layers.Dense(64, activation='relu')(x)
        
    prediction = layers.Dense(1, activation='sigmoid', name="prediction")(x)

    model = keras.Model(inputs=[input_user, input_item], outputs=prediction)
    
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

    model.compile(optimizer=optimizer, metrics=[tf.keras.metrics.RootMeanSquaredError()], loss={"prediction": 'mse'})
    
    return model

In [128]:
def model_fit(model, user_input=user_input, item_input=item_input, target=target, epochs=1, batch_size=10, val_user_input=val_user_input, val_item_input=val_item_input, val_target=val_target) :
    ## Calculate training time
    start = time.time()
    history = model.fit({"user": user_input, 
                         "item": item_input},
                        {"prediction": target},
                        validation_data=({"user": val_user_input, 
                                         "item": val_item_input},
                                        {"prediction": val_target}),
                        batch_size=batch_size, 
                        epochs=epochs, 
                        verbose=1,
                       )

    end = time.time()
    print("Time elapsed training: {} second".format(end - start))
    return history

In [129]:
# create model
model = create_model(4, lr=0.0005)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user (InputLayer)               [(None, 1)]          0                                            
__________________________________________________________________________________________________
item (InputLayer)               [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1, 64)        84736       user[0][0]                       
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 1, 64)        521984      item[0][0]                       
____________________________________________________________________________________________

In [130]:
history = model_fit(model)

Time elapsed training: 29.711272716522217 second


## Storing Recommendation

In [133]:
df_filter.head()

Unnamed: 0,product_id,user_id,weight,user_id_cat,product_id_cat
720,1620213982,AIBZYH7KI1AF6,0.5,1164,1
1562,1620213982,A15WSZ11YEOKMU,0.3,43,1
1766,1620213982,A3K6852AILJ825,0.5,895,1
4455,1620213982,A3LX2VT91TE1M1,0.5,914,1
4518,1620213982,A2NJVP1JZ5G8VD,0.5,577,1


In [140]:
# create dictionary from user and product
user_dict = df_filter[['user_id', 'user_id_cat']].drop_duplicates()
product_dict = df_filter[['product_id', 'product_id_cat']].drop_duplicates()

In [170]:
# create function to generate all user recommendation
recommendation_list = []

for user in tqdm(user_dict['user_id'].values) :
    user_trans = user_dict[user_dict['user_id'] == user]['user_id_cat']
    items = top_n_recommendation(user_trans, df_filter, model, 100)
    product_trans = product_dict[product_dict['product_id_cat'].isin(items)]['product_id']
    product_trans = product_trans.values.tolist()
    recommendation_list.append(','.join(product_trans))

100%|██████████████████████████████████████████████████████████████████████████████| 1323/1323 [17:57<00:00,  1.23it/s]


In [None]:
# for i in range(len(recommendation_list)) :
#     recommendation_list[i] = ','.join(recommendation_list[i].tolist())

In [223]:
# transform to dataframe
d = {'reviewerID': user_dict['user_id'].values, 'recommendation_product': recommendation_list}
df_rec = pd.DataFrame(data=d)

In [224]:
# export to sql
from sqlalchemy import create_engine
engine = create_engine('sqlite://', echo=False)

In [225]:
df_rec.to_sql('recommendation', con=engine, if_exists='replace')

In [228]:
df_rec.to_csv(r'recommendation.csv', index=False)