In [2]:
import pandas as pd
import numpy as np

from keras.layers.merge import dot
from keras.models import Model
from keras.layers import Input, Embedding, Flatten
from keras import backend as K

In [3]:
df = pd.read_csv('automotive.csv')
df

Unnamed: 0.1,Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,unixReviewTime
0,1,1,True,"04 19, 2018",ABCA1A8E4DGV1,0209688726,{'Color:': ' Blue'},1524096000
1,2,1,True,"04 16, 2018",A1NX8HM89FRQ32,0209688726,{'Color:': ' Black'},1523836800
2,3,3,True,"04 13, 2018",A1X77G023NY0KY,0209688726,{'Color:': ' CA'},1523577600
3,4,5,True,"04 8, 2018",A3GK37JO2MGW6Q,0209688726,{'Color:': ' Black'},1523145600
4,5,5,True,"03 24, 2018",AIY18YON1TWJJ,0209688726,{'Color:': ' Black'},1521849600
...,...,...,...,...,...,...,...,...
936191,1711514,5,True,"06 19, 2018",A3H86E5N0F1Q5R,B01HJFDJ8S,,1529366400
936192,1711515,4,True,"08 23, 2017",AXH645B4SSAJY,B01HJFDJ8S,,1503446400
936193,1711516,5,True,"08 8, 2017",AMGJLCCNVFB8,B01HJFDJ8S,,1502150400
936194,1711517,5,True,"08 24, 2018",A1MJUNTX7CTR5U,B01HJI17Y8,{'Size:': ' H8-NEW'},1535068800


In [4]:
df = df.drop(columns=['Unnamed: 0', 'verified', 'reviewTime', 'style', 'unixReviewTime'])
df

Unnamed: 0,overall,reviewerID,asin
0,1,ABCA1A8E4DGV1,0209688726
1,1,A1NX8HM89FRQ32,0209688726
2,3,A1X77G023NY0KY,0209688726
3,5,A3GK37JO2MGW6Q,0209688726
4,5,AIY18YON1TWJJ,0209688726
...,...,...,...
936191,5,A3H86E5N0F1Q5R,B01HJFDJ8S
936192,4,AXH645B4SSAJY,B01HJFDJ8S
936193,5,AMGJLCCNVFB8,B01HJFDJ8S
936194,5,A1MJUNTX7CTR5U,B01HJI17Y8


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 936196 entries, 0 to 936195
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   overall     936196 non-null  int64 
 1   reviewerID  936196 non-null  object
 2   asin        936196 non-null  object
dtypes: int64(1), object(2)
memory usage: 21.4+ MB


In [6]:
df.reviewerID = df.reviewerID.astype('category').cat.codes.values
df.asin = df.asin.astype('category').cat.codes.values

In [7]:
df['reviewerID'].value_counts(ascending=True)

38125       1
133015      1
25832       1
48371       1
34905       1
         ... 
70608     106
96063     114
106026    114
117724    123
37592     127
Name: reviewerID, Length: 161088, dtype: int64

In [8]:
df['overall'] = df['overall'].astype('float32')

In [9]:
df.isnull().sum()

overall       0
reviewerID    0
asin          0
dtype: int64

In [10]:
users = df.reviewerID.unique()
items = df.asin.unique()

userid2idx = {o:i for i,o in enumerate(users)}
itemid2idx = {o:i for i,o in enumerate(items)}

In [11]:
userid2idx

{131852: 0,
 28278: 1,
 39302: 2,
 104561: 3,
 140778: 4,
 69607: 5,
 34027: 6,
 30918: 7,
 111330: 8,
 111768: 9,
 9148: 10,
 56704: 11,
 104958: 12,
 13553: 13,
 52219: 14,
 56359: 15,
 62882: 16,
 14654: 17,
 28582: 18,
 39289: 19,
 133615: 20,
 85331: 21,
 91533: 22,
 2772: 23,
 36777: 24,
 93640: 25,
 14772: 26,
 48011: 27,
 8454: 28,
 13571: 29,
 131214: 30,
 11889: 31,
 103594: 32,
 32886: 33,
 46751: 34,
 131905: 35,
 68395: 36,
 158314: 37,
 104245: 38,
 60854: 39,
 103311: 40,
 3371: 41,
 27015: 42,
 42485: 43,
 144737: 44,
 25770: 45,
 137978: 46,
 103420: 47,
 29740: 48,
 113234: 49,
 10659: 50,
 24757: 51,
 86878: 52,
 100068: 53,
 29764: 54,
 149525: 55,
 80206: 56,
 100869: 57,
 5360: 58,
 86858: 59,
 56870: 60,
 33391: 61,
 127933: 62,
 9417: 63,
 84533: 64,
 54055: 65,
 66405: 66,
 92334: 67,
 85725: 68,
 91005: 69,
 93058: 70,
 30438: 71,
 153266: 72,
 151470: 73,
 137859: 74,
 62138: 75,
 19030: 76,
 86459: 77,
 104272: 78,
 72442: 79,
 105075: 80,
 154785: 81,
 1196

In [10]:
df['reviewerID'] = df['reviewerID'].apply(lambda x: userid2idx[x])
df['asin'] = df['asin'].apply(lambda x: itemid2idx[x])
split = np.random.rand(len(df)) < 0.8
train = df[split]
valid = df[~split]
print(train.shape , valid.shape)

(748827, 3) (187369, 3)


In [11]:
n_items = len(df['asin'].unique())
n_users = len(df['reviewerID'].unique())
n_latent_factors = 64  

In [12]:
user_input = Input(shape=(1,), name='user_input', dtype='int64')
user_embedding = Embedding(n_users, n_latent_factors, name='user_embedding')(user_input)
user_vec = Flatten(name='FlattenUsers')(user_embedding)

In [13]:
item_input = Input(shape=(1,), name='item_input', dtype='int64')
item_embedding = Embedding(n_items, n_latent_factors, name='item_embedding')(item_input)
item_vec = Flatten(name='FlattenMovies')(item_embedding)

In [14]:
sim = dot([user_vec, item_vec], name='Simalarity-Dot-Product',axes=1)
model = Model([user_input, item_input],sim)

In [15]:
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true))) 

model.compile(optimizer = "rmsprop", loss = root_mean_squared_error, metrics =["accuracy"])

In [16]:
train.shape
batch_size = 128
epochs = 50

In [17]:
History = model.fit([train.reviewerID,train.asin],train.overall, batch_size=batch_size,
            epochs=epochs, validation_data = ([valid.reviewerID,valid.asin],valid.overall), verbose = 1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
