In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
%cd gdrive/MyDrive/colab_projects/recommendation_system/nbs/

/content/gdrive/MyDrive/colab_projects/recommendation_system/nbs


In [3]:
import sys
sys.path.append("../src/")
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from config import *

In [4]:
pd.options.display.max_columns=999
pd.options.display.max_rows=999

In [5]:
df_train = pd.read_parquet("../data/processed/df_rating_movie_train.parquet")
df_val = pd.read_parquet("../data/processed/df_rating_movie_val.parquet")

In [6]:
df_train.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
5355991,34760,4963,4.0,1008626350,Ocean's Eleven (2001),Crime|Thriller
16979199,110106,1183,1.5,1301889569,"English Patient, The (1996)",Drama|Romance|War
14781790,95711,4571,2.0,1009726295,Bill & Ted's Excellent Adventure (1989),Adventure|Comedy|Sci-Fi
5159343,33523,1411,4.0,1039513864,Hamlet (1996),Crime|Drama|Romance
18503349,119883,33660,4.0,1353051590,Cinderella Man (2005),Drama|Romance


In [7]:
movies_ids = list(set(list(df_train.movieId.unique()) + list(df_val.movieId.unique())))

In [8]:
users_ids = list(set(list(df_train.userId.unique()) + list(df_val.userId.unique())))

In [9]:
len(movies_ids)

59047

In [10]:
max(movies_ids)

209171

In [11]:
dict_movies = {}
index = 0
for ids in sorted(movies_ids):
    dict_movies[ids] = index
    index += 1

In [12]:
dict_users = {}
index = 0
for ids in sorted(users_ids):
    dict_users[ids] = index
    index += 1

In [13]:
df_train["movieId"] = df_train["movieId"].map(dict_movies)
df_val["movieId"] = df_val["movieId"].map(dict_movies)

df_train["userId"] = df_train["userId"].map(dict_users)
df_val["userId"] = df_val["userId"].map(dict_users)

In [14]:
df_train.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,20000080.0,20000080.0,20000080.0,20000080.0
mean,81188.88,6107.92,3.533854,1215613000.0
std,46792.03,8262.76,1.060744,226874400.0
min,0.0,0.0,0.5,789652000.0
25%,40510.0,1167.0,3.0,1011760000.0
50%,80913.0,2855.0,3.5,1198869000.0
75%,121556.0,7920.0,4.0,1447207000.0
max,162540.0,59046.0,5.0,1574328000.0


In [15]:
df_train.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
5355991,34759,4857,4.0,1008626350,Ocean's Eleven (2001),Crime|Thriller
16979199,110105,1154,1.5,1301889569,"English Patient, The (1996)",Drama|Romance|War
14781790,95710,4466,2.0,1009726295,Bill & Ted's Excellent Adventure (1989),Adventure|Comedy|Sci-Fi
5159343,33522,1373,4.0,1039513864,Hamlet (1996),Crime|Drama|Romance
18503349,119882,9979,4.0,1353051590,Cinderella Man (2005),Drama|Romance


In [16]:
df_train.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
title         object
genres        object
dtype: object

In [17]:
for col in ["userId", "movieId", "rating"]:
    df_train[col] = df_train[col].astype(np.float32)
    df_val[col] = df_val[col].astype(np.float32)

In [18]:
num_unique_users=len(set(list(df_train.userId.unique()) + list(df_val.userId.unique())))
num_unique_movies=len(set(list(df_train.movieId.unique()) + list(df_val.movieId.unique())))

In [19]:
min_rating=min(df_train.rating.min(), df_val.rating.min())
max_rating=max(df_train.rating.max(), df_val.rating.max())

In [20]:
min_rating, max_rating

(0.5, 5.0)

In [21]:
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras import optimizers as opt
from tensorflow.keras.layers import Embedding, multiply, concatenate, Flatten, Input, Dense
from tensorflow.keras.regularizers import l2

In [22]:
users_input = Input(shape=(1,), name="users_input")
users_embedding = Embedding(num_unique_users + 1, EMBEDDING_SIZE, name="users_embeddings")(users_input)

movies_input = Input(shape=(1,), name="movies_input")
movies_embedding = Embedding(num_unique_movies + 1, EMBEDDING_SIZE, name="movies_embeddings")(movies_input)

input_terms = concatenate([users_embedding, movies_embedding])

input_terms = Flatten(name="fl_inputs")(input_terms)

input_terms = Dense(1024, activation="relu", name="dense_0", kernel_regularizer=l2(1e-5))(input_terms) 
input_terms = Dense(512, activation="relu", name="dense_1", kernel_regularizer=l2(1e-5))(input_terms) 
input_terms = Dense(256, activation="relu", name="dense_2", kernel_regularizer=l2(1e-5))(input_terms) 
input_terms = Dense(128, activation="relu", name="dense_3", kernel_regularizer=l2(1e-5))(input_terms) 
input_terms = Dense(64, activation="relu", name="dense_4", kernel_regularizer=l2(1e-5))(input_terms) 

output = Dense(1, activation="sigmoid", name="output")(input_terms) 
output = output * (max_rating - min_rating) + min_rating

In [23]:
model = Model(inputs=[users_input, movies_input], outputs=output)

In [24]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
users_input (InputLayer)        [(None, 1)]          0                                            
__________________________________________________________________________________________________
movies_input (InputLayer)       [(None, 1)]          0                                            
__________________________________________________________________________________________________
users_embeddings (Embedding)    (None, 1, 50)        8127100     users_input[0][0]                
__________________________________________________________________________________________________
movies_embeddings (Embedding)   (None, 1, 50)        2952400     movies_input[0][0]               
______________________________________________________________________________________________

In [25]:
opt_adam = opt.Adam(lr = 0.005)
model.compile(optimizer=opt_adam, loss= ['mse'], metrics=['mean_absolute_error'])

In [26]:
model.fit(x=[df_train.userId, df_train.movieId], y=df_train.rating, batch_size=2048, epochs=3, verbose=1,    
          validation_data=([df_val.userId, df_val.movieId], df_val.rating))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f5a2da26250>

In [27]:
model.predict([df_val.userId, df_val.movieId])

array([[3.0393615],
       [4.1650534],
       [4.7213693],
       ...,
       [3.2276237],
       [4.1517754],
       [3.6566398]], dtype=float32)

In [28]:
pd.DataFrame(model.predict([df_val.userId, df_val.movieId])).describe()

Unnamed: 0,0
count,5000019.0
mean,3.544742
std,0.7129393
min,0.5037349
25%,3.164183
50%,3.64103
75%,4.050724
max,4.998502


In [29]:
model.fit(x=[df_train.userId, df_train.movieId], y=df_train.rating, batch_size=2048, epochs=3, verbose=1,    
          validation_data=([df_val.userId, df_val.movieId], df_val.rating))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f5a2f79a490>