In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
%cd gdrive/MyDrive/colab_projects/recommendation_system/nbs/

/content/gdrive/MyDrive/colab_projects/recommendation_system/nbs


In [3]:
import sys
sys.path.append("../src/")
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from config import *

In [4]:
pd.options.display.max_columns=999
pd.options.display.max_rows=999

In [5]:
df_train = pd.read_parquet("../data/processed/df_rating_movie_train.parquet")
df_val = pd.read_parquet("../data/processed/df_rating_movie_val.parquet")

In [6]:
movies_ids = list(set(list(df_train.movieId.unique()) + list(df_val.movieId.unique())))

In [7]:
users_ids = list(set(list(df_train.userId.unique()) + list(df_val.userId.unique())))

In [8]:
len(movies_ids)

59047

In [9]:
max(movies_ids)

209171

In [10]:
dict_movies = {}
index = 0
for ids in sorted(movies_ids):
    dict_movies[ids] = index
    index += 1

In [11]:
dict_users = {}
index = 0
for ids in sorted(users_ids):
    dict_users[ids] = index
    index += 1

In [12]:
df_train["movieId"] = df_train["movieId"].map(dict_movies)
df_val["movieId"] = df_val["movieId"].map(dict_movies)

df_train["userId"] = df_train["userId"].map(dict_users)
df_val["userId"] = df_val["userId"].map(dict_users)

In [13]:
df_train.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,20000080.0,20000080.0,20000080.0,20000080.0
mean,81188.88,6107.92,3.533854,1215613000.0
std,46792.03,8262.76,1.060744,226874400.0
min,0.0,0.0,0.5,789652000.0
25%,40510.0,1167.0,3.0,1011760000.0
50%,80913.0,2855.0,3.5,1198869000.0
75%,121556.0,7920.0,4.0,1447207000.0
max,162540.0,59046.0,5.0,1574328000.0


In [14]:
df_train.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
5355991,34759,4857,4.0,1008626350,Ocean's Eleven (2001),Crime|Thriller
16979199,110105,1154,1.5,1301889569,"English Patient, The (1996)",Drama|Romance|War
14781790,95710,4466,2.0,1009726295,Bill & Ted's Excellent Adventure (1989),Adventure|Comedy|Sci-Fi
5159343,33522,1373,4.0,1039513864,Hamlet (1996),Crime|Drama|Romance
18503349,119882,9979,4.0,1353051590,Cinderella Man (2005),Drama|Romance


In [15]:
df_train.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
title         object
genres        object
dtype: object

In [16]:
for col in ["userId", "movieId", "rating"]:
    df_train[col] = df_train[col].astype(np.float32)
    df_val[col] = df_val[col].astype(np.float32)

In [17]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [18]:
class NeuralCollaborativeFiltering(keras.Model):
    def __init__(self, num_unique_users, num_unique_movies, embedding_size, min_rating, max_rating):
        super(NeuralCollaborativeFiltering, self).__init__()
        self.num_unique_users = num_unique_users
        self.num_unique_movies = num_unique_movies
        self.embedding_size = embedding_size
        self.min_rating = min_rating
        self.max_rating = max_rating

        self.users_embedding = layers.Embedding(num_unique_users, embedding_size, embeddings_initializer="he_normal",
                                               embeddings_regularizer=keras.regularizers.l2(1e-6))
        self.users_bias = layers.Embedding(num_unique_users, 1)

        self.movies_embedding = layers.Embedding(num_unique_movies, embedding_size, embeddings_initializer="he_normal",
                                                embeddings_regularizer=keras.regularizers.l2(1e-6))
        
        self.movies_bias = layers.Embedding(num_unique_movies, 1)

        self.min_max = layers.Lambda(lambda x: x * (self.max_rating - self.min_rating) + self.min_rating)

    def call(self, inputs):
        users_emb = self.users_embedding(inputs[0])
        users_bias = self.users_bias(inputs[0])
        movies_emb = self.movies_embedding(inputs[1])
        movies_bias = self.movies_bias(inputs[1])
        dot_product_users_movies = tf.tensordot(users_emb, tf.transpose(movies_emb), 2)
        x = dot_product_users_movies + users_bias + movies_bias
        x = tf.nn.sigmoid(x)
        x = self.min_max(x)
        return x

In [19]:
model = NeuralCollaborativeFiltering(num_unique_users=len(set(list(df_train.userId.unique()) + list(df_val.userId.unique()))),
                                    num_unique_movies=len(set(list(df_train.movieId.unique()) + list(df_val.movieId.unique()))),
                                    embedding_size=EMBEDDING_SIZE, min_rating=min(df_train.rating.min(), df_val.rating.min()),
                                    max_rating=max(df_train.rating.max(), df_val.rating.max()))

In [20]:
model.num_unique_users

162541

In [21]:
model([df_train.userId.values, df_train.movieId.values])

<tf.Tensor: shape=(20000076, 1), dtype=float32, numpy=
array([[1.8986906],
       [1.8920974],
       [1.966909 ],
       ...,
       [1.9040284],
       [1.9361792],
       [1.8717647]], dtype=float32)>

In [22]:
model.compile(
    loss=tf.keras.losses.MeanSquaredError(), optimizer=keras.optimizers.Adam(lr=0.005)
)

In [23]:
model.load_weights("../model/neural_collab_filtering.h5")

In [27]:
pd.DataFrame(model([df_train.userId.values, df_train.movieId.values]).numpy()).describe()

Unnamed: 0,0
count,20000080.0
mean,0.4194288
std,0.08095416
min,0.5
25%,0.5
50%,0.5
75%,0.5
max,0.5


In [29]:
model.evaluate(x=[df_val.userId.values, df_val.movieId.values], y=df_val.rating.values)



KeyboardInterrupt: ignored