In [3]:
from typing import Dict, Text

import numpy as np
import tensorflow as tf

import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs
import pandas as pd

In [4]:
# Import ratings and movies data
ratings = pd.read_csv("data/ratings_200k.csv")
movies = pd.read_csv('data/movies_2000.csv')

# Get movie title to ratings DF
ratings = pd.merge(ratings, movies[['movieId', 'title']], on='movieId')[['userId', 'title', 'rating']]

# Convert value to byte
ratings = ratings.apply(lambda x: x.apply(lambda y: str(y).encode()))

# Convert DF to TF dataset
ratings = tf.data.Dataset.from_tensor_slices(ratings)

# Map so each row can be called by its name
ratings = ratings.map(lambda x: {
    'userId': x[0],
    'title': x[1],
    'rating': float(b'2.5')})

In [5]:
# Shuffle the data
shuffled = ratings.shuffle(200000, reshuffle_each_iteration=False)

# Split train test
train = shuffled.take(160000)
test = shuffled.skip(160000).take(40000)

# Get unique movie title and user id lists to be used as vocabulary
movie_titles = ratings.batch(1000000).map(lambda x: x['title'])
user_ids = ratings.batch(1000000).map(lambda x: x['title'])

uniq_movie_title = np.unique(np.concatenate(list(movie_titles)))
uniq_user_id = np.unique(np.concatenate(list(user_ids)))

In [6]:
# Define the ranking model
class RankingModel(tf.keras.Model):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        embedding_dims = 32

        # User embedding model        
        self.movie_embedding = tf.keras.models.Sequential([
            tf.keras.layers.StringLookup(vocabulary=uniq_movie_title),
            tf.keras.layers.Embedding(len(uniq_movie_title) + 1, embedding_dims)
        ])

        # Movie embedding model
        self.user_embedding = tf.keras.models.Sequential([
            tf.keras.layers.StringLookup(vocabulary=uniq_user_id),
            tf.keras.layers.Embedding(len(uniq_user_id) + 1, embedding_dims)
        ])

        # Rating prediction model
        self.rating = tf.keras.models.Sequential([
            tf.keras.layers.Dense(256, 'relu'),
            tf.keras.layers.Dense(64, 'relu'),
            tf.keras.layers.Dense(1) # This will output the predicted rating
        ])

    def call(self, inputs):
        user_id, movie_title = inputs

        user_embedding = self.user_embedding(user_id)
        movie_embedding = self.movie_embedding(movie_title)

        return self.rating(tf.concat([user_embedding, movie_embedding], axis=1)) 


In [7]:
# Define movie recommender model
class MovieRecModel(tfrs.Model):
    def __init__(self):
        super().__init__()
        self.ranking_model: tf.keras.Model = RankingModel()
        self.task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
            loss=tf.keras.losses.MeanSquaredError(),
            metrics=[tf.keras.metrics.RootMeanSquaredError()]
        )

    def call(self, inputs: Dict[Text, tf.Tensor]):
        return self.ranking_model((inputs['userId'], inputs['title']))
        

    def compute_loss(self, inputs: Dict[Text, tf.Tensor], training: bool = False) -> tf.Tensor:
        rating_prediction = self.ranking_model((inputs['userId'], inputs['title']))

        return self.task(labels=inputs['rating'], predictions=rating_prediction)

In [8]:
# Compile the model
model = MovieRecModel()
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

# Cache the dataset
cached_train = train.batch(10000).cache()
cached_test = test.batch(5000).cache()

# Fit the model to train dataset
model.fit(cached_train, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x1de01fa8a00>

In [9]:
# Evaluate model
model.evaluate(cached_test, return_dict=True)



{'root_mean_squared_error': 0.008466300554573536,
 'loss': 7.085403194651008e-05,
 'regularization_loss': 0,
 'total_loss': 7.085403194651008e-05}

In [10]:
# Inference testing those movies for user 20
rating_result = {}
movies_to_test = ['Toy Story (1995)', 'Jumanji (1995)', 'Star Wars: The Last Jedi (2017)']
for title in movies_to_test:
    rating_result[title] =\
        model({'userId': np.array(['50']),
               'title': np.array([title])})

for x in rating_result.values():
    print(x.numpy()[0])

[2.4946513]
[2.5149736]
[2.4869032]
