# Movie Recommendation System: Ranking

Real-world recommender systems are often composed of two stages:

1. The retrieval stage is responsible for selecting an initial set of hundreds of candidates from all possible candidates. The main objective of this model is to efficiently weed out all candidates that the user is not interested in. Because the retrieval model may be dealing with millions of candidates, it has to be computationally efficient.
2. The ranking stage takes the outputs of the retrieval model and fine-tunes them to select the best possible handful of recommendations. Its task is to narrow down the set of items the user may be interested in to a shortlist of likely candidates.

In this notebook, we're going to focus on the second stage, ranking.

## Install libraries

In [1]:
!pip install -q tensorflow-recommenders

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/96.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.2/96.2 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h

## Import libraries

In [15]:
import os
import pprint
import tempfile

from typing import Dict, Text

import pandas as pd
import numpy as np


import tensorflow as tf
import tensorflow_recommenders as tfrs

## Constants

In [46]:
data_Path = 'Recommended_Systems_ Movie/ml-100k'


## Reading Data

In [17]:
user_info_path = os.path.join(data_Path, 'u.info')

user_info = pd.read_csv(user_info_path, header=None)
user_info.head()

Unnamed: 0,0
0,943 users
1,1682 items
2,100000 ratings


### Users Data:

In [18]:
user_data_path = os.path.join(data_Path, 'u.data')

#Reading users file:
column_names = ['user_id', 'item_id', 'rating', 'timestamp']
users_df = pd.read_csv(user_data_path, sep='\t', names=column_names)
# Checking shape of users files and head
print(users_df.shape)
users_df.head()

(100000, 4)


Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


### Movies Data:

In [19]:
columns = "item_id | title | release date | video release date | "\
             "IMDb URL | unknown | Action | Adventure | Animation | Children's | "\
              "Comedy | Crime | Documentary | Drama | Fantasy | Film-Noir | "\
               "Horror | Musical | Mystery | Romance | Sci-Fi | Thriller | War | Western | "

column_names_2 = columns.split(' | ')
movie_data_path = os.path.join(data_Path, 'u.item')

# Reading the movie data
movie_df = pd.read_csv(movie_data_path, sep='|', header=None, names=column_names_2, encoding='latin-1')
movie_df.drop(movie_df.columns[-1], axis=1, inplace=True)
# Checking shape of movie data and look first 5 rows
print(movie_df.shape)
movie_df.head()

(1682, 24)


Unnamed: 0,item_id,title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [20]:
movie_df = movie_df[['item_id', 'title']]
movie_df.head()

Unnamed: 0,item_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


### Merger Movie and User data

In [21]:
# Combining the data on same column
df= pd.merge(users_df, movie_df, on= 'item_id')
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [22]:
df.shape

(100000, 5)

In [23]:
refined_df = df.groupby(by=['user_id','title'], as_index=False).agg({"rating":"mean"})

refined_df.head()

Unnamed: 0,user_id,title,rating
0,1,101 Dalmatians (1996),2.0
1,1,12 Angry Men (1957),5.0
2,1,"20,000 Leagues Under the Sea (1954)",3.0
3,1,2001: A Space Odyssey (1968),4.0
4,1,"Abyss, The (1989)",3.0


In [24]:
refined_df['user_id'] = refined_df['user_id'].values.astype(str)
refined_df = tf.data.Dataset.from_tensor_slices(dict(refined_df))

In [26]:
ratings = refined_df.map(lambda x: {
    "movie_title": x["title"],
    "user_id": x["user_id"],
    "user_rating": x["rating"]
})

In [27]:
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

Let's also figure out unique user ids and movie titles present in the data.

In [28]:
movie_titles = ratings.batch(1_000_000).map(lambda x: x["movie_title"])
user_ids = ratings.batch(1_000_000).map(lambda x: x["user_id"])

unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

unique_movie_titles[:10]

array([b"'Til There Was You (1997)", b'1-900 (1994)',
       b'101 Dalmatians (1996)', b'12 Angry Men (1957)', b'187 (1997)',
       b'2 Days in the Valley (1996)',
       b'20,000 Leagues Under the Sea (1954)',
       b'2001: A Space Odyssey (1968)',
       b'3 Ninjas: High Noon At Mega Mountain (1998)',
       b'39 Steps, The (1935)'], dtype=object)

## Implementing a model

### Architecture

In [29]:
class RankingModel(tf.keras.Model):

  def __init__(self):
    super().__init__()
    embedding_dimension = 32

    # Compute embeddings for users.
    self.user_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_user_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
    ])

    # Compute embeddings for movies.
    self.movie_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_movie_titles, mask_token=None),
      tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
    ])

    # Compute predictions.
    self.ratings = tf.keras.Sequential([
      # Learn multiple dense layers.
      tf.keras.layers.Dense(256, activation="relu"),
      tf.keras.layers.Dense(64, activation="relu"),
      # Make rating predictions in the final layer.
      tf.keras.layers.Dense(1)
  ])

  def call(self, inputs):

    user_id, movie_title = inputs

    user_embedding = self.user_embeddings(user_id)
    movie_embedding = self.movie_embeddings(movie_title)

    return self.ratings(tf.concat([user_embedding, movie_embedding], axis=1))

This model takes user ids and movie titles, and outputs a predicted rating:

In [31]:
RankingModel()((["90"], ["One Flew Over the Cuckoo's Nest (1975)"]))

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[-0.01062707]], dtype=float32)>

### Loss and metrics

In [32]:
task = tfrs.tasks.Ranking(
  loss = tf.keras.losses.MeanSquaredError(),
  metrics=[tf.keras.metrics.RootMeanSquaredError()]
)

### The full model

In [33]:
class MovielensModel(tfrs.models.Model):

  def __init__(self):
    super().__init__()
    self.ranking_model: tf.keras.Model = RankingModel()
    self.task: tf.keras.layers.Layer = task

  def call(self, features: Dict[str, tf.Tensor]) -> tf.Tensor:
    return self.ranking_model(
        (features["user_id"], features["movie_title"]))

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    labels = features.pop("user_rating")

    rating_predictions = self(features)

    # The task computes the loss and the metrics.
    return self.task(labels=labels, predictions=rating_predictions)

### Fitting and evaluating

In [34]:
model = MovielensModel()
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [35]:
AUTOTUNE = tf.data.AUTOTUNE

cached_train = train.shuffle(100_000).batch(8192).cache().prefetch(buffer_size=AUTOTUNE)
cached_test = test.batch(4096).cache().prefetch(buffer_size=AUTOTUNE)

In [36]:
model.fit(cached_train, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7edc00f9c6d0>

In [37]:
model.evaluate(cached_test, return_dict=True)



{'root_mean_squared_error': 1.1100149154663086,
 'loss': 1.2559467554092407,
 'regularization_loss': 0,
 'total_loss': 1.2559467554092407}

## Testing the ranking model

In [43]:
test_ratings = {}
test_movie_titles = ["2 Days in the Valley (1996)", "2001: A Space Odyssey (1968)",
                     "12 Angry Men (1957)"]
for movie_title in test_movie_titles:
  test_ratings[movie_title] = model({
      "user_id": np.array(["90"]),
      "movie_title": np.array([movie_title])
  })

print("Ratings:")
for title, score in sorted(test_ratings.items(), key=lambda x: x[1], reverse=True):
  print(f"{title}: {score[0][0]:0.2f}")

Ratings:
2001: A Space Odyssey (1968): 3.74
12 Angry Men (1957): 3.73
2 Days in the Valley (1996): 3.65


## Exporting for serving

In [44]:
tf.saved_model.save(model, "export")

In [45]:
loaded = tf.saved_model.load("export")

loaded({"user_id": np.array(["90"]), "movie_title": ["12 Angry Men (1957)"]}).numpy()

array([[3.7342825]], dtype=float32)