<a href="https://colab.research.google.com/github/anandmali/Movies_Recommenders/blob/main/retrieval_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Imports

In [None]:
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets
!pip install -q scann

In [2]:
import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

In [3]:
import tensorflow_recommenders as tfrs

Preparing data

#### movielens/100k-ratings
This dataset contains 100,000 ratings from 943 users on 1,682 movies. This dataset is the oldest version of the MovieLens dataset.

In [None]:
# Getting ratings dataset.
ratings = tfds.load("movielens/100k-ratings", split="train")

#### movielens/100k-movies
This dataset contains data of 1,682 movies rated in the 100k dataset. Dict with id, title, and genres.

In [28]:
# Features of all the available movies.
movies = tfds.load("movielens/100k-movies", split="train")

In [29]:
ratings = ratings.map(lambda x: {
    "movie_title": x["movie_title"],
    "user_id": x["user_id"],
})

# Printing for evaluation
for x in ratings.take(10).as_numpy_iterator():
  pprint.pprint(x)

movies = movies.map(lambda x: x["movie_title"])

for x in movies.take(10).as_numpy_iterator():
  pprint.pprint(x)


{'movie_title': b"One Flew Over the Cuckoo's Nest (1975)", 'user_id': b'138'}
{'movie_title': b'Strictly Ballroom (1992)', 'user_id': b'92'}
{'movie_title': b'Very Brady Sequel, A (1996)', 'user_id': b'301'}
{'movie_title': b'Pulp Fiction (1994)', 'user_id': b'60'}
{'movie_title': b'Scream 2 (1997)', 'user_id': b'197'}
{'movie_title': b'Crash (1996)', 'user_id': b'601'}
{'movie_title': b'Aladdin (1992)', 'user_id': b'710'}
{'movie_title': b'True Romance (1993)', 'user_id': b'833'}
{'movie_title': b'Bob Roberts (1992)', 'user_id': b'916'}
{'movie_title': b'Starship Troopers (1997)', 'user_id': b'940'}
b'You So Crazy (1994)'
b'Love Is All There Is (1996)'
b'Fly Away Home (1996)'
b'In the Line of Duty 2 (1987)'
b'Niagara, Niagara (1997)'
b"Young Poisoner's Handbook, The (1995)"
b'Age of Innocence, The (1993)'
b'Flirt (1995)'
b'Frisk (1995)'
b'unknown'


#### Split data for training and evaluation

To fit and evaluate the model, we need to split it into a training and evaluation set. In an industrial recommender system, this would most likely be done by time: the data up to time  T  would be used to predict interactions after  T .

In this simple example, however, let's use a random split, putting 80% of the ratings in the train set, and 20% in the test set.

In [30]:
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

Let's also figure out unique user ids and movie titles present in the data.

This is important because we need to be able to map the raw values of our categorical features to embedding vectors in our models. To do that, we need a vocabulary that maps a raw feature value to an integer in a contiguous range: this allows us to look up the corresponding embeddings in our embedding tables.

This is required to help in lookup of features associated with the indices in querry tower.

In [37]:
movie_titles = movies.batch(1_000)
user_ids = ratings.batch(1_000_000).map(lambda x: x["user_id"])

unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

t = unique_movie_titles[:10]
pprint.pprint(t)

i = unique_user_ids[:10]
pprint.pprint(i)


array([b"'Til There Was You (1997)", b'1-900 (1994)',
       b'101 Dalmatians (1996)', b'12 Angry Men (1957)', b'187 (1997)',
       b'2 Days in the Valley (1996)',
       b'20,000 Leagues Under the Sea (1954)',
       b'2001: A Space Odyssey (1968)',
       b'3 Ninjas: High Noon At Mega Mountain (1998)',
       b'39 Steps, The (1935)'], dtype=object)
array([b'1', b'10', b'100', b'101', b'102', b'103', b'104', b'105',
       b'106', b'107'], dtype=object)


#### Two tower model

define dimesion of both towers

In [38]:
embedding_dimension = 32

#### User embedding tower (Query tower)

In [39]:
user_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_user_ids, mask_token=None),
  # We add an additional embedding to account for unknown tokens.
  tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
])

#### Movies features tower (Candidate tower)

In [40]:
movie_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_movie_titles, mask_token=None),
  tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
])

#### Metrics
In our training data we have positive (user, movie) pairs. To figure out how good our model is, we need to compare the affinity score that the model calculates for this pair to the scores of all the other possible candidates: if the score for the positive pair is higher than for all other candidates, our model is highly accurate.

In [41]:
metrics = tfrs.metrics.FactorizedTopK(
  candidates=movies.batch(128).map(movie_model)
)

#### Loss
The next component is the loss used to train our model. TFRS has several loss layers and tasks to make this easy.

In [42]:
task = tfrs.tasks.Retrieval(
  metrics=metrics
)

Full model

In [44]:
class MovielensModel(tfrs.Model):

  def __init__(self, user_model, movie_model):
    super().__init__()
    self.movie_model: tf.keras.Model = movie_model
    self.user_model: tf.keras.Model = user_model
    self.task: tf.keras.layers.Layer = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    user_embeddings = self.user_model(features["user_id"])
    # And pick out the movie features and pass them into the movie model,
    # getting embeddings back.
    positive_movie_embeddings = self.movie_model(features["movie_title"])

    # The task computes the loss and the metrics.
    return self.task(user_embeddings, positive_movie_embeddings)

#### Fitting and evaluating

In [45]:
#Initiate the model
model = MovielensModel(user_model, movie_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

#Shuffle
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

#Train
model.fit(cached_train, epochs=3)

#Evaluate
model.evaluate(cached_test, return_dict=True)

Epoch 1/3
Epoch 2/3
Epoch 3/3


{'factorized_top_k/top_1_categorical_accuracy': 0.000750000006519258,
 'factorized_top_k/top_5_categorical_accuracy': 0.009200000204145908,
 'factorized_top_k/top_10_categorical_accuracy': 0.02160000056028366,
 'factorized_top_k/top_50_categorical_accuracy': 0.12399999797344208,
 'factorized_top_k/top_100_categorical_accuracy': 0.23420000076293945,
 'loss': 28237.890625,
 'regularization_loss': 0,
 'total_loss': 28237.890625}

#### Recommending movie title

In [64]:
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
index.index_from_dataset(
  tf.data.Dataset.zip((movies.batch(100), movies.batch(100).map(model.movie_model)))
)

# Get some recommendations for given user ID.
_, titles = index(np.array(["54"]))
print(f"Top 10 recommended movies for user 54: \n{titles[0, :10, ]}")


Top 10 recommended movies for user 54: 
[b'Fled (1996)' b'Daylight (1996)' b'Private Parts (1997)'
 b'Turbulence (1997)' b'Arrival, The (1996)' b'Fifth Element, The (1997)'
 b'Fear (1996)' b'Con Air (1997)' b'Breakdown (1997)'
 b"Don't Be a Menace to South Central While Drinking Your Juice in the Hood (1996)"]
