In [1]:
from typing import Dict, Text

import numpy as np
import tensorflow as tf

import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Read the dataset csv file
ratings = pd.read_csv("data/ratings_200k.csv")
movies = pd.read_csv("data/movies_2000.csv")

# Add movie title to ratings dataset
ratings = pd.merge(ratings, movies[['title', 'movieId']], on='movieId')[['userId', 'title']]
ratings['userId'] = ratings['userId'].apply(lambda x: str(x).encode())

# Convert dataframe to tf dataset
ratings = tf.data.Dataset.from_tensor_slices(ratings)
movies = tf.data.Dataset.from_tensor_slices(movies['title'])

# Convert ratings to dict type
ratings = ratings.map(lambda x: {'userId': x[0], 'title': x[1]})

In [6]:
# Convert the user id to int representation using StringLookup
# This representation will be used in embedding
user_ids_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
user_ids_vocabulary.adapt(ratings.map(lambda x: x["userId"]))

In [7]:
# Convert the movie title to int representation using StringLookup
# This representation will be used in embedding
movie_titles_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
movie_titles_vocabulary.adapt(movies)

In [11]:
class MovieLensModel(tfrs.Model):
  # We derive from a custom base class to help reduce boilerplate. Under the hood,
  # these are still plain Keras Models.

  def __init__(
      self,
      user_model: tf.keras.Model, # Query tower as input
      movie_model: tf.keras.Model, # Candidate tower as input
      task: tfrs.tasks.Retrieval):
    super().__init__()

    # Set up user and movie representations.
    self.user_model = user_model
    self.movie_model = movie_model

    # Set up a retrieval task.
    self.task = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # Define how the loss is computed.
    user_embeddings = self.user_model(features["userId"])
    movie_embeddings = self.movie_model(features["title"])

    return self.task(user_embeddings, movie_embeddings)

In [12]:
# Define user and movie models.
user_model = tf.keras.Sequential([
    user_ids_vocabulary,
    tf.keras.layers.Embedding(user_ids_vocabulary.vocabulary_size(), 64)
])
movie_model = tf.keras.Sequential([
    movie_titles_vocabulary,
    tf.keras.layers.Embedding(movie_titles_vocabulary.vocabulary_size(), 64)
])

# Define your objectives.
task = tfrs.tasks.Retrieval(metrics=tfrs.metrics.FactorizedTopK(
    movies.batch(128).map(movie_model)
  )
)

In [15]:
# Create a retrieval model.
model = MovieLensModel(user_model, movie_model, task)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))

# Train for 3 epochs.
model.fit(ratings.batch(4096), epochs=10)

# Use brute-force search to set up retrieval using the trained representations.
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
index.index_from_dataset(
    movies.batch(100).map(lambda title: (title, model.movie_model(title))))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x1e601e88ca0>

In [20]:
# Get 100 recommendations as the result of retrieval stage
_, retrieved_titles = index(np.array(["20"]), k=100)
print(f"Top 3 recommendations for user 20: {retrieved_titles[0, :3]}")

Top 3 recommendations for user 20: [b'Star Wars: Episode VI - Return of the Jedi (1983)'
 b"Schindler's List (1993)" b'Taxi Driver (1976)']


In [21]:
retrieved_titles

<tf.Tensor: shape=(1, 100), dtype=string, numpy=
array([[b'Star Wars: Episode VI - Return of the Jedi (1983)',
        b"Schindler's List (1993)", b'Taxi Driver (1976)',
        b'Manhattan Murder Mystery (1993)',
        b'American in Paris, An (1951)', b'Sin City (2005)',
        b'Rogue One: A Star Wars Story (2016)',
        b'Kill Bill: Vol. 1 (2003)', b'Vertigo (1958)',
        b'Lord of the Rings: The Two Towers, The (2002)',
        b'Name of the Rose, The (Name der Rose, Der) (1986)',
        b'WALL\xc2\xb7E (2008)', b'Road, The (2009)', b'Fallen (1998)',
        b'Jewel of the Nile, The (1985)', b'African Queen, The (1951)',
        b'Avengers: Age of Ultron (2015)', b'Michael Clayton (2007)',
        b'Bound (1996)', b'This Is the End (2013)',
        b'Glengarry Glen Ross (1992)',
        b'Grand Budapest Hotel, The (2014)', b'Rear Window (1954)',
        b'Double Indemnity (1944)', b'Gone Girl (2014)', b'Hugo (2011)',
        b'Easy Rider (1969)', b'Teenage Mutant Ninja Tu