# Imports

In [2]:
from typing import Dict, Text

import numpy as np
import tensorflow as tf

import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


# Datasets

In [3]:
# Read the dataset from tfds
ratings = tfds.load('movielens/latest-small-ratings', split='train')
movies = tfds.load('movielens/latest-small-movies', split='train')

# Select the needed feature
# x['features'] can be done because the dataset are in form of FeaturesDict, check the docs
ratings = ratings.map(lambda x: {
    'user_id': x['user_id'],        
    'movie_title': x['movie_title']
    })
movies = movies.map(lambda x: x['movie_title'])

# Randomize the dataset
tf.random.set_seed(20)
shuffled = ratings.shuffle(100000, seed=20, reshuffle_each_iteration=False)

train = shuffled.take(80000)
test = shuffled.skip(80000).take(20000)

If your dataset are from csv, you can convert that to TF dataset. Below is an example of its implementation.

``` python
# Read the dataset csv file
ratings = pd.read_csv("data/ratings_200k.csv")
movies = pd.read_csv("data/movies_2000.csv")

# Add movie title to ratings dataset
ratings = pd.merge(ratings, movies[['title', 'movieId']], on='movieId')[['userId', 'title']]
ratings['userId'] = ratings['userId'].apply(lambda x: str(x).encode())

# Convert dataframe to tf dataset
ratings = tf.data.Dataset.from_tensor_slices(ratings)
movies = tf.data.Dataset.from_tensor_slices(movies['title'])

# Convert ratings to dict type
ratings = ratings.map(lambda x: {'userId': x[0], 'title': x[1]})
```

In [4]:
# Get the unique user id and movie title
# Alternatively, you can skip this step by passing user ids and movie titles to StringLookup().adapt() but it will be slower
movie_titles = movies.batch(1000)
user_ids = ratings.batch(1000000).map(lambda x: x["user_id"])

uniq_movie_titles = np.unique(np.concatenate(list(movie_titles)))
uniq_user_ids = np.unique(np.concatenate(list(user_ids)))

In [5]:
# Convert the user id and movie title to int representation using StringLookup
# These representation will be used in embedding
movie_titles_vocabulary = tf.keras.layers.StringLookup(vocabulary=uniq_movie_titles, mask_token=None)
user_ids_vocabulary = tf.keras.layers.StringLookup(vocabulary=uniq_user_ids, mask_token=None)

# Models

In [6]:
# Create a class based on tfrs.Model
class MovieRecModel(tfrs.Model):
  def __init__(
      self,
      user_model: tf.keras.Model, # Query embedding tower as input
      movie_model: tf.keras.Model, # Candidate embedding tower as input
      task: tfrs.tasks.Retrieval):
    super().__init__()

    # Assign the query/user and candidate/movie embedding tower to self
    self.user_model = user_model
    self.movie_model = movie_model

    # Assign the retrieval task.
    self.task = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # Define how the loss is computed.
    user_embeddings = self.user_model(features["user_id"])
    movie_embeddings = self.movie_model(features["movie_title"])

    return self.task(user_embeddings, movie_embeddings)

In [7]:
# Define user and movie models.
user_model = tf.keras.Sequential([
    user_ids_vocabulary,
    tf.keras.layers.Embedding(user_ids_vocabulary.vocabulary_size(), 64)
])
movie_model = tf.keras.Sequential([
    movie_titles_vocabulary,
    tf.keras.layers.Embedding(movie_titles_vocabulary.vocabulary_size(), 64)
])

# Define your objectives.
task = tfrs.tasks.Retrieval(metrics=tfrs.metrics.FactorizedTopK(
    movies.batch(128).map(movie_model)
  )
)

In [8]:
# Create a retrieval model.
model = MovieRecModel(user_model, movie_model, task)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))

# Cache the dataset
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

# Train for 3 epochs.
model.fit(ratings.batch(4096), epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x234c45f9490>

In [9]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.0037499999161809683,
 'factorized_top_k/top_5_categorical_accuracy': 0.026900000870227814,
 'factorized_top_k/top_10_categorical_accuracy': 0.0580499991774559,
 'factorized_top_k/top_50_categorical_accuracy': 0.24124999344348907,
 'factorized_top_k/top_100_categorical_accuracy': 0.36934998631477356,
 'loss': 23552.390625,
 'regularization_loss': 0,
 'total_loss': 23552.390625}

In [12]:
# Use brute-force search to set up retrieval using the trained representations
# This index is used for brute-force retrieval based on user representations
# The model.user_model refers to the trained model that provides the user representations.
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)

# .index_from_dataset takes candidates as input which in a form of (movie title, movie title embedding) pairs
# The embedding can be obtained using model.movie_model
index.index_from_dataset(movies.batch(100).map(lambda movie_title: (movie_title, model.movie_model(movie_title))))

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x234c44b6850>

In [29]:
# Get 10 recommendations as the result of retrieval stage
scores, retrieved_titles = index(np.array(["20"]), k=10)

# Print the recommendations
print(f"Top 10 recommendations for user 20:")
for movie in retrieved_titles[0]:
    print(f"- {movie.numpy().decode('utf-8')}")

Top 10 recommendations for user 20:
- Shiloh (1997)
- Soft Fruit (1999)
- Adanggaman (2000)
- Black Stallion, The (1979)
- Endurance: Shackleton's Legendary Antarctic Expedition, The (2000)
- Kiss Me Kate (1953)
- Jimmy Neutron: Boy Genius (2001)
- Trail of the Pink Panther (1982)
- Unfaithful (2002)
- Dennis the Menace (1993)


In [32]:
# Get 1000 recommendations as the result of retrieval stage
scores, retrieved_titles_100 = index(np.array(["20"]), k=100)

# Save the recommendations
pd.DataFrame(retrieved_titles_100[0].numpy()).applymap(lambda x: x.decode('utf-8')).to_csv("retrieved_titles.csv", index=False)

In [14]:
import os

path = os.path.join(os.curdir, "retrieval_model")

# Save the model query model
tf.saved_model.save(index, path)









INFO:tensorflow:Assets written to: .\retrieval_model\assets


INFO:tensorflow:Assets written to: .\retrieval_model\assets
