In [1]:
import os
from pprint import pprint
import tempfile

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

import tensorflow_recommenders as tfrs

# Training

In [2]:
full_ratings = tfds.load("movie_lens/100k-ratings", split="train")
full_movies = tfds.load("movie_lens/100k-movies", split="train")

In [3]:
print(len(full_ratings))
pprint(next(iter(full_ratings)))

100000
{'bucketized_user_age': <tf.Tensor: shape=(), dtype=float32, numpy=45.0>,
 'movie_genres': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([7])>,
 'movie_id': <tf.Tensor: shape=(), dtype=string, numpy=b'357'>,
 'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b"One Flew Over the Cuckoo's Nest (1975)">,
 'raw_user_age': <tf.Tensor: shape=(), dtype=float32, numpy=46.0>,
 'timestamp': <tf.Tensor: shape=(), dtype=int64, numpy=879024327>,
 'user_gender': <tf.Tensor: shape=(), dtype=bool, numpy=True>,
 'user_id': <tf.Tensor: shape=(), dtype=string, numpy=b'138'>,
 'user_occupation_label': <tf.Tensor: shape=(), dtype=int64, numpy=4>,
 'user_occupation_text': <tf.Tensor: shape=(), dtype=string, numpy=b'doctor'>,
 'user_rating': <tf.Tensor: shape=(), dtype=float32, numpy=4.0>,
 'user_zip_code': <tf.Tensor: shape=(), dtype=string, numpy=b'53211'>}


In [4]:
print(len(full_movies))
pprint(next(iter(full_movies)))

1682
{'movie_genres': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([4])>,
 'movie_id': <tf.Tensor: shape=(), dtype=string, numpy=b'1681'>,
 'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b'You So Crazy (1994)'>}


In [5]:
ratings = full_ratings.map(lambda x:{
    "movie_title": x["movie_title"],
    "user_id": x["user_id"]
})
print(type(ratings), len(ratings))
pprint(next(iter(ratings)))

<class 'tensorflow.python.data.ops.dataset_ops.MapDataset'> 100000
{'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b"One Flew Over the Cuckoo's Nest (1975)">,
 'user_id': <tf.Tensor: shape=(), dtype=string, numpy=b'138'>}


In [6]:
movie_titles = full_movies.map(lambda x: x["movie_title"])
print(type(movie_titles), len(movie_titles))
pprint(next(iter(movie_titles)))

<class 'tensorflow.python.data.ops.dataset_ops.MapDataset'> 1682
<tf.Tensor: shape=(), dtype=string, numpy=b'You So Crazy (1994)'>


In [7]:
tf.random.set_seed(1)
shuffled_ratings = ratings.shuffle(100_000, seed=1, reshuffle_each_iteration=False)
trainset = shuffled_ratings.take(80_000)
testset = shuffled_ratings.skip(80_000).take(20_000)

#### Sidebar: How does `batch` work?

In [8]:
# movies.batch(n) will return an iterator that will yield n movies at a time
# until all moives have been iterated.
pprint(next(iter(movie_titles.batch(3))))

<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'You So Crazy (1994)', b'Love Is All There Is (1996)',
       b'Fly Away Home (1996)'], dtype=object)>


In [9]:
for i, batch in enumerate(movie_titles.batch(1000)):
    print(f"batch {i}: {batch.shape}")

batch 0: (1000,)
batch 1: (682,)


In [10]:
# just like with any iterator we can get all the batches with a call to `list`
tp = list(movie_titles.batch(1000))
print(len(tp))
print(tp[0].shape)
print(tp[1].shape)

2
(1000,)
(682,)


In [11]:
# Similarly ratings.batch will also return an iterator. However, because each element in ratings is
# a dict, the batch method will yield a columnar dict with values as tensors. 
# In case of movie_titles, each element was a tensor, which is why the iterator yielded a row tensor.
pprint(next(iter(ratings.batch(3))))

{'movie_title': <tf.Tensor: shape=(3,), dtype=string, numpy=
array([b"One Flew Over the Cuckoo's Nest (1975)",
       b'Strictly Ballroom (1992)', b'Very Brady Sequel, A (1996)'],
      dtype=object)>,
 'user_id': <tf.Tensor: shape=(3,), dtype=string, numpy=array([b'138', b'92', b'301'], dtype=object)>}


#### Back to main tutorial
There is no simple way to get all the movie titles in a single list. In PyTorch I'd have simply list'ed the entire `Dataset` and be done with it. But here I have to first create the batch iterator  (similar to PyTorch `Dataloader`), and then extract the data batch-by-batch, and finally concatenate all the batch tensors into one big tensor

In [12]:
EMBEDDING_DIM = 32

In [13]:
# Lets create the movie_title embedding table

# Lets first get the list of all 1600 movie titles
it = movie_titles.batch(1000)
all_movie_titles = np.concatenate(list(it))
print(type(all_movie_titles), all_movie_titles.shape)

# and then get the unique movie titles from this ndarray
uniq_movie_titles = np.unique(all_movie_titles)
print(uniq_movie_titles.shape)

movie_title_emb = tf.keras.layers.Embedding(len(uniq_movie_titles) + 1, EMBEDDING_DIM)

<class 'numpy.ndarray'> (1682,)
(1664,)


In [14]:
# Now lets create the user_id embedding table

# get the list of all 100_000 user_ids in batches of 1000
it = ratings.batch(1000).map(lambda x: x["user_id"])
all_user_ids = np.concatenate(list(it))
print(type(all_user_ids), all_user_ids.shape)

# and then get the unique user_ids from this ndarray
uniq_user_ids = np.unique(all_user_ids)
print(uniq_user_ids.shape)

user_id_emb = tf.keras.layers.Embedding(len(uniq_user_ids) + 1, EMBEDDING_DIM)

<class 'numpy.ndarray'> (100000,)
(943,)


In [15]:
# Create the two towers, the request tower (aka user tower) and the candidate tower (aka movie tower)

# We need to map the string user_ids and movie_titles into their respective idx vals
user_id2idx = tf.keras.layers.experimental.preprocessing.StringLookup(vocabulary=uniq_user_ids, mask_token=None)
movie_title2idx = tf.keras.layers.experimental.preprocessing.StringLookup(vocabulary=uniq_movie_titles, mask_token=None)

# And then pass the indexes through the embedding tables
user_model = tf.keras.Sequential([user_id2idx, user_id_emb])
movie_model = tf.keras.Sequential([movie_title2idx, movie_title_emb])

In [16]:
# Seems like FactorizedTopK needs all the candidate **embeddings**. 
# Remember this is not eager execution (unlike PT) so this compute graph is simply set up at this
# time, there is no danger of getting untrained embeddings out just yet.
metrics = tfrs.metrics.FactorizedTopK(candidates=movie_titles.batch(128).map(movie_model))

### Retrieval Approach

The main approach is a bit unexpected. Lets say the input consists of a batch of 3 <user_id, movie_title> tuples.

$$
\begin{bmatrix}
\text{user}_1 & \text{movie}_1 \\
\text{user}_2 & \text{movie}_2 \\
\text{user}_3 & \text{movie}_3 \\
\end{bmatrix}
$$

The first step is to take all the users and get their embeddings from the `user_model`. Lets say our embedding dimension is 2 and the embedding of $\text{user}_1$ is $u_1 = \left[u_{11} \; u_{12}\right]$ and so on. Do the same for the movies.

$$
U = \begin{bmatrix}
u_{11} & u_{12} \\
u_{21} & u_{22} \\
u_{31} & u_{32} \\
\end{bmatrix}
$$

$$
M = \begin{bmatrix}
m_{11} & m_{12} \\
m_{21} & m_{22} \\
m_{31} & m_{32} \\
\end{bmatrix}
$$

These two embeddings are passed to the `Retrieval` *task* (wonder why they decided to call this layer task?) which interacts the embeddings and calculates the softmax cross entropy loss. The loss is softmax cross entropy instead of binary cross entropy because this layer does something interesting. It uses the input batch of embeddings to create a bunch of negative samples like so.

First it will calculate the dot product of each user with all the three movies, the one movie that the user has seen, and the other two that they haven't! This way we get 1 positive and 2 negative samples.

$$
U.M^T = \begin{bmatrix}
u_1.m_1 & u_1.m_2 & u_1.m_2 \\
u_2.m_1 & u_2.m_2 & u_2.m_2 \\
u_3.m_1 & u_3.m_2 & u_3.m_3 \\
\end{bmatrix}
$$

Then it will calculate the corresponding labels simply as an $I$.

$$
L = \begin{bmatrix}
1 & 0 & 0 \\
0 & 1 & 0 \\
0 & 0 & 1 \\
\end{bmatrix}
$$

Now these two matrices can be thought of as the inputs and labels of a multi-class classification problem whose loss is a softmax loss. 

In this approach, it is possible that $u_1$ and $m_2$, which have been taken as a negative sample, could actually be a positive sample outside of this particular mini-batch. The `Retrieval` class addresses this by zero-ing out the logits of such pairs. At first glance this might seem like a no-op because softmax does not take into account zeros, it only takes the logit corresponding to the 1 in the one-hot label. And the label for $u1^Tm_2$ is $0$, so how does changing its logit matter? However, by zero-ing out the logit of an element in the row gives all the other elements a greater share of the probability distribution thereby increasing the probability of the $u_1^Tm_1$. This is needed because the model might have learnt to give greater dot products for $u_1^Tm_2$ based on other mini-batches. And because of the high value of that element in the logits for this mini-batch, the probability associated with $u_1^Tm_1$, the positive sample in this batch will have a lower probability and a higher cost. This will prevent the model from learning to give a high dot product for these embeddings. The `Retrieval` class takes in a tensor of `candidate_ids` to address the accidental negative hits. However, the exact structure of this tensor is not well documented. I was not able to reverse-engineer its structure even after looking at the source code.

The metrics are calculated based on straight up positive samples. **TODO:** Dig into the metrics calculation code to figure out exactly what is happening.

In [17]:
task = tfrs.tasks.Retrieval(metrics=metrics)

# tfrs has provided a convenience module that calls the forward method using gradient tape.
# This is a very simple model that just takes the 
class MovielensModel(tfrs.Model):
    def __init__(self, user_model, movie_model):
        super().__init__()
        self.movie_model = movie_model
        self.user_model = user_model
        self.task = task
        
    def compute_loss(self, features, training=False):
        user_embeddings = self.user_model(features["user_id"])
        positive_movie_embeddings = self.movie_model(features["movie_title"])
        return self.task(user_embeddings, positive_movie_embeddings)

In [18]:
model = MovielensModel(user_model, movie_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [19]:
trainloader = trainset.shuffle(100_000).batch(8192).cache()
model.fit(trainloader, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f84280da240>

# Serving

The user and movie models have been trained in such a way that the embeddings they output will have a larger dot product if the user is likely to watch the movie. If $u$ is the user embedding and $m$ is the movie embedding, then the larger $u^Tm$ is, the more likely the user is going to watch the movie. This property can be used during serving time to make retrieval fast. 

At system startup time, all the movie embeddings are calculated. Then, when a request comes in, the user embedding of the requesting user is calculated. Now a simple nearest neighbor algorithm can be used to find the top $k$ movies whose dot product with this user is the highest.

In [20]:
from annoy import AnnoyIndex

In [21]:
index = AnnoyIndex(EMBEDDING_DIM, "dot")

In [22]:
# here we give each movie an "id" which just its index value in the dataset
movie_embeddings = movie_titles.enumerate().map(lambda idx, title: (idx, title, model.movie_model(title)))





In [25]:
movie_id_to_title = dict((idx, title) for idx, title, _ in movie_embeddings.as_numpy_iterator())

# We unbatch the dataset because Annoy accepts only scalar (id, embedding) pairs.
for movie_id, _, movie_embedding in movie_embeddings.as_numpy_iterator():
    index.add_item(movie_id, movie_embedding)

# Build a 10-tree ANN index.
index.build(10)

True

In [27]:
for row in testset.batch(1).take(3):
    query_embedding = model.user_model(row["user_id"])[0]
    candidates = index.get_nns_by_vector(query_embedding, 3)
    print(f"Candidates: {[movie_id_to_title[x] for x in candidates]}.")

Candidates: [b"Kid in King Arthur's Court, A (1995)", b'Now and Then (1995)', b'Kazaam (1996)'].
Candidates: [b'Hot Shots! Part Deux (1993)', b'Heavy Metal (1981)', b'Nightmare Before Christmas, The (1993)'].
Candidates: [b'Once Upon a Time in the West (1969)', b'Spellbound (1945)', b'Notorious (1946)'].
