In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

In [2]:
ratings = tfds.load('movielens/100k-ratings', split='train')

In [3]:
count = 0
for datum in ratings:
    if count > 10:
        break
    user_rating = datum['user_rating']
    movie_title = datum['movie_title']
    user_id = datum['user_id']
    print(f"user {user_id.numpy().decode('utf-8')} rated {movie_title.numpy().decode('utf-8')} with {user_rating.numpy()}")
    count += 1

user 138 rated One Flew Over the Cuckoo's Nest (1975) with 4.0
user 92 rated Strictly Ballroom (1992) with 2.0
user 301 rated Very Brady Sequel, A (1996) with 4.0
user 60 rated Pulp Fiction (1994) with 4.0
user 197 rated Scream 2 (1997) with 3.0
user 601 rated Crash (1996) with 4.0
user 710 rated Aladdin (1992) with 3.0
user 833 rated True Romance (1993) with 2.0
user 916 rated Bob Roberts (1992) with 5.0
user 940 rated Starship Troopers (1997) with 2.0
user 611 rated Sphere (1998) with 1.0


In [4]:
ratings = ratings.map(lambda x: { 'movie_title': x['movie_title'], 'user_id': x['user_id'] })

In [5]:
movies = tfds.load('movielens/100k-movies', split='train')

In [6]:
count = 0
for movie in movies:
    if count > 10:
        break
    tensor = movie['movie_title']
    print(tensor.numpy().decode('utf-8'))
    count += 1

You So Crazy (1994)
Love Is All There Is (1996)
Fly Away Home (1996)
In the Line of Duty 2 (1987)
Niagara, Niagara (1997)
Young Poisoner's Handbook, The (1995)
Age of Innocence, The (1993)
Flirt (1995)
Frisk (1995)
unknown
Girls Town (1996)


In [7]:
movies = movies.map(lambda x: x['movie_title'])

In [24]:
class TwoTowerModel(tfrs.Model):
    def __init__(self):
        super().__init__()
        
        embedding_dim = 32
        num_unique_users = 1000
        num_unique_movies = 1700
        eval_batch_size = 128
        
        # Turn raw user IDs into contiguous integers via a lookup vocab map.
        # Map the result into embedding vectors.
        self.user_model = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(max_tokens=num_unique_users),
            tf.keras.layers.Embedding(num_unique_users, embedding_dim)
        ])
        
        # Same for movies.
        self.movie_model = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(max_tokens=num_unique_movies),
            tf.keras.layers.Embedding(num_unique_movies, embedding_dim)
        ])
        
        # Task object has two purposes:
        # (1) It computes the loss
        # (2) It keeps track of metrics
        
        # The metric is top-k: given a user and a known watched movie, how highly would the model rank
        # the true movie out of all possible movies?
        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(candidates=movies.batch(eval_batch_size).map(self.movie_model))
        )
        
    def compute_loss(self, features, training=False):
        user_embeddings = self.user_model(features['user_id'])
        movie_embeddings = self.movie_model(features['movie_title'])
        return self.task(user_embeddings, movie_embeddings)

In [30]:
model = TwoTowerModel()
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
model.fit(ratings.batch(4096), verbose=True, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f576d51f0f0>

In [23]:
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
index.index(movies.batch(100).map(model.movie_model), movies)

_, titles = index(tf.constant(['42']))
print(f"Recommendations for user 42: {titles[0, :3]}")

Recommendations for user 42: [b'You So Crazy (1994)' b'Love Is All There Is (1996)'
 b'Fly Away Home (1996)']
