https://www.tensorflow.org/api_docs/python/tf/data/Dataset

In [6]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
from typing import Dict, Text
import pprint
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.data import Dataset
from tensorflow.data.experimental import unique

from tensorflow.keras            import Sequential
from tensorflow.keras.layers     import Embedding
from tensorflow.keras.optimizers import Adagrad
from tensorflow.keras.layers.experimental.preprocessing import StringLookup

from sklearn.model_selection import train_test_split

import tensorflow_recommenders as tfrs
from tensorflow_recommenders.tasks                   import Retrieval
from tensorflow_recommenders.metrics                 import FactorizedTopK
from tensorflow_recommenders.layers.factorized_top_k import BruteForce

In [8]:
def get_ratings(dataset):
    ratings = pd.read_csv(f'./datasets/{dataset}/ratings.csv')
    movies  = pd.read_csv(f'./datasets/{dataset}/movies.csv')

    ratings = pd \
        .merge(ratings, movies, how='inner', on=['movieId']) \
        .filter(items=['userId', 'title']) \
        .rename(columns={"userId": "user_id", "title": "movie_title"}) \
        .astype({'user_id': 'str'})

    ratings = Dataset.from_tensor_slices(dict(ratings))

    ratings = ratings.map(lambda x: {
        "movie_title": x["movie_title"],
        "user_id": x["user_id"]
    })
    return ratings

def show(stream): [pprint.pprint(x) for x in stream.as_numpy_iterator()]

In [9]:
dataset = 'ml-latest-small'
# dataset = 'ml-latest'

ratings = get_ratings(dataset)
ratings

<MapDataset shapes: {movie_title: (), user_id: ()}, types: {movie_title: tf.string, user_id: tf.string}>

In [2]:
class EmbeddingModelFactory:
    @staticmethod
    def create(ds, column, embedding_size=64):
        unique_values = ds.map(lambda x: x[column]).apply(unique())

        lookup_layer = StringLookup(mask_token=None)
        lookup_layer.adapt(unique_values)

        return Sequential([
            lookup_layer,
            Embedding(lookup_layer.vocabulary_size(), embedding_size)
        ])

In [10]:
class CantidatesRetrievalModel(tfrs.Model):
    def __init__(self, ds, query_column, candidate_column, embedding_size=64):
        super().__init__()
        self.query_column  = query_column
        self.candidate_column = candidate_column

        self.query_model = EmbeddingModelFactory.create(
            ds, 
            self.query_column, 
            embedding_size
        )
        
        self.candidate_model = EmbeddingModelFactory.create(
            ds, 
            self.candidate_column, 
            embedding_size
        )

        candidates = ds.map(lambda x: x[self.candidate_column])
        candidate_embedings = candidates \
            .batch(128) \
            .map(self.candidate_model)

        self.task = Retrieval(metrics=FactorizedTopK(candidate_embedings))

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        query_embeddings = self.query_model(features[self.query_column])
        positive_candidate_embeddings = self.candidate_model(features[self.candidate_column])
        
        return self.task(query_embeddings, positive_candidate_embeddings)

In [11]:
# Create a retrieval model.
model = CantidatesRetrievalModel(
    ratings,
    query_column     ='user_id',
    candidate_column ='movie_title'
)
model.compile(optimizer=Adagrad(0.5))

In [12]:
model.fit(ratings.batch(4096), epochs=10)

Epoch 1/10
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f1824935850>

In [13]:
def show_top_k(queries, candidates):
    index = 0
    print(f'Top K Candidates:')
    for query in queries:
        print(f'\nQuery {query}:')
        for c in candidates[index]:
            print(f'  - {c}')
        index += 1

In [14]:
class CantidateRetriever:
    def __init__(self, model, df, candidate_column,  k=5):
        # Use brute-force search to set up retrieval using the trained representations.
        self.index = BruteForce(model.query_model, k=k)

        cantidates = ratings.map(lambda x: x[candidate_column])
        cantidate_embedings = cantidates \
            .batch(100) \
            .map(model.candidate_model)
        
        self.index.index(cantidate_embedings, cantidates)
    
    def retrieve(self, queries):
        _, cantidates = self.index(np.array(queries))
        return cantidates
    
    def inspect(self, queries):
        candidates = self.retrieve(queries)
        show_top_k(queries, candidates)

In [15]:
retriever = CantidateRetriever(
    model, 
    ratings, 
    candidate_column='movie_title'
)

In [16]:
candidates = retriever.inspect(['42', '70'])

Top K Candidates:

Query 42:
  - b'Varsity Blues (1999)'
  - b'Varsity Blues (1999)'
  - b'Varsity Blues (1999)'
  - b'Varsity Blues (1999)'
  - b'Varsity Blues (1999)'

Query 70:
  - b'Breaking Away (1979)'
  - b'Breaking Away (1979)'
  - b'Breaking Away (1979)'
  - b'Breaking Away (1979)'
  - b'Breaking Away (1979)'
