# Recommending movies: ranking

reference to https://www.tensorflow.org/recommenders/examples/quickstart


## Imports


Let's first get our imports out of the way.

In [None]:
!pip install -q --upgrade tensorflow==2.3.0
!pip install -q --upgrade tensorflow-datasets
!pip install -q --upgrade tensorflow-recommenders

In [None]:
import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

In [None]:
import tensorflow_recommenders as tfrs

## Preparing the dataset


In [None]:
ratings = tfds.load("movie_lens/100k-ratings", split="train")
ratings = ratings.map(lambda x: {
    "movie_id": x["movie_id"],
    "movie_title": x["movie_title"],
    "user_id": x["user_id"],
    "user_rating": x["user_rating"],
    "timestamp": x["timestamp"]
})


for x in ratings.take(10).as_numpy_iterator():
    pprint.pprint(x)

### Movie ID Model 

In [None]:
import numpy as np
import tensorflow as tf

movie_id_lookup = tf.keras.layers.experimental.preprocessing.StringLookup()

In [None]:
movie_id_lookup.adapt(ratings.map(lambda x: x["movie_id"]))

print(f"Vocabulary: {movie_id_lookup.get_vocabulary()[:10]}")

In [None]:
movie_id_lookup(["50", "258"])

In [None]:
movie_id_embedding = tf.keras.layers.Embedding(
    # Let's use the hashing approach.
    input_dim=movie_id_lookup.vocab_size(),
    output_dim=32
)

In [None]:
movie_id_model = tf.keras.Sequential([movie_id_lookup, movie_id_embedding])

In [None]:
movie_id_model(["258"])

### User ID Model 

In [None]:
num_hashing_bins = 20_000
user_id_lookup = tf.keras.layers.experimental.preprocessing.StringLookup()
user_id_embedding = tf.keras.layers.Embedding(num_hashing_bins, 32)

user_id_lookup.adapt(ratings.map(lambda x: x["user_id"]))

user_id_model = tf.keras.Sequential([user_id_lookup, user_id_embedding])
user_id_model(["1"])


We'll split the data by putting 80% of the ratings in the train set, and 20% in the test set.

In [None]:
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

In [None]:
for x in ratings.take(3).as_numpy_iterator():
    print(f"Timestamp: {x['timestamp']}.")

In [None]:
max_timestamp = ratings.map(lambda x: x["timestamp"]).reduce(
    tf.cast(0, tf.int64), tf.maximum).numpy().max()
min_timestamp = ratings.map(lambda x: x["timestamp"]).reduce(
    np.int64(1e9), tf.minimum).numpy().min()

timestamp_buckets = np.linspace(
    min_timestamp, max_timestamp, num=1000)

print(f"Buckets: {timestamp_buckets[:3]}")

In [None]:
timestamp_embedding_model = tf.keras.Sequential([
  tf.keras.layers.experimental.preprocessing.Discretization(timestamp_buckets.tolist()),
  tf.keras.layers.Embedding(len(timestamp_buckets) + 1, 32)
])

for timestamp in ratings.take(1).map(lambda x: x["timestamp"]).batch(1).as_numpy_iterator():
    print(f"Timestamp embedding: {timestamp_embedding_model(timestamp)}.")                                       

In [None]:
title_text = tf.keras.layers.experimental.preprocessing.TextVectorization()
title_text.adapt(ratings.map(lambda x: x["movie_title"]))

In [None]:
for row in ratings.batch(1).map(lambda x: x["movie_title"]).take(1):
    print(title_text(row))

In [None]:
title_text.get_vocabulary()[40:45]

In [None]:
class UserModel(tf.keras.Model):
  
    def __init__(self):
        super().__init__()

        num_hashing_bins = 20_000

        self.user_embedding = tf.keras.Sequential([
            user_id_lookup,
            tf.keras.layers.Embedding(user_id_lookup.vocab_size(), 32),
        ])
        self.timestamp_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.Discretization(timestamp_buckets.tolist()),
            tf.keras.layers.Embedding(len(timestamp_buckets) + 2, 32)
        ])
    
    def call(self, inputs):
      # Take the input dictionary, pass it through each input layer,
      # and concatenate the result.

        return tf.concat([
            self.user_embedding(inputs["user_id"]),
            self.timestamp_embedding(inputs["timestamp"]),
        ], axis=1)

In [None]:
class MovieModel(tf.keras.Model):
  
    def __init__(self):
        super().__init__()

        max_tokens = 10_000

        self.id_embedding = tf.keras.Sequential([
            movie_id_lookup,
            tf.keras.layers.Embedding(movie_id_lookup.vocab_size(), 32)
        ])
        self.title_text_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.TextVectorization(max_tokens=max_tokens),
            tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
          # We average the embedding of individual words to get one embedding vector
          # per title.
            tf.keras.layers.GlobalAveragePooling1D(),
        ])

    def call(self, inputs):
        return tf.concat([
            self.id_embedding(inputs["movie_id"]),
            self.title_text_embedding(inputs["movie_title"]),
    ], axis=1)

In [None]:
for row in ratings.batch(10).map(lambda x: x["movie_title"]).take(1):
  print(title_text(row))

In [None]:
movie_model = MovieModel()

movie_model.title_text_embedding.layers[0].adapt(
    ratings.map(lambda x: x["movie_title"]))

for row in ratings.batch(1).take(1):
    print(f"Computed representations: {movie_model(row)[0, :3]}")

## Implementing a model

### Architecture

Ranking models do not face the same efficiency constrains as retrieval models do, and so we have a little bit more freedom in our choice of architectures.

A model composed of multiple stacked dense layers is a relatively common architecture for ranking tasks. We can implement it as follows:

In [None]:
class RankingModel(tf.keras.Model):

    def __init__(self):
        super().__init__()
        embedding_dimension = 32

        # Compute embeddings for users.
        self.user_embeddings = UserModel()

        # Compute embeddings for movies.
        self.movie_embeddings = MovieModel()

        # Compute predictions.
        self.ratings = tf.keras.Sequential([
          # Learn multiple dense layers.
            tf.keras.layers.Dense(256, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
          # Make rating predictions in the final layer.
            tf.keras.layers.Dense(1)
        ])
        
    def call(self, inputs):
        user_embedding = self.user_embeddings(inputs)
        movie_embedding = self.movie_embeddings(inputs)
        return self.ratings(tf.concat([user_embedding, movie_embedding], axis=1))

This model takes user ids and movie titles, and outputs a predicted rating:

### Loss and metrics

The next component is the loss used to train our model. TFRS has several loss layers and tasks to make this easy.

In this instance, we'll make use of the `Ranking` task object: a convenience wrapper that bundles together the loss function and metric computation. 

We'll use it together with the `MeanSquaredError` Keras loss in order to predict the ratings.

```python
task = tfrs.tasks.Ranking(
  loss = tf.keras.losses.MeanSquaredError(),
  metrics=[tf.keras.metrics.RootMeanSquaredError()]
)
```

The task itself is a Keras layer that takes true and predicted as arguments, and returns the computed loss. We'll use that to implement the model's training loop.

### The full model

We can now put it all together into a model. TFRS exposes a base model class (`tfrs.models.Model`) which streamlines bulding models: all we need to do is to set up the components in the `__init__` method, and implement the `compute_loss` method, taking in the raw features and returning a loss value.

The base model will then take care of creating the appropriate training loop to fit our model.

In [None]:
class MovielensModel(tfrs.models.Model):

    def __init__(self):
        super().__init__()
        self.ranking_model: tf.keras.Model = RankingModel()
        self.task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
            loss = tf.keras.losses.MeanSquaredError(),
            metrics=[tf.keras.metrics.RootMeanSquaredError()]
        )

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        rating_predictions = self.ranking_model(
            features)
        print('rating_predictions', rating_predictions)
        # The task computes the loss and the metrics.
        return self.task(labels=features["user_rating"], predictions=rating_predictions)

## Fitting and evaluating

After defining the model, we can use standard Keras fitting and evaluation routines to fit and evaluate the model.

Let's first instantiate the model.

In [None]:
model = MovielensModel()
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001))

Then shuffle, batch, and cache the training and evaluation data.

In [None]:
cached_train = train.shuffle(100_000).batch(10_000).cache()
cached_test = test.batch(4096).cache()

Then train the  model:

In [None]:
model.fit(cached_train, epochs=200)

As the model trains, the loss is falling and the RMSE metric is improving.

Finally, we can evaluate our model on the test set:

In [None]:
model.evaluate(cached_test, return_dict=True)

In [None]:
import json

import numpy 
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

rerank_user = 88
user_id = str(rerank_user)
user_item_df = pd.read_pickle("user_item_df.p")
item_df = pd.read_pickle("item_df.p")
genres = ['unknown','Action' , 'Adventure', 'Animation', 'Childrens' , 'Comedy' , 'Crime', \
                                        'Documentary', 'Drama' ,'Fantasy' , 'Film-Noir' , 'Horror' , 'Musical', \
                                        'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']


def plot_heat_map(df, figsize=(10,7)): 
    df = df.div(df.sum(axis=1), axis=0)     
    plt.subplots(figsize=figsize)
    sns.heatmap(df)


u_id = user_id

tester_df = user_item_df[user_item_df['uid']==int(u_id)]
tester_df['positive'] = tester_df['rating'] >3 

review = tester_df[['positive']+genres].groupby(['positive']).sum()
plot_heat_map(review, figsize=(10,5))

In [None]:
master_user="yianc"
master_user_password="Cj;6qo4fu60218"
elastic_search_endpoint="search-p13n-search-demo-gezwjq5aol2p7u2gvje4ni7oom.us-west-2.es.amazonaws.com"

In [None]:
import requests 

r = requests.get('https://{}/movies/movie/_search?q=Drama:1&size=100'.format(elastic_search_endpoint), auth=(master_user, master_user_password))
rjson = r.json()
rjson 


In [None]:
items_from_search = [] 
for h in rjson['hits']['hits']:
    items_from_search.append(h['_source']['iid'])



In [None]:
import time 

def get_input_dic_by_movie_user_id(user_id, movie_id, movie_df=item_df):
    """
    This takes in an artist_id from Personalize so it will be a string,
    converts it to an int, and then does a lookup in a default or specified
    dataframe.
    
    A really broad try/except clause was added in case anything goes wrong.
    
    Feel free to add more debugging or filtering here to improve results if
    you hit an error.
    """

    title_str = movie_df[movie_df['iid']==int(movie_id)].iloc[0]['title']
    mid = tf.convert_to_tensor([str(movie_id)])
    uid = tf.convert_to_tensor([str(user_id)])
    title = tf.convert_to_tensor([movie_df[movie_df['iid']==int(movie_id)].iloc[0]['title']])
    timestamp = tf.convert_to_tensor([time.time()])
    res = {} 
    res['movie_id'] = mid
    res['user_id'] = uid
    res['movie_title'] = title 
    res['timestamp'] = timestamp
    return res





def get_movie_by_id(movie_id, movie_df=item_df):
    """
    This takes in an artist_id from Personalize so it will be a string,
    converts it to an int, and then does a lookup in a default or specified
    dataframe.
    
    A really broad try/except clause was added in case anything goes wrong.
    
    Feel free to add more debugging or filtering here to improve results if
    you hit an error.
    """
    try:
        c_row = movie_df[movie_df['iid']==movie_id].iloc[0]
        title = c_row['title'] 
        m_genres = [] 
        for g in genres: 
            if c_row[g] == 1: 
                m_genres.append(g)
        return movie_df[movie_df['iid']==movie_id].iloc[0]['title'] + " genres:" + ",".join(m_genres)
    except:
        return "Error obtaining movie info"

In [None]:
rerank_list = []
for item in items_from_search:
    movie = get_movie_by_id(item)
    rerank_list.append(movie)
rerank_df = pd.DataFrame(rerank_list, columns = [rerank_user])
rerank_df

In [None]:
import numpy 
ranked_list = []
for item in items_from_search: 
    movie = get_movie_by_id(item)
    inf_input = get_input_dic_by_movie_user_id(str(rerank_user),str(item))
    score = model.ranking_model(inf_input)
    ranked_list.append([movie, score.numpy()[0][0]])

ranked_list = sorted(ranked_list, key=lambda x:x[1], reverse=True)   
ranked_df = pd.DataFrame(numpy.array(ranked_list)[:,0], columns = ['Re-Ranked'])
all_df = pd.concat([rerank_df, ranked_df], axis=1)
pd.set_option('display.max_colwidth', -1)
all_df

The lower the RMSE metric, the more accurate our model is at predicting ratings.