###  Created by Luis A. Sanchez-Perez (l.alejandro.2011@gmail.com).
<p><span style="color:green"><b>Copyright &#169;</b> Do not distribute or use without authorization from author.</span></p>

In [1]:
import altair as alt
import pandas as pd
import os
import pathlib
import tensorflow as tf
import numpy as np
import json
from sklearn.manifold import TSNE

In [2]:
MODEL = 'models/BasicBatchedMF/20210321-091553/1/'
DATASETS = pathlib.Path(os.environ['DATASETS'])

In [3]:
# Loads model
model = tf.saved_model.load(MODEL)



In [4]:
# Loading movies embeddings
def load_movies_embeddings(model):
    embeddings = model.movies_embeddings.numpy()
    with open(f'data/ml-100k/splits/random/movies_mapping.json') as file:
        mapping = {int(key): embeddings[value] for key, value in json.load(file).items()}
    return mapping

In [5]:
# Loading users embeddings
def load_users_embeddings(model):
    embeddings = model.users_embeddings.numpy()
    with open(f'data/ml-100k/splits/random/users_mapping.json') as file:
        mapping = {int(key): embeddings[value] for key, value in json.load(file).items()}
    return mapping

In [6]:
movie_to_embedding = load_movies_embeddings(model)

In [7]:
user_to_embedding = load_users_embeddings(model)

In [8]:
# Gets similarity score of query with all movies based on their embeddings
def get_query_similarity_with_movies(query, measure='cosine'):
    movies_indexes = movie_to_embedding.keys()
    movies_embeddings = np.vstack(list(movie_to_embedding.values()))
    product = movies_embeddings.dot(query)
    if measure == 'cosine':
        normalizer = np.sqrt(query.T.dot(query) * (movies_embeddings ** 2).sum(axis=1, keepdims=True))
        movies_similarity = product / normalizer
    elif measure == 'dot':
        movies_similarity = product
    return {key: value.item() for key, value in zip(movies_indexes, movies_similarity)}

In [9]:
# Finds ratings for a given user
user_id = 25
user_embedding = np.expand_dims(user_to_embedding[user_id], axis=-1)
scores = get_query_similarity_with_movies(user_embedding, measure='cosine')

In [10]:
estimation = pd.DataFrame(scores.items(), columns=['movieId', 'rating'])
estimation.set_index(keys=['movieId'], inplace=True)
estimation.head()

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
1,0.524015
2,0.485384
3,0.355721
4,0.560974
5,0.560859


In [11]:
genre_columns = np.array([
    "Unknown", "Action", "Adventure", "Animation", "Children", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
    "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
])
movies_columns = [
    'movieId', 'title', 'release_date', 'video_release_date', 'imdb_url'
] + genre_columns.tolist()
    
def load_movies_data():
    movies = pd.read_csv(
        DATASETS / 'recommender/movies/ml-100k/u.item', 
        sep='|', names=movies_columns, 
        index_col=['movieId'],
        encoding='latin-1'
    )
    movies['genre'] = ['|'.join(genre_columns[row]) for row in movies[genre_columns].values.astype(bool)]
    movies.drop(columns=['release_date', 'video_release_date', 'imdb_url'], inplace=True)
    return movies

In [12]:
movies = load_movies_data()
movies.head()

Unnamed: 0_level_0,title,Unknown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,genre
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),0,0,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,Animation|Children|Comedy
2,GoldenEye (1995),0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,Action|Adventure|Thriller
3,Four Rooms (1995),0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,Thriller
4,Get Shorty (1995),0,1,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,Action|Comedy|Drama
5,Copycat (1995),0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,1,0,0,Crime|Drama|Thriller


In [13]:
predicted_ratings = estimation.merge(movies, left_index=True, right_index=True).drop(columns=genre_columns)
predicted_ratings.head(5)

Unnamed: 0_level_0,rating,title,genre
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.524015,Toy Story (1995),Animation|Children|Comedy
2,0.485384,GoldenEye (1995),Action|Adventure|Thriller
3,0.355721,Four Rooms (1995),Thriller
4,0.560974,Get Shorty (1995),Action|Comedy|Drama
5,0.560859,Copycat (1995),Crime|Drama|Thriller


In [14]:
real_ratings = pd.read_csv(
    DATASETS / 'recommender/movies/ml-100k/u.data', sep='\t', names=['userId', 'movieId', 'rating', 'timestamp']
).drop(columns=['timestamp'])
real_ratings.set_index(keys=['userId', 'movieId'], inplace=True)
unrated_movies = set(predicted_ratings.index) - set(real_ratings.loc[user_id].index)

In [15]:
k = 10
recommendations = predicted_ratings.loc[unrated_movies].sort_values(by='rating', ascending=False)
recommendations.head(k)

Unnamed: 0_level_0,rating,title,genre
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
847,0.727018,Looking for Richard (1996),Documentary|Drama
1007,0.719956,Waiting for Guffman (1996),Comedy
617,0.703735,"Blue Angel, The (Blaue Engel, Der) (1930)",Drama
1019,0.701465,"Die xue shuang xiong (Killer, The) (1989)",Action|Thriller
1045,0.701316,Fearless (1993),Drama
256,0.697634,When the Cats Away (Chacun cherche son chat) (...,Comedy|Romance
489,0.69555,Notorious (1946),Film-Noir|Romance|Thriller
639,0.694168,"Tin Drum, The (Blechtrommel, Die) (1979)",Drama
645,0.693377,Paris Is Burning (1990),Documentary
622,0.693342,Swiss Family Robinson (1960),Adventure|Children


In [16]:
def plot_watches_by_genre(user_id):
    merged = real_ratings.loc[user_id].merge(movies, left_index=True, right_index=True)
    data = merged[genre_columns].sum().to_frame().reset_index()
    data.columns = ['genre', 'count']
    return alt.Chart(data).mark_bar().encode(
        x='genre',
        y='count',
        color='genre'
    )

In [17]:
plot_watches_by_genre(user_id)

In [18]:
def plot_watches_by_rating_and_genre(user_id):
    merged = real_ratings.loc[user_id].merge(movies, left_index=True, right_index=True)
    grouped = merged.groupby(by='rating')
    temp = grouped.sum()[genre_columns]
    ratings = list(temp.index)
    temp = temp.transpose().reset_index()
    temp.columns = ['genre'] + ratings
    temp = temp.melt(id_vars=['genre'], value_vars=ratings)
    temp.columns = ['genre', 'rating', 'count']
    return alt.Chart(temp).mark_bar().encode(
        x='genre',
        y='count',
        order=alt.Order('rating', sort='ascending'),
        color='rating:N'
    )

In [19]:
plot_watches_by_rating_and_genre(user_id)

In [20]:
def show_closest_movies_to_user(user_id, k=30):
    # Prepares data
    movies_data = movies.copy()
    # Trains TSNE
    tsne = TSNE(
        n_components=2, perplexity=40, metric='cosine', early_exaggeration=10.0,
        init='pca', verbose=True, n_iter=400
    )
    projection = tsne.fit_transform(np.vstack([movie_to_embedding[entry] for entry in movies_data.index] + [user_embedding.T]))
    movies_data['x'] = projection[:-1, 0]
    movies_data['y'] = projection[:-1, 1]
    user_data = pd.DataFrame([[user_id, projection[-1,0], projection[-1,0]]], columns=['userId','x','y'])
    # Builds dataframe with the top k movies and the bot k movies
    positive = movies_data.loc[recommendations.head(k).index]
    positive['action'] = 'Recommend'
    negative = movies_data.loc[recommendations.tail(k).index]
    negative['action'] = 'Not recommend'
    movies_data = pd.concat([positive, negative], axis=0)
    # Assigns a random genre to each movie (from all genres the movie was tagged)
    movies_data['random'] = [np.random.choice(row.split('|')) for row in movies_data['genre']]
    # Removes unused data
    movies_data.drop(columns=['genre'] + genre_columns.tolist(), inplace=True)
    # Selections & conditions
    genre_filter = alt.selection(type='multi', fields=['random'], bind='legend')
    genre_coloring = alt.condition(genre_filter, "random:N", alt.value('whitesmoke'))
    action_filter = alt.selection(type='multi', fields=['action'])
    action_opacity = alt.condition(action_filter, alt.value(1), alt.value(0.2))
    # Base movies embedding chart
    base = alt.Chart(movies_data).mark_circle().encode(
        x='x',
        y='y',
        tooltip='title',
        color=genre_coloring,
        opacity=action_opacity
    ).add_selection(
        genre_filter
    )
    legend = alt.Chart(movies_data).mark_circle().encode(
        y=alt.Y('action:N', axis=alt.Axis(orient='right')),
        color=alt.condition(action_filter, alt.value('black'), alt.value('whitesmoke'))
    ).add_selection(
        action_filter
    )
    query = alt.Chart(user_data).mark_point().encode(
        x='x:Q',
        y='y:Q',
        color=alt.value('black')
    )
    return alt.layer(base, query) & legend

In [21]:
show_closest_movies_to_user(user_id)

[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 1683 samples in 0.000s...
[t-SNE] Computed neighbors for 1683 samples in 0.103s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1683
[t-SNE] Computed conditional probabilities for sample 1683 / 1683
[t-SNE] Mean sigma: 0.163590
[t-SNE] KL divergence after 250 iterations with early exaggeration: 65.204964
[t-SNE] KL divergence after 400 iterations: 2.289736


In [22]:
target = 'Aladdin'
indexes = movies[movies['title'].str.contains(target)].index.values
titles = movies.loc[indexes, 'title'].values
if len(titles) == 0:
    raise ValueError(f"Found no movies with title {target}")
print(f"Nearest neighbors of: {titles[0]}")
embedding = np.expand_dims(movie_to_embedding[indexes[0]], axis=-1)
scores = get_query_similarity_with_movies(embedding)
closest = movies.copy().drop(columns=genre_columns)
closest['score'] = [scores[index]for index in closest.index]
closest.sort_values(by='score', ascending=False).head(10)

Nearest neighbors of: Aladdin (1992)


Unnamed: 0_level_0,title,genre,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
95,Aladdin (1992),Animation|Children|Comedy|Musical,1.0
498,"African Queen, The (1951)",Action|Adventure|Romance|War,0.670543
999,Clean Slate (1994),Comedy,0.639115
925,Unforgettable (1996),Sci-Fi|Thriller,0.638308
604,It Happened One Night (1934),Comedy,0.635312
617,"Blue Angel, The (Blaue Engel, Der) (1930)",Drama,0.630295
76,Carlito's Way (1993),Crime|Drama,0.625785
1007,Waiting for Guffman (1996),Comedy,0.623388
874,Career Girls (1997),Drama,0.616228
36,Mad Love (1995),Drama|Romance,0.614355
