###  Created by Luis A. Sanchez-Perez (l.alejandro.2011@gmail.com).
<p><span style="color:green"><b>Copyright &#169;</b> Do not distribute or use without authorization from author.</span></p>

In [1]:
import pathlib
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

### Loads a prepares datasets
Load some of the default and pre-processed datasets (in csv formats) and prepares for later use.

In [2]:
DATASETS = pathlib.Path('/media/alejand/DatasetsT7/datasets/')

In [3]:
# Loads user ratings
ratings = pd.read_csv(DATASETS / 'recommender/movies/ml-20m/ratings.csv').drop(columns=['timestamp'])
ratings = ratings.set_index(['userId', 'movieId'])
ratings.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,rating
userId,movieId,Unnamed: 2_level_1
1,2,3.5
1,29,3.5
1,32,3.5
1,47,3.5
1,50,3.5


In [4]:
# Loads users embeddings
users = pd.read_csv('users_embeddings.csv', index_col='userId')
users.head()

Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0.143005,0.018878,0.035428,0.079131,0.135247,0.022498,0.083786,0.12723,0.041376,0.081717,0.087148,0.033618,0.076804,0.004396,0.0,0.017067,0.005689,0.006982,0.0,0.0
2,0.130573,0.004777,0.004777,0.062102,0.003185,0.036624,0.117834,0.140127,0.007962,0.128981,0.101911,0.028662,0.16879,0.004777,0.0,0.02707,0.009554,0.014331,0.007962,0.0
3,0.103027,0.007324,0.020996,0.103027,0.041992,0.031738,0.119629,0.122559,0.043945,0.104004,0.061523,0.023438,0.181641,0.0,0.001953,0.013672,0.011719,0.006348,0.001465,0.0
4,0.079585,0.027682,0.051903,0.134948,0.038062,0.048443,0.103806,0.15917,0.065744,0.155709,0.0,0.027682,0.051903,0.0,0.0,0.013841,0.027682,0.013841,0.0,0.0
5,0.121951,0.035944,0.05905,0.125802,0.052632,0.080873,0.145058,0.103979,0.037227,0.08344,0.003851,0.008986,0.05905,0.019255,0.0,0.005135,0.044929,0.012837,0.0,0.0


In [5]:
# Loads movies
movies = pd.read_csv('encoded_movies.csv', index_col='movieId')
index_to_genres = movies.columns[1:-1]
movies.head()

Unnamed: 0_level_0,title,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,...,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed),year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
2,Jumanji,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
3,Grumpier Old Men,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
4,Waiting to Exhale,0,0,0,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,1995
5,Father of the Bride Part II,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1995


### Perform recommendations using user vectors (embedding)

In [6]:
def find_recommendations_for_user(userId, user_embedding, recommendations=20):
    # Creates a list of moviesId that the user has not rated yet
    index_to_movie = list(set(movies.index) - set(ratings.loc[userId].index))
    # Gets movies embedding matrix
    movies_embeddings = movies.loc[index_to_movie].iloc[:, 1:-1].values
    # Finds the closest k neighbors
    user_embedding_norm = np.sqrt((user_embedding**2).sum())
    movies_embeddings_norms = np.sqrt((movies_embeddings**2).sum(axis=1))
    cosine = movies_embeddings.dot(user_embedding.T).flatten() / (user_embedding_norm * movies_embeddings_norms)
    sorted_neighbors = sorted(zip(index_to_movie, cosine), key=lambda entry: entry[1], reverse=True)
    indices = [index for index, metric in sorted_neighbors[:recommendations]]
    # Return recommendations
    return movies.loc[indices]

In [7]:
# Gets user embedding for the given user
userId = 1
user_data = users.loc[userId]
user_embedding = user_data.values.reshape(1,-1)

In [8]:
# Prints the genre preference of this user (based on this embedding)
print(user_data.sort_values(ascending=False))

Adventure             0.143005
Fantasy               0.135247
Action                0.127230
Horror                0.087148
Drama                 0.083786
Thriller              0.081717
Comedy                0.079131
Sci-Fi                0.076804
Crime                 0.041376
Children              0.035428
Mystery               0.033618
Romance               0.022498
Animation             0.018878
War                   0.017067
Western               0.006982
Musical               0.005689
IMAX                  0.004396
Documentary           0.000000
Film-Noir             0.000000
(no genres listed)    0.000000
Name: 1, dtype: float64


In [9]:
find_recommendations_for_user(userId, user_embedding, recommendations=10)

Unnamed: 0_level_0,title,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,...,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed),year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2617,"Mummy, The",1,0,0,1,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1999
72165,Cirque du Freak: The Vampire's Assistant,1,0,0,1,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,2009
117646,Dragonheart 2: A New Beginning,1,0,0,1,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,2000
49593,She,1,0,0,0,1,1,1,1,0,...,0,1,0,0,0,0,0,0,0,1965
2429,Mighty Joe Young,1,0,0,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,1998
41569,King Kong,1,0,0,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,2005
71999,Aelita: The Queen of Mars (Aelita),1,0,0,0,1,1,1,1,0,...,0,1,0,0,0,0,0,0,0,1924
4781,Megiddo: The Omega Code 2,1,0,0,0,1,0,0,1,0,...,0,1,0,0,0,0,0,0,0,2001
120833,Super Capers,1,0,0,1,1,0,0,1,0,...,0,1,0,0,0,0,0,0,0,2009
2366,King Kong,1,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1933


### Perform recommendations using ridge regression to find user embeddings
This is done by trying to predict user rating for a given movie.

In [10]:
# Gets data for the given user
userId = 1
user_ratings = ratings.loc[userId]
X = np.concatenate((
    np.ones((len(user_ratings), 1)),
    movies.loc[user_ratings.index].iloc[:, 1:-1].values
), axis=1)
y = user_ratings['rating'].values.reshape(-1,1)

In [11]:
# Finds user embedding using ridge regression
regularization = 0.1
num_genres = len(index_to_genres) + 1
X_train, X_test, y_train, y_test = train_test_split(X, y)
diagonal = np.sqrt(regularization) * np.eye(num_genres)
diagonal[0,0] = 0
XX = np.concatenate((X_train, diagonal), axis=0)
yy = np.concatenate((y_train, np.zeros((num_genres, 1))), axis=0)
user_embedding = np.linalg.pinv(XX).dot(yy)
# Checking performance after training
y_pred = X_train.dot(user_embedding)
print('Training Perf:', r2_score(y_train, y_pred))
y_pred = X_test.dot(user_embedding)
print('Test Perf:', r2_score(y_test, y_pred))

Training Perf: 0.24351926182820416
Test Perf: -0.25481815292022336


In [12]:
# Prints the genre preference of this user (based on this embedding)
values = sorted(list(zip(movies.columns[1:-1], user_embedding[1:].flatten())), key=lambda entry: entry[1], reverse=True)
values

[('IMAX', 0.4455275648436996),
 ('Romance', 0.30233898553813005),
 ('Thriller', 0.17116863895039774),
 ('Adventure', 0.1459695803669943),
 ('Horror', 0.0984920190489432),
 ('War', 0.08249716699862752),
 ('Crime', 0.07976625342228844),
 ('Animation', 0.035825813914280524),
 ('Documentary', 1.750057855898471e-17),
 ('Film-Noir', 0.0),
 ('(no genres listed)', 0.0),
 ('Musical', -0.015742832497620485),
 ('Comedy', -0.03693043270594715),
 ('Sci-Fi', -0.03760622149906234),
 ('Drama', -0.1032206364620229),
 ('Fantasy', -0.1045006063697006),
 ('Action', -0.11376458363154718),
 ('Children', -0.16558012771757336),
 ('Mystery', -0.32830829600783057),
 ('Western', -0.5519119225792692)]

In [13]:
# Checks movies rated from the top category
rated_movies_by_user = user_ratings.merge(movies.loc[user_ratings.index], left_index=True, right_index=True)
rated_movies_by_user[rated_movies_by_user[values[0][0]] == 1]

Unnamed: 0_level_0,rating,title,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,...,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed),year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8368,4.0,Harry Potter and the Prisoner of Azkaban,1,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,2004
8636,4.5,Spider-Man 2,1,0,0,0,0,0,0,1,...,0,1,1,0,0,0,0,0,0,2004


In [14]:
def find_recommendations_for_user(userId, user_embedding, recommendations=20):
    # Creates a list of moviesId that the user has not rated yet
    index_to_movie = list(set(movies.index) - set(ratings.loc[userId].index))
    # Gets movies embedding matrix
    movies_embeddings = np.concatenate((
        np.ones((len(index_to_movie), 1)),
        movies.loc[index_to_movie].iloc[:, 1:-1].values
    ), axis=1)
    # Finds the best k rated movies
    predicted_rating = movies_embeddings.dot(user_embedding).flatten()
    sorted_movies = sorted(zip(index_to_movie, predicted_rating), key=lambda entry: entry[1], reverse=True)
    indices = [index for index, metric in sorted_movies[:recommendations]]
    # Return recommendations
    return movies.loc[indices]

In [15]:
find_recommendations_for_user(userId, user_embedding, recommendations=10)

Unnamed: 0_level_0,title,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,...,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed),year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
33,Wings of Courage,1,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,1995
108190,Divergent,1,0,0,0,0,1,0,0,0,...,0,1,1,0,0,0,0,0,0,2014
78772,"Twilight Saga: Eclipse, The",0,0,0,0,1,1,0,0,0,...,0,0,1,0,0,0,0,0,0,2010
100882,Journey to the West: Conquering the Demons (Da...,1,0,0,1,1,1,0,0,0,...,0,0,1,0,0,0,0,0,0,2013
88932,Final Destination 5,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,2011
98203,"Twilight Saga: Breaking Dawn - Part 2, The",1,0,0,0,1,1,1,0,0,...,0,0,1,0,0,0,0,0,0,2012
78105,Prince of Persia: The Sands of Time,1,0,0,0,1,1,0,1,0,...,0,0,1,0,0,0,0,0,0,2010
105504,Captain Phillips,1,0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,2013
40815,Harry Potter and the Goblet of Fire,1,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,2005
66289,Singapore Sling (Singapore sling: O anthropos ...,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,1,0,1990
