###  Created by Luis A. Sanchez-Perez (l.alejandro.2011@gmail.com).
<p><span style="color:green"><b>Copyright &#169;</b> Do not distribute or use without authorization from author.</span></p>

In [1]:
import pathlib
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

### Loads a prepares datasets
Load some of the default and pre-processed datasets (in csv formats) and prepares for later use.

In [2]:
DATASETS = pathlib.Path('/media/alejand/DatasetsT7/datasets')

In [3]:
# Loads user ratings
ratings = pd.read_csv(DATASETS / 'recommender/movies/ml-20m/ratings.csv').drop(columns=['timestamp'])
ratings = ratings.set_index(['userId', 'movieId'])
ratings.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,rating
userId,movieId,Unnamed: 2_level_1
1,2,3.5
1,29,3.5
1,32,3.5
1,47,3.5
1,50,3.5


In [4]:
# Loads users embeddings
users = pd.read_csv('users_embeddings.csv', index_col='userId')
users.head()

Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0.143005,0.018878,0.035428,0.079131,0.135247,0.022498,0.083786,0.12723,0.041376,0.081717,0.087148,0.033618,0.076804,0.004396,0.0,0.017067,0.005689,0.006982,0.0,0.0
2,0.130573,0.004777,0.004777,0.062102,0.003185,0.036624,0.117834,0.140127,0.007962,0.128981,0.101911,0.028662,0.16879,0.004777,0.0,0.02707,0.009554,0.014331,0.007962,0.0
3,0.103027,0.007324,0.020996,0.103027,0.041992,0.031738,0.119629,0.122559,0.043945,0.104004,0.061523,0.023438,0.181641,0.0,0.001953,0.013672,0.011719,0.006348,0.001465,0.0
4,0.079585,0.027682,0.051903,0.134948,0.038062,0.048443,0.103806,0.15917,0.065744,0.155709,0.0,0.027682,0.051903,0.0,0.0,0.013841,0.027682,0.013841,0.0,0.0
5,0.121951,0.035944,0.05905,0.125802,0.052632,0.080873,0.145058,0.103979,0.037227,0.08344,0.003851,0.008986,0.05905,0.019255,0.0,0.005135,0.044929,0.012837,0.0,0.0


In [5]:
# Loads movies
movies = pd.read_csv('encoded_movies.csv', index_col='movieId')
index_to_genres = movies.columns[1:-1]
movies.head()

Unnamed: 0_level_0,title,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,...,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed),year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
2,Jumanji,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
3,Grumpier Old Men,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
4,Waiting to Exhale,0,0,0,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,1995
5,Father of the Bride Part II,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1995


In [6]:
ratings_count = ratings.groupby(by='movieId').count().sort_values(by='rating', ascending=False)
buffer = 100
keep = 10
selected = np.random.permutation(ratings_count.index[:buffer])[:keep]
movies.loc[selected]['title']

movieId
1221                              Godfather: Part II, The
1210           Star Wars: Episode VI - Return of the Jedi
1721                                              Titanic
1961                                             Rain Man
316                                              Stargate
110                                            Braveheart
3996     Crouching Tiger, Hidden Dragon (Wo hu cang long)
293     Léon: The Professional (a.k.a. The Professiona...
1193                      One Flew Over the Cuckoo's Nest
367                                             Mask, The
Name: title, dtype: object

In [7]:
ratings_count.loc[movies.loc[selected]['title'].index]

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
1221,27398
1210,46839
1721,32238
1961,24591
316,31799
110,53769
3996,25090
293,25804
1193,29932
367,34384
