# Importación de las librerías

In [1]:
! pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 6.7 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1630132 sha256=2015c3af12689f3794f30fafee0c6605215bc70dbaf454a690674fed81b80e47
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1


In [2]:
from pandas import read_csv
from pandas import DataFrame

from plotly.express import scatter
from plotly.express import scatter_matrix

from sklearn.manifold import TSNE

from surprise.reader import Reader
from surprise.dataset import Dataset
from surprise.model_selection.search import RandomizedSearchCV
from surprise.prediction_algorithms.matrix_factorization import NMF
from surprise.prediction_algorithms.matrix_factorization import SVD

# Importación del conjunto de datos

In [3]:
movies = read_csv(filepath_or_buffer='https://raw.githubusercontent.com/andres-alcala-gtz/movielens/main/movies.csv')

movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [4]:
ratings = read_csv(filepath_or_buffer='https://raw.githubusercontent.com/andres-alcala-gtz/movielens/main/ratings.csv')
ratings = ratings[['userId', 'movieId', 'rating']]

ratings

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [5]:
reader = Reader(rating_scale=(ratings['rating'].min(), ratings['rating'].max()))

dataset = Dataset.load_from_df(df=ratings, reader=reader)

# Selección del modelo

## Búsqueda

In [6]:
algorithms = ['NMF', 'SVD']
parameters = ['best_estimator', 'best_score', 'best_params', 'best_index']

measure = 'rmse'

splits = 5

In [7]:
hyperparameters = {
    'n_factors': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
    'n_epochs' : [5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
    'biased'   : [False, True],
    'lr_bu'    : [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'lr_bi'    : [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'reg_bu'   : [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'reg_bi'   : [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
}

In [8]:
models = DataFrame(index=algorithms, columns=parameters)

for algorithm in algorithms:
    searcher = RandomizedSearchCV(algo_class=eval(algorithm), param_distributions=hyperparameters, measures=[measure], cv=splits)
    searcher.fit(data=dataset)

    for parameter in parameters:
        models.loc[algorithm][parameter] = eval(f'searcher.{parameter}')[measure]

models

Unnamed: 0,best_estimator,best_score,best_params,best_index
NMF,<surprise.prediction_algorithms.matrix_factori...,0.963498,"{'n_factors': 15, 'n_epochs': 15, 'biased': Fa...",7
SVD,<surprise.prediction_algorithms.matrix_factori...,0.929547,"{'n_factors': 20, 'n_epochs': 35, 'biased': Fa...",6


In [9]:
model = models[models['best_score'] == models['best_score'].min()]['best_estimator'].item()
model.fit(trainset=dataset.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f3156a0cdd0>

## Resultado

In [10]:
validation = DataFrame(data=searcher.cv_results)

In [11]:
validation

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,split3_test_rmse,split4_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_n_factors,param_n_epochs,param_biased,param_lr_bu,param_lr_bi,param_reg_bu,param_reg_bi
0,0.923704,0.936575,0.937781,0.92956,0.925418,0.930608,0.005705,2,2.531059,0.095566,0.231529,0.110011,"{'n_factors': 15, 'n_epochs': 25, 'biased': Fa...",15,25,False,0.0,0.0,1.0,0.9
1,1.352397,1.358366,1.371074,1.363514,1.347586,1.358588,0.008241,10,0.833453,0.026388,0.201266,0.088192,"{'n_factors': 50, 'n_epochs': 5, 'biased': Fal...",50,5,False,1.0,0.0,1.0,1.0
2,0.945489,0.954758,0.958847,0.953767,0.944964,0.951565,0.005451,6,4.106514,0.131318,0.234371,0.087488,"{'n_factors': 50, 'n_epochs': 25, 'biased': Fa...",50,25,False,0.6,0.8,0.1,0.5
3,0.946499,0.954053,0.95133,0.945187,0.944009,0.948216,0.003836,5,5.169261,0.237395,0.204331,0.111892,"{'n_factors': 50, 'n_epochs': 30, 'biased': Fa...",50,30,False,1.0,0.6,0.4,0.4
4,0.932046,0.942472,0.946327,0.932702,0.930903,0.93689,0.006278,3,2.950713,0.047586,0.219459,0.111671,"{'n_factors': 25, 'n_epochs': 25, 'biased': Fa...",25,25,False,0.4,0.8,0.9,1.0
5,0.982413,0.990813,1.009295,0.992276,0.978466,0.990652,0.010645,7,1.548864,0.06888,0.189493,0.087763,"{'n_factors': 15, 'n_epochs': 15, 'biased': Tr...",15,15,True,0.4,0.0,0.5,0.6
6,0.926486,0.935377,0.933392,0.933214,0.919265,0.929547,0.005955,1,3.702473,0.080456,0.213679,0.098242,"{'n_factors': 20, 'n_epochs': 35, 'biased': Fa...",20,35,False,0.6,0.2,0.6,0.7
7,1.048971,1.060103,1.073025,1.066043,1.053984,1.060425,0.008522,8,1.185428,0.048654,0.171984,0.097527,"{'n_factors': 25, 'n_epochs': 10, 'biased': Fa...",25,10,False,0.6,0.0,0.3,0.5
8,0.936803,0.944251,0.95393,0.947828,0.934025,0.943367,0.007247,4,4.338043,0.097987,0.241616,0.081073,"{'n_factors': 40, 'n_epochs': 30, 'biased': Fa...",40,30,False,0.8,0.3,0.9,0.0
9,1.149113,1.147743,1.161971,1.177075,1.151317,1.157444,0.011018,9,2.643151,0.115512,0.180628,0.074069,"{'n_factors': 5, 'n_epochs': 35, 'biased': Tru...",5,35,True,0.0,0.8,1.0,0.1


In [12]:
scatter_matrix(data_frame=validation, dimensions=validation.columns[-len(hyperparameters):], color=f'rank_test_{measure}')

# Recomendación del modelo

## Cruda

In [13]:
model.predict(uid=1, iid=129)

Prediction(uid=1, iid=129, r_ui=None, est=1.8584712881123475, details={'was_impossible': False})

## Procesada

In [14]:
predictions = [model.predict(uid=1, iid=i) for i in movies['movieId']]
predictions = DataFrame(data=predictions, columns=['userId', 'movieId', 'true', 'rating', 'details'])
predictions = predictions[['userId', 'movieId', 'rating']]
predictions = predictions.merge(right=movies, on='movieId')
# predictions = predictions.sort_values(by='rating')

predictions

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.431037,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,2,3.922974,Jumanji (1995),Adventure|Children|Fantasy
2,1,3,3.765999,Grumpier Old Men (1995),Comedy|Romance
3,1,4,2.595924,Waiting to Exhale (1995),Comedy|Drama|Romance
4,1,5,3.010396,Father of the Bride Part II (1995),Comedy
...,...,...,...,...,...
9737,1,193581,3.107760,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,1,193583,2.760496,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,1,193585,2.653317,Flint (2017),Drama
9740,1,193587,2.670106,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [15]:
def recommend_movie(user:int, year:str='', genre:str='', approach:str='random', rating:float=4.5, n:int=10):

    predictions = [model.predict(uid=user, iid=i) for i in movies['movieId']]
    predictions = DataFrame(data=predictions, columns=['userId', 'movieId', 'true', 'rating', 'details'])
    predictions = predictions[['userId', 'movieId', 'rating']]
    predictions = predictions.merge(right=movies, on='movieId')
    predictions = predictions.sort_values(by='rating')

    if year != '':
        predictions = predictions[predictions['title'].str.contains(pat=year, case=False)]

    if genre != '':
        predictions = predictions[predictions['genres'].str.contains(pat=genre, case=False)]

    if approach == 'random':
        predictions = predictions[predictions['rating'] >= rating].sample()

    if approach == 'top':
        predictions = predictions.tail(n=n)

    return predictions

In [16]:
recommend_movie(user=1, year='2010', genre='animation', approach='random', rating=4.5)

Unnamed: 0,userId,movieId,rating,title,genres
7467,1,81847,4.931959,Tangled (2010),Animation|Children|Comedy|Fantasy|Musical|Roma...


In [17]:
recommend_movie(user=1, year='2010', genre='animation', approach='top', n=10)

Unnamed: 0,userId,movieId,rating,title,genres
7380,1,79274,4.322465,Batman: Under the Red Hood (2010),Action|Animation
8464,1,112512,4.396374,Colourful (Karafuru) (2010),Animation|Drama|Fantasy|Mystery
7499,1,83132,4.418979,"Secret World of Arrietty, The (Kari-gurashi no...",Animation|Children|Fantasy
7360,1,78637,4.502113,Shrek Forever After (a.k.a. Shrek: The Final C...,Adventure|Animation|Children|Comedy|Fantasy|IMAX
7455,1,81564,4.62356,Megamind (2010),Action|Animation|Children|Comedy|Sci-Fi|IMAX
7510,1,83803,4.678167,Day & Night (2010),Animation|Children
7467,1,81847,4.931959,Tangled (2010),Animation|Children|Comedy|Fantasy|Musical|Roma...
7371,1,79091,4.935888,Despicable Me (2010),Animation|Children|Comedy|Crime
7302,1,76093,4.98475,How to Train Your Dragon (2010),Adventure|Animation|Children|Fantasy|IMAX
7355,1,78499,5.0,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX


# Visualización del conjunto de datos

## Q

In [18]:
model.qi.shape

(9724, 20)

In [19]:
DataFrame(data=model.qi)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,-0.145044,0.032720,-0.073208,0.036973,-0.067170,0.213885,0.313787,0.197555,0.267784,-0.288871,-0.115848,1.169772,0.018179,0.404046,-0.173397,-0.149277,-0.314819,0.088200,0.354337,-0.962023
1,0.223208,0.149048,-0.105684,-0.215535,-0.249282,0.114324,0.119370,-0.052666,0.624134,-0.253793,-0.018357,0.450307,0.508889,0.023414,-0.108895,0.139254,-0.303782,-0.216920,0.391389,-0.751940
2,0.092816,-0.405265,-0.209048,0.262233,-0.090510,0.501955,-0.208561,-0.032735,0.348726,-0.487484,-0.264698,0.681507,0.327223,0.105783,-0.342009,-0.276337,-0.522646,-0.233151,0.110733,-0.715239
3,0.415038,-0.155932,-0.141131,-0.134655,0.027295,0.510303,-0.167909,-0.207667,0.114269,-0.391989,-0.575992,0.539630,-0.218375,0.402999,-0.233587,-0.496919,-1.118918,-0.048926,0.163720,-0.775700
4,0.484031,-0.366612,-0.235573,0.033672,-0.180801,0.525624,-0.006380,-0.120521,0.038730,0.238122,-0.197231,1.088466,0.245829,0.616597,-0.229083,-0.596060,-0.332884,-0.008820,0.137049,-0.900998
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9719,-0.033741,-0.063854,0.175264,0.229992,-0.138555,0.203189,-0.187211,-0.187820,0.108268,-0.141153,-0.273162,0.348696,0.073147,-0.014671,-0.143268,-0.245527,-0.054972,-0.052259,-0.058156,-0.228761
9720,0.144407,-0.198805,0.012521,0.241206,-0.080070,0.322075,-0.379551,-0.191441,0.312837,-0.206124,-0.469903,0.252209,0.324971,0.040790,0.185978,-0.097496,-0.203215,0.008317,-0.132955,-0.588338
9721,0.019444,-0.151085,-0.020210,-0.049174,-0.058768,0.274352,-0.362345,0.055969,0.362217,-0.211134,-0.353842,0.188946,0.136058,-0.041994,0.154876,-0.168838,-0.192085,0.041147,0.101846,-0.225922
9722,-0.024687,0.095241,-0.044638,-0.069843,0.107003,0.210241,-0.388804,-0.020839,0.366950,-0.274957,-0.280983,0.338432,0.034933,0.005940,-0.065440,-0.219754,-0.231122,-0.122340,-0.179019,-0.368499


## t-SNE

In [20]:
tsne = TSNE(n_components=2, learning_rate='auto', init='pca')
embedding = tsne.fit_transform(X=model.qi)

projection = DataFrame(data=embedding, columns=['x', 'y'])
projection['title'] = movies['title']


The PCA initialization in TSNE will change to have the standard deviation of PC1 equal to 1e-4 in 1.2. This will ensure better convergence.



In [21]:
projection

Unnamed: 0,x,y,title
0,-64.967621,-10.180140,Toy Story (1995)
1,-46.584908,33.025135,Jumanji (1995)
2,-64.609604,5.788824,Grumpier Old Men (1995)
3,-52.346481,-38.540085,Waiting to Exhale (1995)
4,-51.408947,-36.440624,Father of the Bride Part II (1995)
...,...,...,...
9719,49.087154,22.609060,Spiral (2018)
9720,18.942947,20.133127,Mission: Impossible - Fallout (2018)
9721,35.839764,15.500614,SuperFly (2018)
9722,17.554649,-0.272054,Iron Soldier (2010)


In [22]:
scatter(data_frame=projection, x='x', y='y', text='title')