<a href="https://colab.research.google.com/github/cruz-marco/dex_files/blob/main/dinamica_recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

In [2]:
movies, ratings = pd.read_csv('movie.csv'), pd.read_csv('rating.csv')

In [3]:
display(movies.head(), ratings.head())

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [4]:
top10_eval = ratings[['movieId', 'rating']]\
.merge(movies[['title', 'movieId']], how='left', on='movieId')\
.drop(columns=['movieId'])\
.groupby('title')\
.agg({
    'rating': 'sum'
})\
.sort_values('rating', ascending = False)\
.rename(columns={
    'rating': 'eval_sum'
}).head(10)

In [5]:
top10_eval.index

Index(['Shawshank Redemption, The (1994)', 'Pulp Fiction (1994)',
       'Forrest Gump (1994)', 'Silence of the Lambs, The (1991)',
       'Star Wars: Episode IV - A New Hope (1977)', 'Jurassic Park (1993)',
       'Braveheart (1995)', 'Schindler's List (1993)', 'Matrix, The (1999)',
       'Terminator 2: Judgment Day (1991)'],
      dtype='object', name='title')

In [6]:
top_10_most_eval = ratings[['movieId', 'rating']]\
.merge(movies[['title', 'movieId']], how='left', on='movieId')\
.drop(columns=['movieId'])\
.groupby('title')\
.agg({
    'rating': 'count'
})\
.sort_values('rating', ascending = False)\
.rename(columns={
    'rating': 'eval_count'
}).head(10)
display(top_10_most_eval.index)

Index(['Pulp Fiction (1994)', 'Forrest Gump (1994)',
       'Shawshank Redemption, The (1994)', 'Silence of the Lambs, The (1991)',
       'Jurassic Park (1993)', 'Star Wars: Episode IV - A New Hope (1977)',
       'Braveheart (1995)', 'Terminator 2: Judgment Day (1991)',
       'Matrix, The (1999)', 'Schindler's List (1993)'],
      dtype='object', name='title')

In [7]:
meval_list = ratings[['movieId', 'rating']]\
.groupby('movieId')\
.agg({
    'rating': 'count'
})\
.sort_values('rating', ascending = False)\
.rename(columns={
    'rating': 'eval_count'
}).head(1000)
display(meval_list)

Unnamed: 0_level_0,eval_count
movieId,Unnamed: 1_level_1
296,67310
356,66172
318,63366
593,63299
480,59715
...,...
1589,5044
1049,5039
915,5029
38061,5026


In [8]:
relmat = ratings[ratings['movieId'].isin(meval_list.index)]\
.pivot_table(index='movieId', columns='userId', values='rating').fillna(0)

In [9]:
relmat_sc = MinMaxScaler().fit_transform(relmat)

In [10]:
sim_mat = pd.DataFrame(data=cosine_similarity(relmat),
                       index = meval_list.index,
                       columns = meval_list.index)
display(sim_mat)

movieId,296,356,318,593,480,260,110,589,2571,527,...,5015,55765,2528,3683,52973,1589,1049,915,38061,383
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
296,1.000000,0.402873,0.307764,0.302354,0.381024,0.311412,0.382389,0.310818,0.192932,0.295487,...,0.224761,0.192636,0.239985,0.201816,0.185721,0.251637,0.230983,0.187631,0.172379,0.175185
356,0.402873,1.000000,0.233205,0.257734,0.277743,0.229323,0.427468,0.339796,0.119089,0.279807,...,0.179421,0.162633,0.193478,0.178498,0.160806,0.154440,0.166826,0.145707,0.141191,0.126668
318,0.307764,0.233205,1.000000,0.452468,0.280451,0.387538,0.202774,0.254642,0.226483,0.199290,...,0.046561,0.046630,0.047946,0.051373,0.039087,0.043432,0.040264,0.042337,0.037077,0.038609
593,0.302354,0.257734,0.452468,1.000000,0.245149,0.404644,0.205174,0.278951,0.215223,0.160485,...,0.044742,0.044726,0.053887,0.051016,0.043616,0.053157,0.048836,0.050711,0.043310,0.047027
480,0.381024,0.277743,0.280451,0.245149,1.000000,0.266871,0.377400,0.265874,0.260484,0.462643,...,0.168026,0.133015,0.148195,0.134771,0.151773,0.107851,0.151916,0.141393,0.131998,0.118073
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1589,0.251637,0.154440,0.043432,0.053157,0.107851,0.044733,0.105904,0.055019,0.028914,0.111161,...,0.373305,0.382427,0.418368,0.395315,0.373777,1.000000,0.477481,0.434871,0.372905,0.402652
1049,0.230983,0.166826,0.040264,0.048836,0.151916,0.039795,0.130455,0.048400,0.028674,0.151003,...,0.517424,0.424252,0.549799,0.467944,0.538322,0.477481,1.000000,0.501385,0.480616,0.467942
915,0.187631,0.145707,0.042337,0.050711,0.141393,0.047598,0.112821,0.060705,0.040347,0.151700,...,0.415164,0.372265,0.426488,0.395023,0.466563,0.434871,0.501385,1.000000,0.501944,0.480399
38061,0.172379,0.141191,0.037077,0.043310,0.131998,0.037903,0.102484,0.044549,0.034056,0.141981,...,0.401766,0.347660,0.388566,0.352280,0.496458,0.372905,0.480616,0.501944,1.000000,0.452445


In [23]:
def sim_recommender (movieId, sim_matrix):
  if movieId not in sim_matrix.index:
    return None
  else:
    ret = sim_matrix\
        .loc[movieId]\
        .drop(movieId)\
        .sort_values(ascending=False)\
        .to_frame()\
        .head(10)    
    return ret
        

In [25]:
sim_recommender(1883, sim_mat)

Unnamed: 0_level_0,1883
movieId,Unnamed: 1_level_1
5015,0.560347
1049,0.547102
280,0.528974
140,0.526234
52973,0.504311
2528,0.504262
1091,0.503041
3827,0.494346
2410,0.492098
3409,0.491286


In [26]:
sim_recommender('lapatata', sim_mat)