In [1]:
import pandas as pd
from math import sqrt
import numpy as np

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [21]:
movies = pd.read_csv('gdrive/My Drive/movies.csv')
ratings = pd.read_csv('gdrive/My Drive/ratings.csv')
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  62423 non-null  int64 
 1   title    62423 non-null  object
 2   genres   62423 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


In [3]:
userInput = [{'title':'Everything Everywhere All At Once (2022)', 'rating':5},
             {'title':'How to Lose a Guy in 10 Days (2003)', 'rating':3.5},
             {'title':'Before Sunrise (1995)', 'rating':4},
             {'title':'Spirited Away (2001)', 'rating':4.5},
             {'title':'Pride & Prejudice (2005)', 'rating':4}]
inputMovies = pd.DataFrame(userInput)
print(inputMovies)

                                      title  rating
0  Everything Everywhere All At Once (2022)     5.0
1       How to Lose a Guy in 10 Days (2003)     3.5
2                     Before Sunrise (1995)     4.0
3                      Spirited Away (2001)     4.5
4                  Pride & Prejudice (2005)     4.0


In [4]:
inputId = movies[movies['title'].isin(inputMovies['title'].tolist())]
inputMovies = pd.merge(inputId, inputMovies)
inputMovies = inputMovies.drop('genres', 1)
inputMovies = inputMovies[['movieId','title','rating']]
print(inputMovies)

   movieId                                title  rating
0      215                Before Sunrise (1995)     4.0
1     6155  How to Lose a Guy in 10 Days (2003)     3.5
2    40629             Pride & Prejudice (2005)     4.0


  inputMovies = inputMovies.drop('genres', 1)


In [5]:
userSubset = ratings[ratings['movieId'].isin(inputMovies['movieId'].tolist())]

print(userSubset.groupby('movieId').count())

         userId  rating  timestamp
movieId                           
215        6582    6582       6582
6155       5938    5938       5938
40629      6043    6043       6043


In [6]:
userSubsetGroup = userSubset.groupby(['userId'])

def take_5_elem(x):
    return len(x[1])

userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)

userSubsetGroup = userSubsetGroup[0:100]
print(userSubsetGroup[0:5])

  userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True) # get 5 sorted datas from each user


[(172,        userId  movieId  rating   timestamp
21466     172      215     4.5  1422551150
21507     172     6155     4.0  1422616131
21523     172    40629     4.5  1422551078), (924,         userId  movieId  rating   timestamp
129863     924      215     2.5  1191623997
130145     924     6155     4.0  1180365562
130226     924    40629     5.0  1192440539), (2032,         userId  movieId  rating   timestamp
293797    2032      215     3.5  1225580806
294473    2032     6155     3.5  1225750786
294707    2032    40629     4.0  1225747292), (2775,         userId  movieId  rating   timestamp
400981    2775      215     3.0  1009938293
401909    2775     6155     3.5  1142880574
402180    2775    40629     4.0  1135872353), (3742,         userId  movieId  rating   timestamp
543330    3742      215     5.0  1280554074
543948    3742     6155     1.5  1280623327
544335    3742    40629     4.5  1280779410)]


In [7]:
pearsonCorrelationDict = {}

for name, group in userSubsetGroup:

    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')

    nRatings = len(group)

    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]

    tempRatingList = temp_df['rating'].tolist()

    tempGroupList = group['rating'].tolist()

    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [8]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())

   similarityIndex  userId
0         1.000000     172
1        -0.114708     924
2         0.500000    2032
3         0.000000    2775
4         0.991241    3742


In [9]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
print(topUsers.head())

    similarityIndex  userId
82              1.0   34164
39              1.0   18628
58              1.0   27638
36              1.0   17352
78              1.0   32426


In [14]:
topUsersRating=topUsers.merge(ratings, left_on='userId', right_on='userId', how='inner')
print(topUsersRating.head(100))

    similarityIndex  userId  movieId  rating   timestamp
0               1.0   34164        2     3.5  1516429399
1               1.0   34164        6     3.5  1406729504
2               1.0   34164        7     3.5  1551936726
3               1.0   34164       10     3.5  1570454536
4               1.0   34164       11     4.0  1495056697
..              ...     ...      ...     ...         ...
95              1.0   34164      665     4.0  1442500833
96              1.0   34164      733     3.5  1459026869
97              1.0   34164      736     3.0  1500474948
98              1.0   34164      750     4.0  1406892572
99              1.0   34164      778     4.0  1406729633

[100 rows x 5 columns]


In [15]:
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
print(topUsersRating.head())

   similarityIndex  userId  movieId  rating   timestamp  weightedRating
0              1.0   34164        2     3.5  1516429399             3.5
1              1.0   34164        6     3.5  1406729504             3.5
2              1.0   34164        7     3.5  1551936726             3.5
3              1.0   34164       10     3.5  1570454536             3.5
4              1.0   34164       11     4.0  1495056697             4.0


In [16]:
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
print(tempTopUsersRating.head())

         sum_similarityIndex  sum_weightedRating
movieId                                         
1                  37.673433          142.548688
2                  28.298348           80.438428
3                   8.196533           24.796845
4                   3.621873           12.243543
5                  12.995299           34.441949


In [17]:
recommendation_df = pd.DataFrame()

recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
print(recommendation_df.head(10))

         weighted average recommendation score  movieId
movieId                                                
1                                     3.783799        1
2                                     2.842513        2
3                                     3.025284        3
4                                     3.380445        4
5                                     2.650339        5
6                                     4.091523        6
7                                     2.999064        7
8                                     2.490381        8
9                                     2.000000        9
10                                    3.227686       10


In [18]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
print(recommendation_df)

         weighted average recommendation score  movieId
movieId                                                
4277                                       5.0     4277
1156                                       5.0     1156
70649                                      5.0    70649
25901                                      5.0    25901
87697                                      5.0    87697
...                                        ...      ...
132594                                     0.5   132594
197179                                     0.5   197179
197181                                     0.5   197181
197197                                     0.5   197197
141718                                     0.5   141718

[14492 rows x 2 columns]


In [20]:
recommended_movie=movies.loc[movies['movieId'].isin(recommendation_df['movieId'])]

recommended_movie=recommended_movie.loc[~recommended_movie.movieId.isin(userSubset['movieId'])]

print(recommended_movie)

       movieId                               title  \
0            1                    Toy Story (1995)   
1            2                      Jumanji (1995)   
2            3             Grumpier Old Men (1995)   
3            4            Waiting to Exhale (1995)   
4            5  Father of the Bride Part II (1995)   
...        ...                                 ...   
62274   208545                Eminence Hill (2019)   
62283   208581                    30 Nights (2018)   
62313   208715                  Let It Snow (2019)   
62323   208747                The Good Liar (2019)   
62341   208802                Minion Scouts (2019)   

                                            genres  
0      Adventure|Animation|Children|Comedy|Fantasy  
1                       Adventure|Children|Fantasy  
2                                   Comedy|Romance  
3                             Comedy|Drama|Romance  
4                                           Comedy  
...                              