Data Reading

In [129]:
import pandas as pd

movies_df = pd.read_csv('./resources/movies.csv')
ratings_df = pd.read_csv('./resources/ratings.csv')

print(ratings_df.shape)
ratings_df.head()

(100836, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


Lets reduce the number of rows in ratings_df by only considering movies relevant enough to be given atleast 100 ratings in order to ease computation

In [130]:
agg_ratings = ratings_df.groupby('movieId').movieId.count()
agg_ratings_gt100 = agg_ratings[agg_ratings.values > 100]

ratings_reduced = ratings_df[ratings_df.movieId.isin(agg_ratings_gt100.index)]
print(ratings_reduced.shape)
ratings_reduced.head()

(19788, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
7,1,110,4.0,964982176


To find how much each user is related to each other based on how they rate movies, let's apply Pearson's correlation on a dataframe with userId as index, movieId as columns & each cell indicating the ratings given by a particular user on a specific movie

In [131]:
movieUserMatrix = ratings_reduced.pivot(index='userId', columns='movieId', values='rating')
print(movieUserMatrix.shape)
movieUserMatrix.head()

(597, 134)


movieId,1,2,6,10,32,34,39,47,50,110,...,7153,7361,7438,8961,33794,48516,58559,60069,68954,79132
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,,,5.0,5.0,4.0,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,4.0,4.5,,,4.0
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,2.0,,,2.0,,,...,,,,,,,,,,
5,4.0,,,,,4.0,3.0,,4.0,4.0,...,,,,,,,,,,


Let's normalize movieRatingsMatrix

In [132]:
movieUserMatrix_norm = movieUserMatrix.subtract(movieUserMatrix.mean(axis=1), axis=0)
print(movieUserMatrix_norm.shape)
movieUserMatrix_norm.head()

(597, 134)


movieId,1,2,6,10,32,34,39,47,50,110,...,7153,7361,7438,8961,33794,48516,58559,60069,68954,79132
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.392857,,-0.392857,,,,,0.607143,0.607143,-0.392857,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,0.0,0.5,,,0.0
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,-1.382353,,,-1.382353,,,...,,,,,,,,,,
5,0.538462,,,,,0.538462,-0.461538,,0.538462,0.538462,...,,,,,,,,,,


Applying Pearson's Correlation on movieUserMatrix

In [133]:
user_similarity = movieUserMatrix_norm.T.corr()
print(user_similarity.shape)
user_similarity.head()

(597, 597)


userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,,,0.391797,0.180151,-0.439941,-0.029894,0.464277,1.0,-0.037987,...,0.09157371,0.254514,0.101482,-0.5,0.78002,0.303854,-0.012077,0.242309,-0.175412,0.071553
2,,1.0,,,,,,,,1.0,...,-0.5833333,,-1.0,,,0.583333,,-0.229416,,0.765641
3,,,,,,,,,,,...,,,,,,,,,,
4,0.391797,,,1.0,-0.394823,0.421927,0.704669,0.055442,,0.360399,...,-0.2393249,0.5625,0.162301,-0.158114,0.905134,0.021898,-0.020659,-0.286872,,-0.050868
5,0.180151,,,-0.394823,1.0,-0.006888,0.328889,0.030168,,-0.777714,...,-4.5324670000000006e-17,0.231642,0.131108,0.068621,-0.245026,0.377341,0.228218,0.263139,0.384111,0.040582


Now, let's get the recommended movies for a certain "picked_user"

In [134]:
picked_user = 1

user_similarity.drop(index=picked_user, inplace=True)

Let's find the 10 most similar users to the "picked_user"

In [135]:
similarity_threshold = 0.3
user_similarity_pickedUser = user_similarity[picked_user]
similar_users = user_similarity_pickedUser[user_similarity_pickedUser.values > similarity_threshold].sort_values(ascending=False)[0:10]
similar_users

userId
502    1.000000
598    1.000000
550    1.000000
108    1.000000
9      1.000000
401    0.942809
511    0.925820
366    0.872872
154    0.866025
595    0.866025
Name: 1, dtype: float64

Now let's get the movies that have been watched by these 10 users

In [136]:
movies_watched = movieUserMatrix_norm[movieUserMatrix_norm.index == picked_user].dropna(axis=1,how='all')
movies_watched

movieId,1,6,47,50,110,223,231,260,296,316,...,2115,2329,2571,2628,2716,2858,2959,3147,3578,3793
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.392857,-0.392857,0.607143,0.607143,-0.392857,-1.392857,0.607143,0.607143,-1.392857,-1.392857,...,0.607143,0.607143,0.607143,-0.392857,0.607143,0.607143,0.607143,0.607143,0.607143,0.607143


In [137]:
similar_movies = movieUserMatrix_norm[movieUserMatrix_norm.index.isin(similar_users.index)].dropna(axis=1,how='all')
print(similar_movies.shape)
similar_movies.head()

(10, 62)


movieId,1,50,110,223,318,356,364,480,527,541,...,7153,7361,7438,8961,33794,48516,58559,60069,68954,79132
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9,,,,-0.666667,,,,,,,...,,,,,,,,,,
108,,,,,,,,-0.533333,,0.466667,...,,,,,,,,,,
154,,,,,,0.214286,,,,,...,,,,,,,,,0.214286,0.214286
366,,,-0.205882,,,,,,,,...,0.294118,,-0.205882,-0.205882,-0.205882,0.294118,-0.205882,,,-0.205882
401,0.117647,,,,,0.117647,0.117647,,,,...,-0.382353,,,0.617647,,,,0.617647,0.617647,


Now let's get the movies that have been watched by these 10 users which haven't been watched by "picked_user"

In [140]:
similar_movies.drop(movies_watched.columns, axis=1, inplace=True, errors='ignore')
# similar_movies.head()
# print(similar_movies_reduced.shape)
# similar_movies_reduced.head()
# similar_movies.columns
# movies_watched.columns
similar_movies_reduced = similar_movies
similar_movies_reduced.head()

movieId,318,364,541,588,589,595,1036,1704,1721,1968,...,7153,7361,7438,8961,33794,48516,58559,60069,68954,79132
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9,,,,,,,,,,,...,,,,,,,,,,
108,,,0.466667,,,,,,-0.533333,-0.533333,...,,,,,,,,,,
154,,,,,,,,0.214286,,,...,,,,,,,,,0.214286,0.214286
366,,,,,-0.205882,,-0.205882,,,,...,0.294118,,-0.205882,-0.205882,-0.205882,0.294118,-0.205882,,,-0.205882
401,,0.117647,,-0.382353,,-0.382353,,,,,...,-0.382353,,,0.617647,,,,0.617647,0.617647,


Now, to finally compute the recommended movies

In [144]:
similarityScore_movies = {}
# similar_movies_reduced.columns

for i in similar_movies_reduced.columns:
    total = 0
    count = 0
    
    for j in similar_users.index:
        if pd.isna(similar_movies_reduced[i][j]) == False:
            # rating * similarity_score
            total += similar_movies_reduced[i][j] * similar_users[j]
            count += 1
    
    if count == 0:
        similarityScore_movies[i] = 0
    else:
        similarityScore_movies[i] = total/count


similarityScore_movies = pd.DataFrame(similarityScore_movies.items(), columns=['movieId', 'score'])
similarityScore_movies = similarityScore_movies.sort_values('score',ascending=False)
recommended_movies = pd.merge(similarityScore_movies,movies_df, on='movieId', how='inner')
recommended_movies.head(10)

Unnamed: 0,movieId,score,title,genres
0,7361,1.888889,Eternal Sunshine of the Spotless Mind (2004),Drama|Romance|Sci-Fi
1,5816,1.888889,Harry Potter and the Chamber of Secrets (2002),Adventure|Fantasy
2,5418,0.888889,"Bourne Identity, The (2002)",Action|Mystery|Thriller
3,4963,0.888889,Ocean's Eleven (2001),Crime|Thriller
4,79132,0.587491,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX
5,4878,0.466667,Donnie Darko (2001),Drama|Mystery|Sci-Fi|Thriller
6,541,0.466667,Blade Runner (1982),Action|Sci-Fi|Thriller
7,4995,0.466667,"Beautiful Mind, A (2001)",Drama|Romance
8,48516,0.256727,"Departed, The (2006)",Crime|Drama|Thriller
9,318,0.222566,"Shawshank Redemption, The (1994)",Crime|Drama
