Data Reading

In [30]:
import pandas as pd

movies_df = pd.read_csv('./resources/movies.csv')
ratings_df = pd.read_csv('./resources/ratings.csv')

print(ratings_df.shape)
ratings_df.head()

(100836, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


Lets reduce the number of rows in ratings_df by only considering movies relevant enough to be given atleast 100 ratings in order to ease computation

In [31]:
agg_ratings = ratings_df.groupby('movieId').movieId.count()
agg_ratings_gt100 = agg_ratings[agg_ratings.values > 100]

ratings_reduced = ratings_df[ratings_df.movieId.isin(agg_ratings_gt100.index)]
print(ratings_reduced.shape)
ratings_reduced.head()

(19788, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
7,1,110,4.0,964982176


To find how much each user is related to each other based on how they rate movies, let's apply Pearson's correlation on a dataframe with userId as index, movieId as columns & each cell indicating the ratings given by a particular user on a specific movie

In [32]:
movieUserMatrix = ratings_reduced.pivot(index=['userId'], columns=['movieId'], values=['rating'])
print(movieUserMatrix.shape)
movieUserMatrix.head()

(597, 134)


Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
movieId,1,2,6,10,32,34,39,47,50,110,...,7153,7361,7438,8961,33794,48516,58559,60069,68954,79132
userId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,4.0,,4.0,,,,,5.0,5.0,4.0,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,4.0,4.5,,,4.0
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,2.0,,,2.0,,,...,,,,,,,,,,
5,4.0,,,,,4.0,3.0,,4.0,4.0,...,,,,,,,,,,


Let's normalize movieRatingsMatrix

In [33]:
movieUserMatrix_norm = movieUserMatrix.subtract(movieUserMatrix.mean(axis=1), axis=0)
print(movieUserMatrix_norm.shape)
movieUserMatrix_norm.head()

(597, 134)


Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
movieId,1,2,6,10,32,34,39,47,50,110,...,7153,7361,7438,8961,33794,48516,58559,60069,68954,79132
userId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,-0.392857,,-0.392857,,,,,0.607143,0.607143,-0.392857,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,0.0,0.5,,,0.0
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,-1.382353,,,-1.382353,,,...,,,,,,,,,,
5,0.538462,,,,,0.538462,-0.461538,,0.538462,0.538462,...,,,,,,,,,,


Applying Pearson's Correlation on movieUserMatrix

In [34]:
user_similarity = movieUserMatrix_norm.T.corr()
print(user_similarity.shape)
user_similarity.head()

(597, 597)


userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,,,0.391797,0.180151,-0.439941,-0.029894,0.464277,1.0,-0.037987,...,0.09157371,0.254514,0.101482,-0.5,0.78002,0.303854,-0.012077,0.242309,-0.175412,0.071553
2,,1.0,,,,,,,,1.0,...,-0.5833333,,-1.0,,,0.583333,,-0.229416,,0.765641
3,,,,,,,,,,,...,,,,,,,,,,
4,0.391797,,,1.0,-0.394823,0.421927,0.704669,0.055442,,0.360399,...,-0.2393249,0.5625,0.162301,-0.158114,0.905134,0.021898,-0.020659,-0.286872,,-0.050868
5,0.180151,,,-0.394823,1.0,-0.006888,0.328889,0.030168,,-0.777714,...,-4.5324670000000006e-17,0.231642,0.131108,0.068621,-0.245026,0.377341,0.228218,0.263139,0.384111,0.040582


Now, let's get the recommended movies for a certain "picked_user"

In [35]:
picked_user = 1

user_similarity.drop(index=picked_user, inplace=True)

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,,1.0,,,,,,,,1.0,...,-0.5833333,,-1.0,,,0.583333,,-0.229416,,0.765641
3,,,,,,,,,,,...,,,,,,,,,,
4,0.391797,,,1.0,-0.394823,0.421927,0.7046689,0.055442,,0.360399,...,-0.2393249,0.5625,0.162301,-0.158114,0.905134,0.021898,-0.020659,-0.286872,,-0.050868
5,0.180151,,,-0.394823,1.0,-0.006888,0.3288887,0.030168,,-0.777714,...,-4.5324670000000006e-17,0.231642,0.131108,0.068621,-0.245026,0.377341,0.228218,0.263139,0.384111,0.040582
6,-0.439941,,,0.421927,-0.006888,1.0,-7.323043000000001e-18,-0.127385,,0.957427,...,-0.29277,-0.030599,-0.123983,-0.176327,0.063861,-0.468008,0.541386,-0.337129,0.158255,-0.030567


Let's find the 10 most similar users to the "picked_user"

In [37]:
similarity_threshold = 0.3
user_similarity_pickedUser = user_similarity[picked_user]
similar_users = user_similarity_pickedUser[user_similarity_pickedUser.values > similarity_threshold].sort_values(ascending=False)[0:10]
similar_users

userId
502    1.000000
598    1.000000
550    1.000000
108    1.000000
9      1.000000
401    0.942809
511    0.925820
366    0.872872
154    0.866025
595    0.866025
Name: 1, dtype: float64

Now let's get the movies that have been watched by these 10 users

In [39]:
movies_watched = movieUserMatrix_norm[movieUserMatrix_norm.index == picked_user].dropna(axis=1,how='all')
movies_watched

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
movieId,1,6,47,50,110,223,231,260,296,316,...,2115,2329,2571,2628,2716,2858,2959,3147,3578,3793
userId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,-0.392857,-0.392857,0.607143,0.607143,-0.392857,-1.392857,0.607143,0.607143,-1.392857,-1.392857,...,0.607143,0.607143,0.607143,-0.392857,0.607143,0.607143,0.607143,0.607143,0.607143,0.607143


In [43]:
similar_movies = movieUserMatrix_norm[movieUserMatrix_norm.index.isin(similar_users.index)]
print(similar_movies.shape)
similar_movies.head()

(10, 134)


Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
movieId,1,2,6,10,32,34,39,47,50,110,...,7153,7361,7438,8961,33794,48516,58559,60069,68954,79132
userId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
9,,,,,,,,,,,...,,,,,,,,,,
108,,,,,,,,,,,...,,,,,,,,,,
154,,,,,,,,,,,...,,,,,,,,,0.214286,0.214286
366,,,,,,,,,,-0.205882,...,0.294118,,-0.205882,-0.205882,-0.205882,0.294118,-0.205882,,,-0.205882
401,0.117647,,,,,,,,,,...,-0.382353,,,0.617647,,,,0.617647,0.617647,


Now let's get the movies that have been watched by these 10 users which haven't been watched by "picked_user"

In [45]:
similar_movies_reduced = similar_movies.drop(columns = movies_watched.columns)
print(similar_movies_reduced.shape)
similar_movies_reduced.head()

(10, 78)


Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
movieId,2,10,32,34,39,111,150,153,161,165,...,7153,7361,7438,8961,33794,48516,58559,60069,68954,79132
userId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
9,,,,,,,,,,,...,,,,,,,,,,
108,,,,,,,,,,,...,,,,,,,,,,
154,,,,,,,,,,,...,,,,,,,,,0.214286,0.214286
366,,,,,,,,,,,...,0.294118,,-0.205882,-0.205882,-0.205882,0.294118,-0.205882,,,-0.205882
401,,,,,,,,,,,...,-0.382353,,,0.617647,,,,0.617647,0.617647,
