Reading data & creating an appropriate dataframe

In [7]:
import pandas as pd

movies_df = pd.read_csv('./resources/movies.csv')
ratings_df = pd.read_csv('./resources/ratings.csv')

df = pd.merge(ratings_df, movies_df, on='movieId', how='inner')
print(df.shape)
df.head()

(100836, 6)


Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


Let's narrow the data pool down to movies relevant enough to be rated atleast 100 times

In [17]:
agg_ratings = ratings_df.groupby('movieId').movieId.count().sort_values(ascending=False)
agg_ratings_gt100 = agg_ratings[agg_ratings.values > 100]
df_gt100 = df[df.movieId.isin(agg_ratings_gt100.index)]
print(df_gt100.shape)
df_gt100.head()

(19788, 6)


Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


Let's calculate movie similarities in the form of a matrix using Pearson's correlation

In [21]:
matrix = df_gt100.pivot(index='movieId',columns='userId', values='rating')
# Let's normalize matrix
matrix_norm = matrix.subtract(matrix.mean(axis=1), axis=0)
print(matrix_norm.shape)
matrix_norm.head()

(134, 597)


userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.07907,,,,0.07907,,0.57907,,,,...,0.07907,,0.07907,-0.92093,0.07907,-1.42093,0.07907,-1.42093,-0.92093,1.07907
2,,,,,,0.568182,,0.568182,,,...,,0.568182,,1.568182,0.068182,,,-1.431818,,
6,0.053922,,,,,0.053922,,,,,...,,-0.946078,0.053922,-0.946078,,,,,,1.053922
10,,,,,,-0.496212,,-1.496212,,,...,,-0.496212,,,,,,0.503788,0.503788,
32,,,,-1.983051,,0.016949,,-0.983051,,,...,,-0.983051,-0.983051,0.016949,,0.016949,,-0.483051,,0.516949


In [22]:
similar_movies = matrix.T.corr()
print(similar_movies.shape)
similar_movies.head()

(134, 134)


movieId,1,2,6,10,32,34,39,47,50,110,...,7153,7361,7438,8961,33794,48516,58559,60069,68954,79132
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.330978,0.106465,-0.021409,0.120087,0.407493,0.144193,0.222006,0.250781,0.028768,...,0.101284,0.049218,0.093972,0.643301,0.285349,0.326149,0.25529,0.237234,0.355424,0.076195
2,0.330978,1.0,0.16351,0.016626,0.084588,0.330327,0.241754,0.088477,0.045364,0.311665,...,0.133258,0.164373,0.056481,0.460369,0.250228,0.274094,0.025349,-0.003548,0.296362,0.300878
6,0.106465,0.16351,1.0,0.420222,0.184299,-0.005211,-0.169698,0.388253,0.201409,0.327672,...,0.08874,0.103805,-0.174723,0.074621,-0.004642,0.113802,0.143009,0.202412,0.16461,0.565437
10,-0.021409,0.016626,0.420222,1.0,-0.160238,-0.28109,0.076431,0.243948,0.045645,0.060961,...,0.090872,0.346523,-0.041482,0.094992,0.162075,0.088284,0.152039,0.23522,-0.348508,-0.047223
32,0.120087,0.084588,0.184299,-0.160238,1.0,0.238554,-0.308569,0.285574,0.255347,0.32552,...,0.312204,0.096976,0.421061,0.256342,0.217525,0.33601,0.350385,0.513187,0.512313,0.472037


Let's pick a user "picked_userId" as the user who has to get recommended movies & for now, just collect all the movies he/she has already watched

In [50]:
picked_userId = 1

watched_movies = pd.Series([index for index in matrix_norm.index if pd.isna(matrix_norm.loc[index,picked_userId]) == False])
watched_movies = matrix_norm[matrix_norm.index.isin(watched_movies.values)].loc[0:,picked_userId]
print(watched_movies.shape)
watched_movies.head()

(56,)


movieId
1      0.079070
6      0.053922
47     1.024631
50     0.762255
110   -0.031646
Name: 1, dtype: float64

Now, let's find the movies user has not watched

In [52]:
notwatched_movies = matrix_norm.loc[~matrix_norm.index.isin(watched_movies.index),picked_userId]
print(notwatched_movies.shape)
notwatched_movies.head()

(78,)


movieId
2    NaN
10   NaN
32   NaN
34   NaN
39   NaN
Name: 1, dtype: float64