In [101]:
from numpy.linalg import svd
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

In [130]:
PATH = 'data/ratings_small.csv'

# import data
ratings = pd.read_csv(PATH)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [131]:
ratings = ratings[['userId', 'movieId', 'rating']]

In [132]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [133]:
# Look for the users who reviewed more than 50 movies
usercount = ratings[["movieId","userId"]].groupby("userId").count()
usercount = usercount[usercount["movieId"] >= 50]
print(usercount.head())

        movieId
userId         
2            76
3            51
4           204
5           100
7            88


In [134]:
# Look for the books who reviewed by more than 50 users
moviecount = ratings[["movieId","userId"]].groupby("movieId").count()
moviecount = moviecount[moviecount["userId"] >= 50]
print(moviecount.head())

         userId
movieId        
1           247
2           107
3            59
5            56
6           104


In [135]:
# Keep only the popular books and active users
ratings = ratings[ratings["userId"].isin(usercount.index) & ratings["movieId"].isin(moviecount.index)]
print(ratings)

        userId  movieId  rating
20           2       10     4.0
21           2       17     5.0
22           2       39     5.0
23           2       47     4.0
24           2       50     4.0
...        ...      ...     ...
99994      671     5952     5.0
99995      671     5989     4.0
99996      671     5991     4.5
99997      671     5995     4.0
100001     671     6365     4.0

[38538 rows x 3 columns]


In [136]:
ratings['rating'].mean()

3.7272691888525613

In [137]:
# convert df to the matrix
ratingmatrix = ratings.pivot(index="userId", columns="movieId", values="rating").fillna(0)

In [138]:
ratingmatrix

movieId,1,2,3,5,6,7,10,11,16,17,...,59315,60069,63082,68157,68358,68954,70286,72998,74458,79132
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,3.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
662,0.0,5.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
664,3.5,0.0,0.0,0.0,4.0,0.0,0.0,0.0,4.0,0.0,...,4.0,0.0,4.0,4.0,4.0,4.0,4.5,4.0,4.5,5.0
665,0.0,3.0,3.0,3.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
667,0.0,0.0,0.0,0.0,4.0,0.0,0.0,3.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [76]:
# 427 users and 453 movies

In [139]:
def normalize(ratings):
    return (ratings - ratings.min()) / (ratings.max() - ratings.min())

In [140]:
ratingmatrix = normalize(ratingmatrix)

In [141]:
matrix = ratingmatrix.values

In [142]:
U, S, V = svds(matrix, k=50)

In [151]:
# calculate pred ratings
S = np.diag(S)
pred_ratings = np.dot(np.dot(U, S), V) 
pred_ratings = normalize(pred_ratings)
    
# convert to df
pred_df = pd.DataFrame(
    pred_ratings,
    columns = ratingmatrix.columns,
    index = list(ratingmatrix.index)
).transpose()
pred_df

Unnamed: 0_level_0,2,3,4,5,7,8,12,13,15,17,...,655,656,658,659,660,662,664,665,667,671
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.285151,0.414357,0.323898,0.362357,0.556654,0.344471,0.346158,0.613186,0.447519,0.388773,...,0.338867,0.414626,0.421294,0.435008,0.464933,0.356087,0.713956,0.259846,0.415869,0.840442
2,0.458171,0.299612,0.473428,0.430948,0.353086,0.301978,0.322362,0.408049,0.410652,0.234637,...,0.510983,0.289806,0.343828,0.372211,0.362093,0.507245,0.377450,0.505329,0.354659,0.354122
3,0.292198,0.284207,0.375399,0.405726,0.364457,0.341766,0.302389,0.290505,0.412350,0.298446,...,0.289766,0.290630,0.260555,0.389561,0.326887,0.326036,0.247680,0.416213,0.326846,0.274631
5,0.313502,0.277017,0.303038,0.371781,0.299603,0.378812,0.305891,0.327194,0.463738,0.320694,...,0.322973,0.276111,0.299605,0.351739,0.325611,0.366030,0.217769,0.545554,0.333495,0.308321
6,0.335124,0.308902,0.152583,0.264527,0.226674,0.341806,0.314392,0.331023,0.703429,0.581774,...,0.257800,0.326133,0.410475,0.532510,0.316888,0.307514,0.536322,0.256008,0.481970,0.258211
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68954,0.303435,0.282473,0.291976,0.307949,0.327303,0.191166,0.302724,0.378928,0.624367,0.214152,...,0.388595,0.310976,0.275368,0.269586,0.516306,0.290681,0.525602,0.250240,0.300124,0.354693
70286,0.312932,0.342408,0.287946,0.285311,0.295094,0.246877,0.298366,0.331558,0.666440,0.358847,...,0.369306,0.316374,0.296624,0.274639,0.467508,0.306065,0.614854,0.333426,0.324521,0.231676
72998,0.330439,0.316634,0.256045,0.304575,0.286003,0.266777,0.308931,0.341090,0.607370,0.269224,...,0.468028,0.315959,0.302206,0.279653,0.533121,0.288620,0.609451,0.267389,0.303848,0.280469
74458,0.291922,0.336270,0.287601,0.285052,0.343204,0.228461,0.305929,0.350739,0.537401,0.339096,...,0.341605,0.250505,0.305610,0.255611,0.429318,0.284575,0.580040,0.370994,0.329051,0.326098


In [152]:
def recommend_items(pred_df, user_Id, n_recs):
    
    usr_pred = pred_df[user_Id].sort_values(ascending = False).reset_index().rename(columns = {user_Id : 'sim'})
    rec_df = usr_pred.sort_values(by = 'sim', ascending = False).head(n_recs)
    return rec_df

In [153]:
recommend_items(pred_df, 2, 10) # i wanna get 10 recommendations for second user

Unnamed: 0,movieId,sim
0,527,0.706132
1,150,0.682342
2,480,0.644489
3,593,0.637515
4,457,0.636927
5,590,0.613949
6,296,0.612612
7,356,0.601221
8,589,0.588282
9,509,0.58596


In [157]:
recommend_items(pred_df, 655, 10)

Unnamed: 0,movieId,sim
0,2959,0.815342
1,2571,0.789485
2,4993,0.76673
3,1197,0.75907
4,7153,0.751199
5,5952,0.739309
6,1136,0.705849
7,296,0.664368
8,2716,0.660705
9,2502,0.636968
