In [10]:
import pandas as pd
from scipy.sparse import csr_matrix
import numpy as np
from scipy.sparse.linalg import svds


In [2]:
userRatings=pd.read_csv("ml-latest-small/ratings.csv",usecols=["userId","movieId","rating"])
userRatings.head()

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [3]:
movies=pd.read_csv("ml-latest-small/movies.csv",usecols=["movieId","title"])
movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [4]:
ratings = pd.merge(movies, userRatings)

ratings.head()

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),7,3.0
1,1,Toy Story (1995),9,4.0
2,1,Toy Story (1995),13,5.0
3,1,Toy Story (1995),15,2.0
4,1,Toy Story (1995),19,3.0


In [8]:
R_df = ratings.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0)
R_df.head()


movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
R = R_df.as_matrix()
user_ratings_mean = np.mean(R, axis = 1)
R_demeaned = R - user_ratings_mean.reshape(-1, 1)

In [11]:
U, sigma, Vt = svds(R_demeaned, k = 50)


In [12]:
sigma = np.diag(sigma)


In [13]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = R_df.columns)

In [14]:
preds_df

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
0,-0.054239,0.045130,-0.004835,-0.019817,-0.011284,0.041373,-0.007822,-0.017188,0.012246,0.037670,...,-0.005258,-0.005453,0.012369,-0.004991,-0.004639,-0.019055,0.021402,-0.006365,-0.006098,-0.004819
1,0.419835,1.406440,-0.188807,0.156658,0.268032,0.414698,0.052172,0.044728,-0.020198,2.220256,...,-0.005909,-0.003974,-0.012555,-0.003555,-0.002711,-0.071621,-0.016212,0.001047,-0.001468,-0.006577
2,1.345619,0.266505,-0.011962,0.012278,0.079508,0.090960,-0.122094,0.031327,-0.018023,0.141176,...,-0.002647,-0.002364,-0.010153,0.000277,-0.000116,-0.018063,-0.015761,0.010611,0.006792,-0.006357
3,1.133455,1.046982,0.141275,0.081841,-0.339675,-1.484659,-0.263096,-0.169750,-0.021862,1.611664,...,0.020805,0.000410,0.056040,-0.002817,-0.000767,0.159159,0.087519,-0.030854,-0.021279,0.048529
4,1.389578,1.466495,0.605557,-0.029647,0.729380,-0.118539,-0.026017,0.065577,-0.156655,0.307926,...,-0.007422,-0.011810,0.006644,-0.005159,-0.001249,-0.034658,0.016456,0.001710,-0.004166,-0.001864
5,0.351379,0.147783,-0.226190,0.024425,-0.028854,0.052569,-0.095954,-0.013454,-0.050000,0.020672,...,-0.008031,-0.004661,0.003584,-0.004411,-0.003628,0.006245,0.008364,-0.013672,-0.010594,-0.010085
6,2.710704,0.684846,0.702097,0.141979,0.025992,0.079456,0.386194,-0.038723,0.126546,1.075512,...,0.008354,0.003333,0.004745,0.004281,0.003670,0.064590,0.004365,0.016075,0.011847,0.011205
7,1.286794,-0.263419,-0.147525,0.056464,0.612119,0.048448,0.055058,0.004062,-0.090489,0.294803,...,0.009358,0.012997,0.028965,0.009099,0.006378,-0.012434,0.036176,0.020553,0.018148,0.004175
8,1.803813,-0.024808,-0.113694,-0.026134,0.188041,0.660766,0.039552,-0.021128,-0.016371,-0.331059,...,0.007091,0.002046,0.014177,-0.000965,-0.003194,-0.046815,0.019518,0.012264,0.008756,0.010689
9,0.498709,-0.216453,-0.262796,0.001221,-0.233903,-0.085897,-0.181951,-0.047679,-0.105678,0.104707,...,0.016209,0.003547,0.000557,-0.004506,-0.009630,0.042599,-0.002036,0.007153,0.006589,0.026675


In [27]:
def recommend_movies(predictions_df, userID, movies_df, original_ratings_df, num_recommendations=5):
    
    # Get and sort the user's predictions
    user_row_number = userID - 1 # UserID starts at 1, not 0
    sorted_user_predictions = predictions_df.iloc[user_row_number].sort_values(ascending=False)
    
    # Get the user's data and merge in the movie information.
    user_data = original_ratings_df[original_ratings_df.userId == (userID)]
    user_full = (user_data.merge(movies_df, how = 'left', left_on = 'movieId', right_on = 'movieId').
                     sort_values(['rating'], ascending=False)
                 )

    print ('User {0} has already rated {1} movies.'.format(userID, user_full.shape[0]))
    print ('Recommending the highest {0} predicted ratings movies not already rated.'.format(num_recommendations))
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies_df[~movies_df['movieId'].isin(user_full['movieId'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'movieId',
               right_on = 'movieId').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )

    return user_full, recommendations

already_rated, predictions = recommend_movies(preds_df, 50, movies, ratings, 100)
predictions

User 50 has already rated 46 movies.
Recommending the highest 100 predicted ratings movies not already rated.


Unnamed: 0,movieId,title
482,593,"Silence of the Lambs, The (1991)"
481,592,Batman (1989)
480,588,Aladdin (1992)
303,364,"Lion King, The (1994)"
411,500,Mrs. Doubtfire (1993)
125,153,Batman Forever (1995)
265,318,"Shawshank Redemption, The (1994)"
484,595,Beauty and the Beast (1991)
275,329,Star Trek: Generations (1994)
387,474,In the Line of Fire (1993)
