In [41]:
import pandas as pd
import numpy as np

In [42]:
ratings_list = [i.strip().split("::") for i in open('C:/School Stuff/Senior Research/ml-1m/ratings.dat', 'r').readlines()]
users_list = [i.strip().split("::") for i in open('C:/School Stuff/Senior Research/ml-1m/users.dat', 'r').readlines()]
movies_list = [i.strip().split("::") for i in open('C:/School Stuff/Senior Research/ml-1m/movies.dat', 'r').readlines()]

In [43]:
ratings = np.array(ratings_list)
users = np.array(users_list)
movies = np.array(movies_list)

In [44]:
ratings_df = pd.DataFrame(ratings_list, columns = ['UserID', 'MovieID', 'Rating', 'Timestamp'], dtype = int)
movies_df = pd.DataFrame(movies_list, columns = ['MovieID', 'Title', 'Genres'])
movies_df['MovieID'] = movies_df['MovieID'].apply(pd.to_numeric)

In [45]:
movies_df.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [46]:
ratings_df.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [71]:
R_df = ratings_df.pivot(index = 'UserID', columns ='MovieID', values = 'Rating')

In [72]:
users_mean=np.array(R_df.mean(axis=1))
#R_demeaned=R_df.sub(users_mean, axis=0)
#R_demeaned=R_demeaned.fillna(0).as_matrix()
R_unsparsed=R_df.fillna(0).as_matrix()

In [73]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(R_unsparsed, k = 50)

In [74]:
sigma = np.diag(sigma)

In [75]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt)

In [76]:
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = R_df.columns)
preds_df.head()

MovieID,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
0,4.292556,0.164772,-0.184504,-0.018528,0.021516,-0.181793,-0.102936,0.157111,-0.058523,-0.164231,...,0.032475,0.007048,0.03317,-0.007577,-0.075329,0.394578,0.124714,0.051545,0.057349,0.076743
1,0.754817,0.128981,0.341128,0.00954,0.00183,1.31458,0.077427,0.062218,0.163983,1.514572,...,-0.051376,-0.01578,-0.010918,0.055628,-0.016037,0.166941,-0.421144,-0.106675,-0.04965,-0.126564
2,1.844858,0.473855,0.098573,-0.039309,-0.019895,-0.154038,-0.141531,0.111551,0.035977,0.738427,...,0.049994,0.003596,0.022734,0.043377,0.034825,0.120205,0.086553,0.034783,0.029337,-0.121871
3,0.395484,-0.045487,0.033716,0.08389,0.051561,0.260676,-0.081541,0.023891,0.051932,-0.07717,...,0.010847,0.007518,0.004269,0.009527,-0.07963,0.077845,0.051369,-0.017046,0.01898,-0.04758
4,1.557609,-0.0067,-0.04486,0.249014,-0.04332,1.51814,-0.164306,-0.043626,-0.078771,0.424064,...,0.097708,0.013847,-0.024256,-0.043126,-0.065997,-0.025571,0.517685,0.008508,0.10663,0.221219


In [77]:
def recommend_movies(predictions_df, userID, movies_df, original_ratings_df, num_recommendations=5):
    
    # Get and sort the user's predictions
    user_row_number = userID - 1 # UserID starts at 1, not 0
    sorted_user_predictions = preds_df.iloc[user_row_number].sort_values(ascending=False) # UserID starts at 1
    
    # Get the user's data and merge in the movie information.
    user_data = original_ratings_df[original_ratings_df.UserID == (userID)]
    user_full = (user_data.merge(movies_df, how = 'left', left_on = 'MovieID', right_on = 'MovieID').
                     sort_values(['Rating'], ascending=False)
                 )

    print ('User {0} has already rated {1} movies.'.format(userID, user_full.shape[0]))
    print ('Recommending highest {0} predicted ratings movies not already rated.'.format(num_recommendations))
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies_df[~movies_df['MovieID'].isin(user_full['MovieID'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'MovieID',
               right_on = 'MovieID').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )

    return user_full, recommendations

In [81]:
already_rated, predictions = recommend_movies(preds_df, 14, movies_df, ratings_df, 10)

User 14 has already rated 25 movies.
Recommending highest 10 predicted ratings movies not already rated.


In [82]:
already_rated.head(10)

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Title,Genres
15,14,296,5,978201244,Pulp Fiction (1994),Crime|Drama
2,14,1263,5,978201280,"Deer Hunter, The (1978)",Drama|War
9,14,2686,5,978200975,"Red Violin, The (Le Violon rouge) (1998)",Drama|Mystery
10,14,2762,5,978201003,"Sixth Sense, The (1999)",Thriller
1,14,2997,5,978200689,Being John Malkovich (1999),Comedy
16,14,2920,5,978200528,Children of Paradise (Les enfants du paradis) ...,Drama|Romance
23,14,2396,4,978201003,Shakespeare in Love (1998),Comedy|Romance
4,14,2731,4,978201317,"400 Blows, The (Les Quatre cents coups) (1959)",Drama
5,14,3033,4,978200320,Spaceballs (1987),Comedy|Sci-Fi
22,14,1225,4,978201317,Amadeus (1984),Drama


In [83]:
predictions

Unnamed: 0,MovieID,Title,Genres
3085,3176,"Talented Mr. Ripley, The (1999)",Drama|Mystery|Thriller
2521,2599,Election (1999),Comedy
2635,2716,Ghostbusters (1984),Comedy|Horror
49,50,"Usual Suspects, The (1995)",Crime|Thriller
3699,3793,X-Men (2000),Action|Sci-Fi
2917,3006,"Insider, The (1999)",Drama
2319,2395,Rushmore (1998),Comedy
1571,1617,L.A. Confidential (1997),Crime|Film-Noir|Mystery|Thriller
2823,2908,Boys Don't Cry (1999),Drama
2631,2712,Eyes Wide Shut (1999),Drama
