# MovieLens recommender system using SVD matrix decomposition

In [1]:
import pandas as pd
import numpy as np
import tmdbsimple as tmdb
from sklearn.pipeline import make_pipeline
from sklearn import pipeline, feature_selection, decomposition
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.cluster import DBSCAN, AgglomerativeClustering, Birch
from sklearn.decomposition import PCA, NMF
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
import pprint
from sklearn.utils.extmath import randomized_svd
from scipy.sparse.linalg import svds

In [3]:
links=pd.read_csv('/Users/alexanderhughes/Documents/Kodak/ml-20m/links.csv')
ratings=pd.read_csv('/Users/alexanderhughes/Documents/Kodak/ml-20m/ratings.csv')
movies=pd.read_csv('/Users/alexanderhughes/Documents/Kodak/ml-20m/movies.csv')
tags=pd.read_csv('/Users/alexanderhughes/Documents/Kodak/ml-20m/tags.csv')
genome_scores=pd.read_csv('/Users/alexanderhughes/Documents/Kodak/ml-20m/genome-scores.csv')
genome_tags=pd.read_csv('/Users/alexanderhughes/Documents/Kodak/ml-20m/genome-tags.csv')

In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [5]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Create dictionary of Movie ID's and Titles so that we can have titles in Ratings dataframe:

In [6]:
dictionary=movies.set_index('movieId').to_dict()['title']

In [7]:
dictionary

{1: 'Toy Story (1995)',
 2: 'Jumanji (1995)',
 3: 'Grumpier Old Men (1995)',
 4: 'Waiting to Exhale (1995)',
 5: 'Father of the Bride Part II (1995)',
 6: 'Heat (1995)',
 7: 'Sabrina (1995)',
 8: 'Tom and Huck (1995)',
 9: 'Sudden Death (1995)',
 10: 'GoldenEye (1995)',
 11: 'American President, The (1995)',
 12: 'Dracula: Dead and Loving It (1995)',
 13: 'Balto (1995)',
 14: 'Nixon (1995)',
 15: 'Cutthroat Island (1995)',
 16: 'Casino (1995)',
 17: 'Sense and Sensibility (1995)',
 18: 'Four Rooms (1995)',
 19: 'Ace Ventura: When Nature Calls (1995)',
 20: 'Money Train (1995)',
 21: 'Get Shorty (1995)',
 22: 'Copycat (1995)',
 23: 'Assassins (1995)',
 24: 'Powder (1995)',
 25: 'Leaving Las Vegas (1995)',
 26: 'Othello (1995)',
 27: 'Now and Then (1995)',
 28: 'Persuasion (1995)',
 29: 'City of Lost Children, The (Cité des enfants perdus, La) (1995)',
 30: 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)',
 31: 'Dangerous Minds (1995)',
 32: 'Twelve Monkeys (a.k.a. 12 Monkeys) (199

In [8]:
ratings['movieName']=ratings['movieId'].map(dictionary)

In [9]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,movieName
0,1,2,3.5,1112486027,Jumanji (1995)
1,1,29,3.5,1112484676,"City of Lost Children, The (Cité des enfants p..."
2,1,32,3.5,1112484819,Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
3,1,47,3.5,1112484727,Seven (a.k.a. Se7en) (1995)
4,1,50,3.5,1112484580,"Usual Suspects, The (1995)"


Save this new DataFrame to a csv file for future use:

In [10]:
#ratings.to_csv('newDF.csv', sep=',', index=False)

Use only a subset of movies for computation reasons - using top 600 movies:

In [11]:
n=600
top_n = ratings.movieId.value_counts().index[:n]
ratings = ratings[ratings.movieId.isin(top_n)]
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,movieName
0,1,2,3.5,1112486027,Jumanji (1995)
1,1,29,3.5,1112484676,"City of Lost Children, The (Cité des enfants p..."
2,1,32,3.5,1112484819,Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
3,1,47,3.5,1112484727,Seven (a.k.a. Se7en) (1995)
4,1,50,3.5,1112484580,"Usual Suspects, The (1995)"


Create wide matrix:

In [12]:
wideMatrix = pd.pivot_table(ratings,values='rating',
                                index=['userId','movieId'],
                                aggfunc=np.mean).unstack()

In [13]:
wideMatrix.ix[0:5, 0:5]

movieId,1,2,3,5
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,3.5,,
2,,,4.0,
3,4.0,,,
4,,,,
5,,3.0,,


** Fill NaN values with 0: **

In [14]:
wideMatrix2=wideMatrix.fillna(0)

In [15]:
wideMatrix2.head()

movieId,1,2,3,5,6,7,10,11,16,17,...,55820,56367,58559,59315,60069,63082,68157,68954,72998,79132
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,3.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,3.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


** de-mean  **

In [16]:
R = wideMatrix2.as_matrix()
user_ratings_mean = np.mean(R, axis = 1)
R_demeaned = R - user_ratings_mean.reshape(-1, 1)

** SVD decomposition of the demeaned matrix ** 

In [17]:
U, sigma, Vt = svds(R_demeaned, k = 50)

In [18]:
sigma = np.diag(sigma)

** dot product $U \Sigma V^T$ to get approximation matrix **

In [19]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = wideMatrix2.columns)

In [20]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,movieName
0,1,2,3.5,1112486027,Jumanji (1995)
1,1,29,3.5,1112484676,"City of Lost Children, The (Cité des enfants p..."
2,1,32,3.5,1112484819,Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
3,1,47,3.5,1112484727,Seven (a.k.a. Se7en) (1995)
4,1,50,3.5,1112484580,"Usual Suspects, The (1995)"


In [20]:
def recommend_movies(predictions_df, userId, movies_df, original_ratings_df, num_recommendations=5):
    
    # Get and sort the user's predictions
    user_row_number = userId- 1 # userId starts at 1, not 0
    sorted_user_predictions = predictions_df.iloc[user_row_number].sort_values(ascending=False)
    
    # Get the user's data and merge in the movie information.
    user_data = original_ratings_df[original_ratings_df.userId == (userId)]
    user_full = (user_data.merge(movies_df, how = 'left', left_on = 'movieId', right_on = 'movieId').
                     sort_values(['rating'], ascending=False)
                 )

    print('User {0} has already rated {1} movies.'.format(userId, user_full.shape[0]))
    print('Recommending the highest {0} predicted ratings movies not already rated.'.format(num_recommendations))
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies_df[~movies_df['movieId'].isin(user_full['movieId'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'movieId',
               right_on = 'movieId').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )

    return user_full, recommendations

list of user's we can check with SVD and then compare with Factorization Machine method:

check=[8405, 34576, 59477, 74142, 79159, 82418, 118205, 121535, 125794, 131904]

In [21]:
already_rated, predictions = recommend_movies(preds_df, 125794, movies, ratings, 10)

User 125794 has already rated 559 movies.
Recommending the highest 10 predicted ratings movies not already rated.


In [22]:
already_rated.head(10)

Unnamed: 0,userId,movieId,rating,timestamp,movieName,title,genres
0,125794,1,5.0,981448353,Toy Story (1995),Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
185,125794,1136,5.0,967620201,Monty Python and the Holy Grail (1975),Monty Python and the Holy Grail (1975),Adventure|Comedy|Fantasy
365,125794,2455,5.0,975250706,"Fly, The (1986)","Fly, The (1986)",Drama|Horror|Sci-Fi|Thriller
190,125794,1196,5.0,965293814,Star Wars: Episode V - The Empire Strikes Back...,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
189,125794,1193,5.0,967111749,One Flew Over the Cuckoo's Nest (1975),One Flew Over the Cuckoo's Nest (1975),Drama
367,125794,2490,5.0,965387401,Payback (1999),Payback (1999),Action|Thriller
187,125794,1172,5.0,967112014,Cinema Paradiso (Nuovo cinema Paradiso) (1989),Cinema Paradiso (Nuovo cinema Paradiso) (1989),Drama
368,125794,2502,5.0,975251096,Office Space (1999),Office Space (1999),Comedy|Crime
369,125794,2529,5.0,967620502,Planet of the Apes (1968),Planet of the Apes (1968),Action|Drama|Sci-Fi
193,125794,1199,5.0,967619652,Brazil (1985),Brazil (1985),Fantasy|Sci-Fi


In [23]:
predictions

Unnamed: 0,movieId,title,genres
9,17,Sense and Sensibility (1995),Drama|Romance
2,7,Sabrina (1995),Comedy|Romance
77,105,"Bridges of Madison County, The (1995)",Drama|Romance
631,783,"Hunchback of Notre Dame, The (1996)",Animation|Children|Drama|Musical|Romance
0,3,Grumpier Old Men (1995),Comedy|Romance
3469,4018,What Women Want (2000),Comedy|Romance
270,345,"Adventures of Priscilla, Queen of the Desert, ...",Comedy|Drama
3475,4025,Miss Congeniality (2000),Comedy|Crime
2113,2572,10 Things I Hate About You (1999),Comedy|Romance
580,724,"Craft, The (1996)",Drama|Fantasy|Horror|Thriller
