In [1]:
import numpy as np
import pandas as pd
import os
from IPython.core.display import display, HTML, clear_output
display(HTML("<style>.container { width:80% !important; }</style>"))
from scipy.sparse import csc_matrix
from sparsesvd import sparsesvd

# Import ratings data (including user data)

In [2]:
cwd = os.getcwd()
ratings = pd.read_csv(os.path.join(cwd, "..", "data", "ratings_combined_ryan.csv"))

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


# Create Compressed Sparse Column matrix

In [4]:
N1 = ratings['userId'].nunique()
N2 = ratings['movieId'].nunique()

In [5]:
uids_raw = ratings['userId'].unique()
iids_raw = ratings['movieId'].unique()

In [6]:
uids_inner = np.arange(N1)
iids_inner = np.arange(N2)

In [7]:
uid_maptoraw = dict(zip(uids_inner, uids_raw))
uid_maptoinner = dict(zip(uids_raw, uids_inner))
iid_maptoraw = dict(zip(iids_inner, iids_raw))
iid_maptoinner = dict(zip(iids_raw, iids_inner))

In [8]:
ratings.rename(columns={'userId':'uid_raw', 'movieId':'iid_raw'}, inplace=True)

In [9]:
ratings.head()

Unnamed: 0,uid_raw,iid_raw,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


In [10]:
ratings['uid_inner'] = ratings.apply(lambda x: uid_maptoinner[x['uid_raw']], axis=1)

In [11]:
ratings['iid_inner'] = ratings.apply(lambda x: iid_maptoinner[x['iid_raw']], axis=1)

In [12]:
compressed_matrix = csc_matrix((ratings['rating'], (ratings['uid_inner'], ratings['iid_inner'])), shape=(N1, N2))

# Perform SVD Matrix Factorization

In [13]:
ut, s, vt = sparsesvd(compressed_matrix, 50)

In [14]:
ut.shape, s.shape, vt.shape

((50L, 138494L), (50L,), (50L, 26744L))

# Make user predictions

In [15]:
user_uid_raw = ratings['uid_raw'].max(); print('user raw user id is: {}'.format(user_uid_raw))
user_uid_inner = uid_maptoinner[user_uid_raw]; print('user inner user id is: {}'.format(user_uid_inner))

user raw user id is: 138494
user inner user id is: 138493


### Dot product

In [16]:
s = np.diag(s)
preds = ut[:, user_uid_inner].dot(s).dot(vt)
preds.shape

(26744L,)

In [17]:
preds = pd.DataFrame(preds, columns = ['predicted_rating'])
preds.reset_index(inplace=True)
preds.rename(inplace=True, columns={'index':'iid_inner'})
preds['iid_raw'] = preds.apply(lambda x: iid_maptoraw[x['iid_inner']], axis=1)

In [18]:
preds['predicted_rating'].describe()

count    26744.000000
mean         0.057402
std          0.258901
min         -1.159863
25%         -0.000216
50%          0.002561
75%          0.016158
max          6.194195
Name: predicted_rating, dtype: float64

# Join movie titles and genres

In [19]:
cwd = os.getcwd()
movies = pd.read_csv(os.path.join(cwd, "..", "data", "movies.csv"))

In [20]:
preds = pd.merge(preds, movies[['movieId', 'title', 'genres']], left_on='iid_raw', right_on='movieId')
preds = preds.sort_values('predicted_rating', ascending=False)

In [21]:
preds.head(20)

Unnamed: 0,iid_inner,predicted_rating,iid_raw,movieId,title,genres
2087,2087,6.194195,79132,79132,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX
96,96,4.931815,2959,2959,Fight Club (1999),Action|Crime|Drama|Thriller
11,11,4.738602,296,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
1856,1856,4.42631,91529,91529,"Dark Knight Rises, The (2012)",Action|Adventure|Crime|IMAX
1854,1854,4.399259,89745,89745,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX
4,4,4.152606,50,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
158,158,4.079372,7153,7153,"Lord of the Rings: The Return of the King, The...",Action|Adventure|Drama|Fantasy
3081,3081,4.063217,99114,99114,Django Unchained (2012),Action|Drama|Western
227,227,4.044806,1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
142,142,4.002157,5952,5952,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy


# Filter out movies that the user already rated

In [24]:
user_profile = pd.read_csv('ryan_profile.csv', index_col=0)

In [26]:
user_profile.head()

Unnamed: 0,movieId,rating
0,71518,4
1,73268,5
2,5481,3
3,43679,3
4,4369,3


In [27]:
preds = pd.merge(preds, user_profile, on='movieId', how='left')

In [28]:
havent_seen_it_mask = preds['rating'].isnull()

In [31]:
preds[havent_seen_it_mask][['title', 'genres', 'predicted_rating']][:2000].to_csv('movie_recommendations_ryan_SVD.csv', 
                                                                               index=False)

In [30]:
preds[havent_seen_it_mask][['title', 'genres', 'predicted_rating']].head(30)

Unnamed: 0,title,genres,predicted_rating
12,Avatar (2009),Action|Adventure|Sci-Fi|IMAX,3.883172
20,Inglourious Basterds (2009),Action|Drama|War,3.52595
24,"Hangover, The (2009)",Comedy|Crime,3.373318
25,"King's Speech, The (2010)",Drama,3.317078
31,How to Train Your Dragon (2010),Adventure|Animation|Children|Fantasy|IMAX,3.229924
32,Sherlock Holmes (2009),Action|Crime|Mystery|Thriller,3.186106
38,Skyfall (2012),Action|Adventure|Thriller|IMAX,3.051995
39,Pirates of the Caribbean: The Curse of the Bla...,Action|Adventure|Comedy|Fantasy,3.020261
41,Source Code (2011),Action|Drama|Mystery|Sci-Fi|Thriller,2.995122
49,Zombieland (2009),Action|Comedy|Horror,2.680397
