In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re

from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
py.init_notebook_mode(connected=True)

import warnings
warnings.filterwarnings('ignore')

plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = [18, 8]

In [2]:
reviews = pd.read_csv('ratings.csv', sep='\t', encoding='latin-1', usecols=['user_id', 'movie_id', 'rating', 'timestamp'])
users = pd.read_csv('users.csv', sep='\t', encoding='latin-1', usecols=['user_id', 'gender', 'age', 'occupation', 'zipcode'])
movies = pd.read_csv('movies.csv', sep='\t', encoding='latin-1', usecols=['movie_id', 'title', 'genres'])

In [3]:
movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
users.head()

Unnamed: 0,user_id,gender,age,occupation,zipcode
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [5]:
reviews.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [6]:
print('Reviews shape:', reviews.shape)
print('Users shape:', users.shape)
print('Movies shape:', movies.shape)

Reviews shape: (1000209, 4)
Users shape: (6040, 5)
Movies shape: (3883, 3)


In [7]:
reviews.drop(['timestamp'], axis=1, inplace=True)
users.drop(['zipcode'], axis=1, inplace=True)

In [8]:
movies['release_year'] = movies['title'].str.extract(r'(?:\((\d{4})\))?\s*$', expand=False)

In [9]:
movies.head()

Unnamed: 0,movie_id,title,genres,release_year
0,1,Toy Story (1995),Animation|Children's|Comedy,1995
1,2,Jumanji (1995),Adventure|Children's|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


In [10]:
movies.head()

Unnamed: 0,movie_id,title,genres,release_year
0,1,Toy Story (1995),Animation|Children's|Comedy,1995
1,2,Jumanji (1995),Adventure|Children's|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


In [11]:
final_df = reviews.merge(movies, on='movie_id', how='left').merge(users, on='user_id', how='left')

print('final_df shape:', final_df.shape)

final_df shape: (1000209, 9)


In [12]:
final_df.head()

Unnamed: 0,user_id,movie_id,rating,title,genres,release_year,gender,age,occupation
0,1,1193,5,One Flew Over the Cuckoo's Nest (1975),Drama,1975,F,1,10
1,1,661,3,James and the Giant Peach (1996),Animation|Children's|Musical,1996,F,1,10
2,1,914,3,My Fair Lady (1964),Musical|Romance,1964,F,1,10
3,1,3408,4,Erin Brockovich (2000),Drama,2000,F,1,10
4,1,2355,5,"Bug's Life, A (1998)",Animation|Children's|Comedy,1998,F,1,10


In [13]:
n_users = final_df['user_id'].nunique()
n_movies = final_df['movie_id'].nunique()

print('Number of users:', n_users)
print('Number of movies:', n_movies)

Number of users: 6040
Number of movies: 3706


In [14]:
final_df_matrix = final_df.pivot(index='user_id',
                                 columns='movie_id',
                                 values='rating').fillna(0)

In [15]:
final_df_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
user_ratings_mean = np.mean(final_df_matrix.values, axis=1)
ratings_demeaned = final_df_matrix.values - user_ratings_mean.reshape(-1, 1)

In [17]:
# Check data sparsity

sparsity = round(1.0 - final_df.shape[0] / float(n_users * n_movies), 3)
print('The sparsity level of MovieLens1M dataset is ' +  str(sparsity * 100) + '%')

The sparsity level of MovieLens1M dataset is 95.5%


In [18]:
from scipy.sparse.linalg import svds

U, sigma, Vt = svds(ratings_demeaned, k=50)  # Number of singular values and vectors to compute

In [19]:
sigma = np.diag(sigma)

In [20]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

In [21]:
preds = pd.DataFrame(all_user_predicted_ratings, columns = final_df_matrix.columns)

preds.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
0,4.288861,0.143055,-0.19508,-0.018843,0.012232,-0.176604,-0.07412,0.141358,-0.059553,-0.19595,...,0.027807,0.00164,0.026395,-0.022024,-0.085415,0.403529,0.105579,0.031912,0.05045,0.08891
1,0.744716,0.169659,0.335418,0.000758,0.022475,1.35305,0.051426,0.071258,0.161601,1.567246,...,-0.056502,-0.013733,-0.01058,0.062576,-0.016248,0.15579,-0.418737,-0.101102,-0.054098,-0.140188
2,1.818824,0.456136,0.090978,-0.043037,-0.025694,-0.158617,-0.131778,0.098977,0.030551,0.73547,...,0.040481,-0.005301,0.012832,0.029349,0.020866,0.121532,0.076205,0.012345,0.015148,-0.109956
3,0.408057,-0.07296,0.039642,0.089363,0.04195,0.237753,-0.049426,0.009467,0.045469,-0.11137,...,0.008571,-0.005425,-0.0085,-0.003417,-0.083982,0.094512,0.057557,-0.02605,0.014841,-0.034224
4,1.574272,0.021239,-0.0513,0.246884,-0.032406,1.552281,-0.19963,-0.01492,-0.060498,0.450512,...,0.110151,0.04601,0.006934,-0.01594,-0.05008,-0.052539,0.507189,0.03383,0.125706,0.199244


In [22]:
def recommend_movies(predictions, user_id, movies, reviews, num_recommendations):
    
    # Get and sort the user's predictions
    user_row_number = user_id - 1 # User ID starts at 1, not 0
    sorted_user_predictions = preds.iloc[user_row_number].sort_values(ascending=False)
    
    # Get the user's data and merge in the movie information.
    user_data = reviews[reviews.user_id == (user_id)]
    user_full = (user_data.merge(movies, how = 'left', on = 'movie_id').
                     sort_values(['rating'], ascending=False)
                 )

    print('User {0} has already rated {1} movies.'.format(user_id, user_full.shape[0]))
    print('Recommending highest {0} predicted ratings movies not already rated.'.format(num_recommendations))
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies[~movies['movie_id'].isin(user_full['movie_id'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'movie_id',
               right_on = 'movie_id').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )

    return user_full.head(10), recommendations.sort_values('release_year', ascending=False)  # then sort by newest release year

In [23]:
user_already_rated, for_recommend = recommend_movies(preds, 1920, movies, reviews, 10)

User 1920 has already rated 601 movies.
Recommending highest 10 predicted ratings movies not already rated.


In [24]:
user_already_rated

Unnamed: 0,user_id,movie_id,rating,title,genres,release_year
292,1920,802,5,Phenomenon (1996),Drama|Romance,1996
341,1920,2194,5,"Untouchables, The (1987)",Action|Crime|Drama,1987
33,1920,592,5,Batman (1989),Action|Adventure|Crime|Drama,1989
550,1920,1210,5,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Romance|Sci-Fi|War,1983
472,1920,3704,5,Mad Max Beyond Thunderdome (1985),Action|Sci-Fi,1985
542,1920,2944,5,"Dirty Dozen, The (1967)",Action|War,1967
255,1920,3688,5,Porky's (1981),Comedy,1981
41,1920,1270,5,Back to the Future (1985),Comedy|Sci-Fi,1985
42,1920,1271,5,Fried Green Tomatoes (1991),Drama,1991
433,1920,182,5,Moonlight and Valentino (1995),Drama|Romance,1995


In [25]:
for_recommend

Unnamed: 0,movie_id,title,genres,release_year
1117,1372,Star Trek VI: The Undiscovered Country (1991),Action|Adventure|Sci-Fi,1991
1052,1291,Indiana Jones and the Last Crusade (1989),Action|Adventure,1989
1585,1962,Driving Miss Daisy (1989),Drama,1989
1119,1378,Young Guns (1988),Action|Comedy|Western,1988
1043,1275,Highlander (1986),Action|Adventure,1986
1036,1263,"Deer Hunter, The (1978)",Drama|War,1978
2823,3421,Animal House (1978),Comedy,1978
1035,1262,"Great Escape, The (1963)",Adventure|War,1963
2412,2949,Dr. No (1962),Action,1962
1681,2096,Sleeping Beauty (1959),Animation|Children's|Musical,1959
