In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.formula.api import ols
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats.stats import pearsonr
import scipy as sp
from surprise.model_selection import train_test_split
from surprise.prediction_algorithms import knns
from surprise.similarities import cosine, msd, pearson
from surprise import accuracy
from surprise.model_selection import cross_validate
from surprise import Reader, Dataset
from surprise.prediction_algorithms import SVD, SVDpp
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline, KNNWithZScore
from surprise.model_selection import GridSearchCV
from surprise.prediction_algorithms.random_pred import NormalPredictor
from surprise.prediction_algorithms.baseline_only import BaselineOnly
from pyspark.ml.recommendation import ALS

# EDA
number of ratings per movie distribution
number of ratings per user distribution
- x = df.groupby('userId')['rating'].count()#.clip(upper=50)
- df.groupby('userId')['rating'].count().reset_index().sort_values('rating', ascending=False)[:10]


# Import and Fromat Cleaned Data

In [2]:
df = pd.read_pickle('cleaned.pickle')

In [3]:
df.head()

Unnamed: 0,userId,movieId,rating,title,genres,year
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,5,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
2,7,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
3,15,1,2.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
4,17,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995


In [4]:
df.drop(columns = ['genres', 'year', 'title'], axis=1, inplace=True)

In [5]:
df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,5,1,4.0
2,7,1,4.5
3,15,1,2.5
4,17,1,4.5


In [6]:
#transform dataframe to be compatible with surprise
reader = Reader()
data = Dataset.load_from_df(df,reader)

# SVD with GridSearch CV

In [10]:
## gridsearch with SVD
params = {'n_factors': [20, 30, 50, 100],
         'reg_all': [0.02, 0.05, 0.1],
         'n_epochs': [5, 10, 20]}
g_s_svd = GridSearchCV(SVD,param_grid=params,n_jobs=-1)
g_s_svd.fit(data)

In [11]:
print(g_s_svd.best_score)
print(g_s_svd.best_params)

{'rmse': 0.8684097382262618, 'mae': 0.667387809862004}
{'rmse': {'n_factors': 50, 'reg_all': 0.05, 'n_epochs': 20}, 'mae': {'n_factors': 50, 'reg_all': 0.05, 'n_epochs': 20}}


In [25]:
sim_options1 = {'name': 'cosine', 'user_based': True}
results = []
# Iterate over algorithms
for alg in [SVD(), SVDpp(), NormalPredictor(), KNNBaseline(sim_options = sim_options1), KNNBasic(sim_options = sim_options1), KNNWithMeans(sim_options = sim_options1), KNNWithZScore(sim_options = sim_options1)]:
    # cross validate
    algo = cross_validate(alg, data, measures=['RMSE'], cv=3, verbose=False)
    
    # create dataframe with results
    df3 = pd.DataFrame.from_dict(algo).mean(axis=0)
    df3 = df3.append(pd.Series([str(alg).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    results.append(df3)
    
pd.DataFrame(results).set_index('Algorithm').sort_values('test_rmse') 

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity mat

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVDpp,0.86961,341.409628,12.049438
SVD,0.879244,3.799888,0.255014
KNNBaseline,0.885382,0.414946,2.636843
KNNWithZScore,0.906069,0.331403,2.337467
KNNWithMeans,0.906747,0.276943,2.165087
KNNBasic,0.978291,0.256773,1.922483
NormalPredictor,1.425939,0.10331,0.264927


In [27]:
sim_options2 = {'name': 'pearson', 'user_based': True}
results = []
# Iterate over algorithms
for alg in [SVD(), SVDpp(), NormalPredictor(), KNNBaseline(sim_options = sim_options2), KNNBasic(sim_options = sim_options2), KNNWithMeans(sim_options = sim_options2), KNNWithZScore(sim_options = sim_options2)]:
    # cross validate
    algo = cross_validate(alg, data, measures=['RMSE'], cv=3, verbose=False)
    
    # create dataframe with results
    df3 = pd.DataFrame.from_dict(algo).mean(axis=0)
    df3 = df3.append(pd.Series([str(alg).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    results.append(df3)
    
pd.DataFrame(results).set_index('Algorithm').sort_values('test_rmse') 

Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing si

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVDpp,0.867202,326.541624,11.781793
SVD,0.880109,3.710124,0.250061
KNNBaseline,0.888401,0.571064,2.419011
KNNWithZScore,0.903322,0.409304,2.09584
KNNWithMeans,0.908064,0.371157,1.933916
KNNBasic,0.984304,0.358489,1.852557
NormalPredictor,1.423353,0.10435,0.229173


In [8]:
trainset, testset = train_test_split(data, test_size=0.2)
dataset = data.build_full_trainset()
print('Number of users: ', trainset.n_users, '\n')
print('Number of items: ', trainset.n_items)

Number of users:  610 

Number of items:  8873


In [None]:
svd = SVDpp(n_factors=50, n_epochs=100, lr_all=0.005, reg_all=0.1)
svd.fit(trainset)
predictions = svd.test(testset)
print(accuracy.rmse(predictions))

In [None]:
#fit best model to dataset
svd = SVDpp(n_factors=50, n_epochs=50, lr_all=0.005, reg_all=0.1)
svd.fit(dataset)

In [37]:
#predict rating for movie 4 for user 2
svd.predict(2, 4)

Prediction(uid=2, iid=4, r_ui=None, est=2.7844824608391456, details={'was_impossible': False})

In [4]:
#import movie df
df_movie = pd.read_csv('movies.csv')

In [5]:
df_movie.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
#new user function that takes in movie df, number of user ratings, and genre to create a new user
def movie_rater(df_movie,num, genre=None):
    userID = 1005
    rating_list = []
    while num > 0:
        if genre:
            movie = df_movie[df_movie['genres'].str.contains(genre)].sample(1)
        else:
            movie = df_movie.sample(1)
        print(movie)
        rating = input('How do you rate this movie on a scale of 1-5, press n if you have not seen :\n')
        if rating == 'n':
            continue
        else:
            rating_one_movie = {'userId':userID,'movieId':movie['movieId'].values[0],'rating':rating}
            rating_list.append(rating_one_movie) 
            num -= 1
    return rating_list      

In [125]:
user_rating = movie_rater(df_movie, 4, 'Action')

      movieId                           title  \
9710   187595  Solo: A Star Wars Story (2018)   

                                genres  
9710  Action|Adventure|Children|Sci-Fi  
How do you rate this movie on a scale of 1-5, press n if you have not seen :
5
      movieId                                       title              genres
5466    26169  Branded to Kill (Koroshi no rakuin) (1967)  Action|Crime|Drama
How do you rate this movie on a scale of 1-5, press n if you have not seen :
2
      movieId                     title                 genres
4615     6874  Kill Bill: Vol. 1 (2003)  Action|Crime|Thriller
How do you rate this movie on a scale of 1-5, press n if you have not seen :
5
      movieId                            title                        genres
4045     5746  Galaxy of Terror (Quest) (1981)  Action|Horror|Mystery|Sci-Fi
How do you rate this movie on a scale of 1-5, press n if you have not seen :
3


In [107]:
user_rating

[{'userId': 1000, 'movieId': 379, 'rating': '4'},
 {'userId': 1000, 'movieId': 1497, 'rating': '4'},
 {'userId': 1000, 'movieId': 4721, 'rating': '4'},
 {'userId': 1000, 'movieId': 9, 'rating': '4'}]

In [43]:
# add the new user ratings to the original ratings DataFrame (df)
new_ratings_df = df.append(user_rating,ignore_index=True)
new_data = Dataset.load_from_df(new_ratings_df,reader)

In [46]:
# make predictions for the user
list_of_movies = []
for m_id in new_ratings_df['movieId'].unique():
    list_of_movies.append((m_id,svd.predict(1000,m_id)[3]))

In [54]:
# order the predictions from highest to lowest rated
ranked_movies = sorted(list_of_movies, key=lambda x:x[1], reverse=True)

In [102]:
# return the top n recommendations using the 
def recommended_movies(user_ratings,movie_title_df,n):
        for idx, rec in enumerate(user_ratings):
            title = movie_title_df.loc[movie_title_df['movieId'] == int(rec[0])]['title']
            print('Recommendation # ', idx+1, ': ', title, '\n')
            #print(int(rec[0]))
            #print(idx)
            n-= 1
            if n == 0:
                break
            
recommended_movies(ranked_movies,df_movie,5)

Recommendation #  1 :  9618    Three Billboards Outside Ebbing, Missouri (2017)
Name: title, dtype: object 

177593
0
Recommendation #  2 :  4396    Trial, The (Procès, Le) (1962)
Name: title, dtype: object 

6460
1
Recommendation #  3 :  5621    Neon Genesis Evangelion: The End of Evangelion...
Name: title, dtype: object 

27156
2
Recommendation #  4 :  2582    Guess Who's Coming to Dinner (1967)
Name: title, dtype: object 

3451
3
Recommendation #  5 :  4782    Adam's Rib (1949)
Name: title, dtype: object 

7121
4


Try and chain all of the steps together into one function that asks users for ratings for a certain number of movies, then all of the above steps are performed to return the top $n$ recommendations
Make a recommender system that only returns items that come from a specified genre

In [18]:
def top_recs(df_movie, num, genre=None):
    x=movie_rater(df_movie,num, genre=None)
    new_ratings_df = df.append(x,ignore_index=True)
    new_data = Dataset.load_from_df(new_ratings_df,reader)
    # make predictions for the user
    list_of_movies = []
    for m_id in new_ratings_df['movieId'].unique():
        list_of_movies.append( (m_id,svd.predict(1005,m_id)[3]))
    # order the predictions from highest to lowest rated
    ranked_movies = sorted(list_of_movies, key=lambda x:x[1], reverse=True)
    n=5
    for idx, rec in enumerate(ranked_movies):
        title = df_movie.loc[df_movie['movieId'] == int(rec[0])]['title']
        print('Recommendation # ', idx+1, ': ', title, '\n')
        print(rec)
        n-= 1
        if n == 0:
            break

In [19]:
top_recs(df_movie, 4)

      movieId                             title                genres
1183     1580  Men in Black (a.k.a. MIB) (1997)  Action|Comedy|Sci-Fi
How do you rate this movie on a scale of 1-5, press n if you have not seen :
3
      movieId                                              title  \
5858    32657  Man Who Planted Trees, The (Homme qui plantait...   

               genres  
5858  Animation|Drama  
How do you rate this movie on a scale of 1-5, press n if you have not seen :
3
    movieId              title                 genres
20       21  Get Shorty (1995)  Comedy|Crime|Thriller
How do you rate this movie on a scale of 1-5, press n if you have not seen :
3
      movieId                                         title     genres
9601   176051  LEGO DC Super Hero Girls: Brain Drain (2017)  Animation
How do you rate this movie on a scale of 1-5, press n if you have not seen :
3


NameError: name 'svd' is not defined