## Movie Recommendation System Models

In [1]:
#Import necessary libraries
import numpy as np
import pandas as pd
from surprise import SVD
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise import Dataset, Reader
from surprise import accuracy
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV


In [2]:
#Import data into a DataFrame and drop unnecessary columns 
df = pd.read_csv('cleaneddata', index_col=False)
df2 = df.drop(['Unnamed: 0', 'title', 'genres', 'year'], axis=1)

In [3]:
#Look at new dataframe 
df2.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [4]:
#Look at the distribution of ratings again 
df2.rating.value_counts()

4.0    26815
3.0    20044
5.0    13207
3.5    13134
4.5     8549
2.0     7550
2.5     5550
1.0     2810
1.5     1791
0.5     1369
Name: rating, dtype: int64

In [6]:
#Instansiate reader and data 
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(df2, reader) 

In [7]:
#Train test split 
trainset, testset = train_test_split(data, test_size=.2)

In [8]:
# Print number of uses and items for the trainset 
print('Number of users in train set : ', trainset.n_users, '\n')
print('Number of items in train set : ', trainset.n_items, '\n')


Number of users in train set :  610 

Number of items in train set :  8972 



### Baseline Model

In [9]:
#Instansiate a vanilla model 
svd = SVD(random_state=20)

In [10]:
#Fit model on the trainset 
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f95a7ebb6d8>

In [11]:
#Make prediction on test set 
predictions = svd.test(testset)

In [12]:
#Check RMSE and MAE results 
accuracy.rmse(predictions)
accuracy.mae(predictions)

RMSE: 0.8702
MAE:  0.6700


0.6700320584374656

In [13]:
#Run 3-fold cross validation on the data and print results 
cv_baseline = cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8817  0.8868  0.8763  0.8816  0.0043  
MAE (testset)     0.6782  0.6814  0.6747  0.6781  0.0027  
Fit time          3.28    3.16    3.21    3.22    0.05    
Test time         0.27    0.26    0.26    0.26    0.00    


In [14]:
# Print out the RMSE score for each fold 
for i in cv_baseline.items():
    print(i)

('test_rmse', array([0.88166901, 0.88680945, 0.87627774]))
('test_mae', array([0.67817046, 0.68135691, 0.67468063]))
('fit_time', (3.283911943435669, 3.159525156021118, 3.2121198177337646))
('test_time', (0.2665388584136963, 0.25997114181518555, 0.26328396797180176))


In [15]:
#Find the average test RMSE from the 3-Fold cross-validation
np.mean(cv_baseline['test_rmse'])

0.8815854004038183

### Model 1

In [16]:
#Set parameters for GridSearch on SVD model 
parameters = {'n_factors': [50, 100, 150],
             'reg_all': [0.02, 0.05, 0.1],
             'n_epochs': [10, 20, 30, 40],
             'lr_all': [.005, .075, .01]}
gridsvd = GridSearchCV(SVD, param_grid=parameters, n_jobs=-1)

In [17]:
#Fit SVD model on data
gridsvd.fit(data)

In [18]:
#Print best score and best parameters from the GridSearch 
print(gridsvd.best_score)
print(gridsvd.best_params)

{'rmse': 0.8510081324268611, 'mae': 0.6525188360781956}
{'rmse': {'n_factors': 150, 'reg_all': 0.1, 'n_epochs': 40, 'lr_all': 0.01}, 'mae': {'n_factors': 150, 'reg_all': 0.1, 'n_epochs': 40, 'lr_all': 0.01}}


In [25]:
#Reinstantiate the model with the best parameters fromGridSearch 
svdtuned = SVD(n_factors=150,
               reg_all=0.1,
               n_epochs=40,
               lr_all=0.01)

In [26]:
#Fit and predict the model 
svdtuned.fit(trainset)
svdpreds = svdtuned.test(testset)

In [27]:
#Print RMSE and MAE results 
accuracy.rmse(svdpreds)
accuracy.mae(svdpreds)

RMSE: 0.8474
MAE:  0.6502


0.650227969777596

In [28]:
#Perform 3-Fold cross validation for SVD tuned model
cv_svd_tuned = cross_validate(svdtuned, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8584  0.8608  0.8605  0.8599  0.0011  
MAE (testset)     0.6592  0.6624  0.6580  0.6599  0.0018  
Fit time          8.53    8.80    9.83    9.05    0.56    
Test time         0.23    0.28    0.33    0.28    0.04    


In [29]:
#Display the results for all 3-folds 
for i in cv_svd_tuned.items():
    print(i)

('test_rmse', array([0.85842497, 0.86079009, 0.86052599]))
('test_mae', array([0.65917945, 0.66238588, 0.65803631]))
('fit_time', (8.528038740158081, 8.800540924072266, 9.826272010803223))
('test_time', (0.23207378387451172, 0.2771940231323242, 0.33236193656921387))


In [30]:
# Print out the average RMSE score for the test set
np.mean(cv_svd_tuned['test_rmse'])

0.859913685461386

### Model 2

In [44]:
# Set parameters to be used in KNN models 
knn_params = {'name': ['cosine', 'pearson'],
              'user_based':[True, False], 
              'min_support':[True, False],
            'min_k' : [1, 2]}

In [45]:
# Apply GridSearch to the KNN Basic model to identify the best parameters
gsknnbasic = GridSearchCV(KNNBasic, knn_params, measures=['rmse', 'mae'], cv=3)
gsknnbasic.fit(data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [49]:
#Display the best scores and parameters from GridSearch
print(gsknnbasic.best_score)
print(gsknnbasic.best_params)

{'rmse': 0.9499509634510471, 'mae': 0.730209417090112}
{'rmse': {'name': 'cosine', 'user_based': True, 'min_support': True, 'min_k': 2}, 'mae': {'name': 'cosine', 'user_based': True, 'min_support': True, 'min_k': 2}}


In [52]:
#Reinstantiate the model with the best parameters from GridSearch 
knnbasic_tuned = KNNBasic(sim_options={'name': 'cosine', 
                                       'user_based': True, 
                                       'min_support':True, 
                                       'min_k':2, })

In [56]:
#Fit on the train set and predict on the test set 
knnbasic_tuned.fit(trainset)
knnbpreds = knnbasic_tuned.test(testset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [57]:
#Print RMSE and MAE results 
accuracy.rmse(knnbpreds)
accuracy.mae(knnbpreds)

RMSE: 0.9708
MAE:  0.7474


0.7474414650698246

Another way to evalute the model is to perform a cross validation and print the resulting scores. We will explore this below:

In [97]:
#Conduct cross validation for the KNNBasic tuned model 
cv_knn_basic = cross_validate(knnbasic_tuned, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9799  0.9789  0.9790  0.9793  0.0005  
MAE (testset)     0.7555  0.7548  0.7543  0.7549  0.0005  
Fit time          0.25    0.25    0.24    0.25    0.01    
Test time         2.23    2.03    1.92    2.06    0.13    


In [98]:
# Print out results from the cross-valdiatoin 
for i in cv_knn_basic.items():
    print(i)

('test_rmse', array([0.97994813, 0.97885244, 0.97897875]))
('test_mae', array([0.75553416, 0.75476358, 0.75434169]))
('fit_time', (0.24974584579467773, 0.24862909317016602, 0.23802495002746582))
('test_time', (2.22851824760437, 2.0293898582458496, 1.920827865600586))


In [99]:
# Print out the average RMSE score for the test set
np.mean(cv_knn_basic['test_rmse'])

0.9792597736461263

This average of test RMSE results in our cross validation is approximately 0.97, similar to the RMSE we found above. This is a significant improvement from our baseline model which had an RMSE of 0.873.

### Model 3

In [48]:
#Apply KNN GridSearch parameters on the KNNBaseline model 
gsknnbaseline = GridSearchCV(KNNBaseline, knn_params, measures=['rmse', 'mae'], cv=3)
gsknnbaseline.fit(data)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matr

In [50]:
#Display the best score and the best parameters 
print(gsknnbaseline.best_score)
print(gsknnbaseline.best_params)

{'rmse': 0.8741180875425969, 'mae': 0.6696693048950011}
{'rmse': {'name': 'cosine', 'user_based': True, 'min_support': True, 'min_k': 2}, 'mae': {'name': 'cosine', 'user_based': True, 'min_support': True, 'min_k': 2}}


In [65]:
#Reinstantiate the model with the best parameters from GridSearch 
knnbaseline_tuned = KNNBaseline(sim_options={'name': 'cosine', 
                                       'user_based': True, 
                                       'min_support':True, 
                                       'min_k':2, })

In [66]:
#Fit the trainset and predict on the test set 
knnbaseline_tuned.fit(trainset)
knnbaselinepreds = knnbaseline_tuned.test(testset)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.


In [67]:
#Print the RMSE and MAE scores 
accuracy.rmse(knnbaselinepreds)
accuracy.mae(knnbaselinepreds)

RMSE: 0.8778
MAE:  0.6700


0.6699751099064485

In [100]:
#Perform 3 fold cross validation 
cv_knn_baseline = cross_validate(knnbaseline_tuned, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBaseline on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8887  0.8883  0.8862  0.8877  0.0011  
MAE (testset)     0.6811  0.6814  0.6758  0.6794  0.0026  
Fit time          0.36    0.36    0.36    0.36    0.00    
Test time         2.77    2.57    2.84    2.73    0.11    


In [101]:
#Show the mean RMSE score for the test set 
np.mean(cv_knn_baseline['test_rmse'])

0.8877304861884762

### Model 4

In [46]:
#Apply GridSearch to the KNNWithMeans model 
gsknnWM = GridSearchCV(KNNWithMeans, knn_params, measures=['rmse', 'mae'], cv=3)
gsknnWM.fit(data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [51]:
#Display the best score and best parameters from GridSearch 
print(gsknnWM.best_score)
print(gsknnWM.best_params)

{'rmse': 0.8985852405639799, 'mae': 0.6872647488734684}
{'rmse': {'name': 'cosine', 'user_based': True, 'min_support': True, 'min_k': 2}, 'mae': {'name': 'cosine', 'user_based': True, 'min_support': True, 'min_k': 2}}


In [71]:
#Reinstansiate the model with the best parameters 
knnwm_tuned = KNNWithMeans(sim_options={'name': 'cosine', 
                                       'user_based': True, 
                                       'min_support':True, 
                                       'min_k':2, })

In [72]:
#Fit on the trainset, predict on the testset 
knnwm_tuned.fit(trainset)
knnwmpreds = knnwm_tuned.test(testset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [73]:
#Print RMSE and MAE results
accuracy.rmse(knnwmpreds)
accuracy.mae(knnwmpreds)

RMSE: 0.9005
MAE:  0.6873


0.6873119082836627

In [102]:
#Perform 3-Fold cross validation on KNNWithMeans model 
cv_knn_wm = cross_validate(knnwm_tuned, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9089  0.9098  0.9123  0.9103  0.0014  
MAE (testset)     0.6952  0.6968  0.6967  0.6962  0.0007  
Fit time          0.26    0.26    0.28    0.26    0.01    
Test time         2.20    2.40    2.31    2.30    0.08    


In [103]:
#Print the average RMSE score for the test set 
np.mean(cv_knn_wm['test_rmse'])

0.9103409705314892

### All results

In [107]:
#Create a dictionary for each models' results 
baselineresult = {'model': 'baseline','RMSE': accuracy.rmse(predictions), 'MAE': accuracy.mae(predictions), 'CV': np.mean(cv_baseline['test_rmse'])}
svdresult = {'model':'svd', 'RMSE': accuracy.rmse(svdpreds), 'MAE': accuracy.mae(svdpreds), 'CV': np.mean(cv_svd['test_rmse'])}
knnbasicresult = {'model':'knnbasic','RMSE': accuracy.rmse(knnbpreds), 'MAE': accuracy.mae(knnbpreds), 'CV': np.mean(cv_knn_basic['test_rmse'])}
knnbaselineresult = {'model':'knnbaseline','RMSE': accuracy.rmse(knnbaselinepreds), 'MAE': accuracy.mae(knnbaselinepreds), 'CV': np.mean(cv_knn_baseline['test_rmse'])}
knnwmresult = {'model':'knnwm','RMSE': accuracy.rmse(knnwmpreds), 'MAE': accuracy.mae(knnwmpreds), 'CV': np.mean(cv_knn_wm['test_rmse'])}

RMSE: 0.8730
MAE:  0.6698
RMSE: 0.8603
MAE:  0.6586
RMSE: 0.9708
MAE:  0.7474
RMSE: 0.8778
MAE:  0.6700
RMSE: 0.9005
MAE:  0.6873


In [108]:
#Combine all the results into a list 
result_list = [baselineresult, svdresult, knnbasicresult, knnbaselineresult, knnwmresult]

In [109]:
#Transform the results lists into a DataFrame 
df_results_updated = pd.DataFrame.from_dict(result_list, orient='columns')
df_results_updated = df_results_updated.set_index('model')

In [110]:
#Display the results for all of the models 
df_results_updated

Unnamed: 0_level_0,RMSE,MAE,CV
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
baseline,0.873039,0.669792,0.880247
svd,0.860286,0.658641,0.870431
knnbasic,0.970818,0.747441,0.97926
knnbaseline,0.877813,0.669975,0.88773
knnwm,0.900529,0.687312,0.910341


## Generating New Ratings

In [53]:
#Create function that generates new ratings for moves in our dataset 

def movie_rater(movie_df,num, genre=None):
    userID = 1000
    rating_list = []
    while num > 0:
        if genre:
            movie = movie_df[movie_df['genres'].str.contains(genre)].sample(1)
        else:
            movie = movie_df.sample(1)
        print(movie)
        rating = input('How would you rate this movie on a scale of 0-5, press n if you have not seen :\n')
        if rating == 'n':
            continue
        else:
            rating_one_movie = {'userId':userID,'movieId':movie['movieId'].values[0],'rating':rating}
            rating_list.append(rating_one_movie) 
            num -= 1
    return rating_list   
        

In [59]:
#Take original dataframe and drop unncessary columns 
df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [60]:
#Re-investigate the original dataframe that included titles and genres for films 
df.head()

Unnamed: 0,userId,movieId,rating,title,genres,year
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,1,3,4.0,Grumpier Old Men (1995),Comedy|Romance,1995
2,1,6,4.0,Heat (1995),Action|Crime|Thriller,1995
3,1,47,5.0,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,1995
4,1,50,5.0,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,1995


In [None]:
#Apply the function to obtain new ratings for Action films 
userrating = movie_rater(df, 3, 'Action')

In [62]:
#Display the new user ratings 
userrating

[{'userId': 1000, 'movieId': 2887, 'rating': '2'},
 {'userId': 1000, 'movieId': 785, 'rating': '1'}]