Initial Modeling

In [3]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 5.0 MB/s eta 0:00:01     |████████████████████████████▋   | 10.5 MB 5.0 MB/s eta 0:00:01
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25ldone
[?25h  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp36-cp36m-macosx_10_7_x86_64.whl size=819626 sha256=40b4f86aac0390e5e2cf65ee99ac7913476276e16b39a7729abf59f79fde597d
  Stored in directory: /Users/MZhang/Library/Caches/pip/wheels/de/9a/41/6a57bf37eb7b50de7f8c7ca9d7053bebe0ea7c7c9bae9fa293
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.1 surprise-0.1


In [4]:
#Import relevant libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from surprise import SVD
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise import Dataset, Reader
from surprise import accuracy
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV

In [6]:
df = pd.read_csv('cleaneddata', index_col=False)
df1 = df.drop(['Unnamed: 0', 'title', 'genres', 'year'], axis=1)

In [10]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,userId,movieId,rating,title,genres,year
0,0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,1,1,3,4.0,Grumpier Old Men (1995),Comedy|Romance,1995


In [7]:
df1.head(2)

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0


In [46]:
df2 = df1.groupby('userId').filter(lambda x: len(x)>200)

In [47]:
#Checking sparsity
numratings = len(df2['rating'])
numusers = len(df2['userId'].unique())
numitems = len(df2['movieId'].unique())
sparse = 1 - (numratings / (numusers*numitems))
sparse

0.94456034242643

In [48]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 68271 entries, 0 to 100818
Data columns (total 3 columns):
userId     68271 non-null int64
movieId    68271 non-null int64
rating     68271 non-null float64
dtypes: float64(1), int64(2)
memory usage: 2.1 MB


In [49]:
from surprise import Reader, Dataset
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(df2,reader)

In [50]:
trainset, testset = train_test_split(data, test_size=.2)

In [51]:
# Print number of uses and items for the trainset 
print('Number of users in train set : ', trainset.n_users, '\n')
print('Number of items in train set : ', trainset.n_items)

Number of users in train set :  133 

Number of items in train set :  8513


In [52]:
#Instantiate a baseline
svd = SVD()

svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x129d9d080>

In [53]:
#Predictions
predictions = svd.test(testset)

In [54]:
accuracy.rmse(predictions)
accuracy.mae(predictions)

RMSE: 0.8628
MAE:  0.6638


0.6638319488885648

In [55]:
cv_baseline = cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8629  0.8549  0.8688  0.8622  0.0057  
MAE (testset)     0.6652  0.6628  0.6682  0.6654  0.0022  
Fit time          2.42    2.48    2.47    2.46    0.03    
Test time         0.13    0.21    0.13    0.16    0.04    


In [None]:
#There is a mean of .8622 for the testing set 

In [None]:
#Original, sprase matrix mean: .8792

Model 1 with GridSearchCV for SVD

In [56]:
#Create set of parameters to run on GridSearchCV
parameters = {'n_factors': [20, 50, 80],
             'reg_all': [0.04, 0.06],
             'n_epochs': [10, 20, 30],
             'lr_all': [.002, .005, .01]}
svdgrid = GridSearchCV(SVD, param_grid=parameters, n_jobs=-1)

In [57]:
svdgrid.fit(data)

In [58]:
print(svdgrid.best_score)
print(svdgrid.best_params)

{'rmse': 0.8340906483373122, 'mae': 0.6408937850326492}
{'rmse': {'n_factors': 80, 'reg_all': 0.06, 'n_epochs': 30, 'lr_all': 0.01}, 'mae': {'n_factors': 80, 'reg_all': 0.06, 'n_epochs': 30, 'lr_all': 0.01}}


In [59]:
svd1 = SVD(n_factors=80, reg_all=0.06, n_epochs=30, lr_all=0.01)

svd1.fit(trainset)
svdpreds = svd1.test(testset)

In [60]:
accuracy.rmse(svdpreds)
accuracy.mae(svdpreds)

RMSE: 0.8404
MAE:  0.6463


0.6463279607784326

In [None]:
#RMSE of 0.8556

In [None]:
#New RMSE of 0.8404

In [61]:
svd1_cv = cross_validate(svd1, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8483  0.8447  0.8404  0.8445  0.0032  
MAE (testset)     0.6527  0.6512  0.6469  0.6502  0.0024  
Fit time          3.25    3.31    3.31    3.29    0.03    
Test time         0.13    0.22    0.12    0.16    0.04    


In [None]:
#Mean RMSE of .8645

In [None]:
#New Mean RMSE of .8445