In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.spatial.distance import cosine, correlation

### We won't use all of these, but they are imported for future extensions
from surprise import Dataset
from surprise import Reader
from surprise import KNNWithMeans, KNNWithZScore, KNNBaseline
from surprise import SVD, SVDpp, NMF, SlopeOne, CoClustering
from surprise import Reader, similarities, Dataset, accuracy
from surprise.model_selection import cross_validate, KFold, ShuffleSplit
from surprise.model_selection import GridSearchCV

pd.options.display.float_format='{:,.3f}'.format

In [2]:
reviews = pd.read_csv("yelp-core5/review.csv", names=['userID', 'businessID', 'rating'])
print('Rows:', reviews.shape[0], '; Columns:', reviews.shape[1], '\n')

reviews.head()

Rows: 141454 ; Columns: 3 



Unnamed: 0,userID,businessID,rating
0,10000,0,5
1,10001,1,5
2,10002,2,5
3,10003,3,4
4,10004,4,5


In [3]:
print('No. of Unique Users    :', reviews.userID.nunique())
print('No. of Unique Business :', reviews.businessID.nunique())
print('No. of Unique Ratings  :', reviews.rating.nunique())

No. of Unique Users    : 8043
No. of Unique Business : 5199
No. of Unique Ratings  : 5


In [4]:
# Set Rating Scale
reader = Reader(rating_scale=(1, 5))

# Load data with rating scale
data = Dataset.load_from_df(reviews[['userID', 'businessID', 'rating']], reader)

In [5]:
knnbasic_cv = cross_validate(KNNBaseline(), data, cv=5, n_jobs=5, verbose=False)
knnmeans_cv = cross_validate(KNNWithMeans(), data, cv=5, n_jobs=5, verbose=False)
knnz_cv = cross_validate(KNNWithZScore(), data, cv=5, n_jobs=5, verbose=False)

In [6]:
svd_cv = cross_validate(SVD(), data, cv=5, n_jobs=5, verbose=False)
svdpp_cv = cross_validate(SVDpp(), data, cv=5, n_jobs=5, verbose=False)
nmf_cv = cross_validate(NMF(), data, cv=5, n_jobs=5, verbose=False)

In [7]:
cocluster_cv = cross_validate(CoClustering(), data, cv=5, n_jobs=5, verbose=False)
slope_cv = cross_validate(SlopeOne(), data, cv=5, n_jobs=5, verbose=False)

In [8]:
print('Algorithm\t RMSE\t\t MAE')
print()
print('KNN Basic', '\t', knnbasic_cv['test_rmse'].mean(), '\t', round(knnbasic_cv['test_mae'].mean(), 4))
print('KNN Means', '\t', knnmeans_cv['test_rmse'].mean(), '\t', round(knnmeans_cv['test_mae'].mean(), 4))
print('KNN ZScore', '\t', knnz_cv['test_rmse'].mean(), '\t', round(knnz_cv['test_mae'].mean(), 4))
print()
print('SVD', '\t\t', svd_cv['test_rmse'].mean(), '\t', round(svd_cv['test_mae'].mean(), 4))
print('SVDpp', '\t\t', svdpp_cv['test_rmse'].mean(), '\t', round(svdpp_cv['test_mae'].mean(), 4))
print('NMF', '\t\t', nmf_cv['test_rmse'].mean(), '\t', round(nmf_cv['test_mae'].mean(), 4))
print()
print('SlopeOne', '\t', slope_cv['test_rmse'].mean(), '\t', round(slope_cv['test_mae'].mean(), 4))
print('CoClustering', '\t', cocluster_cv['test_rmse'].mean(), '\t', round(cocluster_cv['test_mae'].mean(), 4))
print()

Algorithm	 RMSE		 MAE

KNN Basic 	 1.0582624142131039 	 0.8173
KNN Means 	 1.0874373431698918 	 0.8327
KNN ZScore 	 1.0932388731301008 	 0.8301

SVD 		 1.02305465129364 	 0.8004
SVDpp 		 1.0228442314989463 	 0.7987
NMF 		 1.1247162986674097 	 0.8733

SlopeOne 	 1.1149503037826456 	 0.8546
CoClustering 	 1.0835357589909593 	 0.8341



In [13]:
from time import time
t0 = time()

svd_param_grid = {'n_factors': [5,10,20], 
                  'lr_all': [0.001,0.002,0.003],
                  'reg_all': [0.01, 0.02]}

svdpp_gs = GridSearchCV(SVDpp, svd_param_grid, measures=['rmse'], cv=5, n_jobs=5)
svdpp_gs.fit(data)

print("done in %0.3fs." % (time() - t0))

done in 5240.639s.


In [15]:
print('SVDpp - RMSE:', svdpp_gs.best_score['rmse'])

SVDpp - RMSE: 1.0180776897448272


In [14]:
1-1

0