In [1]:
from surprise import accuracy, Dataset, SVD, similarities,AlgoBase
from surprise.model_selection import cross_validate
from surprise import BaselineOnly, Dataset, Reader
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
import datetime
import math
from surprise import Dataset
from surprise import Reader
from surprise import SVD, KNNBasic,KNNWithMeans,KNNWithZScore,KNNBaseline,SVDpp,NMF,CoClustering,accuracy,NormalPredictor
from surprise.model_selection import cross_validate
from surprise.model_selection import KFold,RepeatedKFold
from surprise.model_selection import GridSearchCV

#### Input dataset
#### Train on u1.base and test on u1.test

In [29]:
movie100k = Dataset.load_builtin('ml-100k')

In [2]:
import os

from surprise import accuracy, Dataset, Reader, SVD
from surprise.model_selection import PredefinedKFold

# path to dataset folder
files_dir = os.path.expanduser("~/.surprise_data/ml-100k/ml-100k/")

# This time, we'll use the built-in reader.
reader = Reader("ml-100k")

# folds_files is a list of tuples containing file paths:
# [(u1.base, u1.test), (u2.base, u2.test), ... (u5.base, u5.test)]
train_file = files_dir + "u%d.base"
test_file = files_dir + "u%d.test"
folds_files = [(train_file % i, test_file % i) for i in (1,)]

data = Dataset.load_from_folds(folds_files, reader=reader)
pkf = PredefinedKFold()

#Dataset formatted train and test sets for CV
trainset = Dataset.load_from_file('~/.surprise_data/ml-100k/ml-100k/u1.base',reader=reader)
testset = Dataset.load_from_file('~/.surprise_data/ml-100k/ml-100k/u1.test',reader=reader)

#Trainset formatted train and test sets for predictions
for train,test in pkf.split(data):
    trainset_final = train
    testset_final = test

In [3]:
trainset

<surprise.dataset.DatasetAutoFolds at 0x280b2bcfe50>

In [39]:
trainset_final

<surprise.trainset.Trainset at 0x210a7f9d460>

## Hyperparameter Tuning using GridSearchCV
### Description of algorithms: https://surprise.readthedocs.io/en/stable/prediction_algorithms_package.html

### Tuning KNN-inspired algorithms (User-based)

In [4]:
param_grid = {"k":[25,100,500], "min_k":[1,5,10],"verbose":[False],'random_state':[42],'sim_options':{"user_based": [True],"name":['cosine','msd','pearson','pearson_baseline']}}
classes = (KNNBasic,KNNWithMeans,KNNWithZScore,KNNBaseline)
for model in classes:
    start = datetime.datetime.now()
    print(model)
    best_score = []
    best_params = [] 
    gs = GridSearchCV(model,param_grid, measures=["rmse", "mae"], cv=5)
    gs.fit(trainset)
    print("Runtime",str(datetime.datetime.now() - start)[:-3])
    print("RMSE: ",round(gs.best_score["rmse"],3))
    print("Best parameters: ",gs.best_params["rmse"])

<class 'surprise.prediction_algorithms.knns.KNNBasic'>
Runtime 0:09:10.157
RMSE:  0.986
Best parameters:  {'k': 25, 'min_k': 5, 'verbose': False, 'random_state': 42, 'sim_options': {'user_based': True, 'name': 'msd'}}
<class 'surprise.prediction_algorithms.knns.KNNWithMeans'>
Runtime 0:25:46.517
RMSE:  0.948
Best parameters:  {'k': 100, 'min_k': 5, 'verbose': False, 'random_state': 42, 'sim_options': {'user_based': True, 'name': 'pearson_baseline'}}
<class 'surprise.prediction_algorithms.knns.KNNWithZScore'>
Runtime 0:13:22.582
RMSE:  0.949
Best parameters:  {'k': 100, 'min_k': 5, 'verbose': False, 'random_state': 42, 'sim_options': {'user_based': True, 'name': 'pearson_baseline'}}
<class 'surprise.prediction_algorithms.knns.KNNBaseline'>
Runtime 0:11:57.466
RMSE:  0.927
Best parameters:  {'k': 100, 'min_k': 10, 'verbose': False, 'random_state': 42, 'sim_options': {'user_based': True, 'name': 'pearson_baseline'}}


### Tuning KNN-inspired algorithms (Item-based)

In [7]:
param_grid = {"k":[25,100,500], "min_k":[1,5,10],"verbose":[False],'random_state':[42],'sim_options':{"user_based": [False],"name":['cosine','msd','pearson','pearson_baseline']}}
classes = (KNNBasic,KNNWithMeans,KNNWithZScore,KNNBaseline)
for model in classes:
    start = datetime.datetime.now()
    print(model)
    best_score = []
    best_params = [] 
    gs = GridSearchCV(model,param_grid, measures=["rmse", "mae"], cv=5)
    gs.fit(trainset)
    print("Runtime",str(datetime.datetime.now() - start)[:-3])
    print("RMSE: ",round(gs.best_score["rmse"],3))
    print("Best parameters: ",gs.best_params["rmse"])

<class 'surprise.prediction_algorithms.knns.KNNBasic'>
Runtime 0:09:12.452
RMSE:  0.988
Best parameters:  {'k': 25, 'min_k': 1, 'verbose': False, 'random_state': 42, 'sim_options': {'user_based': False, 'name': 'msd'}}
<class 'surprise.prediction_algorithms.knns.KNNWithMeans'>
Runtime 0:09:58.193
RMSE:  0.934
Best parameters:  {'k': 100, 'min_k': 5, 'verbose': False, 'random_state': 42, 'sim_options': {'user_based': False, 'name': 'pearson_baseline'}}
<class 'surprise.prediction_algorithms.knns.KNNWithZScore'>
Runtime 0:11:03.419
RMSE:  0.937
Best parameters:  {'k': 100, 'min_k': 5, 'verbose': False, 'random_state': 42, 'sim_options': {'user_based': False, 'name': 'pearson_baseline'}}
<class 'surprise.prediction_algorithms.knns.KNNBaseline'>
Runtime 0:11:41.153
RMSE:  0.924
Best parameters:  {'k': 100, 'min_k': 10, 'verbose': False, 'random_state': 42, 'sim_options': {'user_based': False, 'name': 'pearson_baseline'}}


### Tune Matrix Factorization algorithms

#### SVD

In [5]:
# param_grid = {"n_factors":[2,5,10,20,50,100], "n_epochs": [5,10,20,50,100,200], "lr_all": [0.001,0.002,0.005,0.01,0.02,0.05], "reg_all": [0.01,0.05,0.1,0.2,0.3,0.4,0.5],'random_state':[42],'biased':[True,False]}
# param_grid = {"n_factors":[5,10,50,100], "n_epochs": [100,200], "lr_all": [0.001,0.01], "reg_all": [0.01,0.1],'random_state':[42],'biased':[True,]}
param_grid = {"n_factors":[10, 50, 100, 200, 500], "n_epochs": [100, 200], "lr_all": [0.01, 0.001], "reg_all": [0.1, 0.001],'random_state':[42],'biased':[True,False]}
#SVD
start = datetime.datetime.now()
best_score = []
best_params = []   
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=5)
gs.fit(trainset)

print("SVD")
print("Runtime",str(datetime.datetime.now() - start)[:-3])  
print("RMSE: ",round(gs.best_score["rmse"],3))
print("Best parameters: ",gs.best_params["rmse"])

SVD
Runtime 1:43:31.935
RMSE:  0.919
Best parameters:  {'n_factors': 500, 'n_epochs': 100, 'lr_all': 0.01, 'reg_all': 0.1, 'random_state': 42, 'biased': True}


#### Try smaller number of factors and epochs

In [19]:
# param_grid = {"n_factors":[2,5,10,20,50,100], "n_epochs": [5,10,20,50,100,200], "lr_all": [0.001,0.002,0.005,0.01,0.02,0.05], "reg_all": [0.01,0.05,0.1,0.2,0.3,0.4,0.5],'random_state':[42],'biased':[True,False]}
# param_grid = {"n_factors":[5,10,50,100], "n_epochs": [100,200], "lr_all": [0.001,0.01], "reg_all": [0.01,0.1],'random_state':[42],'biased':[True,]}
param_grid = {"n_factors":[500], "n_epochs": [20,50,100], "lr_all": [0.01, 0.001], "reg_all": [0.1, 0.001],'random_state':[42],'biased':[True,False]}
#SVD
start = datetime.datetime.now()
best_score = []
best_params = []   
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=5)
gs.fit(trainset)

print("SVD")
print("Runtime",str(datetime.datetime.now() - start)[:-3])  
print("RMSE: ",round(gs.best_score["rmse"],3))
print("Best parameters: ",gs.best_params["rmse"])

SVD
Runtime 0:10:26.759
RMSE:  0.92
Best parameters:  {'n_factors': 500, 'n_epochs': 50, 'lr_all': 0.01, 'reg_all': 0.1, 'random_state': 42, 'biased': True}


In [20]:
# param_grid = {"n_factors":[2,5,10,20,50,100], "n_epochs": [5,10,20,50,100,200], "lr_all": [0.001,0.002,0.005,0.01,0.02,0.05], "reg_all": [0.01,0.05,0.1,0.2,0.3,0.4,0.5],'random_state':[42],'biased':[True,False]}
# param_grid = {"n_factors":[5,10,50,100], "n_epochs": [100,200], "lr_all": [0.001,0.01], "reg_all": [0.01,0.1],'random_state':[42],'biased':[True,]}
param_grid = {"n_factors":[200], "n_epochs": [20,50,100], "lr_all": [0.01, 0.001], "reg_all": [0.1, 0.001],'random_state':[42],'biased':[True,False]}
#SVD
start = datetime.datetime.now()
best_score = []
best_params = []   
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=5)
gs.fit(trainset)

print("SVD")
print("Runtime",str(datetime.datetime.now() - start)[:-3])  
print("RMSE: ",round(gs.best_score["rmse"],3))
print("Best parameters: ",gs.best_params["rmse"])

SVD
Runtime 0:05:20.620
RMSE:  0.919
Best parameters:  {'n_factors': 200, 'n_epochs': 50, 'lr_all': 0.01, 'reg_all': 0.1, 'random_state': 42, 'biased': True}


In [21]:
# param_grid = {"n_factors":[2,5,10,20,50,100], "n_epochs": [5,10,20,50,100,200], "lr_all": [0.001,0.002,0.005,0.01,0.02,0.05], "reg_all": [0.01,0.05,0.1,0.2,0.3,0.4,0.5],'random_state':[42],'biased':[True,False]}
# param_grid = {"n_factors":[5,10,50,100], "n_epochs": [100,200], "lr_all": [0.001,0.01], "reg_all": [0.01,0.1],'random_state':[42],'biased':[True,]}
param_grid = {"n_factors":[100], "n_epochs": [20,50,100], "lr_all": [0.01, 0.001], "reg_all": [0.1, 0.001],'random_state':[42],'biased':[True,False]}
#SVD
start = datetime.datetime.now()
best_score = []
best_params = []   
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=5)
gs.fit(trainset)

print("SVD")
print("Runtime",str(datetime.datetime.now() - start)[:-3])  
print("RMSE: ",round(gs.best_score["rmse"],3))
print("Best parameters: ",gs.best_params["rmse"])

SVD
Runtime 0:03:38.959
RMSE:  0.92
Best parameters:  {'n_factors': 100, 'n_epochs': 50, 'lr_all': 0.01, 'reg_all': 0.1, 'random_state': 42, 'biased': True}


In [22]:
# param_grid = {"n_factors":[2,5,10,20,50,100], "n_epochs": [5,10,20,50,100,200], "lr_all": [0.001,0.002,0.005,0.01,0.02,0.05], "reg_all": [0.01,0.05,0.1,0.2,0.3,0.4,0.5],'random_state':[42],'biased':[True,False]}
# param_grid = {"n_factors":[5,10,50,100], "n_epochs": [100,200], "lr_all": [0.001,0.01], "reg_all": [0.01,0.1],'random_state':[42],'biased':[True,]}
param_grid = {"n_factors":[50], "n_epochs": [20,50,100], "lr_all": [0.01, 0.001], "reg_all": [0.1, 0.001],'random_state':[42],'biased':[True,False]}
#SVD
start = datetime.datetime.now()
best_score = []
best_params = []   
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=5)
gs.fit(trainset)

print("SVD")
print("Runtime",str(datetime.datetime.now() - start)[:-3])  
print("RMSE: ",round(gs.best_score["rmse"],3))
print("Best parameters: ",gs.best_params["rmse"])

SVD
Runtime 0:02:48.683
RMSE:  0.921
Best parameters:  {'n_factors': 50, 'n_epochs': 50, 'lr_all': 0.01, 'reg_all': 0.1, 'random_state': 42, 'biased': True}


In [24]:
# param_grid = {"n_factors":[2,5,10,20,50,100], "n_epochs": [5,10,20,50,100,200], "lr_all": [0.001,0.002,0.005,0.01,0.02,0.05], "reg_all": [0.01,0.05,0.1,0.2,0.3,0.4,0.5],'random_state':[42],'biased':[True,False]}
# param_grid = {"n_factors":[5,10,50,100], "n_epochs": [100,200], "lr_all": [0.001,0.01], "reg_all": [0.01,0.1],'random_state':[42],'biased':[True,]}
param_grid = {"n_factors":[25], "n_epochs": [20,50,100], "lr_all": [0.01, 0.001], "reg_all": [0.1, 0.001],'random_state':[42],'biased':[True,False]}
#SVD
start = datetime.datetime.now()
best_score = []
best_params = []   
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=5)
gs.fit(trainset)

print("SVD")
print("Runtime",str(datetime.datetime.now() - start)[:-3])  
print("RMSE: ",round(gs.best_score["rmse"],3))
print("Best parameters: ",gs.best_params["rmse"])

SVD
Runtime 0:02:21.578
RMSE:  0.924
Best parameters:  {'n_factors': 25, 'n_epochs': 50, 'lr_all': 0.01, 'reg_all': 0.1, 'random_state': 42, 'biased': True}


In [23]:
# param_grid = {"n_factors":[2,5,10,20,50,100], "n_epochs": [5,10,20,50,100,200], "lr_all": [0.001,0.002,0.005,0.01,0.02,0.05], "reg_all": [0.01,0.05,0.1,0.2,0.3,0.4,0.5],'random_state':[42],'biased':[True,False]}
# param_grid = {"n_factors":[5,10,50,100], "n_epochs": [100,200], "lr_all": [0.001,0.01], "reg_all": [0.01,0.1],'random_state':[42],'biased':[True,]}
param_grid = {"n_factors":[10], "n_epochs": [20,50,100], "lr_all": [0.01, 0.001], "reg_all": [0.1, 0.001],'random_state':[42],'biased':[True,False]}
#SVD
start = datetime.datetime.now()
best_score = []
best_params = []   
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=5)
gs.fit(trainset)

print("SVD")
print("Runtime",str(datetime.datetime.now() - start)[:-3])  
print("RMSE: ",round(gs.best_score["rmse"],3))
print("Best parameters: ",gs.best_params["rmse"])

SVD
Runtime 0:02:04.502
RMSE:  0.929
Best parameters:  {'n_factors': 10, 'n_epochs': 50, 'lr_all': 0.01, 'reg_all': 0.1, 'random_state': 42, 'biased': True}


In [11]:
dir(movie100k)
movie100k.has_been_split

False

#### SVD++

In [13]:
#SVD++
# param_grid = {"n_epochs": [5,10,20,50,100,200], "lr_all": [0.001,0.002,0.005,0.01,0.02,0.05], "reg_all": [0.01,0.05,0.1,0.2,0.3,0.4,0.5],'random_state':[42]}
# param_grid = {"n_factors":[2,5,10,20,50,100], "n_epochs": [5,50,100,200], "lr_all": [0.001,0.005,0.01,0.05], "reg_all": [0.01,0.1,0.3,0.5],'random_state':[42]}
# param_grid = {"n_factors":[5, 10, 50, 100], "n_epochs": [100, 200], "lr_all": [0.01, 0.001], "reg_all": [0.1, 0.001],'random_state':[42]}
param_grid = {"n_factors":[500], "n_epochs": [1], "lr_all": [0.01], "reg_all": [0.1],'random_state':[42]}
start = datetime.datetime.now()
best_score = []
best_params = [] 
gs = GridSearchCV(SVDpp, param_grid, measures=["rmse", "mae"], cv=5)
gs.fit(trainset)

print('SVD++')
print("Runtime",str(datetime.datetime.now() - start)[:-3])
print("RMSE: ",round(gs.best_score["rmse"],3))
print("Best parameters: ",gs.best_params["rmse"])


SVD++
Runtime 0:01:00.518
RMSE:  0.996
Best parameters:  {'n_factors': 500, 'n_epochs': 1, 'lr_all': 0.01, 'reg_all': 0.1, 'random_state': 42}


#### NMF

In [4]:
#Nonegative Matrix Factorization (NMF)
param_grid = {"n_factors":[10,100,500], 'n_epochs':[50,100],'reg_pu':[0.01,0.06,0.2],'reg_qi':[0.01,0.06,0.2],'biased':[True,False]}
start = datetime.datetime.now()
   
gs = GridSearchCV(NMF, param_grid, measures=["rmse", "mae"], cv=5)
gs.fit(trainset)

print('NMF')
print("Runtime",str(datetime.datetime.now() - start)[:-3])
print("RMSE: ",round(gs.best_score["rmse"],3))
print("Best parameters: ",gs.best_params["rmse"])

NMF
Runtime 1:06:35.587
RMSE:  0.921
Best parameters:  {'n_factors': 500, 'n_epochs': 100, 'reg_pu': 0.06, 'reg_qi': 0.06, 'biased': False}


#### 'Biased' is default True for SVD and False for NMF
#### Select n_factors=500, n_epochs=100, lr_all=0.01, reg_all=0.1

In [3]:
#Nonegative Matrix Factorization (NMF)
param_grid = {"n_factors":[200], 'n_epochs':[20,50,100],'reg_pu':[0.01,0.06,0.2],'reg_qi':[0.01,0.06,0.2],'biased':[True,False]}
start = datetime.datetime.now()
   
gs = GridSearchCV(NMF, param_grid, measures=["rmse", "mae"], cv=5)
gs.fit(trainset)

print('NMF')
print("Runtime",str(datetime.datetime.now() - start)[:-3])
print("RMSE: ",round(gs.best_score["rmse"],3))
print("Best parameters: ",gs.best_params["rmse"])

NMF
Runtime 0:23:53.549
RMSE:  0.927
Best parameters:  {'n_factors': 200, 'n_epochs': 100, 'reg_pu': 0.06, 'reg_qi': 0.06, 'biased': False}


In [4]:
#Nonegative Matrix Factorization (NMF)
param_grid = {"n_factors":[100], 'n_epochs':[20,50,100],'reg_pu':[0.01,0.06,0.2],'reg_qi':[0.01,0.06,0.2],'biased':[True,False]}
start = datetime.datetime.now()
   
gs = GridSearchCV(NMF, param_grid, measures=["rmse", "mae"], cv=5)
gs.fit(trainset)

print('NMF')
print("Runtime",str(datetime.datetime.now() - start)[:-3])
print("RMSE: ",round(gs.best_score["rmse"],3))
print("Best parameters: ",gs.best_params["rmse"])

NMF
Runtime 0:20:37.452
RMSE:  0.933
Best parameters:  {'n_factors': 100, 'n_epochs': 100, 'reg_pu': 0.06, 'reg_qi': 0.2, 'biased': False}


In [5]:
#Nonegative Matrix Factorization (NMF)
param_grid = {"n_factors":[50], 'n_epochs':[20,50,100],'reg_pu':[0.01,0.06,0.2],'reg_qi':[0.01,0.06,0.2],'biased':[True,False]}
start = datetime.datetime.now()
   
gs = GridSearchCV(NMF, param_grid, measures=["rmse", "mae"], cv=5)
gs.fit(trainset)

print('NMF')
print("Runtime",str(datetime.datetime.now() - start)[:-3])
print("RMSE: ",round(gs.best_score["rmse"],3))
print("Best parameters: ",gs.best_params["rmse"])

NMF
Runtime 0:09:26.016
RMSE:  0.935
Best parameters:  {'n_factors': 50, 'n_epochs': 100, 'reg_pu': 0.06, 'reg_qi': 0.2, 'biased': False}


In [6]:
#Nonegative Matrix Factorization (NMF)
param_grid = {"n_factors":[25], 'n_epochs':[20,50,100],'reg_pu':[0.01,0.06,0.2],'reg_qi':[0.01,0.06,0.2],'biased':[True,False]}
start = datetime.datetime.now()
   
gs = GridSearchCV(NMF, param_grid, measures=["rmse", "mae"], cv=5)
gs.fit(trainset)

print('NMF')
print("Runtime",str(datetime.datetime.now() - start)[:-3])
print("RMSE: ",round(gs.best_score["rmse"],3))
print("Best parameters: ",gs.best_params["rmse"])

NMF
Runtime 0:07:07.410
RMSE:  0.941
Best parameters:  {'n_factors': 25, 'n_epochs': 100, 'reg_pu': 0.06, 'reg_qi': 0.2, 'biased': False}


In [7]:
#Nonegative Matrix Factorization (NMF)
param_grid = {"n_factors":[10], 'n_epochs':[20,50,100],'reg_pu':[0.01,0.06,0.2],'reg_qi':[0.01,0.06,0.2],'biased':[True,False]}
start = datetime.datetime.now()
   
gs = GridSearchCV(NMF, param_grid, measures=["rmse", "mae"], cv=5)
gs.fit(trainset)

print('NMF')
print("Runtime",str(datetime.datetime.now() - start)[:-3])
print("RMSE: ",round(gs.best_score["rmse"],3))
print("Best parameters: ",gs.best_params["rmse"])

NMF
Runtime 0:05:00.476
RMSE:  0.936
Best parameters:  {'n_factors': 10, 'n_epochs': 100, 'reg_pu': 0.06, 'reg_qi': 0.2, 'biased': True}


### Tune co-clustering algorithms

In [9]:
param_grid = {'n_epochs':[10,20,50],'n_cltr_u':[3,10,50],'n_cltr_i':[3,10,50]}
start = datetime.datetime.now()
best_score = []
best_params = []
   
gs = GridSearchCV(CoClustering, param_grid, measures=["rmse", "mae"], cv=5)
gs.fit(trainset)

print('Co-clustering')
print("Runtime",str(datetime.datetime.now() - start)[:-3])
print("RMSE: ",round(gs.best_score["rmse"],3))
print("Best parameters: ",gs.best_params["rmse"])

Co-clustering
Runtime 0:14:15.648
RMSE:  0.967
Best parameters:  {'n_epochs': 50, 'n_cltr_u': 10, 'n_cltr_i': 3}


#### Select n_epochs=20, n_cltr_u=2, n_cltr_i=2

## Compare model performance on test set (u1.test)

In [4]:
CVResults = pd.DataFrame(columns = ['Model','RMSE','MAE','Timespan'])

class PredictMean(AlgoBase):
    def __init__(self):
        AlgoBase.__init__(self)
    def estimate(self, u, i):
        return 3.530

Naive_model = PredictMean()
NormalPredictor_model = NormalPredictor()
BaselineOnly_model = BaselineOnly(verbose=False)

KNNBasic_ub_model = KNNBasic(k=25, min_k=5,verbose=False,random_state=42,sim_options={'user_based':True,'name':'msd'})
KNNWithMeans_ub_model = KNNWithMeans(k=100, min_k=5,verbose=False,random_state=42,sim_options={'user_based':True,'name':'pearson_baseline'})
KNNWithZScore_ub_model = KNNWithZScore(k=100, min_k=5,verbose=False,random_state=42,sim_options={'user_based':True,'name':'pearson_baseline'})
KNNBaseline_ub_model = KNNBaseline(k=100, min_k=10,verbose=False,random_state=42,sim_options={'user_based':True,'name':'pearson_baseline'})
KNNBasic_ib_model = KNNBasic(k=25, min_k=1,verbose=False,random_state=42,sim_options={'user_based':False,'name':'msd'})
KNNWithMeans_ib_model = KNNWithMeans(k=100, min_k=5,verbose=False,random_state=42,sim_options={'user_based':False,'name':'pearson_baseline'})
KNNWithZScore_ib_model = KNNWithZScore(k=100, min_k=5,verbose=False,random_state=42,sim_options={'user_based':False,'name':'pearson_baseline'})
KNNBaseline_ib_model = KNNBaseline(k=100, min_k=10,verbose=False,random_state=42,sim_options={'user_based':False,'name':'pearson_baseline'})

SVD500_model = SVD(n_factors=500, n_epochs=50, lr_all=0.01, reg_all=0.1, verbose=False,random_state=42,biased=True)
SVD50_model = SVD(n_factors=50, n_epochs=50, lr_all=0.01, reg_all=0.1, verbose=False,random_state=42,biased=True)
# SVDpp_model = SVDpp(n_factors=100, n_epochs=50, lr_all=0.01, reg_all=0.3, verbose=False,random_state=42)
SVDpp500_model = SVDpp(n_factors=500, n_epochs=50, lr_all=0.01, reg_all=0.1, verbose=False,random_state=42)
SVDpp50_model = SVDpp(n_factors=50, n_epochs=50, lr_all=0.01, reg_all=0.1, verbose=False,random_state=42)
NMF500_model = NMF(n_factors=500,n_epochs=100,reg_pu=0.06,reg_qi=0.06,verbose=False,random_state=42,biased=False)
NMF50_model = NMF(n_factors=50,n_epochs=100,reg_pu=0.06,reg_qi=0.06,verbose=False,random_state=42,biased=False)
CoClustering_model = CoClustering(n_epochs=50, n_cltr_u=10, n_cltr_i=3,verbose=False,random_state=42)

classes = [Naive_model,NormalPredictor_model,BaselineOnly_model,KNNBasic_ub_model,KNNBaseline_ub_model,KNNWithMeans_ub_model,KNNWithZScore_ub_model,KNNBasic_ib_model,KNNBaseline_ib_model,KNNWithMeans_ib_model,KNNWithZScore_ib_model,SVD500_model,SVD50_model,SVDpp500_model,SVDpp50_model,NMF500_model,NMF50_model,CoClustering_model]
for model in classes:
    start = datetime.datetime.now()
    out = cross_validate(model, data, ['rmse', 'mae'], pkf)
    mean_rmse = np.mean(out['test_rmse'])
    mean_mae = np.mean(out['test_mae'])  
    cv_time = str(datetime.datetime.now() - start)[:-3]
    CVResults = CVResults.append({'Model': model, 'RMSE': mean_rmse, 'MAE': mean_mae, 'Timespan': cv_time}, ignore_index=True)

CVResults['Model'] = ['Naive','NormalPredictor','BaselineOnly','UB KNNBasic','UB KNNBaseline','UB KNNWithMeans','UB KNNWithZScore','IB KNNBasic','IB KNNBaseline','IB KNNWithMeans','IB KNNWithZScore','SVD (500 factors)','SVD (50 factors)','SVDpp (500 factors)','SVDpp (50 factors)','NMF (500 factors)','NMF (50 factors)','CoClustering']
CVResults

  CVResults = CVResults.append({'Model': model, 'RMSE': mean_rmse, 'MAE': mean_mae, 'Timespan': cv_time}, ignore_index=True)
  CVResults = CVResults.append({'Model': model, 'RMSE': mean_rmse, 'MAE': mean_mae, 'Timespan': cv_time}, ignore_index=True)
  CVResults = CVResults.append({'Model': model, 'RMSE': mean_rmse, 'MAE': mean_mae, 'Timespan': cv_time}, ignore_index=True)
  CVResults = CVResults.append({'Model': model, 'RMSE': mean_rmse, 'MAE': mean_mae, 'Timespan': cv_time}, ignore_index=True)
  CVResults = CVResults.append({'Model': model, 'RMSE': mean_rmse, 'MAE': mean_mae, 'Timespan': cv_time}, ignore_index=True)
  CVResults = CVResults.append({'Model': model, 'RMSE': mean_rmse, 'MAE': mean_mae, 'Timespan': cv_time}, ignore_index=True)
  CVResults = CVResults.append({'Model': model, 'RMSE': mean_rmse, 'MAE': mean_mae, 'Timespan': cv_time}, ignore_index=True)
  CVResults = CVResults.append({'Model': model, 'RMSE': mean_rmse, 'MAE': mean_mae, 'Timespan': cv_time}, ignore_index=True)


Unnamed: 0,Model,RMSE,MAE,Timespan
0,Naive,1.153666,0.967845,0:00:00.424
1,NormalPredictor,1.541677,1.237153,0:00:00.798
2,BaselineOnly,0.959944,0.761583,0:00:00.630
3,UB KNNBasic,0.992456,0.785038,0:00:04.329
4,UB KNNBaseline,0.935851,0.736315,0:00:06.534
5,UB KNNWithMeans,0.956649,0.747031,0:00:04.679
6,UB KNNWithZScore,0.954556,0.743314,0:00:05.083
7,IB KNNBasic,0.998139,0.787119,0:00:02.679
8,IB KNNBaseline,0.932976,0.734099,0:00:05.291
9,IB KNNWithMeans,0.936551,0.733992,0:00:04.750


### Compare with naive estimator (just the mean)

In [50]:
class PredictMean(AlgoBase):
    def __init__(self):
        AlgoBase.__init__(self)
    def estimate(self, u, i):
        return 3.530

predictions = PredictMean().fit(trainset_final).test(testset_final)
print('MAE:',round(accuracy.mae(predictions),3))
print('RMSE:',round(accuracy.rmse(predictions),3))

MAE:  0.9678
MAE: 0.968
RMSE: 1.1537
RMSE: 1.154


#### MSE with best KNN is (0.933/1.154)^2 = 65.37% of the naive estimator
#### MSE with SVD is (0.924/1.154)^2 = 64.11% of the naive estimator
#### MSE with NMF is (0.918/1.154)^2 = 63.28% of the naive estimator

### Analyze accuracy of KNN predictions on population segments

In [43]:
knn_predictions = KNNBaseline(k=100, min_k=10, verbose=False, random_state=42, sim_options={'user_based': False, 'name': 'pearson_baseline'}).fit(trainset_final).test(testset_final)
knn_ResultCatcher = pd.DataFrame(knn_predictions, columns=['User', 'Restaurant', 'Real_Rating', 'Estimated_Rating', 'details'])
knn_ResultCatcher.drop(['details'], axis=1, inplace=True)
knn_ResultComparison = pd.DataFrame({'Count': knn_ResultCatcher.groupby(['Real_Rating']).size(),
        'Avg_Est': knn_ResultCatcher.groupby(['Real_Rating']).mean()['Estimated_Rating']
        }).reset_index()
knn_ResultComparison

  'Avg_Est': knn_ResultCatcher.groupby(['Real_Rating']).mean()['Estimated_Rating']


Unnamed: 0,Real_Rating,Count,Avg_Est
0,1.0,1391,2.764331
1,2.0,2192,3.122309
2,3.0,5182,3.407245
3,4.0,6778,3.743099
4,5.0,4457,4.061909


### Analyze accuracy of SVD predictions on population segments

In [3]:
svd_predictions = SVD(n_factors=500, n_epochs=100, lr_all=0.01, reg_all=0.1, verbose=False,random_state=42,biased=True).fit(trainset_final).test(testset_final)
svd_ResultCatcher = pd.DataFrame(svd_predictions, columns=['User', 'Restaurant', 'Real_Rating', 'Estimated_Rating', 'details'])
svd_ResultCatcher.drop(['details'], axis=1, inplace=True)
svd_ResultComparison = pd.DataFrame({'Count': svd_ResultCatcher.groupby(['Real_Rating']).size(),
        'Avg_Est': svd_ResultCatcher.groupby(['Real_Rating']).mean()['Estimated_Rating']
        }).reset_index()
svd_ResultComparison

  'Avg_Est': svd_ResultCatcher.groupby(['Real_Rating']).mean()['Estimated_Rating']


Unnamed: 0,Real_Rating,Count,Avg_Est
0,1.0,1391,2.684416
1,2.0,2192,3.069825
2,3.0,5182,3.371561
3,4.0,6778,3.705844
4,5.0,4457,4.011789


In [4]:
svd_predictions = SVD(n_factors=50, n_epochs=50, lr_all=0.01, reg_all=0.1, verbose=False,random_state=42,biased=True).fit(trainset_final).test(testset_final)
svd_ResultCatcher = pd.DataFrame(svd_predictions, columns=['User', 'Restaurant', 'Real_Rating', 'Estimated_Rating', 'details'])
svd_ResultCatcher.drop(['details'], axis=1, inplace=True)
svd_ResultComparison = pd.DataFrame({'Count': svd_ResultCatcher.groupby(['Real_Rating']).size(),
        'Avg_Est': svd_ResultCatcher.groupby(['Real_Rating']).mean()['Estimated_Rating']
        }).reset_index()
svd_ResultComparison

  'Avg_Est': svd_ResultCatcher.groupby(['Real_Rating']).mean()['Estimated_Rating']


Unnamed: 0,Real_Rating,Count,Avg_Est
0,1.0,1391,2.623781
1,2.0,2192,3.055239
2,3.0,5182,3.364365
3,4.0,6778,3.703791
4,5.0,4457,4.018016


#### Tends to overestimate low ratings and underestimate high ratings

### Confirm that we may predict missing values

In [77]:
#algo = SVD(n_epochs=5, lr_all=0.001, reg_all=0.2, verbose=False, random_state=42).fit(trainset)
#pred = algo.predict(uid=2, iid='V1', verbose=True)

user: 2          item: V1         r_ui = None   est = 3.26   {'was_impossible': False}


## Next Steps:
### Confirm missing values were handled correctly
### Figure out how to submit predictions

#### Evaluate on the entire dataset

In [6]:
#all_dataset = data.build_full_trainset()
#model = SVD(n_epochs=200, lr_all=0.005, reg_all=0.3, verbose=False, random_state=42).fit(all_dataset)

In [8]:
#feedback_pred = feedback_orig.copy()

In [9]:
#for v in feedback_pred.columns:
#    for i in feedback_pred.index:
#        val = feedback_pred.iloc[i][v]
#        if pd.isnull(val):
#            feedback_pred.iloc[i][v] = model.predict(uid=i+1, iid=v, verbose=False).est

In [10]:
#feedback_pred

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15
0,3.0,2.915125,3.096318,3.0,3.491074,3.0,3.0,2.99817,3.231718,3.0,3.0,3.488151,4.0,3.223453,3.0
1,3.904623,3.0,3.0,3.359263,3.775469,3.227723,3.0,3.0,3.6738,3.095216,3.0,3.760955,5.0,3.567399,3.0
2,3.337861,2.854548,3.0,3.0,3.0,3.0,3.0,3.0,3.10534,2.873284,2.855822,4.0,3.0,3.0,2.964254
3,4.0,4.0,3.24725,3.316952,3.0,3.164566,3.411835,3.0,3.56733,2.0,3.078502,4.0,3.792723,4.0,3.199202
4,5.0,4.366082,5.0,5.0,4.787959,4.307359,5.0,4.262825,5.0,4.14857,4.263347,5.0,4.852598,4.464221,5.0
5,1.0,2.931339,4.0,2.0,3.117857,2.658271,4.0,2.827705,4.0,2.0,2.931245,3.0,3.08472,2.660475,2.87049
6,3.637396,2.853405,3.0,4.0,3.565103,3.0,3.053452,3.007353,3.0,2.901351,1.0,5.0,3.842731,3.489402,3.0
7,2.0,2.216213,2.491548,2.196496,4.0,2.475243,2.594261,2.0,1.0,2.0,2.455727,3.0,1.0,2.347648,2.0
8,3.0,2.960113,3.0,3.162362,3.640746,5.0,3.334216,3.0,3.0,3.299491,3.0,3.604268,3.0,4.0,3.0
9,4.535697,4.0,4.217504,5.0,4.543128,4.090026,4.273468,4.026073,5.0,3.927606,3.953727,5.0,4.65846,4.0,5.0


In [11]:
#feedback_pred.to_csv('Feedback_pred.csv')