In [1]:
!pip install tabulate



In [2]:
'''This module runs a 5-Fold CV for all the algorithms (default parameters) on
the movielens datasets, and reports average RMSE, MAE, and total computation
time.'''

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
import time
import datetime
import random

import numpy as np
import pandas as pd
import six
from tabulate import tabulate

from surprise import accuracy
from surprise.model_selection import train_test_split

from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise.model_selection import KFold
from surprise import NormalPredictor
from surprise import BaselineOnly
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import KNNBaseline
from surprise import SVD
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering

# lista algoritama
classes = (NormalPredictor, BaselineOnly, KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline, SVD,
           SVDpp,
           NMF, SlopeOne, CoClustering)

LINK = {
        'NormalPredictor': 'Random',
        'BaselineOnly': 'Baseline',
        'KNNBasic': 'k-NN',
        'KNNWithMeans': 'Centered k-NN',
        'KNNWithZScore': 'k-NN with z-score normalization',
        'KNNBaseline': 'k-NN Baseline',    
        'SVD': 'SVD',
        'SVDpp': 'SVD++',
        'NMF': 'NMF',
        'SlopeOne': 'Slope One',
        'CoClustering': 'Co-Clustering'
        }

dataset = 'ml-100k'
data = Dataset.load_builtin(dataset)

In [5]:
data

<surprise.dataset.DatasetAutoFolds at 0x109e75198>

# Unakrsna validacija

In [28]:
table = []
for klass in classes:
    start = time.time()
    out = cross_validate(klass(), data, ['rmse', 'mae'], 5)
    cv_time = str(datetime.timedelta(seconds=int(time.time() - start)))
    link = LINK[klass.__name__]
    mean_rmse = '{:.3f}'.format(np.mean(out['test_rmse']))
    mean_mae = '{:.3f}'.format(np.mean(out['test_mae']))

    new_line = [link, mean_rmse, mean_mae, cv_time]
    table.append(new_line)

header = [LINK[dataset],
          'RMSE',
          'MAE',
          'Time'
          ]
print(tabulate(table, header, tablefmt="pipe"))

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix..

# Evaluacija na skupu podataka za testiranje

In [6]:
trainset, testset = train_test_split(data, test_size=.3)
table = []

for klass in classes:
    start = time.time()
    algo = klass()
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse = '{:.3f}'.format(accuracy.rmse(predictions))
    algo_time = str(datetime.timedelta(seconds=int(time.time() - start)))
    link = LINK[klass.__name__]

    new_line = [link, rmse, algo_time]
    table.append(new_line)

header = [LINK[dataset],
          'RMSE',
          'Time'
          ]
print(tabulate(table, header, tablefmt="pipe"))

RMSE: 1.5188
Estimating biases using als...
RMSE: 0.9477
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9909
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9561
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9556
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9353
RMSE: 0.9426
RMSE: 0.9259
RMSE: 0.9701
RMSE: 0.9480
RMSE: 0.9764
| [Movielens 100k](http://grouplens.org/datasets/movielens/100k)   |   RMSE | Time    |
|:-----------------------------------------------------------------|-------:|:--------|
| Random                                                           |  1.519 | 0:00:00 |
| Baseline                                                         |  0.948 | 0:00:00 |
| k-NN                                                             |  0.991 | 0:00:06 |
| Centered k-NN                                                

In [7]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import GridSearchCV

In [8]:
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])# We can now use the algorithm that yields the best rmse:
algo = gs.best_estimator['rmse']
algo.fit(data.build_full_trainset())

0.963040573288
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x10ea4bf98>

In [25]:
param_grid = {'k': [10, 20],
              'sim_options': {'name': ['msd', 'cosine'],
                              'min_support': [1, 5],
                              'user_based': [False]}
              }
param_grid = {'bsl_options': {'method': ['als', 'sgd'],
                              'reg': [1, 2]},
              'k': [2, 3],
              'sim_options': {'name': ['msd', 'cosine'],
                              'min_support': [1, 5],
                              'user_based': [False]}
              }

In [31]:
algo = gs.best_estimator['rmse']
algo.fit(data.build_full_trainset())

results_df = pd.DataFrame.from_dict(gs.cv_results)
results_df

Unnamed: 0,mean_fit_time,mean_test_mae,mean_test_rmse,mean_test_time,param_lr_all,param_n_epochs,param_reg_all,params,rank_test_mae,rank_test_rmse,split0_test_mae,split0_test_rmse,split1_test_mae,split1_test_rmse,split2_test_mae,split2_test_rmse,std_fit_time,std_test_mae,std_test_rmse,std_test_time
0,2.797858,0.806048,0.997176,0.874351,0.002,5,0.4,"{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.4}",7,7,0.800957,0.989023,0.810723,1.003678,0.806464,0.998827,0.026331,0.003998,0.006096,0.052703
1,3.619428,0.814685,1.003301,0.892896,0.002,5,0.6,"{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.6}",8,8,0.809535,0.995234,0.819103,1.009578,0.815417,1.005091,0.571589,0.00394,0.005991,0.004187
2,3.799063,0.78186,0.973705,1.0317,0.005,5,0.4,"{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.4}",2,2,0.776645,0.965741,0.786605,0.980983,0.782329,0.97439,0.064236,0.00408,0.006241,0.148419
3,3.665977,0.792739,0.982412,1.126099,0.005,5,0.6,"{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.6}",5,5,0.787362,0.974326,0.797471,0.98952,0.793385,0.983391,0.104564,0.004152,0.006241,0.110935
4,5.821176,0.786237,0.97817,0.888134,0.002,10,0.4,"{'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.4}",4,4,0.780877,0.97006,0.79117,0.985374,0.786662,0.979075,0.165295,0.004213,0.006284,0.057338
5,5.787429,0.796599,0.986196,0.99759,0.002,10,0.6,"{'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.6}",6,6,0.791374,0.978264,0.801201,0.992967,0.797224,0.987357,0.112496,0.004036,0.006059,0.05766
6,5.926009,0.772617,0.963841,0.874462,0.005,10,0.4,"{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}",1,1,0.767252,0.95562,0.777604,0.971638,0.772994,0.964266,0.152206,0.004234,0.006546,0.034523
7,5.654495,0.78437,0.973707,0.868866,0.005,10,0.6,"{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.6}",3,3,0.779016,0.965575,0.789216,0.981166,0.784879,0.97438,0.024825,0.004179,0.006383,0.115431


gs.cv_results