## 1 Installation of Surprise in Google Colab

In [1]:
!pip install surprise

Collecting surprise
  Downloading https://files.pythonhosted.org/packages/61/de/e5cba8682201fcf9c3719a6fdda95693468ed061945493dea2dd37c5618b/surprise-0.1-py2.py3-none-any.whl
Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/97/37/5d334adaf5ddd65da99fc65f6507e0e4599d092ba048f4302fe8775619e8/scikit-surprise-1.1.1.tar.gz (11.8MB)
[K     |████████████████████████████████| 11.8MB 273kB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp36-cp36m-linux_x86_64.whl size=1618271 sha256=4c8b1eb09f9ec343fcf82ac554abb2ce9b2759a2ab8ff290fa930aec438ae511
  Stored in directory: /root/.cache/pip/wheels/78/9c/3d/41b419c9d2aff5b6e2b4c0fc8d25c538202834058f9ed110d0
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.1 surprise-0.1


## 2 Import necessary libraries

In [2]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise import accuracy

## 3 Load Data (Movies Dataset from Surprise) 

In [3]:
data = Dataset.load_builtin('ml-100k')

Dataset ml-100k could not be found. Do you want to download it? [Y/n] Y
Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /root/.surprise_data/ml-100k


## 4 Build Model using SVD

### 4.1 Split data in train and test

In [4]:
train, test = train_test_split(data, test_size=0.25, random_state=42)

In [5]:
train.n_users, train.n_items

(943, 1644)

### 4.2 Train Model

In [6]:
# From surprise import SVD
model_SVD = SVD()

In [7]:
model_SVD.fit(train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f06a0d00860>

### 4.3 Make Prediction

In [8]:
predictions = model_SVD.test(test)

In [9]:
predictions

[Prediction(uid='391', iid='591', r_ui=4.0, est=3.335669524343874, details={'was_impossible': False}),
 Prediction(uid='181', iid='1291', r_ui=1.0, est=1.5681825567781469, details={'was_impossible': False}),
 Prediction(uid='637', iid='268', r_ui=2.0, est=2.879937431043156, details={'was_impossible': False}),
 Prediction(uid='332', iid='451', r_ui=5.0, est=3.917108586984055, details={'was_impossible': False}),
 Prediction(uid='271', iid='204', r_ui=4.0, est=3.6637728667167466, details={'was_impossible': False}),
 Prediction(uid='27', iid='286', r_ui=3.0, est=3.56646835247723, details={'was_impossible': False}),
 Prediction(uid='387', iid='663', r_ui=4.0, est=3.4721003020223242, details={'was_impossible': False}),
 Prediction(uid='92', iid='722', r_ui=3.0, est=3.027970793102507, details={'was_impossible': False}),
 Prediction(uid='820', iid='347', r_ui=4.0, est=3.5103032480286696, details={'was_impossible': False}),
 Prediction(uid='479', iid='1444', r_ui=1.0, est=2.715845833703309, det

### 4.4 Evaluation

In [10]:
accuracy.rmse(predictions)

RMSE: 0.9422


0.9422377857356765

In [11]:
accuracy.mae(predictions)

MAE:  0.7425


0.7424685450783365

### 4.5 Cross Validation

In [12]:
cross_validate(model_SVD, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9383  0.9361  0.9381  0.9341  0.9418  0.9377  0.0025  
MAE (testset)     0.7394  0.7375  0.7400  0.7356  0.7395  0.7384  0.0016  
Fit time          4.07    4.05    4.03    4.05    4.06    4.05    0.01    
Test time         0.12    0.18    0.12    0.18    0.12    0.14    0.03    


{'fit_time': (4.071376800537109,
  4.0538084506988525,
  4.030364751815796,
  4.046112537384033,
  4.056187868118286),
 'test_mae': array([0.7394473 , 0.73747447, 0.74001503, 0.73559976, 0.73948018]),
 'test_rmse': array([0.93825299, 0.93605329, 0.93809253, 0.93413806, 0.94178145]),
 'test_time': (0.11727261543273926,
  0.18349361419677734,
  0.1197512149810791,
  0.17941570281982422,
  0.11533975601196289)}

# Use of Pipeline and build two models

## 5 Using KNNwithMeans algorithm

#### 5.1 Check best score and best params to build models

In [13]:
from surprise import KNNWithMeans
# from surprise import Dataset
from surprise.model_selection import GridSearchCV

# data = Dataset.load_builtin("ml-100k")
sim_options = {
    "name": ["msd", "cosine"],
    "min_support": [3, 4, 5],
    "user_based": [False, True],
}
param_grid = {"sim_options": sim_options}

gs = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse", "mae"], cv=3)
gs.fit(data)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

#### 5.2 Build Model

In [14]:
# Using best params from above data
sim_options = {
    "name": "msd",
    "user_based": False,  # Compute  similarities between items
}
model_KNNWithMeans = KNNWithMeans(sim_options=sim_options)
model_KNNWithMeans.fit(train)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f069b632898>

### 5.3 Prediction

In [15]:
pred_KNNWithMeans = model_KNNWithMeans.test(test)

In [16]:
pred_KNNWithMeans

[Prediction(uid='391', iid='591', r_ui=4.0, est=3.4081618213595783, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='181', iid='1291', r_ui=1.0, est=1.6957735725954333, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='637', iid='268', r_ui=2.0, est=2.9879049464497935, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='332', iid='451', r_ui=5.0, est=4.113439566471616, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='271', iid='204', r_ui=4.0, est=3.8430048887370565, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='27', iid='286', r_ui=3.0, est=3.527979423991914, details={'actual_k': 15, 'was_impossible': False}),
 Prediction(uid='387', iid='663', r_ui=4.0, est=3.8230888217722683, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='92', iid='722', r_ui=3.0, est=2.5644448440139858, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='820', iid='347', r_ui=4.0,

### 5.4 Accuracy

In [17]:
accuracy.rmse(pred_KNNWithMeans)

RMSE: 0.9377


0.9376561891346782

In [18]:
accuracy.mae(pred_KNNWithMeans)

MAE:  0.7355


0.7355240537993545

### 5.5 Cross Validate

In [19]:
cross_validate(model_KNNWithMeans, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9342  0.9323  0.9360  0.9308  0.9386  0.9344  0.0027  
MAE (testset)     0.7340  0.7342  0.7353  0.7279  0.7365  0.7336  0.0030  
Fit time          0.39    0.40    0.45    0.40    0.40    0.41    0.02    
Test time         3.04    3.08    3.09    3.16    3.26    3.13    0.08    


{'fit_time': (0.39069151878356934,
  0.4045116901397705,
  0.4527461528778076,
  0.3969156742095947,
  0.40216946601867676),
 'test_mae': array([0.73395681, 0.73416171, 0.73527573, 0.72792687, 0.73649539]),
 'test_rmse': array([0.93415215, 0.93232342, 0.93599398, 0.93076482, 0.93858647]),
 'test_time': (3.0405125617980957,
  3.0809500217437744,
  3.0906777381896973,
  3.163834810256958,
  3.264052629470825)}

## 6 KNNBaseLine Model

### 6.1 Check best score and best params to build models

In [20]:
from surprise import KNNBaseline
# from surprise import Dataset
from surprise.model_selection import GridSearchCV

# data = Dataset.load_builtin("ml-100k")
sim_options = {
    "name": ["msd", "pearson_baseline"],
    "min_support": [3, 4, 5],
    "user_based": [False, True],
}
param_grid = {"sim_options": sim_options}

gs = GridSearchCV(KNNBaseline, param_grid, measures=["rmse", "mae"], cv=3)
gs.fit(data)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matr

#### 6.2 Build Model

In [21]:
# Using best params from above data
sim_options = {
    "name": "pearson_baseline",
    "user_based": False,  # Compute  similarities between items
}
model_KNNBaseline = KNNBaseline(sim_options=sim_options)
model_KNNBaseline.fit(train)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x7f069b632128>

#### 6.3 Prediction

In [22]:
pred_KnnBaseline = model_KNNBaseline.test(test)

In [23]:
pred_KnnBaseline

[Prediction(uid='391', iid='591', r_ui=4.0, est=3.4911051665539237, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='181', iid='1291', r_ui=1.0, est=1.9867724998882346, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='637', iid='268', r_ui=2.0, est=2.496550303571375, details={'actual_k': 39, 'was_impossible': False}),
 Prediction(uid='332', iid='451', r_ui=5.0, est=4.096959193713625, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='271', iid='204', r_ui=4.0, est=3.7886256278442043, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='27', iid='286', r_ui=3.0, est=3.802355488363607, details={'actual_k': 5, 'was_impossible': False}),
 Prediction(uid='387', iid='663', r_ui=4.0, est=3.837743659689841, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='92', iid='722', r_ui=3.0, est=2.8117369976706046, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='820', iid='347', r_ui=4.0, es

#### 6.4 Accuracy

In [24]:
accuracy.rmse(pred_KnnBaseline)

RMSE: 0.9238


0.9237660818560733

In [25]:
accuracy.mae(pred_KnnBaseline)

MAE:  0.7230


0.7229722085621993

#### 6.5 Cross Validate

In [26]:
cross_validate(model_KNNBaseline, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBaseline on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9172  0.9112  0.9237  0.9123  0.9203  0.9169  0.0047  
MAE (testset)     0.7189  0.7162  0.7228  0.7148  0.7232  0.7192  0.0034  
Fit time          1.12    1.12    1.16    1.13    1.11    1.13    0.02    
Test time         3.44    3.47    3.42    3.42    3.3

{'fit_time': (1.122544765472412,
  1.1162028312683105,
  1.1645176410675049,
  1.1258575916290283,
  1.1126768589019775),
 'test_mae': array([0.71889039, 0.71618161, 0.72279423, 0.71475042, 0.7232163 ]),
 'test_rmse': array([0.91716857, 0.91119558, 0.9236505 , 0.91230033, 0.92027441]),
 'test_time': (3.435586452484131,
  3.473083257675171,
  3.4206838607788086,
  3.416865348815918,
  3.372520923614502)}

## 7 Benchmarking

In [27]:
import time
import datetime
import numpy as np

# classes = ['SVD', 'KNNWithMeans', 'KNNBaseline']
classes =(model_SVD, model_KNNWithMeans, model_KNNBaseline)
table = []
# cv = 5
for klass in classes:
    start = time.time()
    out = cross_validate(klass, data, ['rmse', 'mae'], cv=5, verbose=True)
    cv_time = str(datetime.timedelta(seconds=int(time.time() - start)))
    mean_rmse = '{:.3f}'.format(np.mean(out['test_rmse']))
    mean_mae = '{:.3f}'.format(np.mean(out['test_mae']))
    new_line = [klass, mean_rmse, mean_mae, cv_time]
    table.append(new_line)



Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9323  0.9344  0.9367  0.9359  0.9376  0.9354  0.0019  
MAE (testset)     0.7335  0.7352  0.7398  0.7379  0.7393  0.7371  0.0024  
Fit time          3.96    3.91    3.89    3.86    3.88    3.90    0.03    
Test time         0.12    0.22    0.12    0.12    0.21    0.16    0.05    
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9336  0.9305  0.9313  0.9336  0.9451  0.9348  0.

In [44]:
import pandas as pd
header = ['Movielens 100K',
          'RMSE',
          'MAE',
          'Time'
          ]
movie_df = pd.DataFrame(table, columns=header, index=['SVD', 'KNNWIthMeans', 'KNNBaseline'])

In [45]:
movie_df.head()

Unnamed: 0,Movielens 100K,RMSE,MAE,Time
SVD,<surprise.prediction_algorithms.matrix_factori...,0.935,0.737,0:00:20
KNNWIthMeans,<surprise.prediction_algorithms.knns.KNNWithMe...,0.935,0.734,0:00:17
KNNBaseline,<surprise.prediction_algorithms.knns.KNNBaseli...,0.916,0.719,0:00:23
