## 1 Installation of Surprise in Google Colab

In [1]:
!pip install surprise

Collecting surprise
  Downloading https://files.pythonhosted.org/packages/61/de/e5cba8682201fcf9c3719a6fdda95693468ed061945493dea2dd37c5618b/surprise-0.1-py2.py3-none-any.whl
Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/97/37/5d334adaf5ddd65da99fc65f6507e0e4599d092ba048f4302fe8775619e8/scikit-surprise-1.1.1.tar.gz (11.8MB)
[K     |████████████████████████████████| 11.8MB 4.0MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp36-cp36m-linux_x86_64.whl size=1618267 sha256=845e6aaebd938cdfc0dcd9df6017ebef2d9a8529c48eee815e88a6cf7e1b93fa
  Stored in directory: /root/.cache/pip/wheels/78/9c/3d/41b419c9d2aff5b6e2b4c0fc8d25c538202834058f9ed110d0
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.1 surprise-0.1


## 2 Import necessary libraries

In [2]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise import accuracy

## 3 Load Data (Movies Dataset from Surprise) 

In [3]:
data = Dataset.load_builtin('ml-100k')

Dataset ml-100k could not be found. Do you want to download it? [Y/n] Y
Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /root/.surprise_data/ml-100k


## 4 Build Model using SVD

### 4.1 Split data in train and test

In [4]:
train, test = train_test_split(data, test_size=0.25, random_state=42)

In [5]:
train.n_users, train.n_items

(943, 1644)

### 4.2 Train Model

In [6]:
# From surprise import SVD
model = SVD()

In [7]:
model.fit(train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f59398cd588>

### 4.3 Make Prediction

In [8]:
predictions = model.test(test)

In [9]:
predictions

[Prediction(uid='391', iid='591', r_ui=4.0, est=3.467493848882146, details={'was_impossible': False}),
 Prediction(uid='181', iid='1291', r_ui=1.0, est=1.4865244198608778, details={'was_impossible': False}),
 Prediction(uid='637', iid='268', r_ui=2.0, est=3.2433927566013114, details={'was_impossible': False}),
 Prediction(uid='332', iid='451', r_ui=5.0, est=4.163027499214295, details={'was_impossible': False}),
 Prediction(uid='271', iid='204', r_ui=4.0, est=3.9328619673667014, details={'was_impossible': False}),
 Prediction(uid='27', iid='286', r_ui=3.0, est=3.832629940188108, details={'was_impossible': False}),
 Prediction(uid='387', iid='663', r_ui=4.0, est=3.8251985172172596, details={'was_impossible': False}),
 Prediction(uid='92', iid='722', r_ui=3.0, est=2.663507707454304, details={'was_impossible': False}),
 Prediction(uid='820', iid='347', r_ui=4.0, est=2.7983560787401522, details={'was_impossible': False}),
 Prediction(uid='479', iid='1444', r_ui=1.0, est=3.1131615950828038, 

### 4.4 Evaluation

In [10]:
accuracy.rmse(predictions)

RMSE: 0.9433


0.9432801082871652

In [11]:
accuracy.mae(predictions=predictions)

MAE:  0.7430


0.7429533260706718

### 4.5 Cross Validation

In [12]:
cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9337  0.9282  0.9365  0.9412  0.9396  0.9359  0.0046  
MAE (testset)     0.7367  0.7339  0.7377  0.7409  0.7388  0.7376  0.0023  
Fit time          4.99    4.96    5.09    5.01    5.04    5.02    0.04    
Test time         0.16    0.23    0.15    0.23    0.15    0.18    0.04    


{'fit_time': (4.991088628768921,
  4.961717367172241,
  5.092529535293579,
  5.005552530288696,
  5.0397868156433105),
 'test_mae': array([0.73666743, 0.73392067, 0.73767903, 0.74087246, 0.7387511 ]),
 'test_rmse': array([0.93374641, 0.92824767, 0.9364928 , 0.94124124, 0.93964979]),
 'test_time': (0.16019821166992188,
  0.22687125205993652,
  0.1538243293762207,
  0.2311713695526123,
  0.14871597290039062)}

# Use of Pipeline and build two models

## 5 Using KNNwithMeans algorithm

#### 5.1 Check best score and best params to build models

In [13]:
from surprise import KNNWithMeans
# from surprise import Dataset
from surprise.model_selection import GridSearchCV

# data = Dataset.load_builtin("ml-100k")
sim_options = {
    "name": ["msd", "cosine"],
    "min_support": [3, 4, 5],
    "user_based": [False, True],
}
param_grid = {"sim_options": sim_options}

gs = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse", "mae"], cv=3)
gs.fit(data)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

#### 5.2 Build Model

In [14]:
# Using best params from above data
sim_options = {
    "name": "msd",
    "user_based": False,  # Compute  similarities between items
}
model_1 = KNNWithMeans(sim_options=sim_options)
model_1.fit(train)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f5934c40128>

### 5.3 Prediction

In [15]:
pred_KNNWithMeans = model_1.test(test)

In [16]:
pred_KNNWithMeans

[Prediction(uid='391', iid='591', r_ui=4.0, est=3.4081618213595783, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='181', iid='1291', r_ui=1.0, est=1.6957735725954333, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='637', iid='268', r_ui=2.0, est=2.9879049464497935, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='332', iid='451', r_ui=5.0, est=4.113439566471616, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='271', iid='204', r_ui=4.0, est=3.8430048887370565, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='27', iid='286', r_ui=3.0, est=3.527979423991914, details={'actual_k': 15, 'was_impossible': False}),
 Prediction(uid='387', iid='663', r_ui=4.0, est=3.8230888217722683, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='92', iid='722', r_ui=3.0, est=2.5644448440139858, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='820', iid='347', r_ui=4.0,

### 5.4 Accuracy

In [17]:
accuracy.rmse(pred_KNNWithMeans)

RMSE: 0.9377


0.9376561891346782

In [18]:
accuracy.mae(pred_KNNWithMeans)

MAE:  0.7355


0.7355240537993545

### 5.5 Cross Validate

In [19]:
cross_validate(model_1, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9393  0.9275  0.9351  0.9340  0.9339  0.9340  0.0038  
MAE (testset)     0.7361  0.7282  0.7352  0.7327  0.7350  0.7334  0.0029  
Fit time          0.70    0.67    0.72    0.69    0.66    0.69    0.02    
Test time         4.50    4.61    4.51    4.28    4.23    4.43    0.15    


{'fit_time': (0.7018787860870361,
  0.6725223064422607,
  0.721268892288208,
  0.6885206699371338,
  0.6578302383422852),
 'test_mae': array([0.73614695, 0.72815834, 0.73523837, 0.73266705, 0.73500619]),
 'test_rmse': array([0.93925839, 0.92751255, 0.93511264, 0.93400179, 0.93388438]),
 'test_time': (4.5047242641448975,
  4.614814519882202,
  4.514279842376709,
  4.279648780822754,
  4.228545427322388)}

## 6 KNNBaseLine Model

### 6.1 Check best score and best params to build models

In [20]:
from surprise import KNNBaseline
# from surprise import Dataset
from surprise.model_selection import GridSearchCV

# data = Dataset.load_builtin("ml-100k")
sim_options = {
    "name": ["msd", "pearson_baseline"],
    "min_support": [3, 4, 5],
    "user_based": [False, True],
}
param_grid = {"sim_options": sim_options}

gs = GridSearchCV(KNNBaseline, param_grid, measures=["rmse", "mae"], cv=3)
gs.fit(data)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matr

#### 6.2 Build Model

In [21]:
# Using best params from above data
sim_options = {
    "name": "pearson_baseline",
    "user_based": False,  # Compute  similarities between items
}
model_2 = KNNBaseline(sim_options=sim_options)
model_2.fit(train)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x7f5934c40470>

#### 6.3 Prediction

In [22]:
pred_KnnBaseline = model_2.test(test)

In [23]:
pred_KnnBaseline

[Prediction(uid='391', iid='591', r_ui=4.0, est=3.4911051665539237, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='181', iid='1291', r_ui=1.0, est=1.9867724998882346, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='637', iid='268', r_ui=2.0, est=2.496550303571375, details={'actual_k': 39, 'was_impossible': False}),
 Prediction(uid='332', iid='451', r_ui=5.0, est=4.096959193713625, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='271', iid='204', r_ui=4.0, est=3.7886256278442043, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='27', iid='286', r_ui=3.0, est=3.802355488363607, details={'actual_k': 5, 'was_impossible': False}),
 Prediction(uid='387', iid='663', r_ui=4.0, est=3.837743659689841, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='92', iid='722', r_ui=3.0, est=2.8117369976706046, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='820', iid='347', r_ui=4.0, es

#### 6.4 Accuracy

In [24]:
accuracy.rmse(pred_KnnBaseline)

RMSE: 0.9238


0.9237660818560733

In [25]:
accuracy.mae(pred_KnnBaseline)

MAE:  0.7230


0.7229722085621993

#### 6.5 Cross Validate

In [26]:
cross_validate(model_2, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBaseline on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9123  0.9244  0.9100  0.9223  0.9098  0.9157  0.0063  
MAE (testset)     0.7130  0.7268  0.7166  0.7202  0.7136  0.7180  0.0051  
Fit time          1.79    1.91    1.86    1.82    1.81    1.84    0.04    
Test time         4.54    4.75    4.60    4.59    4.5

{'fit_time': (1.7928225994110107,
  1.9083902835845947,
  1.8568685054779053,
  1.8235752582550049,
  1.8070549964904785),
 'test_mae': array([0.71299708, 0.72681797, 0.71659784, 0.72021942, 0.71361566]),
 'test_rmse': array([0.91227881, 0.92436789, 0.90995739, 0.92231725, 0.90977928]),
 'test_time': (4.538878440856934,
  4.748149394989014,
  4.602680921554565,
  4.586146354675293,
  4.539069652557373)}

#### Conclusion