# TODO
- Implement the evaluate and benchmarking_pipeline functions
- Extract all the notebook functions in a python script
- Create a new notebook where you will use the extracted benchmarking_pipeline function to do the benchamrking
- Do the benchmarking of the 5 already used models along with NMF and SVD.

http://surpriselib.com

# Load data

## From surprise

In [4]:
from surprise import Dataset

ratings = Dataset.load_builtin('ml-100k')
ratings

<surprise.dataset.DatasetAutoFolds at 0x7fbf2510f2e0>

## From file

In [2]:
from pathlib import Path
from surprise import Reader

ratings_filepath = Path('../data/movielens/ml-latest-small/ratings.csv')
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
ratings = Dataset.load_from_file(ratings_filepath, reader)
ratings

<surprise.dataset.DatasetAutoFolds at 0x7fbf1f1918b0>

## Modular function

In [8]:
from surprise.dataset import DatasetAutoFolds
from pathlib import Path

def load_ratings_from_surprise() -> DatasetAutoFolds:
    ratings = Dataset.load_builtin('ml-100k')
    return ratings

def load_ratings_from_file(ratings_filepath : Path) -> DatasetAutoFolds:
    reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
    ratings = Dataset.load_from_file(ratings_filepath, reader)
    return ratings


def get_ratings(load_from_surprise : bool = True, ratings_filepath : Path = None) -> DatasetAutoFolds:
    if load_from_surprise:
        ratings = load_ratings_from_surprise()
    else:
        ratings = load_ratings_from_file(ratings_filepath)
    return ratings

ratings = get_ratings(load_from_surprise=True)
ratings

<surprise.dataset.DatasetAutoFolds at 0x7fbf24eb5610>

# Manual pipeline

## Split data in train and test

In [17]:
from surprise.model_selection import train_test_split

trainset, testset = train_test_split(ratings, test_size=0.2, random_state=42)
trainset

<surprise.trainset.Trainset at 0x7fbf0ab97a30>

## Train model

In [13]:
from surprise import KNNBasic

model = KNNBasic(sim_options={'user_based': True})

In [18]:
model.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7fbf0ab72400>

In [19]:
from surprise.prediction_algorithms.algo_base import AlgoBase
from surprise.trainset import Trainset

def train(model_class: AlgoBase, model_arguments: dict, trainset: Trainset) -> AlgoBase:
    model = model_class(model_arguments)
    model.fit(trainset)
    return model

train(KNNBasic, {'user_based': True}, trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7fbf252fa580>

In [20]:
from surprise.prediction_algorithms.matrix_factorization import NMF

train(NMF, {'n_factors' : 10}, trainset)

TypeError: 'dict' object cannot be interpreted as an integer

## Evaluation

In [12]:
from surprise import accuracy

accuracy.rmse(predictions=predictions)

RMSE: 0.9378


0.9378456428063894

In [13]:
accuracy.mae(predictions=predictions)

MAE:  0.7395


0.7395408044495279

## Modular code

# Benchmarking

In [21]:
def benchmarking_pipeline():
    pass

In [None]:
model_dict_list = [
    {
        'model_name' : 'user based CF with cosine',
        'model_class': KNNBasic,
        'model_arguments' : {}
    }
]