In [35]:
!pip install scikit-surprise



In [0]:
import pandas as pd
import numpy as np

## **Data Loading**

In [0]:
from surprise import Dataset

# Load the movielens-100k dataset (download it if needed),
movie_data = Dataset.load_builtin('ml-100k')

We then split it with a training and a test set. 

In [0]:
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(movie_data, test_size=.25)

## **Algo Building** : With Surprise



### With SVD

In [19]:
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import KFold


# define a cross-validation iterator
kf = KFold(n_splits=5)

algo = SVD()

for trainset, testset in kf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

RMSE: 0.9341
RMSE: 0.9300
RMSE: 0.9396
RMSE: 0.9285
RMSE: 0.9421


### With KNN-Basic

In [39]:
from surprise import KNNBasic


kf = KFold(n_splits=5)

algo = KNNBasic()

for trainset, testset in kf.split(data):


    algo.fit(trainset)
    predictions = algo.test(testset)
    accuracy.rmse(predictions, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9739
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9777
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9831
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9737
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9793


### With KNN-with-means

In [21]:
from surprise import KNNWithMeans


kf = KFold(n_splits=5)

algo = KNNWithMeans()

for trainset, testset in kf.split(data):

    algo.fit(trainset)
    predictions = algo.test(testset)
    accuracy.rmse(predictions, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9622
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9491
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9469
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9529
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9449


### With SVD++

In [22]:
from surprise import SVDpp


kf = KFold(n_splits=5)

algo = SVDpp()

for trainset, testset in kf.split(data):

    algo.fit(trainset)
    predictions = algo.test(testset)
    accuracy.rmse(predictions, verbose=True)

RMSE: 0.9253
RMSE: 0.9175
RMSE: 0.9173
RMSE: 0.9178
RMSE: 0.9243


### With Co-Clustering

In [23]:
from surprise import CoClustering 


kf = KFold(n_splits=5)

algo = CoClustering()

for trainset, testset in kf.split(data):

    algo.fit(trainset)
    predictions = algo.test(testset)
    accuracy.rmse(predictions, verbose=True)

RMSE: 0.9629
RMSE: 0.9561
RMSE: 0.9645
RMSE: 0.9700
RMSE: 0.9757


### With NMF

In [24]:
from surprise import NMF


kf = KFold(n_splits=5)

algo = NMF()

for trainset, testset in kf.split(data):

    algo.fit(trainset)
    predictions = algo.test(testset)
    accuracy.rmse(predictions, verbose=True)

RMSE: 0.9555
RMSE: 0.9617
RMSE: 0.9657
RMSE: 0.9664
RMSE: 0.9583


### Predictions with KNNBasic
KNNBasic gave the best results here. 

Let's do a prediction for a particular couple of user, movie. Let's take user 10 and movie 100. 

In [41]:
user_id = str(1)
movie_id = str(10)

prediction = algo.predict(user_id,movie_id)
prediction

Prediction(uid='1', iid='10', r_ui=None, est=4.078636050542991, details={'actual_k': 40, 'was_impossible': False})