In [1]:
from surprise import KNNWithMeans
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import KFold

import pandas as pd

In [2]:
# загрузаить данные по фильтма и рейтингам
movies = pd.read_csv('movies.dat',  sep="::", names = ['movieId', 'title', 'genres'],encoding='latin-1',engine='python')
ratings = pd.read_csv('ratings.dat',  sep="::", names = ["userId", "movieId", "rating", "Timestamp"],encoding='latin-1',engine='python')
# ratings.head()
# объеденить данные в единый dataframe
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [4]:
# создать df с которым может работать surprise
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})
# dataset.head()

Unnamed: 0,uid,iid,rating
0,1.0,Toy Story (1995),5.0
1,6.0,Toy Story (1995),4.0
2,8.0,Toy Story (1995),4.0
3,9.0,Toy Story (1995),5.0
4,10.0,Toy Story (1995),5.0


In [5]:
# создать dataset для работы surprise
rating_min = ratings.rating.min()
rating_max = ratings.rating.max()
reader = Reader(rating_scale=(rating_min, rating_max))
data = Dataset.load_from_df(dataset, reader)

In [6]:
# для разделения данных используем простой cross-validation с 5 фолдами
splitter = KFold(n_splits=5)
data_splitted = splitter.split(data)

In [7]:
# создаем инстанс алгоритма
# при использовании user-based подхода - ошибка была выше 0.88
# algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
# в результате примерения item-based подхода, удалось добиться ошибки на уровне 0.8575
algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': False})

In [8]:
# цикл обучения и теста в рамках кросс-валидации
for trainset, testset in data_splitted:
    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8583
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8620
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8585
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8605
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8575
