In [1]:
from surprise import KNNWithMeans, KNNBasic
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise.model_selection import KFold
import pandas as pd
import numpy as np
from statistics import mean

In [2]:
movies = pd.read_csv('movies.dat', sep='::',
                     header=None, names=['movie_id', 'movie_title', 'genres'],
                     engine='python')

In [3]:
movies.shape

(3883, 3)

In [4]:
movies.head()

Unnamed: 0,movie_id,movie_title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings = pd.read_csv('ratings.dat', sep='::',
                      names=['user_id', 'movie_id', 'rating', 'rating_timestamp'],
                      engine='python'
                      ).sort_values("rating_timestamp") 

In [6]:
ratings.shape

(1000209, 4)

In [7]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,rating_timestamp
1000138,6040,858,4,956703932
1000153,6040,2384,4,956703954
999873,6040,593,5,956703954
1000007,6040,1961,4,956703977
1000192,6040,2019,5,956703977


In [8]:
movies_with_ratings = movies.join(ratings.set_index('movie_id'), on = 'movie_id')

In [9]:
movies_with_ratings.dropna(inplace = True)

In [10]:
movies_with_ratings.head()

Unnamed: 0,movie_id,movie_title,genres,user_id,rating,rating_timestamp
0,1,Toy Story (1995),Animation|Children's|Comedy,6035.0,4.0,956712849.0
0,1,Toy Story (1995),Animation|Children's|Comedy,6032.0,4.0,956718127.0
0,1,Toy Story (1995),Animation|Children's|Comedy,6022.0,5.0,956755763.0
0,1,Toy Story (1995),Animation|Children's|Comedy,6021.0,3.0,956757147.0
0,1,Toy Story (1995),Animation|Children's|Comedy,6016.0,4.0,956778750.0


In [11]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.user_id,
    'iid': movies_with_ratings.movie_title,
    'rating': movies_with_ratings.rating
})

In [12]:
dataset.head()

Unnamed: 0,uid,iid,rating
0,6035.0,Toy Story (1995),4.0
0,6032.0,Toy Story (1995),4.0
0,6022.0,Toy Story (1995),5.0
0,6021.0,Toy Story (1995),3.0
0,6016.0,Toy Story (1995),4.0


In [13]:
min_rating = dataset.rating.min()
max_rating = dataset.rating.max()

In [14]:
reader = Reader(rating_scale = (min_rating, max_rating))
data = Dataset.load_from_df(dataset, reader)

In [15]:
algo = KNNWithMeans(k = 50, sim_options = {'name' : 'pearson_baseline', 'user_based' : False})

In [16]:
kf = KFold(n_splits=5)

In [17]:
ib_rmse = []

In [18]:
for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)

    ib_rmse.append(accuracy.rmse(predictions, verbose = True))

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8564
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8607
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8599
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8591
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8594


In [19]:
ib_rmse

[0.856437913888122,
 0.8607266094498102,
 0.8599143935735855,
 0.8590637951506488,
 0.8593956638087886]

In [20]:
round(mean(ib_rmse), 3)

0.859

Среднее значение RMSE при оценке на 5 подмножествах данных составляет 0,859, что меньше 0,870, следовательно, цель выполнения домашнего задания можно считать достигнутой.