In [1]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=2811596 sha256=b6107c038bea5ecda807b3995add4044501cc7813e388b08ae142ef8a4164ae8
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.3 surprise-0.1


In [2]:
!wget https://files.grouplens.org/datasets/movielens/ml-latest.zip
!unzip ml-latest.zip

--2023-08-29 09:50:22--  https://files.grouplens.org/datasets/movielens/ml-latest.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 350896731 (335M) [application/zip]
Saving to: ‘ml-latest.zip’


2023-08-29 09:50:25 (132 MB/s) - ‘ml-latest.zip’ saved [350896731/350896731]

Archive:  ml-latest.zip
   creating: ml-latest/
  inflating: ml-latest/tags.csv      
  inflating: ml-latest/links.csv     
  inflating: ml-latest/README.txt    
  inflating: ml-latest/ratings.csv   
  inflating: ml-latest/genome-tags.csv  
  inflating: ml-latest/genome-scores.csv  
  inflating: ml-latest/movies.csv    


In [3]:
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
import pandas as pd
import numpy as np

**Для экономии памяти в процессе обучения моделей я возьму не полный датасет с рейтингами, а 50% от него.**

In [4]:
movies = pd.read_csv('ml-latest/movies.csv')
ratings = pd.read_csv('ml-latest/ratings.csv').sample(frac=0.50)
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [6]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.movieId,
    'rating': movies_with_ratings.rating
})
reader = Reader(rating_scale=(ratings.rating.min(), ratings.rating.max()))
data = Dataset.load_from_df(dataset, reader)
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

**Для построения гибридной модели я обучу пять различных алгоритмов, и в качестве конечного значения буду брать среднее от их предсказаний**

In [None]:
from surprise import SVD, BaselineOnly, NMF, SVDpp, CoClustering

algos = [SVD, BaselineOnly, NMF, SVDpp, CoClustering]
preds = np.zeros((len(algos), len(testset)))

for i, algo in enumerate(algos):
    algo = algo()
    algo.fit(trainset)
    preds[i] = [x.est for x in algo.test(testset)]

**Для определения комбинации моделей, дающей наилучшее усредненное предсказание я посчитаю метрику RMSE для каждой возможной такой комбинации, включая одиночные алгоритмы. Комбинация с самым низким значением RMSE будет оптимальным гибридом выбранных алгоритмов в базовом виде.**

In [8]:
from itertools import combinations
from sklearn.metrics import mean_squared_error

y_test = [x[2] for x in testset]

min_rmse = np.inf
best_combo = None

for i in range(1, len(algos)+1):
    for combo in combinations(preds, i):
        rmse = mean_squared_error(y_test, np.mean(combo, axis=0), squared=False)
        if rmse < min_rmse:
            min_rmse = rmse
            best_combo = combo

best_hybrid = [algos[np.where(np.all(x == preds, axis=1))[0][0]].__name__ for x in best_combo]

print('Best combination:', best_hybrid)
print('Best RMSE score:', min_rmse)

Best combination: ['SVD', 'BaselineOnly', 'SVDpp']
Best RMSE score: 0.811704197140381
