## Данные

In [2]:
!pip install scikit-surprise

!gdown 1Hliaee7Y584-7lMoGeGjGabRRubzdoYn
!gdown 1JYpk5GTzK7GWT3mtDb9fNiwaVTzk8BBy

import pandas as pd
import numpy as np
import pickle
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split as surprise_train_test_split, GridSearchCV
from surprise import accuracy
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m153.6/154.4 kB[0m [31m5.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357290 sha256=877142c821c7b39a34e24de0dd02925f53ef266b37e072d00c770980c531be86
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a

Чтение данных

In [3]:
books = pd.read_csv("Books.csv", encoding='utf-8', low_memory=False)
ratings = pd.read_csv("Ratings.csv", encoding='utf-8', low_memory=False)

## Обработка данных

Приводим год к числу, убираем некорректные

In [4]:
books["Year-Of-Publication"] = pd.to_numeric(books["Year-Of-Publication"], errors='coerce')
books = books[(books["Year-Of-Publication"] >= 1) & (books["Year-Of-Publication"] <= 2024)]

Убираем пропуски по обязательным для нас столбцам

In [5]:
books = books.dropna(subset=["Book-Author", "Publisher", "Book-Title"])

Удаляем колонки с адресами картинок (не нужны)

In [6]:
cols_to_drop = ["Image-URL-S", "Image-URL-M", "Image-URL-L"]
books.drop(columns=cols_to_drop, inplace=True, errors='ignore')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books.drop(columns=cols_to_drop, inplace=True, errors='ignore')


Считаем, что рейтинг = 0 -> пользователь не оценивал книгу (убираем из train)

In [7]:
ratings = ratings[ratings["Book-Rating"] > 0].copy()

Убираем книги/пользователей, которым/которыми поставлена только 1 оценка

In [8]:
book_counts = ratings["ISBN"].value_counts()
valid_books = book_counts[book_counts > 1].index
user_counts = ratings["User-ID"].value_counts()
valid_users = user_counts[user_counts > 1].index

ratings = ratings[ratings["ISBN"].isin(valid_books)]
ratings = ratings[ratings["User-ID"].isin(valid_users)]

Сужаем таблицу books до тех ISBN, которые остались после фильтрации

In [9]:
books = books[books["ISBN"].isin(ratings["ISBN"].unique())]

## Обучение SVD

In [10]:
data = Dataset.load_from_df(
    ratings[["User-ID", "ISBN", "Book-Rating"]],
    Reader(rating_scale=(1, 10))
)

trainset, testset = surprise_train_test_split(data, test_size=0.2, random_state=42)

Подберём гиперпараметры пошире, чтобы добиться MAE < 1.3

In [11]:
param_grid = {
    'n_factors': [50, 100, 150, 200],
    'lr_all':    [0.002, 0.005, 0.007, 0.01],
    'reg_all':   [0.02, 0.05, 0.1, 0.2]
}
gs = GridSearchCV(SVD, param_grid, measures=['mae'], cv=3, n_jobs=-1, joblib_verbose=0)
gs.fit(data)

best_params = gs.best_params['mae']
print("Best params SVD:", best_params)

model_svd = SVD(
    n_factors=best_params['n_factors'],
    lr_all=best_params['lr_all'],
    reg_all=best_params['reg_all'],
    random_state=42
)
model_svd.fit(trainset)

predictions = model_svd.test(testset)
mae_svd = accuracy.mae(predictions)
print("SVD MAE on our test:", mae_svd)

Best params SVD: {'n_factors': 50, 'lr_all': 0.007, 'reg_all': 0.1}
MAE:  1.2460
SVD MAE on our test: 1.2459537247522734


Сохраним обученную модель SVD

In [22]:
with open("svd.pkl", "wb") as f:
    pickle.dump(model_svd, f)

## Обучение линейной регрессии



Считаем для каждой книги средний рейтинг

In [13]:
book_mean_ratings = ratings.groupby("ISBN")["Book-Rating"].mean().reset_index(name="mean_rating")
books_merge = pd.merge(books, book_mean_ratings, on="ISBN", how="inner")

Год -> числовая колонка (уже приведен выше), но дальше может понадобиться масштабирование

In [14]:
year_col = books_merge["Year-Of-Publication"].fillna(0).astype(float)

Автор и Издатель переводим в ID

In [15]:
authors_unique = books_merge["Book-Author"].unique().tolist()
authors_dict = {a: i for i, a in enumerate(authors_unique)}
author_id_col = books_merge["Book-Author"].map(authors_dict).fillna(-1).astype(int)

publishers_unique = books_merge["Publisher"].unique().tolist()
publishers_dict = {p: i for i, p in enumerate(publishers_unique)}
publisher_id_col = books_merge["Publisher"].map(publishers_dict).fillna(-1).astype(int)

TF-IDF для названий (макс 1000 фич)

In [16]:
vectorizer = TfidfVectorizer(stop_words="english", max_features=1000)
title_vectors = vectorizer.fit_transform(books_merge["Book-Title"].fillna(""))

Собираем X так, чтобы было ровно 1003 колонки: [year, author_id, publisher_id] + 1000 tf-idf

In [17]:
num_data = pd.DataFrame({
    "year": year_col.values,
    "author_id": author_id_col.values,
    "publisher_id": publisher_id_col.values
})

Прогоняем через StandardScaler, чтобы нормировать только эти три числовых признака

In [18]:
scaler = StandardScaler()
num_data_scaled = scaler.fit_transform(num_data)

Далее конкатенируем с TF-IDF (shape=(N,1000)) => итого (N, 1003)

In [19]:
from scipy.sparse import hstack, csr_matrix

num_data_sparse = csr_matrix(num_data_scaled)
X = hstack([num_data_sparse, title_vectors], format='csr')  # (N, 3 + 1000) = (N, 1003)
y = books_merge["mean_rating"].values

Делим на train/test (для локальной оценки), обучаем SGDRegressor

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

linreg = SGDRegressor(
    random_state=42,
    max_iter=1000,
    tol=1e-3
)
linreg.fit(X_train, y_train)

y_pred = linreg.predict(X_test)
mae_linreg = mean_absolute_error(y_test, y_pred)
print("LinReg MAE on our test:", mae_linreg)

LinReg MAE on our test: 0.9967102016706337


Сохраняем всё

In [23]:
with open("linreg.pkl", "wb") as f:
    pickle.dump(linreg, f)

with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

with open("authors_dict.pkl", "wb") as f:
    pickle.dump(authors_dict, f)

with open("publishers_dict.pkl", "wb") as f:
    pickle.dump(publishers_dict, f)