## Данные

In [1]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m81.9/154.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357275 sha256=8656f1b14cf2f7e718e2efd7e996af69c9b47210773a0684c3f8c4ad198f4491
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df

In [2]:
!gdown 1Hliaee7Y584-7lMoGeGjGabRRubzdoYn
!gdown 1JYpk5GTzK7GWT3mtDb9fNiwaVTzk8BBy

Downloading...
From: https://drive.google.com/uc?id=1Hliaee7Y584-7lMoGeGjGabRRubzdoYn
To: /content/Ratings.csv
100% 22.6M/22.6M [00:00<00:00, 57.6MB/s]
Downloading...
From: https://drive.google.com/uc?id=1JYpk5GTzK7GWT3mtDb9fNiwaVTzk8BBy
To: /content/Books.csv
100% 73.3M/73.3M [00:00<00:00, 79.6MB/s]


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.sparse import csr_matrix, hstack
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate, train_test_split as surprise_train_test_split

import joblib

In [4]:
ratings = pd.read_csv("Ratings.csv")
books = pd.read_csv("Books.csv")

  books = pd.read_csv("Books.csv")


## Обработка данных

In [5]:
# Преобразуем тип года к числовому
books["Year-Of-Publication"] = pd.to_numeric(books["Year-Of-Publication"], errors='coerce')

# Удалим строки с нереалистичными годами
books = books[(books["Year-Of-Publication"] > 0) & (books["Year-Of-Publication"] <= 2024)]

# Удалим строки с пропусками в авторах/издателях
books = books.dropna(subset=["Book-Author", "Publisher"])

# Удалим колонки с картинками
cols_to_drop = ["Image-URL-S", "Image-URL-M", "Image-URL-L"]
for c in cols_to_drop:
    if c in books.columns:
        books.drop(columns=[c], inplace=True, errors='ignore')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books.drop(columns=[c], inplace=True, errors='ignore')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books.drop(columns=[c], inplace=True, errors='ignore')


In [6]:
# Проверим наличие пропусков
books.isnull().sum()


Unnamed: 0,0
ISBN,0
Book-Title,0
Book-Author,0
Year-Of-Publication,0
Publisher,0


In [7]:
# Отберем рейтинги > 0
train_ratings = ratings[ratings["Book-Rating"] > 0].copy()

# Посчитаем количество оценок на книгу
book_counts = train_ratings["ISBN"].value_counts()
valid_books = book_counts[book_counts > 1].index

# Посчитаем количество оценок на пользователя
user_counts = train_ratings["User-ID"].value_counts()
valid_users = user_counts[user_counts > 1].index

train_ratings = train_ratings[train_ratings["ISBN"].isin(valid_books)]
train_ratings = train_ratings[train_ratings["User-ID"].isin(valid_users)]

# Отфильтруем таблицу книг под те ISBN, что остались
books = books[books["ISBN"].isin(train_ratings["ISBN"].unique())]

## Обучение SVD

In [8]:
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(train_ratings[["User-ID", "ISBN", "Book-Rating"]], reader)

trainset, testset = surprise_train_test_split(data, test_size=0.2, random_state=42)

model_svd = SVD(random_state=42)
model_svd.fit(trainset)

predictions = model_svd.test(testset)

# Оценим качество по MAE
from surprise import accuracy
mae = accuracy.mae(predictions)
print("MAE SVD:", mae)

# Требование: MAE < 1.3

MAE:  1.2541
MAE SVD: 1.2541457963741103


In [9]:
joblib.dump(model_svd, "svd_model.pkl")

['svd_model.pkl']

## Обучение линейной регрессии

Для обучения предсказанию среднего рейтинга книги нужны:

- Автор
- Издатель
- Год издания
- Векторизованное название


Сначала вычислим средний рейтинг для каждой книги

In [10]:
book_mean_ratings = train_ratings.groupby("ISBN")["Book-Rating"].mean().reset_index(name="mean_rating")
books_merge = pd.merge(books, book_mean_ratings, on="ISBN", how="inner")

books_merge.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,mean_rating
0,2005018,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,7.5
1,60973129,Decision in Normandy,Carlo D'Este,1991.0,HarperPerennial,7.5
2,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999.0,Farrar Straus Giroux,7.833333
3,399135782,The Kitchen God's Wife,Amy Tan,1991.0,Putnam Pub Group,8.285714
4,1558746218,A Second Chicken Soup for the Woman's Soul (Ch...,Jack Canfield,1998.0,Health Communications,10.0


Векторизация названий

In [11]:
vectorizer = TfidfVectorizer(stop_words="english", max_features=1000)
title_vectors = vectorizer.fit_transform(books_merge["Book-Title"].fillna(""))

Преобразование авторов и издателей

In [12]:
authors = pd.get_dummies(books_merge["Book-Author"], prefix="author", sparse=True)
publishers = pd.get_dummies(books_merge["Publisher"], prefix="pub", sparse=True)

authors_sparse = authors.sparse.to_coo()
publishers_sparse = publishers.sparse.to_coo()

Признак года издания

In [13]:
years = books_merge["Year-Of-Publication"].values.reshape(-1, 1)

scaler = StandardScaler()
years_scaled = scaler.fit_transform(years)

Объединение всех фич

In [14]:
years_sparse = csr_matrix(years_scaled)
X_sparse = hstack([title_vectors, authors_sparse, publishers_sparse, years_sparse], format='csr')

y = books_merge["mean_rating"].values

Разбиение на трейн/тест

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_sparse, y, test_size=0.2, random_state=42)

Обучение линейной модели

In [16]:
linreg = SGDRegressor(random_state=42, max_iter=1000, tol=1e-3)
linreg.fit(X_train, y_train)

y_pred = linreg.predict(X_test)
mae_linreg = mean_absolute_error(y_test, y_pred)
print("MAE LinReg:", mae_linreg)

# Требование: MAE < 1.5

MAE LinReg: 0.9945953571060524


Сохранение модели и масштабатора

In [17]:
joblib.dump(linreg, "linreg_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")
joblib.dump(authors.columns, "authors_columns.pkl")
joblib.dump(publishers.columns, "publishers_columns.pkl")

['publishers_columns.pkl']