In [None]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import MiniBatchKMeans

import ast
import re

from scipy import sparse

from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split

In [11]:
ratings = pd.read_csv('datasets/ratings_small.csv', low_memory=False)
movies_metadata = pd.read_csv('datasets/movies_metadata.csv', low_memory=False)
links = pd.read_csv('datasets/links_small.csv', low_memory=False)

In [12]:
movies_metadata['id'] = pd.to_numeric(movies_metadata['id'], errors='coerce')
movies_metadata = movies_metadata.dropna(subset=['id'])
movies_metadata['id'] = movies_metadata['id'].astype(int)

In [13]:
# Удаляем повторяющиеся строки
movies_metadata = movies_metadata.drop_duplicates()
links = links.drop_duplicates()
ratings = ratings.drop_duplicates()

In [None]:
links = links.dropna(subset=['tmdbId'])
links['tmdbId'] = links['tmdbId'].astype(int)

In [16]:
ratings = ratings[['userId', 'movieId', 'rating']].merge(links[['tmdbId', 'movieId']], on='movieId', how='left')
ratings = ratings.dropna(subset=['tmdbId'])

In [17]:
ratings['tmdbId'] = ratings['tmdbId'].astype(int)
ratings

Unnamed: 0,userId,movieId,rating,tmdbId
0,1,31,2.5,9909
1,1,1029,3.0,11360
2,1,1061,3.0,819
3,1,1129,2.0,1103
4,1,1172,4.0,11216
...,...,...,...,...
99999,671,6268,2.5,25461
100000,671,6269,4.0,51927
100001,671,6365,4.0,604
100002,671,6385,2.5,1088


In [18]:
# Создадим эмбеддинги для фильмов (первая попытка - взять более или менее информативн.фичи по максимуму)
movies_metadata['popularity'] = pd.to_numeric(movies_metadata['popularity'], errors='coerce')
movies_metadata['release_date'] = pd.to_datetime(movies_metadata['release_date'], errors='coerce')
# Вряд ли кого-то интересует прям конкретная дата выхода, оставим год
movies_metadata['release_year'] = movies_metadata['release_date'].dt.year

In [None]:
cols_to_scale = ['vote_average', 'popularity', 'runtime', 'release_year']

for col in cols_to_scale:
    median_val = movies_metadata[col].median()
    movies_metadata[col] = movies_metadata[col].fillna(median_val)

scaler = StandardScaler()

In [None]:
movies_metadata[cols_to_scale] = scaler.fit_transform(movies_metadata[cols_to_scale])

In [19]:
# Текстовые фичи тоже добавляем в эмбеддинги (вообще, они тут самые ценные: жанры, описание фильма, лозунг, + возможно название)
movies_metadata['genres'] = movies_metadata['genres'].apply(ast.literal_eval)

genres_count = (
    movies_metadata['genres']
    .explode()
    .dropna()
    .apply(lambda x: x.get('name') if isinstance(x, dict) else None)
    .dropna()
    .value_counts()
)

genres = genres_count[genres_count > 1].index.to_list()

genres_sets = movies_metadata['genres'].apply(
    lambda genres: {genre.get('name') for genre in genres
                    if isinstance(genre, dict) and genre.get('name')
                }
)

for g in genres:
    movies_metadata[f'genre_{g}'] = genres_sets.apply(lambda genre: int(g in genre))

movies_metadata['genres'] = genres_sets

In [None]:
stemmer = SnowballStemmer('english')
stop_words = set(stopwords.words('english'))

In [20]:
def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ''
    
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    words = text.split()
    cleaned_words = [stemmer.stem(w) for w in words if w not in stop_words]

    return ' '.join(cleaned_words)

In [None]:
movies_metadata['overview'] = movies_metadata['overview'].fillna('')
movies_metadata['tagline'] = movies_metadata['tagline'].fillna('')
movies_metadata['title'] = movies_metadata['title'].fillna('')

In [None]:
movies_metadata['overview_clean'] = movies_metadata['overview'].apply(clean_text)
movies_metadata['tagline_clean'] = movies_metadata['tagline'].apply(clean_text)
movies_metadata['title_clean'] = movies_metadata['title'].apply(clean_text)

In [None]:
movies_metadata['soup'] = (
    movies_metadata['overview_clean'] + ' ' + 
    movies_metadata['tagline_clean'] + ' ' +
    movies_metadata['title_clean']
)

In [22]:
tfidf = TfidfVectorizer(
    max_features=1000,
    strip_accents='unicode',
    ngram_range=(1, 2)
)
movies_metadata = movies_metadata.reset_index(drop=True)

In [None]:
tfidf_matrix = tfidf.fit_transform(movies_metadata['soup']).toarray()

In [23]:
genres_cols = [f'genre_{genre}' for genre in genres]
cols_to_add = genres_cols + cols_to_scale

In [None]:
additional_features = movies_metadata[cols_to_add].values
movies_matrix = np.hstack([tfidf_matrix, additional_features])

In [None]:
# Также рассчитываем эмбеддинги для пользователей

In [None]:
tmdbId_to_idx = dict(zip(movies_metadata['id'], movies_metadata.index))

In [None]:
# Проверяем, чтобы фильм был в основной таблице
ratings_filtered = ratings[ratings['tmdbId'].isin(tmdbId_to_idx.keys())].copy()

In [24]:
# Работаем по принципу взвешенного среднего, для этого находим всех уникальных пользователей
unique_users = sorted(ratings_filtered['userId'].unique())
userId_to_idx = {uid: i for i, uid in enumerate(unique_users)}
num_users = len(unique_users)

In [None]:
# Потом находим все его оценки
ratings_filtered['user_idx'] = ratings_filtered['userId'].map(userId_to_idx)
ratings_filtered['movie_idx'] = ratings_filtered['tmdbId'].map(tmdbId_to_idx)
ratings_clean = ratings_filtered.dropna(subset=['user_idx', 'movie_idx'])
ratings_clean['user_idx'] = ratings_clean['user_idx'].astype(int)
ratings_clean['movie_idx'] = ratings_clean['movie_idx'].astype(int)

In [25]:
weight_matrix = sparse.csr_matrix(
    (ratings_clean['rating'].values,
    (ratings_clean['user_idx'].values, ratings_clean['movie_idx'].values)),
    shape=(num_users, movies_matrix.shape[0])
)

row_sums = np.array(weight_matrix.sum(axis=1)).flatten()
row_sums[row_sums == 0] = 1
norm_weights = sparse.diags(1.0 / row_sums) @ weight_matrix

user_matrix = norm_weights @ movies_matrix

In [27]:
class MovieRatingDataset(Dataset):
    def __init__(self, ratings_df, user_matrix, movies_matrix,
                 userId_to_idx, tmdbId_to_idx):
        self.user_vecs = []
        self.movie_vecs = []
        self.targets = []

        for _, row in ratings_df.iterrows():
            uid = userId_to_idx.get(row['userId'])
            mid = tmdbId_to_idx.get(int(row['tmdbId']))

            if uid is not None and mid is not None:
                self.user_vecs.append(user_matrix[uid])
                self.movie_vecs.append(movies_matrix[mid])
                self.targets.append(row['rating'])

        self.user_vecs = torch.tensor(np.array(self.user_vecs), dtype=torch.float32)
        self.movie_vecs = torch.tensor(np.array(self.movie_vecs), dtype=torch.float32)
        self.targets = torch.tensor(np.array(self.targets), dtype=torch.float32)

    def __len__(self):
        return len(self.targets)
    
    def __getitem__(self, index):
        return self.user_vecs[index], self.movie_vecs[index], self.targets[index]

In [42]:
class RecommenderNet(nn.Module):
    def __init__(self, user_dim, movie_dim):
        super().__init__()
        input_dim = user_dim + movie_dim

        self.network = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            nn.Dropout(0.3),

            nn.Linear(512, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(0.2),

            nn.Linear(256, 128),
            nn.ReLU(),

            nn.Linear(128, 1),
            nn.Sigmoid()
        )

        self.min_rating = 0.5
        self.max_rating = 5.0

    def forward(self, user_emb, movie_emb):
        x = torch.cat([user_emb, movie_emb], dim=-1)
        out = self.network(x)
        out = self.min_rating + out * (self.max_rating - self.min_rating)
        return out

In [58]:
train_df, test_val_df = train_test_split(ratings_filtered, test_size=0.3, random_state=42)

test_df, val_df = train_test_split(test_val_df, test_size=0.5, random_state=42)

train_dataset = MovieRatingDataset(
    train_df, user_matrix, movies_matrix, userId_to_idx, tmdbId_to_idx
)
val_dataset = MovieRatingDataset(
    val_df, user_matrix, movies_matrix, userId_to_idx, tmdbId_to_idx
)
test_dataset = MovieRatingDataset(
    test_df, user_matrix, movies_matrix, userId_to_idx, tmdbId_to_idx
)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1024, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False)

In [None]:
import torch.optim as optim

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

user_dim = user_matrix.shape[1]
movie_dim = movies_matrix.shape[1]

model = RecommenderNet(user_dim, movie_dim).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, factor=0.5)

In [None]:
num_epochs = 10

In [59]:
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    train_correct = 0
    train_total = 0

    for user_emb, movie_emb, target in train_loader:
        user_emb = user_emb.to(device)
        movie_emb = movie_emb.to(device)
        target = target.to(device)

        pred = model(user_emb, movie_emb).squeeze()
        loss = criterion(pred, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * len(target)

    train_loss /= len(train_dataset)
    train_rmse = train_loss ** 0.5

    model.eval()
    val_loss = 0
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for user_emb, movie_emb, target in val_loader:
            user_emb = user_emb.to(device)
            movie_emb = movie_emb.to(device)
            target = target.to(device)

            pred = model(user_emb, movie_emb).squeeze()
            loss = criterion(pred, target)

            val_loss += loss.item() * len(target)
            val_correct += ((pred - target).abs() <= 0.5).sum().item()
            val_total += len(target)

    val_loss /= len(val_dataset)
    val_rmse = val_loss ** 0.5

    scheduler.step(val_loss)

    print(f"Epoch {epoch+1:>2}/{num_epochs} | "
          f"Train RMSE: {train_rmse:.4f} | "
          f"Val RMSE: {val_rmse:.4f}")

Epoch  1/10 | Train RMSE: 0.9860 | Val RMSE: 0.9324
Epoch  2/10 | Train RMSE: 0.9185 | Val RMSE: 0.9082
Epoch  3/10 | Train RMSE: 0.8929 | Val RMSE: 0.9085
Epoch  4/10 | Train RMSE: 0.8688 | Val RMSE: 0.8942
Epoch  5/10 | Train RMSE: 0.8488 | Val RMSE: 0.8890
Epoch  6/10 | Train RMSE: 0.8301 | Val RMSE: 0.8915
Epoch  7/10 | Train RMSE: 0.8170 | Val RMSE: 0.8863
Epoch  8/10 | Train RMSE: 0.8054 | Val RMSE: 0.8910
Epoch  9/10 | Train RMSE: 0.7916 | Val RMSE: 0.8955
Epoch 10/10 | Train RMSE: 0.7837 | Val RMSE: 0.8846


In [62]:
model.eval()

test_loss = 0
test_correct = 0
test_total = 0
all_preds = []
all_targets = []

with torch.no_grad():
    for user_emb, movie_emb, target in test_loader:
        user_emb = user_emb.to(device)
        movie_emb = movie_emb.to(device)
        target = target.to(device)

        pred = model(user_emb, movie_emb).squeeze()
        loss = criterion(pred, target)

        test_loss += loss.item() * len(target)
        test_correct += ((pred - target).abs() <= 0.5).sum().item()
        test_total += len(target)

        all_preds.extend(pred.cpu().numpy())
        all_targets.extend(target.cpu().numpy())

test_loss /= len(test_dataset)
test_rmse = test_loss ** 0.5

all_preds = np.array(all_preds)
all_targets = np.array(all_targets)
test_mae = np.abs(all_preds - all_targets).mean()

print(f"Test RMSE: {test_rmse:.4f}")
print(f"Test MAE:  {test_mae:.4f}")
print(f"Примеров:  {test_total:,}")


Test RMSE: 0.8786
Test MAE:  0.6752
Примеров:  14,971


Надо попробовать убрать некоторые фичи, возможно, некоторые из них мешают. По моим предположениям: основные фичи для обучения данной модельки - описание фильма (overview), лозунг, жанр и, возможно, название. Возможно выручка, бюджет и популярность - так себе показатели для выбора фильма пользователем. Возможно, это повысит точность.