In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import GroupKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
import math
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [None]:
# Загрузка данных
user_features = pd.read_csv('data/user_features.csv')
item_features = pd.read_csv('data/item_features.csv')
events = pd.read_csv('data/events.csv')

In [None]:

user_features = pd.get_dummies(user_features, columns=['gender'], drop_first=True)


if 'age' in user_features.columns:
    scaler_user_age = StandardScaler()
    user_features['age'] = scaler_user_age.fit_transform(user_features[['age']])

events['datetime'] = pd.to_datetime(events['timestamp'], unit='s')
events['hour'] = events['datetime'].dt.hour
events['day_of_week'] = events['datetime'].dt.dayofweek


events_sorted = events.sort_values(['user_id', 'timestamp'])

user_ids = user_features['user_id'].unique()
item_ids = item_features['item_id'].unique()

user_id_map = {id: idx for idx, id in enumerate(user_ids)}
item_id_map = {id: idx for idx, id in enumerate(item_ids)}

events_sorted['user_idx'] = events_sorted['user_id'].map(user_id_map)
events_sorted['item_idx'] = events_sorted['item_id'].map(item_id_map)


In [None]:

user_total_interactions = events_sorted.groupby('user_idx').size().reset_index(name='user_total_interactions')
user_features = pd.merge(user_features, user_total_interactions, left_index=True, right_on='user_idx', how='left').fillna(0)
user_features['user_total_interactions'] = user_features['user_total_interactions'].astype(int)


if 'rating' in events_sorted.columns:
    user_avg_rating = events_sorted.groupby('user_idx')['rating'].mean().reset_index(name='user_avg_rating')
    user_features = pd.merge(user_features, user_avg_rating, on='user_idx', how='left').fillna(0)


genre_columns = [col for col in item_features.columns if 'genre_' in col]
user_genres = pd.merge(events_sorted, item_features, on='item_id', how='left')
user_genres = user_genres.groupby('user_idx')[genre_columns].sum().reset_index()
user_features = pd.merge(user_features, user_genres, on='user_idx', how='left').fillna(0)


last_interaction = events_sorted.groupby('user_idx')['datetime'].max().reset_index(name='last_interaction_time')
current_time = events_sorted['datetime'].max()
last_interaction['days_since_last_interaction'] = (current_time - last_interaction['last_interaction_time']).dt.days
user_features = pd.merge(user_features, last_interaction[['user_idx', 'days_since_last_interaction']], on='user_idx', how='left').fillna(0)

events_sorted['week'] = events_sorted['datetime'].dt.isocalendar().week
user_weekly_interactions = events_sorted.groupby(['user_idx', 'week']).size().reset_index(name='weekly_interactions')
user_freq = user_weekly_interactions.groupby('user_idx')['weekly_interactions'].mean().reset_index(name='avg_weekly_interactions')
user_features = pd.merge(user_features, user_freq, on='user_idx', how='left').fillna(0)

user_continuous_features = ['user_total_interactions', 'user_avg_rating', 'days_since_last_interaction', 'avg_weekly_interactions']
scaler_user = StandardScaler()
user_features[user_continuous_features] = scaler_user.fit_transform(user_features[user_continuous_features])

first_interaction = events_sorted.groupby('user_idx')['datetime'].min().reset_index(name='first_interaction_time')
user_features = pd.merge(user_features, first_interaction, on='user_idx', how='left').fillna(current_time)
user_features['days_since_first_interaction'] = (current_time - user_features['first_interaction_time']).dt.days
user_features = user_features.drop(['first_interaction_time'], axis=1)

events_sorted_user = events_sorted.sort_values(['user_idx', 'datetime'])
events_sorted_user['prev_datetime'] = events_sorted_user.groupby('user_idx')['datetime'].shift(1)
events_sorted_user['time_diff'] = (events_sorted_user['datetime'] - events_sorted_user['prev_datetime']).dt.days
user_time_diff = events_sorted_user.groupby('user_idx')['time_diff'].mean().reset_index(name='avg_time_between_interactions')
user_features = pd.merge(user_features, user_time_diff, on='user_idx', how='left').fillna(0)

additional_user_continuous_features = ['days_since_first_interaction', 'avg_time_between_interactions']
user_features[additional_user_continuous_features] = scaler_user.fit_transform(user_features[additional_user_continuous_features])


In [None]:

item_total_interactions = events_sorted.groupby('item_idx').size().reset_index(name='item_total_interactions')
item_features = pd.merge(item_features, item_total_interactions, left_index=True, right_on='item_idx', how='left').fillna(0)
item_features['item_total_interactions'] = item_features['item_total_interactions'].astype(int)

if 'rating' in events_sorted.columns:
    item_avg_rating = events_sorted.groupby('item_idx')['rating'].mean().reset_index(name='item_avg_rating')
    item_features = pd.merge(item_features, item_avg_rating, on='item_idx', how='left').fillna(0)

item_features['genre_count'] = item_features[genre_columns].sum(axis=1)

recent_threshold = current_time - pd.Timedelta(days=30)
recent_items = events_sorted[events_sorted['datetime'] >= recent_threshold].groupby('item_idx').size().reset_index(name='is_recent')
recent_items['is_recent'] = 1
item_features = pd.merge(item_features, recent_items[['item_idx', 'is_recent']], on='item_idx', how='left').fillna(0)
item_features['is_recent'] = item_features['is_recent'].astype(int)

item_continuous_features = ['item_total_interactions', 'item_avg_rating', 'genre_count']
scaler_item = StandardScaler()
item_features[item_continuous_features] = scaler_item.fit_transform(item_features[item_continuous_features])


In [None]:
train_user_ids = events_sorted.iloc[:int(0.8 * len(events_sorted))]['user_id'].unique()
train_user_indices = [user_id_map[user_id] for user_id in train_user_ids if user_id in user_id_map]


X_train_users = user_features.iloc[train_user_indices][user_continuous_features + genre_columns + additional_user_continuous_features].values
y_train_users = np.ones(X_train_users.shape[0])

rf_user = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_user.fit(X_train_users, y_train_users)

selector_user = SelectFromModel(rf_user, prefit=True, threshold='median')
X_train_users_selected = selector_user.transform(X_train_users)

user_features_selected = selector_user.transform(user_features[user_continuous_features + genre_columns + additional_user_continuous_features].values)

train_item_ids = events_sorted.iloc[:int(0.8 * len(events_sorted))]['item_id'].unique()
train_item_indices = [item_id_map[item_id] for item_id in train_item_ids if item_id in item_id_map]

X_train_items = item_features.iloc[train_item_indices][item_continuous_features].values
y_train_items = np.ones(X_train_items.shape[0])

rf_item = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_item.fit(X_train_items, y_train_items)

selector_item = SelectFromModel(rf_item, prefit=True, threshold='median')
X_train_items_selected = selector_item.transform(X_train_items)

item_features_selected = selector_item.transform(item_features[item_continuous_features].values)



In [None]:
scaler_user_selected = StandardScaler()
user_features_selected = scaler_user_selected.fit_transform(user_features_selected)

scaler_item_selected = StandardScaler()
item_features_selected = scaler_item_selected.fit_transform(item_features_selected)

user_features_tensor = torch.tensor(user_features_selected, dtype=torch.float)
item_features_tensor = torch.tensor(item_features_selected, dtype=torch.float)

In [None]:
user_sequences = events_sorted.groupby('user_id')['item_id'].apply(list).to_dict()

item_encoder = LabelEncoder()
all_item_ids = events_sorted['item_id'].unique()
item_encoder.fit(all_item_ids)
num_items = len(item_encoder.classes_) + 1  # +1 для токена PAD

all_X = []
all_y = []
all_user_ids = []

for user_id, seq in user_sequences.items():
    encoded_seq = item_encoder.transform(seq) + 1  # +1 для PAD
    if len(encoded_seq) < 2:
        continue
    for i in range(1, len(encoded_seq)):
        X_seq = encoded_seq[:i]
        y_target = encoded_seq[i]
        all_X.append(X_seq)
        all_y.append(y_target)
        all_user_ids.append(user_id)


In [None]:
max_sequence_length = max(len(seq) for seq in all_X)

padded_X = nn.utils.rnn.pad_sequence(
    [torch.tensor(seq, dtype=torch.long) for seq in all_X],
    batch_first=True,
    padding_value=0
)

X = padded_X
y = torch.tensor(all_y, dtype=torch.long)

user_ids_list = all_user_ids

In [None]:
gkf = GroupKFold(n_splits=5)
groups = np.array(user_ids_list)

for train_idx, val_idx in gkf.split(X, y, groups):
    X_train = X[train_idx]
    X_val = X[val_idx]
    y_train = y[train_idx]
    y_val = y[val_idx]

    train_user_ids = [user_ids_list[i] for i in train_idx]
    val_user_ids = [user_ids_list[i] for i in val_idx]

    break

In [None]:
class InteractionDataset(Dataset):
    def __init__(self, X, y, user_ids, user_features, item_features, user_id_map, item_id_map, item_encoder):
        self.X = X
        self.y = y
        self.user_ids = user_ids
        self.user_features = user_features
        self.item_features = item_features
        self.user_id_map = user_id_map
        self.item_id_map = item_id_map
        self.item_encoder = item_encoder

    def __len__(self):
        return self.X.size(0)

    def __getitem__(self, idx):
        sequence = self.X[idx]
        target = self.y[idx]

        user_id = self.user_ids[idx]
        user_idx = self.user_id_map.get(user_id, -1)
        if user_idx == -1 or user_idx >= self.user_features.size(0):
            user_feature = torch.zeros(self.user_features.size(1))
        else:
            user_feature = self.user_features[user_idx]

        try:
            item_id = self.item_encoder.inverse_transform([target.item() - 1])[0]
        except ValueError:
            item_id = self.item_encoder.classes_[0]

        item_idx = self.item_id_map.get(item_id, -1)
        if item_idx == -1 or item_idx >= self.item_features.size(0):
            item_feature = torch.zeros(self.item_features.size(1))
        else:
            item_feature = self.item_features[item_idx]

        return sequence, target, user_feature, item_feature


In [None]:
train_dataset = InteractionDataset(
    X_train,
    y_train,
    train_user_ids,
    user_features_tensor,
    item_features_tensor,
    user_id_map,
    item_id_map,
    item_encoder
)

val_dataset = InteractionDataset(
    X_val,
    y_val,
    val_user_ids,
    user_features_tensor,
    item_features_tensor,
    user_id_map,
    item_id_map,
    item_encoder
)


In [None]:
batch_size = 256
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


In [None]:
class Seq2SeqRecommender(nn.Module):
    def __init__(self, num_items, embedding_dim, hidden_dim, user_feature_dim, item_feature_dim, padding_idx=0):
        super(Seq2SeqRecommender, self).__init__()
        self.embedding = nn.Embedding(num_items, embedding_dim, padding_idx=padding_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)

        self.user_fc = nn.Linear(user_feature_dim, hidden_dim)
        self.item_fc = nn.Linear(item_feature_dim, hidden_dim)

        self.fc = nn.Linear(hidden_dim * 2, num_items)
        self.dropout = nn.Dropout(0.5)

    def forward(self, sequences, user_features, item_features):
        embedded = self.embedding(sequences)
        lstm_out, (h_n, c_n) = self.lstm(embedded)
        user_emb = h_n.squeeze(0)

        user_emb = self.user_fc(user_features)
        item_emb = self.item_fc(item_features)

        combined = torch.cat([user_emb, item_emb], dim=1)
        combined = self.dropout(combined)
        logits = self.fc(combined)

        return logits  # Без активации

In [None]:

embedding_dim = 100
hidden_dim = 128
user_feature_dim = user_features_tensor.shape[1]
item_feature_dim = item_features_tensor.shape[1]


model = Seq2SeqRecommender(
    num_items=num_items,
    embedding_dim=embedding_dim,
    hidden_dim=hidden_dim,
    user_feature_dim=user_feature_dim,
    item_feature_dim=item_feature_dim,
    padding_idx=0
)


In [None]:
class EarlyStopping:
    def __init__(self, patience=5, verbose=False, delta=0.0, path='checkpoint.pt'):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path

    def __call__(self, val_loss, model):
        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.verbose:
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        """Сохраняет модель, когда происходит улучшение."""
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

early_stopping = EarlyStopping(patience=5, verbose=True)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)


scaler_amp = GradScaler()


num_epochs = 3
train_losses = []
val_losses = []
val_accuracies = []

In [None]:
from torch.amp import GradScaler, autocast

scaler_amp = GradScaler()

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for sequences, targets, user_feats, item_feats in train_loader:
        sequences = sequences.to(device)
        targets = targets.to(device) - 1
        user_feats = user_feats.to(device)
        item_feats = item_feats.to(device)

        optimizer.zero_grad()

    with autocast(device_type='cuda'):
        logits = model(sequences, user_feats, item_feats)
        loss = criterion(logits, targets)


        scaler_amp.scale(loss).backward()
        scaler_amp.step(optimizer)
        scaler_amp.update()

        running_loss += loss.item() * sequences.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)
    train_losses.append(epoch_loss)

    model.eval()
    val_running_loss = 0.0
    correct = 0
    total = 0
    all_targets = []
    all_preds = []
    all_probs = []
    with torch.no_grad():
        for sequences, targets, user_feats, item_feats in val_loader:
            sequences = sequences.to(device)
            targets = targets.to(device) - 1
            user_feats = user_feats.to(device)
            item_feats = item_feats.to(device)

            with autocast():
                logits = model(sequences, user_feats, item_feats)  # [batch_size, num_items]
                loss = criterion(logits, targets)

            val_running_loss += loss.item() * sequences.size(0)

            probs = torch.softmax(logits, dim=1)  # [batch_size, num_items]
            preds = torch.argmax(probs, dim=1)   # [batch_size]
            correct += (preds == targets).sum().item()
            total += targets.size(0)

            all_targets.extend(targets.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())

    val_epoch_loss = val_running_loss / len(val_loader.dataset)
    val_losses.append(val_epoch_loss)
    val_accuracy = correct / total
    val_accuracies.append(val_accuracy)

    precision = precision_score(all_targets, all_preds, average='weighted', zero_division=0)
    recall = recall_score(all_targets, all_preds, average='weighted', zero_division=0)
    f1 = f1_score(all_targets, all_preds, average='weighted', zero_division=0)
    try:
        auc = roc_auc_score(pd.get_dummies(all_targets), all_probs, average='weighted', multi_class='ovr')
    except ValueError:
        auc = 0.0  # Если невозможно вычислить AUC

    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {epoch_loss:.6f}, Val Loss: {val_epoch_loss:.6f}, Val Acc: {val_accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}, ROC-AUC: {auc:.4f}')

    # Проверка раннего остановка
    early_stopping(val_epoch_loss, model)

    if early_stopping.early_stop:
        print("Раннее остановка активирована.")
        break


In [None]:
model.load_state_dict(torch.load('checkpoint.pt'))

In [None]:
train_user_set = set(train_user_ids)
val_user_set = set(val_user_ids)
overlap = train_user_set.intersection(val_user_set)
print(f'Количество перекрывающихся пользователей: {len(overlap)}')

In [None]:
train_item_set = set(events_sorted.iloc[train_idx]['item_id'].unique())
val_item_set = set(events_sorted.iloc[val_idx]['item_id'].unique())
overlap_items = train_item_set.intersection(val_item_set)
print(f'Количество перекрывающихся элементов: {len(overlap_items)}')