In [None]:
!pip install implicit

In [None]:
import os
import pickle
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
from sklearn.preprocessing import LabelEncoder, StandardScaler
from catboost import CatBoostRanker, Pool
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from transformers import BertConfig, BertForMaskedLM, AdamW

In [None]:
# Константы
K = 10  # Кол-во рекомендаций
# DATA_PATH = '.'  # Текущая директория
DATA_PATH = "/kaggle/working/"
INPUT_PATH = "/kaggle/input/sber-recsys-hack"

MODEL_PATH = os.path.join(DATA_PATH, 'models')
os.makedirs(MODEL_PATH, exist_ok=True)

In [None]:
# Использование GPU, если доступно
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# Генерация кандидатов с ALS
def generate_candidates_als(train, user_encoder, item_encoder):
    user_item_matrix = csr_matrix(
        (train['rating'], (train['user_id_enc'], train['item_id_enc']))
    )
    als_model = AlternatingLeastSquares(
        factors=64,
        regularization=0.1,
        iterations=15,
    )
    als_model.fit(user_item_matrix.T)
    return als_model

In [None]:
# Датасет для BERT4Rec
class BERT4RecDataset(Dataset):
    def __init__(self, sequences, item_num, max_seq_length=50, mask_prob=0.15):
        self.sequences = sequences
        self.max_seq_length = max_seq_length
        self.mask_prob = mask_prob
        self.item_num = item_num
        self.special_tokens = {
            'pad': 0,
            'mask': item_num + 1
        }
        self.vocab_size = item_num + 2

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences[idx][-self.max_seq_length:]
        tokens = seq + [self.special_tokens['pad']] * (self.max_seq_length - len(seq))
        tokens = torch.tensor(tokens, dtype=torch.long)

        input_ids = tokens.clone()
        labels = tokens.clone()

        probability_matrix = torch.full(labels.shape, self.mask_prob)
        masked_indices = torch.bernoulli(probability_matrix).bool() & (tokens != self.special_tokens['pad'])
        labels[~masked_indices] = -100

        input_ids[masked_indices] = self.special_tokens['mask']

        return input_ids, labels

In [None]:
# Обучение BERT4Rec
def train_bert4rec(train_sequences, item_num, max_seq_length=50, epochs=3, batch_size=50):
    dataset = BERT4RecDataset(train_sequences, item_num, max_seq_length)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # Конфигурация BERT4Rec
    config = BertConfig(
        vocab_size=dataset.vocab_size,
        max_position_embeddings=max_seq_length,
        hidden_size=256,
        num_hidden_layers=2,
        num_attention_heads=4,
        intermediate_size=512,
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1
    )

    model = BertForMaskedLM(config)
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=0.001)

    model.train()
    for epoch in range(epochs):
        epoch_loss = 0
        for input_ids, labels in tqdm(dataloader, desc=f"Epoch {epoch + 1}/{epochs}"):
            input_ids = input_ids.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f'Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss / len(dataloader)}')
    return model, dataset.special_tokens

In [None]:
# CatBoost как ранжирующая модель
def train_ranking_model(train_df, user_features, item_features):
    X_train = train_df.merge(user_features, on='user_id').merge(item_features, on='item_id')
    y_train = X_train['rating']
    group_id = X_train['user_id']
    cat_features = ['domain', 'day_of_week', 'hour_of_day']
    X_train = X_train.drop(['user_id', 'item_id', 'rating'], axis=1)
    print(X_train.head(1))

    # Для правильного кодирования категориальных признаков
    for col in cat_features:
        X_train[col] = X_train[col].astype(str)

    train_pool = Pool(
        data=X_train,
        label=y_train,
        group_id=group_id,
        cat_features=cat_features
    )
    model = CatBoostRanker(
        iterations=100,
        depth=6,
        learning_rate=0.1,
        task_type='GPU',
        verbose=10
    )
    model.fit(train_pool)
    return model

In [None]:
# Препроцессинг
def preprocess_data(train, test):
    # Для послеующего объединения и корректного разбиения
    train['is_test'] = 0
    test['is_test'] = 1

    # Комбинирование трейна и теста для совместного препроцессинга
    combined = pd.concat([train, test], ignore_index=True)

    # Заполнение пропущенных значений (просто на всякий случай)
    combined.fillna(0, inplace=True)

    # Конвертирование datetime в привычный формат
    combined['datetime'] = pd.to_datetime(combined['timestamp'])

    # Генерация временных признаков
    combined['day_of_week'] = combined['datetime'].dt.dayofweek
    combined['hour_of_day'] = combined['datetime'].dt.hour

    # Отбрасывание изначальной колонки
    combined.drop('timestamp', axis=1, inplace=True)

    # Обратное разбиение на трейн и тест
    train = combined[combined['is_test'] == 0].drop(columns=['is_test'])
    test = combined[combined['is_test'] == 1].drop(columns=['is_test'])

    return train, test

In [None]:
# Загрузка данных для обоих доменов
train_zvuk = pq.read_table(os.path.join(INPUT_PATH, 'train_zvuk.parquet')).to_pandas()
test_zvuk = pq.read_table(os.path.join(INPUT_PATH, 'test_zvuk.parquet')).to_pandas()
train_smm = pq.read_table(os.path.join(INPUT_PATH, 'train_smm.parquet')).to_pandas()
test_smm = pq.read_table(os.path.join(INPUT_PATH, 'test_smm.parquet')).to_pandas()

In [None]:
# Добавление индикаторов доменов
train_zvuk['domain'] = 'zvuk'
test_zvuk['domain'] = 'zvuk'
train_smm['domain'] = 'smm'
test_smm['domain'] = 'smm'

In [None]:
# Сдвиг значений id для айтемов МегаМаркета, чтобы не повторялись со Звуком
max_item_id_zvuk = train_zvuk['item_id'].max()
train_smm['item_id'] += max_item_id_zvuk + 1
test_smm['item_id'] += max_item_id_zvuk + 1

In [None]:
# Комбинирование тренировочных и текстовых данных для двух доменов
train = pd.concat([train_zvuk, train_smm], ignore_index=True)
test = pd.concat([test_zvuk, test_smm], ignore_index=True)

In [None]:
# Препроцессинг
train, test = preprocess_data(train, test)

In [None]:
# Кодирование user_id
user_encoder = LabelEncoder()
train['user_id_enc'] = user_encoder.fit_transform(train['user_id'])
test['user_id_enc'] = user_encoder.transform(test['user_id'])

In [None]:
# Кодирование item_id
item_encoder = LabelEncoder()
train['item_id_enc'] = item_encoder.fit_transform(train['item_id'])
test['item_id_enc'] = item_encoder.transform(test['item_id'])

In [None]:
# Обучение модели ALS
print("Обучение модели ALS...")
als_model = generate_candidates_als(train[['user_id_enc', 'item_id_enc', 'rating']], user_encoder, item_encoder)

In [None]:
# Сохранение модели ALS
als_model_path = os.path.join(MODEL_PATH, 'als_model_combined.pkl')
with open(als_model_path, 'wb') as f:
    pickle.dump(als_model, f)
print(f"Модель ALS сохранена в {als_model_path}")

In [None]:
# Подготовка последовательностей для BERT4Rec
train_sequences = train.sort_values('datetime').groupby('user_id_enc')['item_id_enc'].apply(list).values
item_num = train['item_id_enc'].nunique()

In [None]:
# Обучение модели BERT4Rec
print("Обучение модели BERT4Rec...")
bert_model, special_tokens = train_bert4rec(train_sequences, item_num)

In [None]:
# Сохранение модели BERT4Rec
bert_model_path = os.path.join(MODEL_PATH, 'bert_model_combined.pth')
torch.save(bert_model.state_dict(), bert_model_path)
print(f"Модель BERT4Rec сохранена в {bert_model_path}")

In [None]:
# Генерация признаков
# Признаки пользователей
user_features = train.groupby('user_id').agg({
    'rating': ['mean', 'count'],
    'day_of_week': lambda x: x.mode()[0],
    'hour_of_day': lambda x: x.mode()[0]
}).reset_index()
user_features.columns = ['user_id', 'user_rating_mean', 'user_rating_count', 'user_day_of_week_mode', 'user_hour_of_day_mode']

# Признаки айтемов
item_features = train.groupby(['item_id', 'domain']).agg({
    'rating': ['mean', 'count'],
    'day_of_week': lambda x: x.mode()[0],
    'hour_of_day': lambda x: x.mode()[0]
}).reset_index()
item_features.columns = ['item_id', 'domain', 'item_rating_mean', 'item_rating_count', 'item_day_of_week_mode', 'item_hour_of_day_mode']

In [None]:
# Обучение ранжирующей модели CatBoost
print("Обучение модели CatBoost...")
catboost_model = train_ranking_model(train, user_features, item_features)

In [None]:
# Сохранение модели CatBoost
catboost_model_path = os.path.join(MODEL_PATH, 'catboost_model_combined.cbm')
catboost_model.save_model(catboost_model_path)
print(f"Модель CatBoost сохранена в {catboost_model_path}")

In [None]:
# Генерация рекомендаций для каждого домена
for domain in ['zvuk', 'smm']:
    print(f"Генерация рекомендаций для {domain}...")
    
    # Фильтр для тестовых пользователей из домена
    test_domain = test[test['domain'] == domain]
    test_user_ids = test_domain['user_id'].unique()
    
    # Генерация 50*K кандидатов для каждого пользователя
    candidates = []
    user_item_matrix = csr_matrix(
        (train['rating'], (train['user_id_enc'], train['item_id_enc']))
    )
    for user in tqdm(test_domain['user_id_enc'].unique(), desc=f"Генерация кандидатов для {domain}"):
        user_items = train[train['user_id_enc'] == user]['item_id_enc'].values
        recs = als_model.recommend(user, user_item_matrix.T.tocsr(), N=K * 50, filter_already_liked_items=True)
        candidates.extend([(user, item) for item in recs[0]])
    
    candidate_df = pd.DataFrame(candidates, columns=['user_id_enc', 'item_id_enc'])
    candidate_df = candidate_df.drop_duplicates()
    candidate_df['user_id'] = user_encoder.inverse_transform(candidate_df['user_id_enc'])
    candidate_df['item_id'] = item_encoder.inverse_transform(candidate_df['item_id_enc'])
    
    # Добавление информации о домене к кандидатам
    candidate_df = candidate_df.merge(item_features[['item_id', 'domain']], on='item_id', how='left')
    
    # Фильтр кандидатов для текущего домена
    candidate_df = candidate_df[candidate_df['domain'] == domain]
    
    # Объединение признаков
    candidate_df = candidate_df.merge(user_features, on='user_id', how='left').merge(
        item_features, on=['item_id', 'domain'], how='left')
    
    # Подготовка данных для предсказаний
    X_candidate = candidate_df.drop(['user_id', 'item_id', 'user_id_enc', 'item_id_enc', 'domain'], axis=1)
    
    # Для правильного кодирования категориальных признаков
    X_candidate['day_of_week'] = X_candidate['item_day_of_week_mode'].astype(str)
    X_candidate['hour_of_day'] = X_candidate['item_hour_of_day_mode'].astype(str)
    X_candidate['domain'] = domain
    
    X_candidate = X_candidate.drop(['item_day_of_week_mode', 'item_hour_of_day_mode', 'user_day_of_week_mode', 'user_hour_of_day_mode'], axis=1)
    
    # Предсказание скоров
    candidate_df['score'] = catboost_model.predict(X_candidate)
    
    # Генерация рекомендаций
    recommendations = candidate_df.groupby('user_id').apply(
        lambda x: x.sort_values('score', ascending=False)['item_id'].tolist()[:K]
    ).reset_index()
    recommendations.columns = ['user_id', 'item_ids']
    
    # Сохранение результатов
    submission_file = f'submission_{domain}.parquet'
    recommendations.to_parquet(os.path.join(DATA_PATH, submission_file), index=False)
    print(f"Рекомендации для {domain} Сохранены в {submission_file}")