In [None]:
# %mkdir notebooks
# %cd notebooks

In [None]:
import os
import torch
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from torch import nn, optim
from sklearn.metrics import f1_score
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

## **Settings**

In [None]:
def set_all_seeds(seed=42):
    # Устанавливаем seed для встроенного генератора Python
    random.seed(seed)
    # Устанавливаем seed для хэш-функции Python (опция для контроля поведения хэшей)
    os.environ['PYTHONHASHSEED'] = str(seed)
    # Устанавливаем seed для NumPy
    np.random.seed(seed)

    # Устанавливаем seed для PyTorch
    torch.manual_seed(seed)
    # Устанавливаем seed для генератора на CUDA
    torch.cuda.manual_seed(seed)
    # Отключаем недетерминированное поведение в алгоритмах CUDNN
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_all_seeds()

## Data

### **Initial**

In [None]:
train_df = pd.read_csv("../data/ml_ozon_сounterfeit_train.csv")
test_df = pd.read_csv("../data/ml_ozon_сounterfeit_test.csv")
new_test_df = pd.read_csv("../data/ml_ozon_сounterfeit_new_test.csv")

In [None]:
train_labels = train_df["resolution"]
test_labels = pd.read_csv("../submissions/best.csv")['prediction']

In [None]:
df = pd.concat([train_df.drop(columns=["resolution"]), test_df, new_test_df])
train_size = train_df.shape[0]
test_size = test_df.shape[0]
new_test_size = new_test_df.shape[0]
del train_df, test_df, new_test_df

In [None]:
num_features = [
    'rating_1_count',
    'rating_2_count',
    'rating_3_count',
    'rating_4_count',
    'rating_5_count',
    'comments_published_count',
    'photos_published_count',
    'videos_published_count',
    'PriceDiscounted',
    'item_time_alive',
    'item_count_fake_returns7',
    'item_count_fake_returns30',
    'item_count_fake_returns90',
    'item_count_sales7',
    'item_count_sales30',
    'item_count_sales90',
    'item_count_returns7',
    'item_count_returns30',
    'item_count_returns90',
    'GmvTotal7',
    'GmvTotal30',
    'GmvTotal90',
    'ExemplarAcceptedCountTotal7',
    'ExemplarAcceptedCountTotal30',
    'ExemplarAcceptedCountTotal90',
    'OrderAcceptedCountTotal7',
    'OrderAcceptedCountTotal30',
    'OrderAcceptedCountTotal90',
    'ExemplarReturnedCountTotal7',
    'ExemplarReturnedCountTotal30',
    'ExemplarReturnedCountTotal90',
    'ExemplarReturnedValueTotal7',
    'ExemplarReturnedValueTotal30',
    'ExemplarReturnedValueTotal90',
    'ItemVarietyCount',
    'ItemAvailableCount',
    'seller_time_alive',
]
df[num_features] = df[num_features].fillna(0)

In [None]:
scaler = MinMaxScaler(feature_range=(-1, 1))
df[num_features] = scaler.fit_transform(df[num_features]).astype(float)

In [None]:
cat_raw_features = ['SellerID', 'CommercialTypeName4', 'brand_name']
cat_features = []
for col in cat_raw_features:
    df[f'{col}_enc'] = LabelEncoder().fit_transform(df[col])
    cat_features.append(f'{col}_enc')

### Text&Image embeddings

In [None]:
image_embeddings = np.load('../data/embeddings/image_clip_large_half_embeddings.npy')
text_embeddings = np.load('../data/embeddings/text_rubert_tiny2_half_embeddings.npy')

df['image_embedding'] = list(image_embeddings)
df['text_embedding'] = list(text_embeddings)

### **Split**

In [None]:
train_df, test_df, new_test_df = df[:train_size], df[train_size:train_size + test_size], df[train_size + test_size:]

In [None]:
train_test_df = pd.concat([train_df, test_df], ignore_index=True)
train_test_labels = pd.concat([train_labels, test_labels], ignore_index=True)

In [None]:
from sklearn.model_selection import StratifiedGroupKFold

n_splits = 10
skf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=42)
splits = skf.split(train_test_df, train_test_labels, groups=train_test_df['SellerID'])

### **Class Dataset**

In [None]:
class MultimodalDataset(Dataset):
    def __init__(self, df, labels=None):
        self.df = df.reset_index(drop=True)
        self.labels = np.array(labels) if labels is not None else None

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        # Извлекаем эмбеддинги
        image_data = row['image_embedding']
        text_data = row['text_embedding']

        # Числовые признаки
        num_data = torch.tensor(row[num_features].values.astype(float))

        # Категориальные признаки
        cat_data = torch.tensor(row[cat_features].values.astype(int), dtype=torch.long)

        # Собираем словарь модальностей
        sample = {
            'image': image_data,
            'text': text_data,
            'cat': cat_data,
            'num': num_data,
        }

        # Добавляем label, если есть
        if self.labels is not None:
            sample['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)

        return sample


## **Model**

In [None]:
class MultimodalModel(nn.Module):
    def __init__(self, num_classes=2, cat_embed_dim=128, hidden_dim=256):
        super().__init__()
        self.cat_embed = nn.ModuleList([
            nn.Embedding(num_embeddings=df[feature].nunique() + 1, embedding_dim=cat_embed_dim)
            for feature in cat_features
        ])
        self.cat_norm = nn.LayerNorm(cat_embed_dim * len(cat_features))
        self.num_layers = nn.Sequential(
            nn.Linear(len(num_features), hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        self.text_proj = nn.Sequential(
            nn.Linear(624, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.ReLU()
        )
        self.image_proj = nn.Sequential(
            nn.Linear(1024, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.ReLU()
        )

        self.fc = nn.Sequential(
            nn.Linear(1152, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, num_classes)
        )

    def forward(self, sample):
        cat_embeds = torch.cat([
            self.cat_embed[i](sample['cat'][:, i]) for i in range(len(self.cat_embed))
        ], dim=1)
        cat_embeds = self.cat_norm(cat_embeds)
        num_outputs = self.num_layers(sample['num'].to(torch.float32))
        text_embeds = self.text_proj(sample['text'].to(torch.float32))
        image_embeds = self.image_proj(sample['image'].to(torch.float32))
        concat = torch.cat([text_embeds, image_embeds, cat_embeds, num_outputs], dim=1)
        logits = self.fc(concat)
        return logits

### *Train*

In [None]:
training_models = []
device = 'cuda' if torch.cuda.is_available() else 'cpu'

for split_idx, (train_idx, valid_idx) in enumerate(splits):
    split_train_df, split_valid_df = train_test_df.iloc[train_idx], train_test_df.iloc[valid_idx]
    split_train_labels, split_valid_labels = train_test_labels.iloc[train_idx], train_test_labels.iloc[valid_idx]

    train_set = MultimodalDataset(split_train_df, split_train_labels)
    valid_set = MultimodalDataset(split_valid_df, split_valid_labels)

    train_loader = DataLoader(train_set, batch_size=64, shuffle=True, drop_last=True)
    valid_loader = DataLoader(valid_set, batch_size=64, shuffle=False)

    model = MultimodalModel().to(device)
    loss_fn = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=3e-4)

    num_epochs = 3
    for epoch in range(1, num_epochs + 1):
        y_true, y_pred = [], []
        model.train()
        progress_bar = tqdm(train_loader, desc="Training")
        for i, batch in enumerate(progress_bar):
            batch['cat'] = batch['cat'].to(device)
            batch['num'] = batch['num'].to(device)
            batch['text'] = batch['text'].to(device)
            batch['image'] = batch['image'].to(device)
            batch['labels'] = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(batch)
            loss = loss_fn(outputs, batch['labels'])
            loss.backward()
            optimizer.step()

            y_true.extend(batch['labels'].cpu().tolist())
            y_pred.extend(outputs.argmax(dim=1).cpu().tolist())

            if i % 10 == 0:
                progress_bar.set_postfix(loss=loss.item(), score=f1_score(y_true, y_pred, average='macro'))

        y_true, y_pred = [], []
        model.eval()
        with torch.no_grad():
            progress_bar = tqdm(valid_loader, desc="Validating")
            for i, batch in enumerate(progress_bar):
                batch['cat'] = batch['cat'].to(device)
                batch['num'] = batch['num'].to(device)
                batch['text'] = batch['text'].to(device)
                batch['image'] = batch['image'].to(device)
                batch['labels'] = batch['labels'].to(device)

                outputs = model(batch)
                loss = loss_fn(outputs, batch['labels'])

                y_true.extend(batch['labels'].cpu().tolist())
                y_pred.extend(outputs.argmax(dim=1).cpu().tolist())

                if i % 10 == 0:
                    progress_bar.set_postfix(loss=loss.item(), score=f1_score(y_true, y_pred, average='macro'))

        print(f"Split {split_idx + 1} - Epoch {epoch} - Score: {f1_score(y_true, y_pred, average='macro')}\n")
            
    training_models.append(model)

In [None]:
for idx, model in enumerate(training_models):
    torch.save(model.state_dict(), f"../models/multimodal_{idx + 1}.pth")

In [None]:
# training_models = [MultimodalModel() for _ in range(n_splits)]
# for idx, model in enumerate(training_models):
#     model.load_state_dict(torch.load(f"../models/multimodal_{idx + 1}.pth"))
#     model.to(device)

## Submission

In [None]:
new_test_set = MultimodalDataset(new_test_df)
new_test_loader = DataLoader(new_test_set, batch_size=64, shuffle=False)

probabilities = []
with torch.no_grad():
    progress_bar = tqdm(new_test_loader, desc="Testing")
    for batch in progress_bar:
        batch['cat'] = batch['cat'].to(device)
        batch['num'] = batch['num'].to(device)
        batch['text'] = batch['text'].to(device)
        batch['image'] = batch['image'].to(device)

        batch_probabilities = [
            torch.softmax(model(batch), dim=1).cpu().numpy()[:, 1] 
            for model in training_models
        ]
        batch_probabilities = np.vstack(batch_probabilities)
        probabilities.append(batch_probabilities)

probabilities = np.hstack(probabilities)
probabilities = np.mean(probabilities, axis=0)

In [None]:
all_test_df = pd.concat([test_df, new_test_df], ignore_index=True)
probabilities = np.concatenate([test_labels, probabilities])

In [None]:
threshold = 0.5
prediction = (probabilities >= threshold).astype(int)
prediction.mean()

In [None]:
submission = pd.DataFrame({
    'id': all_test_df['id'], 
    'prediction': prediction
})
submission.to_csv('../submissions/multimodal.csv', index=False)
submission.head()

In [None]:
submission_probs = pd.DataFrame({
    'id': all_test_df['id'], 
    'probability': probabilities
})
submission_probs.to_csv('../submissions/multimodal_probs.csv', index=False)
submission_probs.head()