In [57]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from PIL import Image
import os
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
train = pd.read_csv("/kaggle/input/russian-ai-olympiad-2022-b/Train.csv", sep=';')
                     # encoding='utf-8', engine='python', quotechar="'",
                     # on_bad_lines='skip')
test = pd.read_csv("/kaggle/input/russian-ai-olympiad-2022-b/Test.csv", sep=';')

subm = pd.read_csv("/kaggle/input/russian-ai-olympiad-2022-b/submission.csv", sep=';')

In [4]:
class2idx = {
    'Развлечения и юмор': 0,
    'Кулинария': 1,
    'Торговля и объявления': 2,
    'СМИ': 3,
    'Философия и религия': 4,
    'Животные': 5,
    'Творчество и дизайн': 6,
    'Путешествия': 7,
}

idx2class = {v: k for k, v in class2idx.items()}

In [5]:
print(f"Before: {len(train)}")
train = train[train['label'].isin(list(class2idx.keys()))]
print(f"After: {len(train)}")

train['label'] = train['label'].map(class2idx)
train = train.fillna('')
test = test.fillna('')

Before: 6401
After: 4947


In [6]:
# from sentence_transformers import SentenceTransformer

# model = SentenceTransformer("intfloat/multilingual-e5-large").to(device)

# sentences = [
#     "The weather is lovely today.",
#     "It's so sunny outside!",
#     "He drove to the stadium."
# ]
# embeddings = model.encode(sentences)

In [7]:
# from transformers import pipeline

# pipe = pipeline("image-feature-extraction", model="facebook/dinov2-base", use_fast=True, return_tensors='pt', device=device)

In [8]:
# images = [f"/kaggle/input/russian-ai-olympiad-2022-b/Data/Train/{i}" for i in train['id']]

# embeddings_img = []

# for image in tqdm(images):
#     embeddings_img.append(pipe(image)[:, 0, :])

# embeddings_img = torch.cat(embeddings_img)
# embeddings_img.shape

In [9]:
# torch.save(embeddings_img, 'embeddings_img.pt')

In [10]:
# images = [f"/kaggle/input/russian-ai-olympiad-2022-b/Data/Test/{i}" for i in test['id']]

# embeddings_img = []

# for image in tqdm(images):
#     embeddings_img.append(pipe(image)[:, 0, :])

# embeddings_img = torch.cat(embeddings_img)
# embeddings_img.shape

In [11]:
# torch.save(embeddings_img, 'embeddings_img_test.pt')

In [12]:
# batch_size = 32

# embeddings_text = []

# for i in tqdm(range(0, len(train), batch_size)):
#     sentences = train['description'][i:i+batch_size].tolist()
#     embeddings_text.append(torch.tensor(model.encode(sentences)))

# embeddings_text = torch.cat(embeddings_text)
# embeddings_text.shape

In [13]:
# torch.save(embeddings_text, 'embeddings_text.pt')

In [14]:
# batch_size = 32

# embeddings_text = []

# for i in tqdm(range(0, len(test), batch_size)):
#     sentences = test['description'][i:i+batch_size].tolist()
#     embeddings_text.append(torch.tensor(model.encode(sentences)))

# embeddings_text = torch.cat(embeddings_text)
# embeddings_text.shape

In [15]:
# torch.save(embeddings_text, 'embeddings_text_test.pt')

In [16]:
embeddings_img = torch.load("/kaggle/input/russian-ai-olympiad-2022-b/embeddings_img.pt")
embeddings_img_test = torch.load("/kaggle/input/russian-ai-olympiad-2022-b/embeddings_img_test.pt")

embeddings_text = torch.load("/kaggle/input/russian-ai-olympiad-2022-b/embeddings_text.pt")
embeddings_text_test = torch.load("/kaggle/input/russian-ai-olympiad-2022-b/embeddings_text_test.pt")

In [18]:
embeddings = torch.cat([embeddings_img, embeddings_text], dim=1)
embeddings_test = torch.cat([embeddings_img_test, embeddings_text_test], dim=1)

In [61]:
from sklearn.model_selection import train_test_split

embeddings_train, embeddings_valid, labels_train, labels_valid = train_test_split(embeddings, train['label'], stratify=train['label'], random_state=42, test_size=0.1)
labels_train = labels_train.reset_index(drop=True)
labels_valid = labels_valid.reset_index(drop=True)

In [62]:
BATCH_SIZE = 64

class EmbeddingsDataset(Dataset):
    def __init__(self, embeddings, labels=None):
        super().__init__()
        self.embeddings = embeddings
        self.labels = labels
    def __len__(self):
        return len(self.embeddings)
    def __getitem__(self, idx):
        if self.labels is not None:
            return self.embeddings[idx], torch.tensor(self.labels[idx])
        return self.embeddings[idx]

train_ds = EmbeddingsDataset(embeddings=embeddings_train, labels=labels_train)
valid_ds = EmbeddingsDataset(embeddings=embeddings_valid, labels=labels_valid)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_ds, batch_size=BATCH_SIZE, shuffle=True)

In [63]:
train['label'].value_counts().sort_index().values

array([1396,  795,  689,  518,  424,  440,  331,  354])

In [64]:
weights = 1 / torch.tensor(train['label'].value_counts().sort_index().values, dtype=torch.float)
weights = weights / weights.sum() * len(class2idx)

In [76]:
class EmbeddingsNetwork(nn.Module):
    def __init__(self, input_size=768+1024, num_classes=8):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, 1024),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.4),

            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(256, num_classes)
        )
    def forward(self, x):
        return self.layers(x)

epochs = 30
log_rate = 5

model = EmbeddingsNetwork().to(device)
loss_fn = nn.CrossEntropyLoss(weight=weights.to(device))
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5, weight_decay=1e-3)

In [77]:
for epoch in tqdm(range(epochs), desc='Training'):
    model.train()
    running_train_loss = 0
    i = 0
    for X, y in (pbar := tqdm(train_loader, leave=False, desc='Train DataLoader')):
        i += 1
        X, y = X.to(device), y.to(device)
        logits = model(X)
        loss = loss_fn(logits, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_train_loss += loss.item()
        pbar.set_postfix({'loss': f'{running_train_loss/i:.5f}'})

    if (epoch+1)%log_rate==0:
        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {running_train_loss/i:.5f}")
    
    model.eval()
    all_preds, all_targets = [], []
    i = 0
    for X, y in (pbar := tqdm(valid_loader, leave=False, desc='Valid DataLoader')):
        i += 1
        X, y = X.to(device), y.to(device)
        with torch.no_grad():
            logits = model(X)
        probs = torch.softmax(logits, dim=-1)
        preds = torch.argmax(probs, dim=-1)

        all_preds.extend(preds.detach().cpu().numpy())
        all_targets.extend(y.detach().cpu().numpy())
    
    macro_f1 = f1_score(all_targets, all_preds, average='macro')

    if (epoch+1)%log_rate==0:
        print(f"Macro F1: {macro_f1:.5f}")

Training:   0%|          | 0/30 [00:00<?, ?it/s]

Train DataLoader:   0%|          | 0/70 [00:00<?, ?it/s]

Valid DataLoader:   0%|          | 0/8 [00:00<?, ?it/s]

Train DataLoader:   0%|          | 0/70 [00:00<?, ?it/s]

Valid DataLoader:   0%|          | 0/8 [00:00<?, ?it/s]

Train DataLoader:   0%|          | 0/70 [00:00<?, ?it/s]

Valid DataLoader:   0%|          | 0/8 [00:00<?, ?it/s]

Train DataLoader:   0%|          | 0/70 [00:00<?, ?it/s]

Valid DataLoader:   0%|          | 0/8 [00:00<?, ?it/s]

Train DataLoader:   0%|          | 0/70 [00:00<?, ?it/s]

Epoch 5/30 | Train Loss: 1.58224


Valid DataLoader:   0%|          | 0/8 [00:00<?, ?it/s]

Macro F1: 0.58978


Train DataLoader:   0%|          | 0/70 [00:00<?, ?it/s]

Valid DataLoader:   0%|          | 0/8 [00:00<?, ?it/s]

Train DataLoader:   0%|          | 0/70 [00:00<?, ?it/s]

Valid DataLoader:   0%|          | 0/8 [00:00<?, ?it/s]

Train DataLoader:   0%|          | 0/70 [00:00<?, ?it/s]

Valid DataLoader:   0%|          | 0/8 [00:00<?, ?it/s]

Train DataLoader:   0%|          | 0/70 [00:00<?, ?it/s]

Valid DataLoader:   0%|          | 0/8 [00:00<?, ?it/s]

Train DataLoader:   0%|          | 0/70 [00:00<?, ?it/s]

Epoch 10/30 | Train Loss: 0.80806


Valid DataLoader:   0%|          | 0/8 [00:00<?, ?it/s]

Macro F1: 0.67782


Train DataLoader:   0%|          | 0/70 [00:00<?, ?it/s]

Valid DataLoader:   0%|          | 0/8 [00:00<?, ?it/s]

Train DataLoader:   0%|          | 0/70 [00:00<?, ?it/s]

Valid DataLoader:   0%|          | 0/8 [00:00<?, ?it/s]

Train DataLoader:   0%|          | 0/70 [00:00<?, ?it/s]

Valid DataLoader:   0%|          | 0/8 [00:00<?, ?it/s]

Train DataLoader:   0%|          | 0/70 [00:00<?, ?it/s]

Valid DataLoader:   0%|          | 0/8 [00:00<?, ?it/s]

Train DataLoader:   0%|          | 0/70 [00:00<?, ?it/s]

Epoch 15/30 | Train Loss: 0.58357


Valid DataLoader:   0%|          | 0/8 [00:00<?, ?it/s]

Macro F1: 0.70645


Train DataLoader:   0%|          | 0/70 [00:00<?, ?it/s]

Valid DataLoader:   0%|          | 0/8 [00:00<?, ?it/s]

Train DataLoader:   0%|          | 0/70 [00:00<?, ?it/s]

Valid DataLoader:   0%|          | 0/8 [00:00<?, ?it/s]

Train DataLoader:   0%|          | 0/70 [00:00<?, ?it/s]

Valid DataLoader:   0%|          | 0/8 [00:00<?, ?it/s]

Train DataLoader:   0%|          | 0/70 [00:00<?, ?it/s]

Valid DataLoader:   0%|          | 0/8 [00:00<?, ?it/s]

Train DataLoader:   0%|          | 0/70 [00:00<?, ?it/s]

Epoch 20/30 | Train Loss: 0.45055


Valid DataLoader:   0%|          | 0/8 [00:00<?, ?it/s]

Macro F1: 0.71439


Train DataLoader:   0%|          | 0/70 [00:00<?, ?it/s]

Valid DataLoader:   0%|          | 0/8 [00:00<?, ?it/s]

Train DataLoader:   0%|          | 0/70 [00:00<?, ?it/s]

Valid DataLoader:   0%|          | 0/8 [00:00<?, ?it/s]

Train DataLoader:   0%|          | 0/70 [00:00<?, ?it/s]

Valid DataLoader:   0%|          | 0/8 [00:00<?, ?it/s]

Train DataLoader:   0%|          | 0/70 [00:00<?, ?it/s]

Valid DataLoader:   0%|          | 0/8 [00:00<?, ?it/s]

Train DataLoader:   0%|          | 0/70 [00:00<?, ?it/s]

Epoch 25/30 | Train Loss: 0.34856


Valid DataLoader:   0%|          | 0/8 [00:00<?, ?it/s]

Macro F1: 0.72619


Train DataLoader:   0%|          | 0/70 [00:00<?, ?it/s]

Valid DataLoader:   0%|          | 0/8 [00:00<?, ?it/s]

Train DataLoader:   0%|          | 0/70 [00:00<?, ?it/s]

Valid DataLoader:   0%|          | 0/8 [00:00<?, ?it/s]

Train DataLoader:   0%|          | 0/70 [00:00<?, ?it/s]

Valid DataLoader:   0%|          | 0/8 [00:00<?, ?it/s]

Train DataLoader:   0%|          | 0/70 [00:00<?, ?it/s]

Valid DataLoader:   0%|          | 0/8 [00:00<?, ?it/s]

Train DataLoader:   0%|          | 0/70 [00:00<?, ?it/s]

Epoch 30/30 | Train Loss: 0.26979


Valid DataLoader:   0%|          | 0/8 [00:00<?, ?it/s]

Macro F1: 0.72420


In [82]:
test_ds = EmbeddingsDataset(embeddings=embeddings_test)

test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)

In [83]:
all_preds = []

for X in test_loader:
    X = X.to(device)
    with torch.no_grad():
        logits = model(X)
    probs = torch.softmax(logits, dim=-1)
    preds = torch.argmax(probs, dim=-1)
    all_preds.extend(preds.detach().cpu().numpy())

len(all_preds), len(subm)

(2565, 2565)

In [91]:
subm['label'] = list(map(idx2class.get, all_preds))

subm.to_csv("submission.csv", index=False, sep=';')

subm.head()

Unnamed: 0,id,label
0,909340245742,Развлечения и юмор
1,909342962411,Развлечения и юмор
2,909343087161,Путешествия
3,909344193109,Торговля и объявления
4,909346841420,Кулинария
