In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    AdamW,
    get_linear_schedule_with_warmup
)
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm
import random
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [3]:
train[['CommercialTypeName4', 'description', 'brand_name', 'name_rus']] = train[['CommercialTypeName4', 'description', 'brand_name', 'name_rus']].fillna('Пусто')
test[['CommercialTypeName4', 'description', 'brand_name', 'name_rus']] = test[['CommercialTypeName4', 'description', 'brand_name', 'name_rus']].fillna('Пусто')

In [4]:
train['PriceDiscounted'] = 2**(train.PriceDiscounted/69.6606)
test['PriceDiscounted'] = 2**(test.PriceDiscounted/69.6606)

In [5]:
all_texts = ('Тип товара:\n' + train['CommercialTypeName4'] + '\n\nНазвание товара:\n' + train['name_rus'] + '\n\nБренд:\n' + train['brand_name'] + '\n\nОписание:\n' + train['description'] + '\n\nЦена:\n' + train['PriceDiscounted'].round().map(int).map(str) + ' рублей').values.tolist()

In [6]:
labels = train.resolution.values.tolist()

In [7]:
print(all_texts[0])

Тип товара:
Пылесборник

Название товара:
Мешки для пылесоса PHILIPS TRIATLON, синтетические, многослойные, тип: HR 6947

Бренд:
ACTRUM

Описание:
Мешки пылесборники для пылесоса PHILIPS, 10 шт., синтетические, многослойные, бренд: ACTRUM, арт. AK-10/10, тип оригинального мешка: HR 6947.Подходят для пылесосов:PHILIPS: HR6955, HR6947, HR6888, HR6844 TRIATHLON, HR6843 TRIATHLON, HR6842 TRIATHLON, HR6841 TRIATHLON, HR6840 TRIATHLON, HR6839 TRIATHLON, HR6838 TRIATHLON, HR6837 TRIATHLON, HR6836 TRIATHLON, HR6835 TRIATHLON, HR6834 TRIATHLON, HR6833 TRIATHLON, HR6832 TRIATHLON, HR6831 TRIATHLON, HR6830 TRIATHLON, HR6829 TRIATHLON, HR6828 TRIATHLON, HR6827 TRIATHLON, HR6826 TRIATHLON, HR6825 TRIATHLON, HR6824 TRIATHLON, HR6823 TRIATHLON, HR6822 TRIATHLON, HR6821 TRIATHLON, HR6820 TRIATHLON, HR6819 TRIATHLON, HR6818 TRIATHLON, HR6817 TRIATHLON, HR6816 TRIATHLON, HR6815 TRIATHLON, HR6814 - HR6845 TRIATHLON, FC6844 TRIATHLON, FC6843 TRIATHLON, FC6842 TRIATHLON, FC6841 - FC6845 TRIATHLONОдноразовы

In [8]:
MODEL_NAME = "sergeyzh/BERTA"
BATCH_SIZE = 64
EPOCHS = 3
LEARNING_RATE = 2e-5
MAX_LENGTH = 256

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [10]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, 
    num_labels=2  # Бинарная классификация
)
model.to(device);

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sergeyzh/BERTA and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [12]:
def calculate_f1(preds, labels):
    preds = np.argmax(preds, axis=1)
    return f1_score(labels, preds)

In [13]:
def train_epoch(model, data_loader, optimizer, scheduler, device):
    model.train()
    losses = []
    correct_predictions = 0
    
    for batch in tqdm(data_loader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        logits = outputs.logits
        
        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        
        losses.append(loss.item())
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
    
    return np.mean(losses), correct_predictions.double() / len(data_loader.dataset)

In [14]:
def eval_model(model, data_loader, device):
    model.eval()
    losses = []
    correct_predictions = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Validation"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            logits = outputs.logits
            
            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            
            losses.append(loss.item())
            all_preds.extend(logits.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    accuracy = correct_predictions.double() / len(data_loader.dataset)
    return np.mean(losses), accuracy, all_preds, all_labels

In [15]:
def predict(text, model, tokenizer, device, max_length=256):
    model.eval()
    
    encoding = tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1)
        prediction = torch.argmax(probabilities, dim=1).cpu().numpy()[0]
        confidence = probabilities.cpu().numpy()[0][prediction]
    
    return prediction, confidence

In [16]:
test[['CommercialTypeName4', 'description', 'brand_name', 'name_rus']] = test[['CommercialTypeName4', 'description', 'brand_name', 'name_rus']].fillna('Пусто')

In [17]:
test = test.fillna(0)

In [18]:
all_texts_test = ('Тип товара:\n' + test['CommercialTypeName4'] + '\n\nНазвание товара:\n' + test['name_rus'] + '\n\nБренд:\n' + test['brand_name'] + '\n\nОписание:\n' + test['description'] + '\n\nЦена:\n' + test['PriceDiscounted'].round().map(int).map(str) + ' рублей').values.tolist()

In [19]:
from sklearn.model_selection import StratifiedKFold

n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
preds_test = np.zeros(len(test))

for fold, (train_idx, test_idx) in enumerate(skf.split(all_texts, labels)):
    print("-" * 50)
    print(f"Fold: {fold+1}")
    print("-" * 50)
    train_texts, val_texts = pd.Series(all_texts)[train_idx].tolist(),  pd.Series(all_texts)[test_idx].tolist()
    train_labels, val_labels = pd.Series(labels)[train_idx].tolist(), pd.Series(labels)[test_idx].tolist()

    train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, MAX_LENGTH)
    val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, MAX_LENGTH)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, correct_bias=False)
    total_steps = len(train_loader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )

    # Основной цикл обучения
    best_f1 = 0
    for epoch in range(EPOCHS):
        print(f"Epoch {epoch + 1}/{EPOCHS}")
        print("-" * 50)
        
        train_loss, train_acc = train_epoch(
            model, train_loader, optimizer, scheduler, device
        )
        
        val_loss, val_acc, val_preds, val_labels = eval_model(
            model, val_loader, device
        )
        
        val_f1 = calculate_f1(val_preds, val_labels)
        
        print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
        print(f"Val Loss: {val_loss:.4f}, Val F1: {val_f1:.4f}")
        
        # Сохранение лучшей модели
        if val_f1 > best_f1:
            best_f1 = val_f1
            torch.save(model.state_dict(), f'berta_{fold+1}.pth')
            print("Model saved!")
    
    model.save_pretrained(f"./berta_binary_classifier_{fold}")
    tokenizer.save_pretrained(f"./berta_binary_classifier_{fold}")
    print("Model and tokenizer saved!")

    preds = [predict(text, model, tokenizer, device) for text in tqdm(all_texts_test)]
    
    probas = []

    for pred, conf in preds:
        if pred == 0:
            probas.append(1 - conf)
        else:
            probas.append(conf)
    probas = np.array(probas)

    preds_test += probas / n_splits



--------------------------------------------------
Fold: 1
--------------------------------------------------
Epoch 1/3
--------------------------------------------------


Training: 100%|██████████| 2774/2774 [28:43<00:00,  1.61it/s] 
Validation: 100%|██████████| 309/309 [01:07<00:00,  4.57it/s]


Train Loss: 0.0905, Train Acc: 0.9667
Val Loss: 0.0752, Val F1: 0.7790
Model saved!
Epoch 2/3
--------------------------------------------------


Training: 100%|██████████| 2774/2774 [28:02<00:00,  1.65it/s]
Validation: 100%|██████████| 309/309 [01:07<00:00,  4.60it/s]


Train Loss: 0.0635, Train Acc: 0.9766
Val Loss: 0.0712, Val F1: 0.8048
Model saved!
Epoch 3/3
--------------------------------------------------


Training: 100%|██████████| 2774/2774 [28:02<00:00,  1.65it/s]
Validation: 100%|██████████| 309/309 [01:06<00:00,  4.63it/s]


Train Loss: 0.0522, Train Acc: 0.9811
Val Loss: 0.0705, Val F1: 0.8115
Model saved!
Model and tokenizer saved!


100%|██████████| 22760/22760 [02:30<00:00, 151.27it/s]


--------------------------------------------------
Fold: 2
--------------------------------------------------
Epoch 1/3
--------------------------------------------------


Training: 100%|██████████| 2774/2774 [27:11<00:00,  1.70it/s]
Validation: 100%|██████████| 309/309 [01:06<00:00,  4.65it/s]


Train Loss: 0.0653, Train Acc: 0.9763
Val Loss: 0.0545, Val F1: 0.8447
Model saved!
Epoch 2/3
--------------------------------------------------


Training: 100%|██████████| 2774/2774 [27:04<00:00,  1.71it/s]
Validation: 100%|██████████| 309/309 [01:06<00:00,  4.66it/s]


Train Loss: 0.0482, Train Acc: 0.9823
Val Loss: 0.0542, Val F1: 0.8457
Model saved!
Epoch 3/3
--------------------------------------------------


Training: 100%|██████████| 2774/2774 [27:03<00:00,  1.71it/s]
Validation: 100%|██████████| 309/309 [01:06<00:00,  4.66it/s]


Train Loss: 0.0392, Train Acc: 0.9857
Val Loss: 0.0578, Val F1: 0.8530
Model saved!
Model and tokenizer saved!


100%|██████████| 22760/22760 [02:28<00:00, 153.66it/s]


--------------------------------------------------
Fold: 3
--------------------------------------------------
Epoch 1/3
--------------------------------------------------


Training: 100%|██████████| 2774/2774 [27:01<00:00,  1.71it/s]
Validation: 100%|██████████| 309/309 [01:06<00:00,  4.66it/s]


Train Loss: 0.0543, Train Acc: 0.9803
Val Loss: 0.0424, Val F1: 0.8807
Model saved!
Epoch 2/3
--------------------------------------------------


Training: 100%|██████████| 2774/2774 [27:03<00:00,  1.71it/s]
Validation: 100%|██████████| 309/309 [01:06<00:00,  4.66it/s]


Train Loss: 0.0385, Train Acc: 0.9858
Val Loss: 0.0415, Val F1: 0.8889
Model saved!
Epoch 3/3
--------------------------------------------------


Training: 100%|██████████| 2774/2774 [27:10<00:00,  1.70it/s]
Validation: 100%|██████████| 309/309 [01:06<00:00,  4.66it/s]


Train Loss: 0.0304, Train Acc: 0.9889
Val Loss: 0.0427, Val F1: 0.8886
Model and tokenizer saved!


100%|██████████| 22760/22760 [02:27<00:00, 154.18it/s]


--------------------------------------------------
Fold: 4
--------------------------------------------------
Epoch 1/3
--------------------------------------------------


Training: 100%|██████████| 2774/2774 [27:07<00:00,  1.70it/s]
Validation: 100%|██████████| 309/309 [01:06<00:00,  4.66it/s]


Train Loss: 0.0456, Train Acc: 0.9835
Val Loss: 0.0357, Val F1: 0.9040
Model saved!
Epoch 2/3
--------------------------------------------------


Training: 100%|██████████| 2774/2774 [27:11<00:00,  1.70it/s]
Validation: 100%|██████████| 309/309 [01:06<00:00,  4.65it/s]


Train Loss: 0.0310, Train Acc: 0.9886
Val Loss: 0.0360, Val F1: 0.9080
Model saved!
Epoch 3/3
--------------------------------------------------


Training: 100%|██████████| 2774/2774 [27:45<00:00,  1.67it/s]
Validation: 100%|██████████| 309/309 [01:07<00:00,  4.58it/s]


Train Loss: 0.0243, Train Acc: 0.9909
Val Loss: 0.0377, Val F1: 0.9094
Model saved!
Model and tokenizer saved!


100%|██████████| 22760/22760 [02:30<00:00, 151.49it/s]


--------------------------------------------------
Fold: 5
--------------------------------------------------
Epoch 1/3
--------------------------------------------------


Training: 100%|██████████| 2774/2774 [27:33<00:00,  1.68it/s]
Validation: 100%|██████████| 309/309 [01:07<00:00,  4.57it/s]


Train Loss: 0.0397, Train Acc: 0.9860
Val Loss: 0.0276, Val F1: 0.9273
Model saved!
Epoch 2/3
--------------------------------------------------


Training: 100%|██████████| 2774/2774 [27:30<00:00,  1.68it/s]
Validation: 100%|██████████| 309/309 [01:07<00:00,  4.57it/s]


Train Loss: 0.0265, Train Acc: 0.9904
Val Loss: 0.0279, Val F1: 0.9291
Model saved!
Epoch 3/3
--------------------------------------------------


Training: 100%|██████████| 2774/2774 [27:30<00:00,  1.68it/s]
Validation: 100%|██████████| 309/309 [01:07<00:00,  4.57it/s]


Train Loss: 0.0200, Train Acc: 0.9924
Val Loss: 0.0291, Val F1: 0.9270
Model and tokenizer saved!


100%|██████████| 22760/22760 [02:29<00:00, 152.00it/s]


--------------------------------------------------
Fold: 6
--------------------------------------------------
Epoch 1/3
--------------------------------------------------


Training: 100%|██████████| 2774/2774 [27:32<00:00,  1.68it/s]
Validation: 100%|██████████| 309/309 [01:07<00:00,  4.57it/s]


Train Loss: 0.0346, Train Acc: 0.9874
Val Loss: 0.0258, Val F1: 0.9262
Model saved!
Epoch 2/3
--------------------------------------------------


Training: 100%|██████████| 2774/2774 [27:30<00:00,  1.68it/s]
Validation: 100%|██████████| 309/309 [01:07<00:00,  4.57it/s]


Train Loss: 0.0221, Train Acc: 0.9918
Val Loss: 0.0274, Val F1: 0.9275
Model saved!
Epoch 3/3
--------------------------------------------------


Training: 100%|██████████| 2774/2774 [27:30<00:00,  1.68it/s]
Validation: 100%|██████████| 309/309 [01:07<00:00,  4.57it/s]


Train Loss: 0.0167, Train Acc: 0.9938
Val Loss: 0.0305, Val F1: 0.9284
Model saved!
Model and tokenizer saved!


100%|██████████| 22760/22760 [02:30<00:00, 151.39it/s]


--------------------------------------------------
Fold: 7
--------------------------------------------------
Epoch 1/3
--------------------------------------------------


Training: 100%|██████████| 2774/2774 [27:29<00:00,  1.68it/s]
Validation: 100%|██████████| 309/309 [01:07<00:00,  4.58it/s]


Train Loss: 0.0313, Train Acc: 0.9892
Val Loss: 0.0178, Val F1: 0.9468
Model saved!
Epoch 2/3
--------------------------------------------------


Training: 100%|██████████| 2774/2774 [27:27<00:00,  1.68it/s]
Validation: 100%|██████████| 309/309 [01:07<00:00,  4.57it/s]


Train Loss: 0.0200, Train Acc: 0.9928
Val Loss: 0.0195, Val F1: 0.9445
Epoch 3/3
--------------------------------------------------


Training: 100%|██████████| 2774/2774 [27:26<00:00,  1.69it/s]
Validation: 100%|██████████| 309/309 [01:07<00:00,  4.60it/s]


Train Loss: 0.0149, Train Acc: 0.9943
Val Loss: 0.0208, Val F1: 0.9465
Model and tokenizer saved!


100%|██████████| 22760/22760 [02:29<00:00, 152.43it/s]


--------------------------------------------------
Fold: 8
--------------------------------------------------
Epoch 1/3
--------------------------------------------------


Training: 100%|██████████| 2774/2774 [27:11<00:00,  1.70it/s]
Validation: 100%|██████████| 309/309 [01:06<00:00,  4.62it/s]


Train Loss: 0.0283, Train Acc: 0.9900
Val Loss: 0.0182, Val F1: 0.9453
Model saved!
Epoch 2/3
--------------------------------------------------


Training: 100%|██████████| 2774/2774 [27:13<00:00,  1.70it/s]
Validation: 100%|██████████| 309/309 [01:06<00:00,  4.62it/s]


Train Loss: 0.0176, Train Acc: 0.9936
Val Loss: 0.0181, Val F1: 0.9507
Model saved!
Epoch 3/3
--------------------------------------------------


Training: 100%|██████████| 2774/2774 [27:10<00:00,  1.70it/s]
Validation: 100%|██████████| 309/309 [01:06<00:00,  4.62it/s]


Train Loss: 0.0123, Train Acc: 0.9953
Val Loss: 0.0215, Val F1: 0.9504
Model and tokenizer saved!


100%|██████████| 22760/22760 [02:29<00:00, 152.63it/s]


--------------------------------------------------
Fold: 9
--------------------------------------------------
Epoch 1/3
--------------------------------------------------


Training: 100%|██████████| 2774/2774 [27:13<00:00,  1.70it/s]
Validation: 100%|██████████| 309/309 [01:06<00:00,  4.62it/s]


Train Loss: 0.0253, Train Acc: 0.9911
Val Loss: 0.0132, Val F1: 0.9579
Model saved!
Epoch 2/3
--------------------------------------------------


Training: 100%|██████████| 2774/2774 [27:14<00:00,  1.70it/s]
Validation: 100%|██████████| 309/309 [01:07<00:00,  4.60it/s]


Train Loss: 0.0158, Train Acc: 0.9941
Val Loss: 0.0130, Val F1: 0.9642
Model saved!
Epoch 3/3
--------------------------------------------------


Training: 100%|██████████| 2774/2774 [27:10<00:00,  1.70it/s]
Validation: 100%|██████████| 309/309 [01:06<00:00,  4.62it/s]


Train Loss: 0.0113, Train Acc: 0.9957
Val Loss: 0.0141, Val F1: 0.9648
Model saved!
Model and tokenizer saved!


100%|██████████| 22760/22760 [02:28<00:00, 152.77it/s]


--------------------------------------------------
Fold: 10
--------------------------------------------------
Epoch 1/3
--------------------------------------------------


Training: 100%|██████████| 2774/2774 [27:12<00:00,  1.70it/s]
Validation: 100%|██████████| 309/309 [01:07<00:00,  4.61it/s]


Train Loss: 0.0240, Train Acc: 0.9916
Val Loss: 0.0141, Val F1: 0.9583
Model saved!
Epoch 2/3
--------------------------------------------------


Training: 100%|██████████| 2774/2774 [27:08<00:00,  1.70it/s]
Validation: 100%|██████████| 309/309 [01:06<00:00,  4.63it/s]


Train Loss: 0.0139, Train Acc: 0.9949
Val Loss: 0.0138, Val F1: 0.9557
Epoch 3/3
--------------------------------------------------


Training: 100%|██████████| 2774/2774 [27:13<00:00,  1.70it/s]
Validation: 100%|██████████| 309/309 [01:06<00:00,  4.62it/s]


Train Loss: 0.0101, Train Acc: 0.9962
Val Loss: 0.0173, Val F1: 0.9593
Model saved!
Model and tokenizer saved!


100%|██████████| 22760/22760 [02:28<00:00, 153.01it/s]


In [20]:
multimodal_probs = pd.read_csv('multimodal_proba.csv').proba

In [None]:
submission = pd.DataFrame({
    'id': test['id'], 
    # 'prediction': (preds_test * 0.4 + multimodal_probs * 0.6 >= 0.6).astype(int) best
    # 'prediction': (np.array([max([a, b]) for a, b in zip(preds_test, multimodal_probs)]) >= 0.8).astype(int)
    # 'prediction': (preds_test >= 0.7).astype(int)
    'prediction': (preds_test * 0.4 + multimodal_probs * 0.6 >= 0.6).astype(int)

})
submission.to_csv('berta_kfolds.csv', index=False)
submission.head()

Unnamed: 0,id,prediction
0,17384,0
1,260316,0
2,10610,0
3,205236,0
4,308655,0


In [41]:
submission.prediction.mean()

np.float64(0.044200351493848856)