In [1]:
import os
import sys
from collections import Counter

import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import f1_score

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_cosine_schedule_with_warmup, AdamW

In [2]:
train = pd.read_csv('/kaggle/input/vtoroy-pilot/train.csv')

CLASSES = list(train['answer_class'].unique())
CLASSES

[11,
 15,
 27,
 21,
 25,
 10,
 2,
 6,
 22,
 3,
 12,
 24,
 26,
 0,
 5,
 17,
 1,
 4,
 14,
 13,
 23,
 8,
 20,
 7,
 18,
 9,
 29,
 16,
 28,
 19]

### Manual cleaning of text from job offers and advertisements


In [3]:
labels = dict(zip(CLASSES, range(len(CLASSES))))

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, phase='train'):
        self.phase = phase
        
        if self.phase == 'train':
            self.labels = [labels[label] for label in df['answer_class']]
        elif self.phase == 'test':
            self.oid = [oid for oid in df['oid']]
            
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['text']]

    def classes(self):
        return self.labels

    def __len__(self):
        if self.phase == 'train':
            return len(self.labels)
        elif self.phase == 'test':
            return len(self.oid)

    def get_batch_labels(self, idx):
        return np.array(self.labels[idx])
    
    def get_batch_oid(self, idx):
        return np.array(self.oid[idx])

    def get_batch_texts(self, idx):
        return self.texts[idx]

    def __getitem__(self, idx):
        if self.phase == 'train':
            batch_texts = self.get_batch_texts(idx)
            batch_y = self.get_batch_labels(idx)
            return batch_texts, batch_y
        elif self.phase == 'test':
            batch_texts = self.get_batch_texts(idx)
            batch_oid = self.get_batch_oid(idx)
            return batch_texts, batch_oid
   

In [8]:
class BertClassifier:
    def __init__(self, model_path, tokenizer_path, data, n_classes=13, epochs=5):
        self.model = BertForSequenceClassification.from_pretrained(model_path)
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
        self.data = data
        self.device = torch.device('cuda')
        self.max_len = 512
        self.epochs = epochs
        self.out_features = self.model.bert.encoder.layer[1].output.dense.out_features
        self.model.classifier = torch.nn.Linear(self.out_features, n_classes).to(self.device)
        self.model = self.model.to(self.device)

    
    def preparation(self):
        self.df_train, self.df_val = np.split(self.data.sample(frac=1, random_state=42), 
                                     [int(.85*len(self.data))])
        
        self.train, self.val = CustomDataset(self.df_train, self.tokenizer, phase='train'), CustomDataset(self.df_val, self.tokenizer, phase='train')
        self.train_dataloader = torch.utils.data.DataLoader(self.train, batch_size=4, shuffle=True)
        self.val_dataloader = torch.utils.data.DataLoader(self.val, batch_size=4)
    
       
        self.optimizer = AdamW(self.model.parameters(), lr=2e-5, correct_bias=False)
        self.scheduler = get_cosine_schedule_with_warmup(
                self.optimizer,
                num_warmup_steps=0,
                num_training_steps=len(self.train_dataloader) * self.epochs
            )
        self.loss_fn = torch.nn.CrossEntropyLoss().to(self.device)
            
    def fit(self):
        self.model = self.model.train()
        
        for epoch_num in range(self.epochs):
            total_acc_train = 0
            total_loss_train = 0
            for train_input, train_label in tqdm(self.train_dataloader):
                train_label = train_label.to(self.device)
                mask = train_input['attention_mask'].to(self.device)
                input_id = train_input['input_ids'].squeeze(1).to(self.device)
                output = self.model(input_id.to(self.device), mask.to(self.device))

                batch_loss = self.loss_fn(output[0], train_label.long())
                total_loss_train += batch_loss.item()

                acc = (output[0].argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                self.model.zero_grad()
                batch_loss.backward()
                self.optimizer.step()
                self.scheduler.step()
            total_acc_val, total_loss_val, f1 = self.eval()
           
            print(
            f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(self.df_train): .3f} \
            | Train Accuracy: {total_acc_train / len(self.df_train): .3f} \
            | Val Loss: {total_loss_val / len(self.df_val): .3f} \
            | Val Accuracy: {total_acc_val / len(self.df_val): .3f} \
            | Val F1: {f1: .3f}')

            
            os.makedirs('checkpoint', exist_ok=True)
            torch.save(self.model, f'checkpoint/BertClassifier{epoch_num}.pt')

        return total_acc_train, total_loss_train
    
    def eval(self):
        self.model = self.model.eval()
        total_acc_val = 0
        total_loss_val = 0
        y_true = []
        y_pred = []

        with torch.no_grad():
            for val_input, val_label in tqdm(self.val_dataloader):
                val_label = val_label.to(self.device)
                mask = val_input['attention_mask'].to(self.device)
                input_id = val_input['input_ids'].squeeze(1).to(self.device)

                output = self.model(input_id.to(self.device), mask.to(self.device))

                batch_loss = self.loss_fn(output[0], val_label.long())
                total_loss_val += batch_loss.item()

                acc = (output[0].argmax(dim=1) == val_label).sum().item()
                pred_label = output[0].argmax(dim=1)
                y_pred.extend(pred_label)
                y_true.extend(val_label)
                total_acc_val += acc
        y_true_tensor = torch.tensor(y_true)
        y_pred_tensor = torch.tensor(y_pred)
        f1 = f1_score(y_true_tensor, y_pred_tensor, average='micro')
        
        return total_acc_val, total_loss_val, f1
    

In [15]:
model_path = 'cointegrated/rubert-tiny2'
tokenizer_path = 'cointegrated/rubert-tiny2'
bert_tiny = BertClassifier(model_path, tokenizer_path, train,n_classes = 30, epochs=15)

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not 

In [16]:
%%time
bert_tiny.preparation()

CPU times: user 483 ms, sys: 18.7 ms, total: 502 ms
Wall time: 488 ms




In [17]:
bert_tiny.fit()

100%|██████████| 118/118 [00:03<00:00, 37.62it/s]
100%|██████████| 21/21 [00:00<00:00, 138.03it/s]


Epochs: 1 | Train Loss:  0.718             | Train Accuracy:  0.465             | Val Loss:  0.544             | Val Accuracy:  0.783             | Val F1:  0.783


100%|██████████| 118/118 [00:02<00:00, 40.04it/s]
100%|██████████| 21/21 [00:00<00:00, 138.36it/s]


Epochs: 2 | Train Loss:  0.415             | Train Accuracy:  0.883             | Val Loss:  0.376             | Val Accuracy:  0.819             | Val F1:  0.819


100%|██████████| 118/118 [00:02<00:00, 40.22it/s]
100%|██████████| 21/21 [00:00<00:00, 138.31it/s]


Epochs: 3 | Train Loss:  0.272             | Train Accuracy:  0.947             | Val Loss:  0.280             | Val Accuracy:  0.867             | Val F1:  0.867


100%|██████████| 118/118 [00:02<00:00, 40.19it/s]
100%|██████████| 21/21 [00:00<00:00, 136.00it/s]


Epochs: 4 | Train Loss:  0.193             | Train Accuracy:  0.979             | Val Loss:  0.226             | Val Accuracy:  0.880             | Val F1:  0.880


100%|██████████| 118/118 [00:02<00:00, 40.14it/s]
100%|██████████| 21/21 [00:00<00:00, 137.90it/s]


Epochs: 5 | Train Loss:  0.146             | Train Accuracy:  0.983             | Val Loss:  0.192             | Val Accuracy:  0.904             | Val F1:  0.904


100%|██████████| 118/118 [00:02<00:00, 40.28it/s]
100%|██████████| 21/21 [00:00<00:00, 137.69it/s]


Epochs: 6 | Train Loss:  0.118             | Train Accuracy:  0.987             | Val Loss:  0.170             | Val Accuracy:  0.904             | Val F1:  0.904


100%|██████████| 118/118 [00:02<00:00, 40.21it/s]
100%|██████████| 21/21 [00:00<00:00, 138.22it/s]


Epochs: 7 | Train Loss:  0.097             | Train Accuracy:  0.991             | Val Loss:  0.155             | Val Accuracy:  0.916             | Val F1:  0.916


100%|██████████| 118/118 [00:02<00:00, 40.21it/s]
100%|██████████| 21/21 [00:00<00:00, 137.41it/s]


Epochs: 8 | Train Loss:  0.083             | Train Accuracy:  0.994             | Val Loss:  0.142             | Val Accuracy:  0.928             | Val F1:  0.928


100%|██████████| 118/118 [00:02<00:00, 40.27it/s]
100%|██████████| 21/21 [00:00<00:00, 137.39it/s]


Epochs: 9 | Train Loss:  0.075             | Train Accuracy:  0.996             | Val Loss:  0.137             | Val Accuracy:  0.928             | Val F1:  0.928


100%|██████████| 118/118 [00:02<00:00, 40.18it/s]
100%|██████████| 21/21 [00:00<00:00, 136.20it/s]


Epochs: 10 | Train Loss:  0.069             | Train Accuracy:  0.996             | Val Loss:  0.131             | Val Accuracy:  0.940             | Val F1:  0.940


100%|██████████| 118/118 [00:02<00:00, 40.17it/s]
100%|██████████| 21/21 [00:00<00:00, 136.42it/s]


Epochs: 11 | Train Loss:  0.065             | Train Accuracy:  0.994             | Val Loss:  0.128             | Val Accuracy:  0.940             | Val F1:  0.940


100%|██████████| 118/118 [00:02<00:00, 40.29it/s]
100%|██████████| 21/21 [00:00<00:00, 137.05it/s]


Epochs: 12 | Train Loss:  0.062             | Train Accuracy:  0.994             | Val Loss:  0.126             | Val Accuracy:  0.940             | Val F1:  0.940


100%|██████████| 118/118 [00:02<00:00, 40.22it/s]
100%|██████████| 21/21 [00:00<00:00, 136.65it/s]


Epochs: 13 | Train Loss:  0.061             | Train Accuracy:  0.996             | Val Loss:  0.125             | Val Accuracy:  0.940             | Val F1:  0.940


100%|██████████| 118/118 [00:02<00:00, 40.21it/s]
100%|██████████| 21/21 [00:00<00:00, 138.09it/s]


Epochs: 14 | Train Loss:  0.060             | Train Accuracy:  0.996             | Val Loss:  0.125             | Val Accuracy:  0.940             | Val F1:  0.940


100%|██████████| 118/118 [00:02<00:00, 40.20it/s]
100%|██████████| 21/21 [00:00<00:00, 138.47it/s]


Epochs: 15 | Train Loss:  0.060             | Train Accuracy:  0.996             | Val Loss:  0.125             | Val Accuracy:  0.940             | Val F1:  0.940


(467, 27.967378929257393)

In [None]:
test_dataset = CustomDataset(test, bert_tiny.tokenizer, phase='test')
test_dataloader = DataLoader(test_dataset, batch_size=4)

In [None]:
def inference(model, dataloader):
    all_oid = []
    all_labels = []
    label_prob = []
    
    model.cuda()
    model.eval()
    with torch.no_grad():
        for test_input, test_oid in tqdm(dataloader):
            test_oid = test_oid.cuda()
            mask = test_input['attention_mask'].cuda()
            input_id = test_input['input_ids'].squeeze(1).cuda()
            output = model(input_id, mask)
            all_oid.extend(test_oid)
            all_labels.extend(torch.argmax(output[0].softmax(1), dim=1))
            
            for prob in output[0].softmax(1):
                label_prob.append(prob)
        return ([oid.item() for oid in all_oid], [CLASSES[labels] for labels in all_labels], label_prob)

In [None]:
inference_model = torch.load(f'/kaggle/working/checkpoint/BertClassifier{bert_tiny.epochs-1}.pt')
inference_result = inference(inference_model, test_dataloader)

In [None]:
oid = [i for i in inference_result[0]]
labels = [i for i in inference_result[1]]
prob = [i for i in inference_result[2]]
print(len(dict(zip(oid, labels))))
print(len(set(oid) & set(test['oid'].unique())))
print(len(set(oid) & set(test['oid'].unique())))

In [None]:
detached_prob = []
for i in prob:
    detached_prob.append(i.cpu().numpy())

In [None]:
data = {'oid':oid, 'category':labels, 'probs':detached_prob}
submit = pd.DataFrame(data)
submit['label_int'] = submit['category'].apply(lambda x: CLASSES.index(x))

label_int = submit['label_int'].to_list()
probs = submit['probs'].to_list()
res = []
for indx, tensor in enumerate(probs):

    res.append(tensor[label_int[indx]])
submit['prob'] = res
del submit['probs'], submit['label_int']
tmp_submit = pd.DataFrame(submit.groupby(by=['oid', 'category']).sum().reset_index())

oid = tmp_submit['oid'].to_list()
category = tmp_submit['category'].to_list()
prob = tmp_submit['prob'].to_list()

res = {}
for indx, id in enumerate(oid):
    if id not in res:
        res[id] = (category[indx], prob[indx])
        
submit_data = {k:v[0] for k,v in res.items()}
oid = list(submit_data.keys())
category = list(submit_data.values())
pd.DataFrame({'oid':oid, 'category':category}).to_csv('submission.csv', index=False)