In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import StepLR
import random
import os
from sklearn.metrics import f1_score

torch.manual_seed(0)
random.seed(0)
np.random.seed(0)

In [2]:
raw_model = "sberbank-ai/ruRoberta-large"
model = RobertaForSequenceClassification.from_pretrained(raw_model, num_labels = 2).cuda();
tokenizer = RobertaTokenizer.from_pretrained(raw_model)

Downloading:   0%|          | 0.00/674 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32G [00:00<?, ?B/s]

Some weights of the model checkpoint at sberbank-ai/ruRoberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at sberbank-ai/ruRoberta-large and are newly initialized: ['classifier.out_proj.bias', 'classifier.out

Downloading:   0%|          | 0.00/1.73M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

In [3]:
def transform_df(df):
    df['is_fake'] = df['is_fake'].apply(lambda x: [0, 1] if x == 1 else [1, 0])
    return df

In [4]:
class PairsDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __getitem__(self, idx):
        return (self.x[idx], self.y[idx])

    def __len__(self):
        return len(self.x)

def data_collator(batch):
    y = torch.Tensor([p[1] for p in batch]).to(model.device)
    x = tokenizer([p[0] for p in batch], return_tensors='pt', padding=True).to(model.device)
    return (x, y)

In [5]:
PATH_TO_TRAIN_FOLDER = "../input/augmented-train"
PATH_TO_VALID_TEST_FOLDER = "../input/kontyp-parsed-train-test"

train = pd.read_feather(os.path.join(PATH_TO_TRAIN_FOLDER,'train.feather'))
valid = pd.read_feather(os.path.join(PATH_TO_VALID_TEST_FOLDER,'validate.feather'))
test = pd.read_feather(os.path.join(PATH_TO_VALID_TEST_FOLDER,'test.feather'))

In [6]:
BATCH_SIZE = 32

train = transform_df(train)
valid = transform_df(valid)
test = transform_df(test)

train_dataset = PairsDataset(train.title.values, train.is_fake.values)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, drop_last=False, shuffle=True, collate_fn=data_collator)

valid_dataset = PairsDataset(valid.title.values, valid.is_fake.values)
valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, drop_last=False, shuffle=False, collate_fn=data_collator)

test_dataset = PairsDataset(test.title.values, test.is_fake.values)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, drop_last=False, shuffle=False, collate_fn=data_collator)

In [7]:
def evaluate_model(model, test_dataloader):
    num = 0
    den = 0
    y_true = list()
    y_pred = list()
    f1_valid = .0
    for x, y in test_dataloader:
        with torch.no_grad():
            output = model(
                input_ids=x.input_ids,
                attention_mask=x.attention_mask,
                labels=y,
                return_dict=True
            )
            loss = output.loss
            
            num += len(x) * loss.item()
            den += len(x)
            
            y_pred.extend(torch.argmax(output.logits, 1).tolist())
            y_true.extend(torch.argmax(y, 1).tolist())
            
    val_loss = num / den
    f1_valid = f1_score(y_true, y_pred, average = 'micro')
    return val_loss, f1_valid

In [8]:
def train_loop(
    model, train_dataloader, val_dataloader, 
    max_epochs=10, 
    lr=1e-5,
    eval_steps = 50
):
    model.train()
    optimizer = torch.optim.Adam(params = [p for p in model.parameters() if p.requires_grad], lr=lr)
    scheduler = StepLR(optimizer, step_size = 3, gamma=0.5)
    best_f1 = float('-inf')
    
    for epoch in range(max_epochs):
        print('EPOCH', epoch)
        losses = list()
        for i, (x, y) in enumerate(train_dataloader):
            output = model(
                input_ids=x.input_ids,
                attention_mask=x.attention_mask,
                labels=y,
                return_dict=True
            )
            loss = output.loss
            
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
            losses.append(loss.item())
            
            if i % eval_steps == 0:
                model.eval()
                train_loss = np.mean(losses[-eval_steps:])
                eval_loss, eval_f1 = evaluate_model(model, val_dataloader)
                if eval_f1 > best_f1:
                    best_f1 = eval_f1
                    torch.save(model.state_dict(), SAVE_PATH)
                print(f'step {i} train_loss: {train_loss:.3} eval_loss: {eval_loss:.3} eval_f1: {eval_f1:.3}')
                model.train()
        scheduler.step()

In [9]:
SAVE_PATH = 'ruroberta_model'

train_loop(model, train_dataloader, valid_dataloader, max_epochs=12, lr=2e-5, eval_steps = 100)

EPOCH 0
step 0 train_loss: 0.705 eval_loss: 0.692 eval_f1: 0.504
step 100 train_loss: 0.517 eval_loss: 0.252 eval_f1: 0.896
step 200 train_loss: 0.381 eval_loss: 0.211 eval_f1: 0.917
step 300 train_loss: 0.366 eval_loss: 0.451 eval_f1: 0.763
step 400 train_loss: 0.448 eval_loss: 0.238 eval_f1: 0.902
step 500 train_loss: 0.347 eval_loss: 0.206 eval_f1: 0.926
step 600 train_loss: 0.33 eval_loss: 0.2 eval_f1: 0.932
step 700 train_loss: 0.31 eval_loss: 0.186 eval_f1: 0.928
step 800 train_loss: 0.305 eval_loss: 0.236 eval_f1: 0.905
step 900 train_loss: 0.286 eval_loss: 0.164 eval_f1: 0.935
step 1000 train_loss: 0.295 eval_loss: 0.187 eval_f1: 0.923
step 1100 train_loss: 0.335 eval_loss: 0.208 eval_f1: 0.924
EPOCH 1
step 0 train_loss: 0.447 eval_loss: 0.304 eval_f1: 0.87
step 100 train_loss: 0.261 eval_loss: 0.189 eval_f1: 0.928
step 200 train_loss: 0.261 eval_loss: 0.224 eval_f1: 0.924
step 300 train_loss: 0.357 eval_loss: 0.22 eval_f1: 0.911
step 400 train_loss: 0.303 eval_loss: 0.183 eval

In [None]:
model.load_state_dict(torch.load(SAVE_PATH))
model.eval()

In [11]:
test_loss, f1 = evaluate_model(model, test_dataloader)
print(f'Test loss: {test_loss}\nTest F1: {f1}')

Test loss: 0.23154260992834513
Test F1: 0.9403409090909091
