In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import StepLR
from torch.nn.functional import softmax
from sklearn.model_selection import KFold
import random
import os
from sklearn.metrics import f1_score

torch.manual_seed(0)
random.seed(0)
np.random.seed(0)

Загрузка токенайзер для модели. Сама модель будет обновляться для каждого фолда, поэтому загружается дальше

In [2]:
raw_model = "sberbank-ai/ruRoberta-large"
tokenizer = RobertaTokenizer.from_pretrained(raw_model)

Downloading:   0%|          | 0.00/1.73M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/674 [00:00<?, ?B/s]

One-hot encoding целевой переменной для подсчета loss

In [3]:
def transform_df(df):
    df['is_fake'] = df['is_fake'].apply(lambda x: [0, 1] if x == 1 else [1, 0])
    return df

In [4]:
class PairsDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __getitem__(self, idx):
        return (self.x[idx], self.y[idx])

    def __len__(self):
        return len(self.x)

def data_collator(batch):
    y = torch.Tensor([p[1] for p in batch]).to(model.device)
    x = tokenizer([p[0] for p in batch], return_tensors='pt', padding=True).to(model.device)
    return (x, y)

In [5]:
PATH_TO_DATA_FOLDER = "../input/fake-news-task"

data = pd.read_csv(os.path.join(PATH_TO_DATA_FOLDER,'train.tsv'), sep = '\t')
test = pd.read_csv(os.path.join(PATH_TO_DATA_FOLDER,'test.tsv'), sep = '\t')

data = transform_df(data)
test = transform_df(test)

In [6]:
def get_dataloaders(train, valid):
    train_dataset = PairsDataset(train.title.values, train.is_fake.values)
    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, drop_last=False, shuffle=True, collate_fn=data_collator)
    
    valid_dataset = PairsDataset(valid.title.values, valid.is_fake.values)
    valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, drop_last=False, shuffle=False, collate_fn=data_collator)
    
    return train_dataloader, valid_dataloader

In [7]:
def evaluate_model(model, test_dataloader):
    num = 0
    den = 0
    y_true = list()
    y_pred = list()
    y_pred_prob = list()
    f1_valid = .0
    for x, y in test_dataloader:
        with torch.no_grad():
            output = model(
                input_ids=x.input_ids,
                attention_mask=x.attention_mask,
                labels=y,
                return_dict=True
            )
            loss = output.loss
            
            # Подсчет среднего значения loss на датасете
            num += len(x) * loss.item()
            den += len(x)
            
            # Сбор предсказанных классов, их вероятностей, и истинных меток
            y_pred.extend(torch.argmax(output.logits, 1).tolist())
            y_pred_prob.extend(softmax(output.logits, dim = 1)[:, 1].tolist())
            y_true.extend(torch.argmax(y, 1).tolist())
            
    val_loss = num / den
    f1_valid = f1_score(y_true, y_pred, average = 'micro')
    return val_loss, f1_valid, y_pred_prob

In [8]:
def train_loop(
    model, train_dataloader, val_dataloader, 
    max_epochs=10, 
    lr=1e-5,
    eval_steps = 50
):
    optimizer = torch.optim.Adam(params = model.parameters(), lr=lr)
    scheduler = StepLR(optimizer, step_size = 3, gamma=0.5)
    best_f1 = float('-inf')
    
    for epoch in range(max_epochs):
        print('EPOCH', epoch)
        losses = list()
        for i, (x, y) in enumerate(train_dataloader):
            output = model(
                input_ids=x.input_ids,
                attention_mask=x.attention_mask,
                labels=y,
                return_dict=True
            )
            loss = output.loss
            
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
            losses.append(loss.item())
            
            if i % eval_steps == 0:
                model.eval()
                train_loss = np.mean(losses[-eval_steps:])
                eval_loss, eval_f1, _ = evaluate_model(model, val_dataloader)
                if eval_f1 > best_f1:
                    best_f1 = eval_f1
                    torch.save(model.state_dict(), SAVE_PATH)
                print(f'step {i} train_loss: {train_loss:.3} eval_loss: {eval_loss:.3} eval_f1: {eval_f1:.3}')
                model.train()
        scheduler.step()

In [9]:
N_SPLITS = 5
BATCH_SIZE = 32
EPOCHS = 6
SAVE_PATH = 'ruroberta_model'

kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=0)
test_results = list()

test_dataset = PairsDataset(test.title.values, test.is_fake.values)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, drop_last=False, shuffle=False, collate_fn=data_collator)

for i, (train_index, test_index) in enumerate(kf.split(data)):
    print(f"=====  FOLD {i}  =====")
    # Данные для фолда
    train, valid = data.iloc[train_index], data.iloc[test_index]
    train_dataloader, valid_dataloader = get_dataloaders(train, valid)
    
    model = RobertaForSequenceClassification.from_pretrained(raw_model, num_labels = 2).cuda();
    model.train()
    
    train_loop(model, train_dataloader, valid_dataloader, max_epochs=EPOCHS, lr=2e-5, eval_steps = 50)
    
    # Загрузка стейта модели с наибольшим F1 на валидации
    model.load_state_dict(torch.load(SAVE_PATH))
    model.eval()
    
    # Получение предсказаний на тестовом наборе данных
    _, _, test_probability = evaluate_model(model, test_dataloader)
    test_results.append(test_probability)

=====  FOLD 0  =====


Downloading:   0%|          | 0.00/1.32G [00:00<?, ?B/s]

Some weights of the model checkpoint at sberbank-ai/ruRoberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at sberbank-ai/ruRoberta-large and are newly initialized: ['classifier.out_proj.bias', 'classifier.out

EPOCH 0
step 0 train_loss: 0.706 eval_loss: 0.675 eval_f1: 0.648
step 50 train_loss: 0.472 eval_loss: 0.292 eval_f1: 0.88
step 100 train_loss: 0.32 eval_loss: 0.27 eval_f1: 0.894
EPOCH 1
step 0 train_loss: 0.385 eval_loss: 0.267 eval_f1: 0.891
step 50 train_loss: 0.157 eval_loss: 0.21 eval_f1: 0.922
step 100 train_loss: 0.158 eval_loss: 0.238 eval_f1: 0.912
EPOCH 2
step 0 train_loss: 0.0386 eval_loss: 0.243 eval_f1: 0.908
step 50 train_loss: 0.104 eval_loss: 0.228 eval_f1: 0.923
step 100 train_loss: 0.0905 eval_loss: 0.205 eval_f1: 0.919
EPOCH 3
step 0 train_loss: 0.0698 eval_loss: 0.248 eval_f1: 0.918
step 50 train_loss: 0.0333 eval_loss: 0.234 eval_f1: 0.924
step 100 train_loss: 0.0238 eval_loss: 0.245 eval_f1: 0.931
EPOCH 4
step 0 train_loss: 0.0118 eval_loss: 0.233 eval_f1: 0.929
step 50 train_loss: 0.0154 eval_loss: 0.247 eval_f1: 0.932
step 100 train_loss: 0.00977 eval_loss: 0.279 eval_f1: 0.925
EPOCH 5
step 0 train_loss: 0.00519 eval_loss: 0.3 eval_f1: 0.924
step 50 train_loss: 

Some weights of the model checkpoint at sberbank-ai/ruRoberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at sberbank-ai/ruRoberta-large and are newly initialized: ['classifier.out_proj.bias', 'classifier.out

EPOCH 0
step 0 train_loss: 0.68 eval_loss: 0.686 eval_f1: 0.527
step 50 train_loss: 0.512 eval_loss: 0.344 eval_f1: 0.859
step 100 train_loss: 0.341 eval_loss: 0.297 eval_f1: 0.878
EPOCH 1
step 0 train_loss: 0.331 eval_loss: 0.244 eval_f1: 0.909
step 50 train_loss: 0.138 eval_loss: 0.225 eval_f1: 0.918
step 100 train_loss: 0.204 eval_loss: 0.278 eval_f1: 0.885
EPOCH 2
step 0 train_loss: 0.061 eval_loss: 0.222 eval_f1: 0.918
step 50 train_loss: 0.0605 eval_loss: 0.334 eval_f1: 0.902
step 100 train_loss: 0.0736 eval_loss: 0.267 eval_f1: 0.92
EPOCH 3
step 0 train_loss: 0.019 eval_loss: 0.225 eval_f1: 0.922
step 50 train_loss: 0.0474 eval_loss: 0.23 eval_f1: 0.926
step 100 train_loss: 0.0144 eval_loss: 0.276 eval_f1: 0.926
EPOCH 4
step 0 train_loss: 0.00534 eval_loss: 0.272 eval_f1: 0.93
step 50 train_loss: 0.0162 eval_loss: 0.293 eval_f1: 0.925
step 100 train_loss: 0.0171 eval_loss: 0.307 eval_f1: 0.923
EPOCH 5
step 0 train_loss: 0.0022 eval_loss: 0.336 eval_f1: 0.919
step 50 train_loss: 

Some weights of the model checkpoint at sberbank-ai/ruRoberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at sberbank-ai/ruRoberta-large and are newly initialized: ['classifier.out_proj.bias', 'classifier.out

EPOCH 0
step 0 train_loss: 0.72 eval_loss: 0.73 eval_f1: 0.473
step 50 train_loss: 0.486 eval_loss: 0.294 eval_f1: 0.883
step 100 train_loss: 0.3 eval_loss: 0.281 eval_f1: 0.891
EPOCH 1
step 0 train_loss: 0.155 eval_loss: 0.308 eval_f1: 0.872
step 50 train_loss: 0.143 eval_loss: 0.232 eval_f1: 0.918
step 100 train_loss: 0.147 eval_loss: 0.215 eval_f1: 0.918
EPOCH 2
step 0 train_loss: 0.203 eval_loss: 0.225 eval_f1: 0.918
step 50 train_loss: 0.0795 eval_loss: 0.244 eval_f1: 0.92
step 100 train_loss: 0.0669 eval_loss: 0.275 eval_f1: 0.923
EPOCH 3
step 0 train_loss: 0.0137 eval_loss: 0.247 eval_f1: 0.928
step 50 train_loss: 0.0323 eval_loss: 0.26 eval_f1: 0.93
step 100 train_loss: 0.018 eval_loss: 0.303 eval_f1: 0.923
EPOCH 4
step 0 train_loss: 0.00557 eval_loss: 0.296 eval_f1: 0.927
step 50 train_loss: 0.0162 eval_loss: 0.331 eval_f1: 0.921
step 100 train_loss: 0.0157 eval_loss: 0.271 eval_f1: 0.933
EPOCH 5
step 0 train_loss: 0.00235 eval_loss: 0.326 eval_f1: 0.922
step 50 train_loss: 0.

Some weights of the model checkpoint at sberbank-ai/ruRoberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at sberbank-ai/ruRoberta-large and are newly initialized: ['classifier.out_proj.bias', 'classifier.out

EPOCH 0
step 0 train_loss: 0.69 eval_loss: 0.666 eval_f1: 0.68
step 50 train_loss: 0.432 eval_loss: 0.279 eval_f1: 0.882
step 100 train_loss: 0.319 eval_loss: 0.216 eval_f1: 0.917
EPOCH 1
step 0 train_loss: 0.177 eval_loss: 0.205 eval_f1: 0.92
step 50 train_loss: 0.17 eval_loss: 0.216 eval_f1: 0.913
step 100 train_loss: 0.155 eval_loss: 0.218 eval_f1: 0.927
EPOCH 2
step 0 train_loss: 0.28 eval_loss: 0.196 eval_f1: 0.931
step 50 train_loss: 0.0763 eval_loss: 0.255 eval_f1: 0.925
step 100 train_loss: 0.108 eval_loss: 0.209 eval_f1: 0.935
EPOCH 3
step 0 train_loss: 0.0241 eval_loss: 0.208 eval_f1: 0.934
step 50 train_loss: 0.032 eval_loss: 0.252 eval_f1: 0.929
step 100 train_loss: 0.0409 eval_loss: 0.231 eval_f1: 0.938
EPOCH 4
step 0 train_loss: 0.0219 eval_loss: 0.229 eval_f1: 0.937
step 50 train_loss: 0.0367 eval_loss: 0.216 eval_f1: 0.93
step 100 train_loss: 0.0523 eval_loss: 0.319 eval_f1: 0.913
EPOCH 5
step 0 train_loss: 0.0244 eval_loss: 0.257 eval_f1: 0.928
step 50 train_loss: 0.01

Some weights of the model checkpoint at sberbank-ai/ruRoberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at sberbank-ai/ruRoberta-large and are newly initialized: ['classifier.out_proj.bias', 'classifier.out

EPOCH 0
step 0 train_loss: 0.665 eval_loss: 0.715 eval_f1: 0.471
step 50 train_loss: 0.429 eval_loss: 0.29 eval_f1: 0.882
step 100 train_loss: 0.329 eval_loss: 0.249 eval_f1: 0.899
EPOCH 1
step 0 train_loss: 0.222 eval_loss: 0.238 eval_f1: 0.905
step 50 train_loss: 0.164 eval_loss: 0.277 eval_f1: 0.9
step 100 train_loss: 0.157 eval_loss: 0.225 eval_f1: 0.923
EPOCH 2
step 0 train_loss: 0.112 eval_loss: 0.293 eval_f1: 0.897
step 50 train_loss: 0.0588 eval_loss: 0.247 eval_f1: 0.919
step 100 train_loss: 0.0886 eval_loss: 0.251 eval_f1: 0.916
EPOCH 3
step 0 train_loss: 0.0553 eval_loss: 0.307 eval_f1: 0.894
step 50 train_loss: 0.037 eval_loss: 0.28 eval_f1: 0.924
step 100 train_loss: 0.0339 eval_loss: 0.257 eval_f1: 0.93
EPOCH 4
step 0 train_loss: 0.00275 eval_loss: 0.251 eval_f1: 0.931
step 50 train_loss: 0.0157 eval_loss: 0.286 eval_f1: 0.927
step 100 train_loss: 0.0106 eval_loss: 0.283 eval_f1: 0.93
EPOCH 5
step 0 train_loss: 0.00169 eval_loss: 0.309 eval_f1: 0.933
step 50 train_loss: 0

In [10]:
assert all([len(x) == len(test) for x in test_results])

predictions = np.mean(test_results, axis = 0)
test.is_fake = [1 if x >= 0.5 else 0 for x in predictions]
test.to_csv('predictions.tsv', index = False, sep = '\t')