In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import StepLR
import random
import os
from sklearn.metrics import f1_score

torch.manual_seed(0)
random.seed(0)
np.random.seed(0)

Сначала 3 эпохи обучим модель на английских данных, затем дообучим модель на русском языке

In [2]:
PATH_TO_VALID_TEST_FOLDER = "../input/kontyp-parsed-train-test"
PATH_TO_ENG_DATA_FOLDER = "../input/fake-news-eng"
PATH_TO_TRAIN_FOLDER = "../input/augmented-train"

train = pd.read_feather(os.path.join(PATH_TO_TRAIN_FOLDER,'train.feather'))
train_eng = pd.read_feather(os.path.join(PATH_TO_ENG_DATA_FOLDER,'fake_dataset_eng.feather'))
valid =  pd.read_feather(os.path.join(PATH_TO_VALID_TEST_FOLDER,'validate.feather'))
test = pd.read_feather(os.path.join(PATH_TO_VALID_TEST_FOLDER,'test.feather'))

In [3]:
raw_model = 'cointegrated/rut5-base-multitask' 
model = T5ForConditionalGeneration.from_pretrained(raw_model).cuda();
tokenizer = T5Tokenizer.from_pretrained(raw_model)

Downloading:   0%|          | 0.00/726 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/932M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/260 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/808k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Для каждого датафрейма изменим title добавив префикс для обучения под задачу классификации фейковых новостей, и изменим числовые значения на Да и Нет

In [4]:
def transform_df(df):
    df['title'] = df['title'].apply(lambda x: 'Fake | ' + x)
    df['is_fake'] = df['is_fake'].apply(lambda x: 'Да' if x == 1 else 'Нет')
    return df

In [5]:
class PairsDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __getitem__(self, idx):
        return (self.x[idx], self.y[idx])

    def __len__(self):
        return len(self.x)

def data_collator(batch):
    y_vals = [p[1] for p in batch]
    x = tokenizer([p[0] for p in batch], return_tensors='pt', padding=True).to(model.device)
    y = tokenizer(y_vals, return_tensors='pt', padding=True).to(model.device)
    y.input_ids[y.input_ids == 0] = -100
    y['labels'] = y_vals
    return (x, y)

In [6]:
BATCH_SIZE = 32

train = transform_df(train)
train_eng = transform_df(train_eng)
valid = transform_df(valid)
test = transform_df(test)

train_dataset = PairsDataset(train.title.values, train.is_fake.values)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, drop_last=False, shuffle=True, collate_fn=data_collator)

train_eng_dataset = PairsDataset(train_eng.title.values, train_eng.is_fake.values)
train_eng_dataloader = DataLoader(train_eng_dataset, batch_size=BATCH_SIZE, drop_last=False, shuffle=True, collate_fn=data_collator)

valid_dataset = PairsDataset(valid.title.values, valid.is_fake.values)
valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, drop_last=False, shuffle=False, collate_fn=data_collator)

test_dataset = PairsDataset(test.title.values, test.is_fake.values)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, drop_last=False, shuffle=False, collate_fn=data_collator)

In [7]:
def generate_answer(inputs, **kwargs):
    with torch.no_grad():
        hypotheses = model.generate(**inputs, **kwargs)
    return tokenizer.batch_decode(hypotheses, skip_special_tokens=True)

In [8]:
def evaluate_model(model, test_dataloader):
    num = 0
    den = 0
    y_true = list()
    y_pred = list()
    f1_valid = .0
    for x, y in test_dataloader:
        with torch.no_grad():
            loss = model(
                input_ids=x.input_ids,
                attention_mask=x.attention_mask,
                labels=y.input_ids,
                decoder_attention_mask=y.attention_mask,
                return_dict=True
            ).loss
            num += len(x) * loss.item()
            den += len(x)
            
            y_pred.extend(generate_answer(x))
            y_true.extend(y['labels'])
            
    val_loss = num / den
    if len(set(y_pred)) == 2:
        y_true = [1 if x == 'Да' else 0 for x in y_true]
        y_pred = [1 if x == 'Да' else 0 for x in y_pred]
        f1_valid = f1_score(y_true, y_pred, average = 'micro')
    return val_loss, f1_valid

In [9]:
def train_loop(
    model, train_dataloader, val_dataloader, 
    max_epochs=10, 
    lr=1e-5,
    eval_steps = 50
):
    model.train()
    optimizer = torch.optim.Adam(params = [p for p in model.parameters() if p.requires_grad], lr=lr)
    scheduler = StepLR(optimizer, step_size = 3, gamma=0.5)
    best_f1 = float('-inf')
    
    for epoch in range(max_epochs):
        print('EPOCH', epoch)
        losses = list()
        for i, (x, y) in enumerate(train_dataloader):
            loss = model(
                input_ids=x.input_ids,
                attention_mask=x.attention_mask,
                labels=y.input_ids,
                decoder_attention_mask=y.attention_mask,
                return_dict=True
            ).loss
            
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
            losses.append(loss.item())
            
            if i % eval_steps == 0:
                model.eval()
                train_loss = np.mean(losses[-eval_steps:])
                eval_loss, eval_f1 = evaluate_model(model, val_dataloader)
                if eval_f1 > best_f1:
                    best_f1 = eval_f1
                    torch.save(model.state_dict(), SAVE_PATH)
                print(f'step {i} train_loss: {train_loss:.3} eval_loss: {eval_loss:.3} eval_f1: {eval_f1:.3}')
                model.train()
        scheduler.step()

In [10]:
SAVE_PATH = 't5_model'

train_loop(model, train_eng_dataloader, valid_dataloader, max_epochs=3, lr=2e-5, eval_steps = 9999)

EPOCH 0
step 0 train_loss: 6.05 eval_loss: 5.56 eval_f1: 0.0
EPOCH 1
step 0 train_loss: 0.00777 eval_loss: 2.38 eval_f1: 0.0
EPOCH 2
step 0 train_loss: 0.00319 eval_loss: 3.15 eval_f1: 0.0


In [11]:
train_loop(model, train_dataloader, valid_dataloader, max_epochs=15, lr=2e-5, eval_steps = 100)

EPOCH 0
step 0 train_loss: 2.79 eval_loss: 3.1 eval_f1: 0.46
step 100 train_loss: 0.56 eval_loss: 0.316 eval_f1: 0.659
step 200 train_loss: 0.38 eval_loss: 0.274 eval_f1: 0.738
step 300 train_loss: 0.333 eval_loss: 0.229 eval_f1: 0.792
step 400 train_loss: 0.306 eval_loss: 0.22 eval_f1: 0.805
step 500 train_loss: 0.278 eval_loss: 0.212 eval_f1: 0.805
step 600 train_loss: 0.274 eval_loss: 0.183 eval_f1: 0.851
step 700 train_loss: 0.265 eval_loss: 0.194 eval_f1: 0.83
step 800 train_loss: 0.252 eval_loss: 0.216 eval_f1: 0.8
step 900 train_loss: 0.258 eval_loss: 0.233 eval_f1: 0.778
step 1000 train_loss: 0.243 eval_loss: 0.162 eval_f1: 0.866
step 1100 train_loss: 0.233 eval_loss: 0.158 eval_f1: 0.868
EPOCH 1
step 0 train_loss: 0.268 eval_loss: 0.157 eval_f1: 0.866
step 100 train_loss: 0.243 eval_loss: 0.155 eval_f1: 0.877
step 200 train_loss: 0.221 eval_loss: 0.179 eval_f1: 0.847
step 300 train_loss: 0.233 eval_loss: 0.148 eval_f1: 0.885
step 400 train_loss: 0.225 eval_loss: 0.171 eval_f1:

In [None]:
model.load_state_dict(torch.load(SAVE_PATH))
model.eval()

In [13]:
test_loss, f1 = evaluate_model(model, test_dataloader)
print(f'Test loss: {test_loss}\nTest F1: {f1}')

Test loss: 0.12564725361087106
Test F1: 0.8948863636363636
