In [1]:
!pip install -q accelerate

In [2]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch import optim

from transformers import RobertaTokenizer, RobertaForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification
from accelerate import Accelerator

import random
import os
from functools import partial
from tqdm.auto import tqdm

import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold

In [4]:
class BertDataset(Dataset):
    def __init__(self, df, tokenizer):
        super(BertDataset, self).__init__()
        self.texts = df['excerpt']
        self.targets = df['target']
        self.tokenizer = tokenizer
        
    def __getitem__(self, idx):
        text = self.texts[idx]
        target = torch.tensor(self.targets[idx], dtype=torch.float)
        enc_inp = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=256 ,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_token_type_ids=True,
            return_tensors='pt'
            
        )
        
        return {
            "input_id": enc_inp['input_ids'].squeeze(0),
            "mask": enc_inp['attention_mask'].squeeze(0),
            "token_type_id": enc_inp['token_type_ids'],
            "target": target.unsqueeze(-1)
        }
        
    def __len__(self): 
        return len(self.texts)

In [5]:
class Trainer:
    def __init__(self, force_use_cpu):
        self.tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
        self.model = AutoModelForSequenceClassification.from_pretrained(
                        'albert-base-v2', 
                         num_labels=1,
                    )
        self.opt = optim.AdamW(self.model.parameters(), lr=2e-5)
        
        self.accelerator = Accelerator(cpu=force_use_cpu)
        self.model, self.optim = self.accelerator.prepare(self.model, self.opt)
        self.device = self.accelerator.device
        
    def get_dataloader(self, train_df, val_df):
        train_ds = BertDataset(train_df, self.tokenizer)
        train_dl = DataLoader(
            train_ds, batch_size=self.bs, 
            pin_memory=True, num_workers=4,
            shuffle=True
        )
        
        val_ds = BertDataset(val_df, self.tokenizer)
        val_dl = DataLoader(
            val_ds, batch_size=self.bs, 
            pin_memory=True, num_workers=4,
            shuffle=False
        )
        train_dl, val_dl = self.accelerator.prepare(train_dl, val_dl)
        
        return train_dl, val_dl
        
    def criterion(self, y_pred, y_true): 
        return torch.sqrt(F.mse_loss(y_true, y_pred)) # RMSE
    
    def get_batch_loss(self, batch):
        input_ids = batch['input_id']
        mask = batch['mask']
        target = batch['target']
        
        out = self.model(input_ids, mask)
        loss = self.criterion(out['logits'], target)
        return out, loss
    
    def create_folds(self, data, splits):
        data.loc[:, 'fold'] = -1
        data = data.sample(frac=1).reset_index(drop=True)

        n_bins = int(np.floor(1+ np.log2(len(data))))
        data.loc[:, 'bins'] = pd.cut(data['target'], bins=n_bins, labels=False)

        skf = StratifiedKFold(splits)

        for fold, (_, v_split) in enumerate(skf.split(X=data, y=data['bins'])):
            data.loc[v_split, 'fold'] = fold

        data.drop('bins', axis=1, inplace=True)
        return data
    
    def fit(self, df, bs, epochs, splits):
        self.df = self.create_folds(df, splits)
        
        self.epochs = epochs
        self.bs = bs
        self.scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(
                            self.opt, T_0=bs*epochs, eta_min=1e-7
                        )
        
        curr_rmse = 10000
        for fold in range(splits):
            tqdm.write(f"\nFold {fold+1}")
            train_df = self.df[self.df['fold']!=fold].reset_index(drop=True)
            val_df = self.df[self.df['fold']==fold].reset_index(drop=True)
            train_dl, val_dl = self.get_dataloader(train_df, val_df)
        
            for epoch in range(epochs):
                self.model.train()
                batch_loss = []
                for batch_step, train_batch in tqdm(enumerate(train_dl), total=len(train_dl), leave=False, desc=f"Epoch {epoch}(train)"):
                    self.opt.zero_grad()

                    train_out, train_loss = self.get_batch_loss(train_batch)
                    self.accelerator.backward(train_loss)

                    self.opt.step()
                    self.scheduler.step()
                    batch_loss.append(train_loss.item())
                    
                with torch.no_grad():
                    self.model.eval()
                    for val_batch in tqdm(val_dl, total=len(val_dl), leave=False, desc=f"Epoch {epoch}(val)"):
                        val_out, val_loss = self.get_batch_loss(val_batch)
                if (np.mean(batch_loss)<curr_rmse):
                    curr_rmse = np.mean(batch_loss)
                    self.save_model()
                    batch_loss = []
                    tqdm.write(f"Model saved with rmse {curr_rmse}")
                            
            tqdm.write(f"Epoch: {epoch+1}  train_loss: {train_loss.item()}  val_loss: {val_loss.item()}")
            
    def save_model(self):
        torch.save(self.model.state_dict(), 'albert-model.pt')

In [6]:
df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
trainer = Trainer(force_use_cpu=False)

Downloading:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You sho

In [7]:
splits = 5
bs = 8
epochs = 3

trainer.fit(df, bs, epochs, splits)


Fold 1


Epoch 0(train):   0%|          | 0/284 [00:00<?, ?it/s]

Epoch 0(val):   0%|          | 0/71 [00:00<?, ?it/s]

Model saved with rmse 0.47844522098787656


Epoch 1(train):   0%|          | 0/284 [00:00<?, ?it/s]

Epoch 1(val):   0%|          | 0/71 [00:00<?, ?it/s]

Model saved with rmse 0.2931218529907121


Epoch 2(train):   0%|          | 0/284 [00:00<?, ?it/s]

Epoch 2(val):   0%|          | 0/71 [00:00<?, ?it/s]

Model saved with rmse 0.2159011657808868
Epoch: 3  train_loss: 0.13211125135421753  val_loss: 0.5963354110717773

Fold 2


Epoch 0(train):   0%|          | 0/284 [00:00<?, ?it/s]

Epoch 0(val):   0%|          | 0/71 [00:00<?, ?it/s]

Model saved with rmse 0.20156448742401012


Epoch 1(train):   0%|          | 0/284 [00:00<?, ?it/s]

Epoch 1(val):   0%|          | 0/71 [00:00<?, ?it/s]

Model saved with rmse 0.12073121289394691


Epoch 2(train):   0%|          | 0/284 [00:00<?, ?it/s]

Epoch 2(val):   0%|          | 0/71 [00:00<?, ?it/s]

Model saved with rmse 0.08834628725398175
Epoch: 3  train_loss: 0.46776124835014343  val_loss: 0.2386566698551178

Fold 3


Epoch 0(train):   0%|          | 0/284 [00:00<?, ?it/s]

Epoch 0(val):   0%|          | 0/71 [00:00<?, ?it/s]

Epoch 1(train):   0%|          | 0/284 [00:00<?, ?it/s]

Epoch 1(val):   0%|          | 0/71 [00:00<?, ?it/s]

Model saved with rmse 0.07429640975938191


Epoch 2(train):   0%|          | 0/284 [00:00<?, ?it/s]

Epoch 2(val):   0%|          | 0/71 [00:00<?, ?it/s]

Model saved with rmse 0.04709660207269005
Epoch: 3  train_loss: 0.011561772786080837  val_loss: 0.18188317120075226

Fold 4


Epoch 0(train):   0%|          | 0/284 [00:00<?, ?it/s]

Epoch 0(val):   0%|          | 0/71 [00:00<?, ?it/s]

Epoch 1(train):   0%|          | 0/284 [00:00<?, ?it/s]

Epoch 1(val):   0%|          | 0/71 [00:00<?, ?it/s]

Epoch 2(train):   0%|          | 0/284 [00:00<?, ?it/s]

Epoch 2(val):   0%|          | 0/71 [00:00<?, ?it/s]

Model saved with rmse 0.038988227869095414
Epoch: 3  train_loss: 0.012468180619180202  val_loss: 0.03518177941441536

Fold 5


Epoch 0(train):   0%|          | 0/284 [00:00<?, ?it/s]

Epoch 0(val):   0%|          | 0/71 [00:00<?, ?it/s]

Model saved with rmse 0.037072740698521824


Epoch 1(train):   0%|          | 0/284 [00:00<?, ?it/s]

Epoch 1(val):   0%|          | 0/71 [00:00<?, ?it/s]

Epoch 2(train):   0%|          | 0/284 [00:00<?, ?it/s]

Epoch 2(val):   0%|          | 0/71 [00:00<?, ?it/s]

Epoch: 3  train_loss: 0.008025831542909145  val_loss: 0.008805619552731514


In [8]:
torch.save(trainer.model.state_dict(), './albert-model_final.pt')