In [34]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.cuda import amp
import transformers
from transformers import BertForSequenceClassification, AutoTokenizer, AutoConfig
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm.notebook import tqdm
from collections import defaultdict
import plotly.graph_objects as go
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, KFold
import warnings
warnings.filterwarnings("ignore")


In [35]:
MODEL_NAME = "bert-base-uncased"

In [36]:
class Config:
    seed = 42
    max_len = 314
    train_batch = 16
    val_batch = 32
    epochs = 10
    n_tokens = 20
    learning_rate = 1e-5
    n_splits = 8
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [37]:
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=1)
model.to(Config.device)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.save_pretrained('./tokenizer')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


('./tokenizer/tokenizer_config.json',
 './tokenizer/special_tokens_map.json',
 './tokenizer/vocab.txt',
 './tokenizer/added_tokens.json',
 './tokenizer/tokenizer.json')

In [65]:
class TrainDataset(Dataset):
    def __init__(self, sentences, targets, tokenizer):
        self.sentences = sentences
        self.targets = targets
        self.tokenizer = tokenizer
        self.n_tokens = Config.n_tokens

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx].strip()
        target = self.targets[idx]
        
        encoding = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=Config.max_len,
            padding='max_length',
            return_attention_mask=True,
            # return_tensors='pt',
            truncation=True
        )

        encoding['input_ids'] = torch.cat(
            (
                torch.full((1,self.n_tokens), 500).resize(20), 
                torch.tensor(encoding['input_ids'], dtype=torch.long)
            )
        )
        encoding['attention_mask'] = torch.cat(
            (
                torch.full((1,self.n_tokens), 1).resize(20), 
                torch.tensor(encoding['attention_mask'], dtype=torch.long)
            )
        )

        return {
            'input_ids': encoding['input_ids'],
            'attention_mask': encoding['attention_mask'],
            'targets': torch.tensor(target, dtype=torch.float)
        }
        
        # return {
        #     'input_ids': torch.tensor(encoding['input_ids'], dtype=torch.long).flatten(),
        #     'attention_mask': torch.tensor(encoding['attention_mask'], dtype=torch.long).flatten(),
        #     'targets': torch.tensor(target, dtype=torch.float)
        # }


class EvalDataset(Dataset):
    def __init__(self, sentences, tokenizer, ids):
        self.sentences = sentences
        self.tokenizer = tokenizer
        self.ids = ids
        self.n_tokens = Config.n_tokens

    def __len__(self):
        return len(self.ids)
    
    def __getitem__(self, idx):
        sentence = self.sentences[idx].strip()
        
        encoding = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=Config.max_len,
            padding='max_length',
            return_attention_mask=True,
            truncation=True
        )

        encoding['input_ids'] = torch.cat(
            (
                torch.full((1,self.n_tokens), 500).resize(20), 
                torch.tensor(encoding['input_ids'], dtype=torch.long)
            )
        )
        encoding['attention_mask'] = torch.cat(
            (
                torch.full((1,self.n_tokens), 1).resize(20), 
                torch.tensor(encoding['attention_mask'], dtype=torch.long)
            )
        )

        return {
            'input_ids': encoding['input_ids'],
            'attention_mask': encoding['attention_mask'],
            'ids': self.ids[idx]
        
        }

        # return {
        #     'input_ids': encoding['input_ids'].flatten(),
        #     'attention_mask': encoding['attention_mask'].flatten(),
        #     'ids': self.ids[idx]
        # }



In [39]:
def create_folds(data, num_splits):
    # we create a new column called kfold and fill it with -1
    data["kfold"] = -1
    
    data = data.sample(frac=1).reset_index(drop=True)

    num_bins = int(np.floor(1 + np.log2(len(data))))
    
    data.loc[:, "bins"] = pd.cut(
        data["target"], bins=num_bins, labels=False
    )
    
    kf = StratifiedKFold(n_splits=num_splits)

    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'kfold'] = f
    
    data = data.drop("bins", axis=1)
    return data


In [40]:
df_test = pd.read_csv('/home/alexuvarovskyi/course_competition/data/test.csv')

In [41]:
df = pd.read_csv('/home/alexuvarovskyi/course_competition/data/train.csv')
df.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


In [42]:
df = create_folds(df, Config.n_splits)
df.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error,kfold
0,c85db4cc7,,,Another striking difference between monkeys an...,0.004187,0.481514,0
1,c9eea4fc6,https://simple.wikipedia.org/wiki/Patriotism,CC BY-SA 3.0 and GFDL,Patriotism means loyalty of person to his/her ...,-0.641552,0.467047,0
2,20758ef4a,,,The deepest and most regularly worked was the ...,-2.217568,0.481423,0
3,a3f2f2a4b,,,The class had assembled again in Professor Gra...,-0.64977,0.485501,0
4,695bb0ad0,https://simple.wikipedia.org/wiki/Radar,CC BY-SA 3.0 and GFDL,Radar is a machine that uses radio waves for e...,-1.572898,0.442499,0


In [43]:

fold = 0
df_train = df[df.kfold != fold].reset_index(drop=True)
df_valid = df[df.kfold == fold].reset_index(drop=True)

train_dataset = TrainDataset(
    sentences=df_train.excerpt.values,
    targets=df_train.target.values,
    tokenizer=tokenizer
)
train_loader = DataLoader(
    train_dataset,
    batch_size=Config.train_batch,
    shuffle=True,
    pin_memory=True
)

val_dataset = TrainDataset(
    sentences=df_valid.excerpt.values,
    targets=df_valid.target.values,
    tokenizer=tokenizer,
)
val_loader = DataLoader(
    val_dataset,
    batch_size=Config.val_batch,
    pin_memory=True
)

In [44]:
len(train_dataset), len(val_dataset)

(2479, 355)

In [45]:
optimizer = AdamW(model.parameters(), lr=Config.learning_rate)
lr_scheduler  = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0, 
    num_training_steps=len(train_loader)*Config.epochs
)

# def loss_fn(output,target):
#     return torch.sqrt(nn.MSELoss()(output,target))

def loss_fn(output,target):
    return nn.MSELoss()(output,target)
scaler = amp.GradScaler()

In [46]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scaler):
    model.train()
    losses = []
    for data in tqdm(data_loader, total=len(data_loader)):
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        targets = data['targets'].to(device)

        optimizer.zero_grad()

        with amp.autocast():
            outputs = model(input_ids, attention_mask=attention_mask)#, labels=targets)
            loss = torch.sqrt(loss_fn(outputs.logits.view(-1), targets))
        
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        lr_scheduler.step()

        losses.append(loss.item())
    
    return np.mean(losses)

def eval_model(model, data_loader, loss_fn, device):
    model.eval()
    losses = []
    with torch.no_grad():
        for data in tqdm(data_loader, total=len(data_loader)):
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            targets = data['targets'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=targets)
            loss = torch.sqrt(loss_fn(outputs.logits.view(-1), targets))

            losses.append(loss.item())
    
    return np.mean(losses)

In [47]:
def train_model(model, train_loader, val_loader, loss_fn, optimizer, device, scaler, epochs):
    model.to(device)
    history = defaultdict(list)
    best_loss = np.inf
    for epoch in tqdm(range(epochs)):
        print(f'Epoch {epoch + 1}/{epochs}')
        print('-' * 10)
        train_loss = train_epoch(model, train_loader, loss_fn, optimizer, device, scaler)
        print(f'Train loss {train_loss}')
        val_loss = eval_model(model, val_loader, loss_fn, device)
        print(f'Val loss {val_loss}')
        print()
        history['train_loss'].append(train_loss)
        history['val_loss'].append(val_loss)
        if val_loss < best_loss:
            torch.save(model.state_dict(), f'best_model_{fold}.bin')
            best_loss = val_loss
            print('Model saved', f"model loss {best_loss}")
    return history

In [48]:
train_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    loss_fn=loss_fn,
    optimizer=optimizer,
    scaler=scaler,
    device=Config.device,
    epochs=Config.epochs
)

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1/10
----------


  0%|          | 0/155 [00:00<?, ?it/s]

Train loss 0.7206798234293538


  0%|          | 0/12 [00:00<?, ?it/s]

Val loss 0.6901050060987473

Model saved model loss 0.6901050060987473
Epoch 2/10
----------


  0%|          | 0/155 [00:00<?, ?it/s]

Train loss 0.5453683495521545


  0%|          | 0/12 [00:00<?, ?it/s]

Val loss 0.7303712169329325

Epoch 3/10
----------


  0%|          | 0/155 [00:00<?, ?it/s]

Train loss 0.46970849460171116


  0%|          | 0/12 [00:00<?, ?it/s]

Val loss 0.7213120460510254

Epoch 4/10
----------


  0%|          | 0/155 [00:00<?, ?it/s]

Train loss 0.40655503359533124


  0%|          | 0/12 [00:00<?, ?it/s]

Val loss 0.7504084606965383

Epoch 5/10
----------


  0%|          | 0/155 [00:00<?, ?it/s]

Train loss 0.3647599451003536


  0%|          | 0/12 [00:00<?, ?it/s]

Val loss 0.6622582376003265

Model saved model loss 0.6622582376003265
Epoch 6/10
----------


  0%|          | 0/155 [00:00<?, ?it/s]

Train loss 0.31514873033569707


  0%|          | 0/12 [00:00<?, ?it/s]

Val loss 0.7429147611061732

Epoch 7/10
----------


  0%|          | 0/155 [00:00<?, ?it/s]

Train loss 0.291682082030081


  0%|          | 0/12 [00:00<?, ?it/s]

Val loss 0.7248704880475998

Epoch 8/10
----------


  0%|          | 0/155 [00:00<?, ?it/s]

Train loss 0.257558881371252


  0%|          | 0/12 [00:00<?, ?it/s]

Val loss 0.7508323987325033

Epoch 9/10
----------


  0%|          | 0/155 [00:00<?, ?it/s]

Train loss 0.24510130863035878


  0%|          | 0/12 [00:00<?, ?it/s]

Val loss 0.7123468369245529

Epoch 10/10
----------


  0%|          | 0/155 [00:00<?, ?it/s]

Train loss 0.23323919128987097


  0%|          | 0/12 [00:00<?, ?it/s]

Val loss 0.6956585496664047



defaultdict(list,
            {'train_loss': [0.7206798234293538,
              0.5453683495521545,
              0.46970849460171116,
              0.40655503359533124,
              0.3647599451003536,
              0.31514873033569707,
              0.291682082030081,
              0.257558881371252,
              0.24510130863035878,
              0.23323919128987097],
             'val_loss': [0.6901050060987473,
              0.7303712169329325,
              0.7213120460510254,
              0.7504084606965383,
              0.6622582376003265,
              0.7429147611061732,
              0.7248704880475998,
              0.7508323987325033,
              0.7123468369245529,
              0.6956585496664047]})

In [49]:
eval_model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=1)
eval_model.load_state_dict(torch.load(f'best_model_{fold}.bin'))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [66]:
def predict(df, model):
    dataset = EvalDataset(
        sentences=df.excerpt.values,
        tokenizer=tokenizer,
        ids=df.id.values
    )
    data_loader = DataLoader(
        dataset,
        batch_size=Config.val_batch,
        pin_memory=True
    )

    model.to(Config.device)
    model.eval()
    preds = []
    with torch.no_grad():
        for data in tqdm(data_loader, total=len(data_loader)):
            input_ids = data['input_ids'].to(Config.device)
            attention_mask = data['attention_mask'].to(Config.device)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds.extend(outputs.logits.view(-1).cpu().numpy())
    
    return preds

In [67]:
predicts = predict(df_test, eval_model)
df_test['target'] = predicts
df_test[['id', 'target']].to_csv('submission.csv', index=False)

  0%|          | 0/1 [00:00<?, ?it/s]

: 