In [None]:
from IPython.display import clear_output
!pip install transformers
clear_output()

In [None]:
import re
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from torch import nn, optim
from torch.utils.data import  DataLoader, Dataset
import transformers

import plotly.graph_objects as go
from plotly.subplots import make_subplots

import matplotlib.pyplot as plt

from collections import defaultdict

from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


path_tr = '/content/drive/MyDrive/CommonLit/input/train.csv'
path_test = '/content/drive/MyDrive/CommonLit/input/test.csv'
path_sub = '/content/drive/MyDrive/CommonLit/input/sample_submission.csv'

df = pd.read_csv(path_tr)

SEED =13
np.random.seed(SEED)
torch.manual_seed(SEED)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

def clean_text(txt):
    return re.sub('[^A-Za-z]+', ' ',str(txt).lower())
df['txt'] = df['excerpt'].apply(lambda x: clean_text(x))

df.head(2)

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error,txt
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009,when the young people returned to the ballroom...
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805,all through dinner time mrs fayre was somewhat...


In [None]:
def make_folds(data: pd.DataFrame, split: int = 5):
    data['kfold'] = -1
    data = data.sample(frac =1).reset_index(drop=True)
    num_bins = int(np.floor(1 + np.log2(len(data))))
    data.loc[:, 'bins'] = pd.cut(
        data['target'], bins = num_bins, labels = False
    )
    kf = model_selection.StratifiedKFold(n_splits=split)
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'kfold'] = f
    data = data.drop("bins", axis=1)
    return data

In [None]:
df_folds = make_folds(df, 5)
df_folds.kfold.value_counts()

3    567
1    567
2    567
0    567
4    566
Name: kfold, dtype: int64

In [None]:
class CL_Dataset(Dataset):

    def __init__(
        self,
        data: pd.DataFrame,
        token,
        max_len: int = 256,
        test: bool = False
        ) -> dict:
        self.data = data 
        self.max_len = max_len
        self.test = test
        self.token = token

    def __len__(self):
        return self.data.shape[0]

    def  __getitem__(self, idx: int):
        text = self.data.txt.iloc[idx]
        encode = self.token(
            text,
            truncation=True,
            max_length=self.max_len,
            padding='max_length',
            add_special_tokens=True,            
            return_attention_mask=True,
            return_token_type_ids=False  
            )
        if self.test:
            target = 0
        else:
            target = self.data.target.iloc[idx]

        ids = encode["input_ids"]
        mask = encode["attention_mask"]

        return {
            "input_ids": torch.tensor(ids, dtype=torch.long),
            "attention_mask": torch.tensor(mask, dtype=torch.long),
            'target': torch.tensor(target, dtype = torch.float)  
        }


class CL_model(nn.Module):

    def __init__(self, dim_out:int = 1):
        super(CL_model, self).__init__()
        self.dim_out = dim_out
        self.bert = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.drop = nn.Dropout(p = 0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, self.dim_out)

    def forward(self, input_ids, attention_mask):
        out_model = self.bert(input_ids, attention_mask)
        d1 = self.drop(out_model['pooler_output'])
        out = self.out(d1)
        return out

In [None]:
def loss_fn(output,target):
    return torch.sqrt(nn.MSELoss()(output,target))


def viz_curve(data: dict, title: str) -> None:
    epochs = list(range(1, EPOCH + 1))
    fig = make_subplots(
        rows=2, cols=2,
        specs=[
               [{"colspan": 2}, None],
               [{}, {}],
               ],
               vertical_spacing=0.09,
               subplot_titles=('Loss Curve',  'RMSE', 'LR')
    )
    fig.add_trace(
        go.Scatter(
            x=epochs,
            y=data['train_loss'],
            mode='lines+markers',
            name='Train Loss'),
            row=1, col=1
    )
    fig.add_trace(
        go.Scatter(
            x=epochs,
            y=data['valid_loss'],
            mode='lines+markers',
            name='Valid Loss'),
            row=1, col=1
    )
    fig.add_trace(
        go.Scatter(
            x=epochs,
            y=data['valid_rms'],
            mode='lines+markers',
            name='RMSE'),     
            row=2, col=1
    )
    fig.add_trace(
        go.Scatter(
            x=epochs,
            y=data['lr'],
            mode='lines+markers',
            name='LR'), 
            row=2, col=2
    )
    fig.update_layout(
        height=600,
        width=600,
        showlegend=False,
        title = title,
        margin=dict(l=10, r=10, t=30, b=20),                     
        template="plotly_dark"    
    )
    fig.show()


def update_result(
    data: dict,
    predict: torch.tensor,
    target:torch.tensor
) -> None:
    loss = loss_fn(predict.squeeze(-1), target)
    data['losses'].append(loss.detach().cpu().numpy())
    data['all_pred'].append(predict.squeeze(-1).detach().cpu().numpy())
    data['all_target'].append(target.detach().cpu().numpy())
    return loss


def train(
    model:nn.Module, 
    loader: DataLoader,
    optimizer: transformers.AdamW,
    schedule: transformers.get_linear_schedule_with_warmup,
    batch: int,
    max_lenght: int
) -> list:
    model.train()
    torch.backends.cudnn.benchmark = True
    results = defaultdict(list)
    for input in loader:
        optimizer.zero_grad()
        target = input['target'].to(device)
        batch = {k:v.to(device) for k,v in input.items() if k != 'target'}   
        out = model(**batch)
        # out = out.logits # BertForSequenceClassification
        loss = update_result(results, out, target)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1.0)
        optimizer.step()    
        schedule.step()
    losses = np.mean(results['losses'])
    allt = np.concatenate(results['all_target'])
    allp = np.concatenate(results['all_pred'])
    rmse = np.sqrt(mean_squared_error(allt, allp))  
    return losses, rmse


def validate(
    model:nn.Module, 
    loader: DataLoader, 
    batch: int,
    max_lenght: int
) -> list:
    model.eval()
    results = defaultdict(list)
    for input in loader:  
        target = input['target'].to(device)
        batch = {k:v.to(device) for k,v in input.items() if k != 'target'} 
        out = model(**batch)
        # out = out.logits # BertForSequenceClassification
        _ = update_result(results, out, target)
    losses = np.mean(results['losses'])
    allt = np.concatenate(results['all_target'])
    allp = np.concatenate(results['all_pred'])
    rmse = np.sqrt(mean_squared_error(allt, allp)) 
    return losses, rmse


def showtime(model:nn.Module, data: pd.DataFrame, tokenizer:transformers.AutoTokenizer, fold: int) -> dict:    
    history = defaultdict(list)
    model = model.to(device)
    ttr = data[data.kfold != fold].reset_index(drop=True)
    vvl = data[data.kfold == fold].reset_index(drop=True)
    print(f'Fold: {fold + 1}, --- {ttr.shape, vvl.shape}')

    tr = CL_Dataset(ttr, tokenizer, MAX_LEN)
    vl = CL_Dataset(vvl, tokenizer, MAX_LEN)
    tr_loader = DataLoader(
        tr,
        batch_size=BATCH,
        shuffle=True
    )
    vl_loader = DataLoader(
        vl,
        batch_size=BATCH,
        shuffle=False
    )
    optimizer = transformers.AdamW(model.parameters(), lr=2e-5, correct_bias=True)
    steps = len(tr_loader) * EPOCH
    # steps = len(tr_loader)/BATCH * EPOCH
    lin_schedule = transformers.get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=1,
        num_training_steps=steps
    )
    best_rmse = np.inf
    for epoch in tqdm(range(EPOCH)):
        tr_loss, tr_rmse = train(model, tr_loader, optimizer, lin_schedule, BATCH, MAX_LEN)
        vl_loss, vl_rmse = validate(model, vl_loader, BATCH, MAX_LEN)
        lr = optimizer.param_groups[0]['lr']
        history['train_loss'].append(tr_loss)
        history['valid_loss'].append(vl_loss)
        history['valid_rms'].append(vl_rmse)
        history['lr'].append(lr)
        print(f'Epoch: {epoch}, lr: {lr}, train rmse: {tr_rmse}, vl rmse: {vl_rmse}, vl loss: {vl_loss}')
        if vl_rmse < best_rmse:
            print(f'Save rmse: {vl_rmse}')
            torch.save(model.state_dict(), f'{MODEL}_model_{fold}.pth')
            best_rmse = vl_rmse
    return history

In [None]:
EPOCH = 1
BATCH = 8
MAX_LEN = 314
model = CL_model()
# model = transformers.BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)
MODEL = model.__class__.__name__
fold = 0
title = f'Fold: {fold + 1}'
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
history = showtime(model, df_folds, tokenizer, fold)
viz_curve(history, title)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|          | 0/1 [00:00<?, ?it/s]

Fold: 1, --- ((2267, 8), (567, 8))
Epoch: 0, lr: 0.0, train rmse: 0.7412992715835571, vl rmse: 0.6628739833831787, vl loss: 0.6422673463821411
Save rmse: 0.6628739833831787


100%|██████████| 1/1 [02:29<00:00, 149.27s/it]


In [None]:
stop

#inference

In [None]:
import gc
_ = gc.collect()

In [None]:
test = pd.read_csv(path_test)
test.head(1)

In [None]:
@torch.no_grad()
def inference(model:nn.Module, data: pd.DataFrame):
    model.to(device)
    model.eval()
    test = CL_Dataset(data, MAX_LEN, True)
    test_loader = DataLoader(
        test,
        batch_size=1,
        shuffle=False
    )
    all_pred = []
    for input in tqdm(test_loader):
        ii = input['input_ids'].view(1, MAX_LEN).to(device)
        am = input['attention_mask'].view(1, MAX_LEN).to(device)
        out = model(input_ids =ii, attention_mask = am)
        all_pred.append(out.squeeze(-1).detach().cpu().numpy())
    return np.concatenate(all_pred)
    

model = CL_model()
model.load_state_dict(torch.load('/content/CL_model_model_0_0.5428774356842041.pth'))
def clean_text(txt):
    return re.sub('[^A-Za-z]+', ' ',str(txt).lower())


test['txt'] = test['excerpt'].apply(lambda x: clean_text(x))
pred = inference(model, test)

In [None]:
pred

#save to load local

In [None]:
# save token
token = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
token.save_pretrained("./model_uncase_bert")
#save model
bert = transformers.BertModel.from_pretrained('bert-base-uncased')
bert.save_pretrained("./model_uncase_bert")

In [None]:
model = transformers.BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)
token = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
token.save_pretrained("./model_uncase_bert")
#save model
model.save_pretrained("./model_uncase_bert")

# Experements
EPOCH = 10, BATCH = 8

- (?) clear_txt, max_length = 314 = score - 0.56
- (best train 0.5789) clear_txt, max_length = 256 = score - 0.608
- (best train 0.5466) clear_txt, max_length = 255 = score - 0.574
- (best train 0.5595) clear_txt, max_length = 356  = score -0.583
- (best train 0.5294) clear_txt, max_length = 314  = score -0.534
- (best train 0.5572 ) clear_txt, max_length = 192  = score -0.582

change bert to clasifir
- (best train 0.5596) clear_txt, max_length = 255  = score -0.593

autonlp
- (best train 0.6111) clear_txt, max_length = 255  = score -0.605

t5 -fast
- (best train 0.6111) clear_txt, max_length = 255  = score -0.605

batch up 8 --> 16
- clear_txt, max_length= score=





In [None]:
make auto nlp
check score