In [1]:
from IPython.display import clear_output
!pip install transformers
clear_output()

In [2]:
import re
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from torch import nn, optim
from torch.utils.data import  DataLoader, Dataset
import transformers
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


path_tr = '/content/drive/MyDrive/CommonLit/input/train.csv'
path_test = '/content/drive/MyDrive/CommonLit/input/test.csv'
path_sub = '/content/drive/MyDrive/CommonLit/input/sample_submission.csv'

df = pd.read_csv(path_tr)

SEED =13
np.random.seed(SEED)
torch.manual_seed(SEED)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

def clean_text(txt):
    return re.sub('[^A-Za-z]+', ' ',str(txt).lower())
df['txt'] = df['excerpt'].apply(lambda x: clean_text(x))

df.head(2)

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error,txt
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009,when the young people returned to the ballroom...
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805,all through dinner time mrs fayre was somewhat...


# Model

In [20]:
class CL_Dataset(Dataset):

    def __init__(
        self,
        data: pd.DataFrame,
        max_len: int = 255,
        test: bool = False
        ) -> dict:
        self.data = data 
        self.max_len = max_len
        self.test = test
        self.token = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return self.data.shape[0]

    def  __getitem__(self, idx: int):
        text = self.data.txt.iloc[idx]
        encode = self.token.encode_plus(
            text,
            truncation=True,
            max_length=self.max_len,
            padding='max_length',
            add_special_tokens=True,            
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt'
            )
        if self.test:
            target = 0
        else:
            target = self.data.target.iloc[idx]                    
        return {
                'input_ids': encode['input_ids'],
                'attention_mask': encode['attention_mask'],
                'target': torch.tensor(target, dtype = torch.float)  
                }

class CL_model(nn.Module):

    def __init__(self, dim_out:int = 1):
        super(CL_model, self).__init__()
        self.dim_out = dim_out
        self.bert = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.drop = nn.Dropout(p = 0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, self.dim_out)

    def forward(self, input_ids, attention_mask):
        out_model = self.bert(input_ids, attention_mask)
        d1 = self.drop(out_model['pooler_output'])
        out = self.out(d1)
        return out

In [4]:
def loss_fn(output,target):
    return torch.sqrt(nn.MSELoss()(output,target))


def train(
    model:nn.Module, 
    loader: DataLoader,
    optimizer: transformers.AdamW,
    schedule: transformers.get_linear_schedule_with_warmup,
    batch: int,
    max_lenght: int
) -> list:
    model.train()
    torch.backends.cudnn.benchmark = True
    all_pred, all_target, losses = [], [], []
    for input in tqdm(loader):
        optimizer.zero_grad()
        ii = input['input_ids'].squeeze().to(device) # view(batch, max_lenght).to(device)
        am = input['attention_mask'].squeeze().to(device) # .view(batch, max_lenght).to(device)
        target = input['target'].to(device)
        out = model(input_ids =ii, attention_mask = am)
        loss = loss_fn(out.squeeze(-1), target)
        losses.append(loss.detach().cpu().numpy())
        all_pred.append(out.squeeze(-1).detach().cpu().numpy())
        all_target.append(target.detach().cpu().numpy())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1.0)
        optimizer.step()    
        schedule.step()
    losses = np.mean(losses)
    allt = np.concatenate(all_target)
    allp = np.concatenate(all_pred)
    rmse = np.sqrt(mean_squared_error(allt, allp))  
    return losses, rmse


def validate(
    model:nn.Module, 
    loader: DataLoader, 
    batch: int,
    max_lenght: int
) -> list:
    model.eval()
    all_pred, all_target, losses = [], [], []
    for input in tqdm(loader):
        ii = input['input_ids'].squeeze().to(device) # view(batch, max_lenght).to(device)
        am = input['attention_mask'].squeeze().to(device) # .view(batch, max_lenght).to(device)
        target = input['target'].to(device)
        out = model(input_ids =ii, attention_mask = am)
        loss = loss_fn(out.squeeze(-1), target)
        losses.append(loss.detach().cpu().numpy())
        all_pred.append(out.squeeze(-1).detach().cpu().numpy())
        all_target.append(target.detach().cpu().numpy())  
    losses = np.mean(losses)
    allt = np.concatenate(all_target)
    allp = np.concatenate(all_pred)
    rmse = np.sqrt(mean_squared_error(allt, allp)) 
    return losses, rmse


def showtime(model:nn.Module, data: pd.DataFrame):
    
    model = model.to(device)
    X_train, X_test = train_test_split(
        data.head(24),
        test_size=0.2,
        random_state=SEED
    )# ((2267, 6), (567, 6))
    tr = CL_Dataset(X_train, MAX_LEN)
    vl = CL_Dataset(X_test,  MAX_LEN)
    tr_loader = DataLoader(
        tr,
        batch_size=BATCH,
        shuffle=True
    )
    vl_loader = DataLoader(
        vl,
        batch_size=BATCH,
        shuffle=False
    )
    optimizer = transformers.AdamW(model.parameters(), lr=2e-5, correct_bias=True)
    steps = len(tr_loader) * EPOCH
    lin_schedule = transformers.get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=1,
        num_training_steps=steps
    )
    fold = 0
    best_rmse = 100
    for epoch in tqdm(range(EPOCH)):
        tr_loss, tr_rmse = train(model, tr_loader, optimizer, lin_schedule, BATCH, MAX_LEN)
        vl_loss, vl_rmse = validate(model, vl_loader, BATCH, MAX_LEN)
        lr = optimizer.param_groups[0]['lr']
        print(f'Epoch: {epoch}, lr: {lr}, train rmse: {tr_rmse}, vl rmse: {vl_rmse}, vl loss: {vl_loss}')
        if vl_rmse < best_rmse:
            print(f'Save rmse: {vl_rmse}')
            torch.save(model.state_dict(), f'{MODEL}_model_{fold}_{vl_rmse}.pth')

In [7]:
EPOCH = 10
BATCH = 8
MAX_LEN = 255
model = CL_model()
MODEL = model.__class__.__name__
showtime(model, df)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|          | 0/10 [00:00<?, ?it/s]
  0%|          | 0/3 [00:00<?, ?it/s][A

torch.Size([8, 1, 255])



 33%|███▎      | 1/3 [00:00<00:00,  2.07it/s][A

torch.Size([8, 1, 255])



 67%|██████▋   | 2/3 [00:00<00:00,  2.11it/s][A
100%|██████████| 3/3 [00:01<00:00,  2.65it/s]

  0%|          | 0/1 [00:00<?, ?it/s][A

torch.Size([3, 1, 255])



100%|██████████| 1/1 [00:00<00:00,  9.16it/s]


Epoch: 0, lr: 1.8620689655172415e-05, train rmse: 1.2684041261672974, vl rmse: 0.7171961665153503, vl loss: 0.7171961665153503
Save rmse: 0.7171961665153503


 10%|█         | 1/10 [00:02<00:22,  2.55s/it]
  0%|          | 0/3 [00:00<?, ?it/s][A

torch.Size([8, 1, 255])



 33%|███▎      | 1/3 [00:00<00:00,  2.25it/s][A

torch.Size([8, 1, 255])



 67%|██████▋   | 2/3 [00:00<00:00,  2.20it/s][A
100%|██████████| 3/3 [00:01<00:00,  2.68it/s]

  0%|          | 0/1 [00:00<?, ?it/s][A

torch.Size([3, 1, 255])


100%|██████████| 1/1 [00:00<00:00, 10.26it/s]


Epoch: 1, lr: 1.6551724137931037e-05, train rmse: 0.9221476912498474, vl rmse: 0.7022027373313904, vl loss: 0.7022027373313904
Save rmse: 0.7022027373313904


 20%|██        | 2/10 [00:04<00:20,  2.51s/it]
  0%|          | 0/3 [00:00<?, ?it/s][A

torch.Size([8, 1, 255])



 33%|███▎      | 1/3 [00:00<00:00,  2.16it/s][A

torch.Size([8, 1, 255])



 67%|██████▋   | 2/3 [00:00<00:00,  2.14it/s][A
100%|██████████| 3/3 [00:01<00:00,  2.63it/s]

  0%|          | 0/1 [00:00<?, ?it/s][A

torch.Size([3, 1, 255])


100%|██████████| 1/1 [00:00<00:00, 10.00it/s]


Epoch: 2, lr: 1.4482758620689657e-05, train rmse: 0.9372828602790833, vl rmse: 0.7080178260803223, vl loss: 0.7080178260803223
Save rmse: 0.7080178260803223


 30%|███       | 3/10 [00:07<00:17,  2.51s/it]
  0%|          | 0/3 [00:00<?, ?it/s][A

torch.Size([8, 1, 255])



 33%|███▎      | 1/3 [00:00<00:00,  2.18it/s][A

torch.Size([8, 1, 255])



 67%|██████▋   | 2/3 [00:00<00:00,  2.15it/s][A
100%|██████████| 3/3 [00:01<00:00,  2.63it/s]

  0%|          | 0/1 [00:00<?, ?it/s][A

torch.Size([3, 1, 255])


100%|██████████| 1/1 [00:00<00:00, 10.59it/s]


Epoch: 3, lr: 1.2413793103448277e-05, train rmse: 0.8971720933914185, vl rmse: 0.6894130706787109, vl loss: 0.6894130110740662
Save rmse: 0.6894130706787109


 40%|████      | 4/10 [00:12<00:18,  3.14s/it]
  0%|          | 0/3 [00:00<?, ?it/s][A

torch.Size([8, 1, 255])



 33%|███▎      | 1/3 [00:00<00:00,  2.03it/s][A

torch.Size([8, 1, 255])



 67%|██████▋   | 2/3 [00:00<00:00,  2.10it/s][A
100%|██████████| 3/3 [00:01<00:00,  2.63it/s]

  0%|          | 0/1 [00:00<?, ?it/s][A

torch.Size([3, 1, 255])


100%|██████████| 1/1 [00:00<00:00, 10.22it/s]


Epoch: 4, lr: 1.0344827586206898e-05, train rmse: 0.9165782928466797, vl rmse: 0.6776334643363953, vl loss: 0.6776334643363953
Save rmse: 0.6776334643363953


 50%|█████     | 5/10 [00:23<00:27,  5.56s/it]
  0%|          | 0/3 [00:00<?, ?it/s][A

torch.Size([8, 1, 255])



 33%|███▎      | 1/3 [00:00<00:00,  2.00it/s][A

torch.Size([8, 1, 255])



 67%|██████▋   | 2/3 [00:00<00:00,  2.05it/s][A
100%|██████████| 3/3 [00:01<00:00,  2.58it/s]

  0%|          | 0/1 [00:00<?, ?it/s][A

torch.Size([3, 1, 255])


100%|██████████| 1/1 [00:00<00:00, 10.63it/s]


Epoch: 5, lr: 8.275862068965518e-06, train rmse: 0.9626761078834534, vl rmse: 0.670048713684082, vl loss: 0.6700486540794373
Save rmse: 0.670048713684082


 60%|██████    | 6/10 [00:34<00:28,  7.16s/it]
  0%|          | 0/3 [00:00<?, ?it/s][A

torch.Size([8, 1, 255])



 33%|███▎      | 1/3 [00:00<00:00,  2.02it/s][A

torch.Size([8, 1, 255])



 67%|██████▋   | 2/3 [00:00<00:00,  2.08it/s][A
100%|██████████| 3/3 [00:01<00:00,  2.62it/s]

  0%|          | 0/1 [00:00<?, ?it/s][A

torch.Size([3, 1, 255])



100%|██████████| 1/1 [00:00<00:00,  9.00it/s]


Epoch: 6, lr: 6.206896551724138e-06, train rmse: 0.9040706753730774, vl rmse: 0.6629596948623657, vl loss: 0.6629596948623657
Save rmse: 0.6629596948623657


 70%|███████   | 7/10 [00:46<00:26,  8.70s/it]
  0%|          | 0/3 [00:00<?, ?it/s][A

torch.Size([8, 1, 255])



 33%|███▎      | 1/3 [00:00<00:00,  2.05it/s][A

torch.Size([8, 1, 255])



 67%|██████▋   | 2/3 [00:00<00:00,  2.10it/s][A
100%|██████████| 3/3 [00:01<00:00,  2.64it/s]

  0%|          | 0/1 [00:00<?, ?it/s][A

torch.Size([3, 1, 255])


100%|██████████| 1/1 [00:00<00:00, 10.43it/s]


Epoch: 7, lr: 4.137931034482759e-06, train rmse: 0.8878311514854431, vl rmse: 0.6576987504959106, vl loss: 0.6576987504959106
Save rmse: 0.6576987504959106


 80%|████████  | 8/10 [00:56<00:18,  9.16s/it]
  0%|          | 0/3 [00:00<?, ?it/s][A

torch.Size([8, 1, 255])



 33%|███▎      | 1/3 [00:00<00:01,  1.97it/s][A

torch.Size([8, 1, 255])



 67%|██████▋   | 2/3 [00:00<00:00,  2.03it/s][A
100%|██████████| 3/3 [00:01<00:00,  2.58it/s]

  0%|          | 0/1 [00:00<?, ?it/s][A

torch.Size([3, 1, 255])



100%|██████████| 1/1 [00:00<00:00,  8.79it/s]


Epoch: 8, lr: 2.0689655172413796e-06, train rmse: 0.8991280794143677, vl rmse: 0.6550197005271912, vl loss: 0.6550197005271912
Save rmse: 0.6550197005271912


 90%|█████████ | 9/10 [01:07<00:09,  9.69s/it]
  0%|          | 0/3 [00:00<?, ?it/s][A

torch.Size([8, 1, 255])



 33%|███▎      | 1/3 [00:00<00:00,  2.06it/s][A

torch.Size([8, 1, 255])



 67%|██████▋   | 2/3 [00:00<00:00,  2.09it/s][A
100%|██████████| 3/3 [00:01<00:00,  2.61it/s]

  0%|          | 0/1 [00:00<?, ?it/s][A

torch.Size([3, 1, 255])


100%|██████████| 1/1 [00:00<00:00, 10.34it/s]


Epoch: 9, lr: 0.0, train rmse: 0.8459253311157227, vl rmse: 0.6537800431251526, vl loss: 0.6537800431251526
Save rmse: 0.6537800431251526


100%|██████████| 10/10 [01:20<00:00,  8.01s/it]


In [8]:
import gc
# del method not need
_ = gc.collect()

In [28]:
test = pd.read_csv(path_test)
test.head(1)

Unnamed: 0,id,url_legal,license,excerpt
0,c0f722661,,,My hope lay in Jack's promise that he would ke...


In [27]:
@torch.no_grad()
def inference(model:nn.Module, data: pd.DataFrame):
    model.to(device)
    model.eval()
    test = CL_Dataset(data, MAX_LEN, True)
    test_loader = DataLoader(
        test,
        batch_size=1,
        shuffle=True
    )
    all_pred = []
    for input in tqdm(test_loader):
        ii = input['input_ids'].view(1, 255).to(device)
        am = input['attention_mask'].view(1, 255).to(device)
        out = model(input_ids =ii, attention_mask = am)
        all_pred.append(out.squeeze(-1).detach().cpu().numpy())
    return np.concatenate(all_pred)
    

model = CL_model()
model.load_state_dict(torch.load('/content/CL_model_model_0_0.6537800431251526.pth'))
def clean_text(txt):
    return re.sub('[^A-Za-z]+', ' ',str(txt).lower())


test['txt'] = test['excerpt'].apply(lambda x: clean_text(x))
pred = inference(model, test)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/7 [00:00<?, ?it/s][A[A

 43%|████▎     | 3/7 [00:00<00:00, 26.00it/s][A[A

torch.Size([1, 255])
torch.Size([1, 255])
torch.Size([1, 255])
torch.Size([1, 255])
torch.Size([1, 255])
torch.Size([1, 255])




100%|██████████| 7/7 [00:00<00:00, 27.31it/s]

torch.Size([1, 255])





In [25]:
pred

array([-0.83763963, -0.77272505, -0.75076175, -0.8542761 , -0.87986195,
       -0.7421036 , -0.85180527], dtype=float32)