In [1]:
from IPython.display import clear_output
!pip install transformers
clear_output()

In [2]:
import re
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from torch import nn, optim
from torch.utils.data import  DataLoader, Dataset
import transformers
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


path_tr = '/content/drive/MyDrive/CommonLit/input/train.csv'
path_test = '/content/drive/MyDrive/CommonLit/input/test.csv'
path_sub = '/content/drive/MyDrive/CommonLit/input/sample_submission.csv'

df = pd.read_csv(path_tr)

SEED =13
np.random.seed(SEED)
torch.manual_seed(SEED)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

def clean_text(txt):
    return re.sub('[^A-Za-z]+', ' ',str(txt).lower())
df['txt'] = df['excerpt'].apply(lambda x: clean_text(x))

df.head(2)

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error,txt
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009,when the young people returned to the ballroom...
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805,all through dinner time mrs fayre was somewhat...


# Model

In [3]:
class CL_Dataset(Dataset):

    def __init__(
        self,
        data: pd.DataFrame,
        max_len: int = 255,
        test: bool = False
        ) -> dict:
        self.data = data 
        self.max_len = max_len
        self.test = test
        self.token = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return self.data.shape[0]

    def  __getitem__(self, idx: int):
        text = self.data.txt.iloc[idx]
        encode = self.token.encode_plus(
            text,
            truncation=True,
            max_length=self.max_len,
            padding='max_length',
            add_special_tokens=True,            
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt'
            )
        if self.test:
            target = 0
        else:
            target = self.data.target.iloc[idx]                    
        return {
                'input_ids': encode['input_ids'],
                'attention_mask': encode['attention_mask'],
                'target': torch.tensor(target, dtype = torch.float)  
                }

class CL_model(nn.Module):

    def __init__(self, dim_out:int = 1):
        super(CL_model, self).__init__()
        self.dim_out = dim_out
        self.bert = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.drop = nn.Dropout(p = 0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, self.dim_out)

    def forward(self, input_ids, attention_mask):
        out_model = self.bert(input_ids, attention_mask)
        d1 = self.drop(out_model['pooler_output'])
        out = self.out(d1)
        return out

In [4]:
def loss_fn(output,target):
    return torch.sqrt(nn.MSELoss()(output,target))


def train(
    model:nn.Module, 
    loader: DataLoader,
    optimizer: transformers.AdamW,
    schedule: transformers.get_linear_schedule_with_warmup,
    batch: int,
    max_lenght: int
) -> list:
    model.train()
    torch.backends.cudnn.benchmark = True
    all_pred, all_target, losses = [], [], []
    for input in loader:
        optimizer.zero_grad()
        ii = input['input_ids'].squeeze().to(device) # view(batch, max_lenght).to(device)
        am = input['attention_mask'].squeeze().to(device) # .view(batch, max_lenght).to(device)
        target = input['target'].to(device)
        out = model(input_ids =ii, attention_mask = am)
        loss = loss_fn(out.squeeze(-1), target)
        losses.append(loss.detach().cpu().numpy())
        all_pred.append(out.squeeze(-1).detach().cpu().numpy())
        all_target.append(target.detach().cpu().numpy())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1.0)
        optimizer.step()    
        schedule.step()
    losses = np.mean(losses)
    allt = np.concatenate(all_target)
    allp = np.concatenate(all_pred)
    rmse = np.sqrt(mean_squared_error(allt, allp))  
    return losses, rmse


def validate(
    model:nn.Module, 
    loader: DataLoader, 
    batch: int,
    max_lenght: int
) -> list:
    model.eval()
    all_pred, all_target, losses = [], [], []
    for input in loader:
        ii = input['input_ids'].squeeze().to(device) # view(batch, max_lenght).to(device)
        am = input['attention_mask'].squeeze().to(device) # .view(batch, max_lenght).to(device)
        target = input['target'].to(device)
        out = model(input_ids =ii, attention_mask = am)
        loss = loss_fn(out.squeeze(-1), target)
        losses.append(loss.detach().cpu().numpy())
        all_pred.append(out.squeeze(-1).detach().cpu().numpy())
        all_target.append(target.detach().cpu().numpy())  
    losses = np.mean(losses)
    allt = np.concatenate(all_target)
    allp = np.concatenate(all_pred)
    rmse = np.sqrt(mean_squared_error(allt, allp)) 
    return losses, rmse


def showtime(model:nn.Module, data: pd.DataFrame):
    
    model = model.to(device)
    X_train, X_test = train_test_split(
        data,
        test_size=0.2,
        random_state=SEED
    )# ((2267, 6), (567, 6))
    tr = CL_Dataset(X_train, MAX_LEN)
    vl = CL_Dataset(X_test,  MAX_LEN)
    tr_loader = DataLoader(
        tr,
        batch_size=BATCH,
        shuffle=True
    )
    vl_loader = DataLoader(
        vl,
        batch_size=BATCH,
        shuffle=False
    )
    optimizer = transformers.AdamW(model.parameters(), lr=2e-5, correct_bias=True)
    steps = len(tr_loader) * EPOCH
    lin_schedule = transformers.get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=1,
        num_training_steps=steps
    )
    fold = 0
    best_rmse = 100
    for epoch in tqdm(range(EPOCH)):
        tr_loss, tr_rmse = train(model, tr_loader, optimizer, lin_schedule, BATCH, MAX_LEN)
        vl_loss, vl_rmse = validate(model, vl_loader, BATCH, MAX_LEN)
        lr = optimizer.param_groups[0]['lr']
        print(f'Epoch: {epoch}, lr: {lr}, train rmse: {tr_rmse}, vl rmse: {vl_rmse}, vl loss: {vl_loss}')
        if vl_rmse < best_rmse:
            print(f'Save rmse: {vl_rmse}')
            torch.save(model.state_dict(), f'{MODEL}_model_{fold}_{vl_rmse}.pth')

In [6]:
EPOCH = 10
BATCH = 8
MAX_LEN = 314#, 255
model = CL_model()
MODEL = model.__class__.__name__
showtime(model, df)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|          | 0/10 [00:00<?, ?it/s]

Epoch: 0, lr: 1.8006340260655162e-05, train rmse: 0.7601271867752075, vl rmse: 0.5724765658378601, vl loss: 0.5519900918006897
Save rmse: 0.5724765658378601


 10%|█         | 1/10 [02:36<23:32, 157.00s/it]

Epoch: 1, lr: 1.6005635787249034e-05, train rmse: 0.5292499661445618, vl rmse: 0.5428774356842041, vl loss: 0.5211071372032166
Save rmse: 0.5428774356842041


 20%|██        | 2/10 [05:21<21:12, 159.12s/it]

Epoch: 2, lr: 1.4004931313842904e-05, train rmse: 0.41651758551597595, vl rmse: 0.7206393480300903, vl loss: 0.7050365805625916
Save rmse: 0.7206393480300903


 30%|███       | 3/10 [08:04<18:43, 160.48s/it]

Epoch: 3, lr: 1.2004226840436775e-05, train rmse: 0.33295121788978577, vl rmse: 0.7406666278839111, vl loss: 0.7245230674743652
Save rmse: 0.7406666278839111


 40%|████      | 4/10 [10:48<16:08, 161.35s/it]

Epoch: 4, lr: 1.0003522367030645e-05, train rmse: 0.2692682147026062, vl rmse: 0.6737424731254578, vl loss: 0.6561287045478821
Save rmse: 0.6737424731254578


 50%|█████     | 5/10 [13:31<13:30, 162.08s/it]

Epoch: 5, lr: 8.002817893624517e-06, train rmse: 0.21980290114879608, vl rmse: 0.6019706726074219, vl loss: 0.5849028825759888
Save rmse: 0.6019706726074219


 60%|██████    | 6/10 [16:15<10:49, 162.44s/it]

Epoch: 6, lr: 6.002113420218388e-06, train rmse: 0.18207767605781555, vl rmse: 0.6223962306976318, vl loss: 0.6051130294799805
Save rmse: 0.6223962306976318


 70%|███████   | 7/10 [18:58<08:08, 162.77s/it]

Epoch: 7, lr: 4.001408946812258e-06, train rmse: 0.16161870956420898, vl rmse: 0.605218231678009, vl loss: 0.5864432454109192
Save rmse: 0.605218231678009


 80%|████████  | 8/10 [21:42<05:26, 163.03s/it]

Epoch: 8, lr: 2.000704473406129e-06, train rmse: 0.1446959525346756, vl rmse: 0.608823299407959, vl loss: 0.5910805463790894
Save rmse: 0.608823299407959


 90%|█████████ | 9/10 [24:25<02:43, 163.21s/it]

Epoch: 9, lr: 0.0, train rmse: 0.1320406198501587, vl rmse: 0.606290340423584, vl loss: 0.5884116888046265
Save rmse: 0.606290340423584


100%|██████████| 10/10 [27:09<00:00, 162.94s/it]


#inference

In [7]:
import gc
_ = gc.collect()

In [8]:
test = pd.read_csv(path_test)
test.head(1)

Unnamed: 0,id,url_legal,license,excerpt
0,c0f722661,,,My hope lay in Jack's promise that he would ke...


In [9]:
@torch.no_grad()
def inference(model:nn.Module, data: pd.DataFrame):
    model.to(device)
    model.eval()
    test = CL_Dataset(data, MAX_LEN, True)
    test_loader = DataLoader(
        test,
        batch_size=1,
        shuffle=False
    )
    all_pred = []
    for input in tqdm(test_loader):
        ii = input['input_ids'].view(1, MAX_LEN).to(device)
        am = input['attention_mask'].view(1, MAX_LEN).to(device)
        out = model(input_ids =ii, attention_mask = am)
        all_pred.append(out.squeeze(-1).detach().cpu().numpy())
    return np.concatenate(all_pred)
    

model = CL_model()
model.load_state_dict(torch.load('/content/CL_model_model_0_0.5428774356842041.pth'))
def clean_text(txt):
    return re.sub('[^A-Za-z]+', ' ',str(txt).lower())


test['txt'] = test['excerpt'].apply(lambda x: clean_text(x))
pred = inference(model, test)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 7/7 [00:00<00:00, 27.32it/s]


In [10]:
pred

array([-2.1042218 , -0.4232107 , -1.8214164 , -0.3670986 , -0.6991254 ,
        0.61048067, -0.3709323 ], dtype=float32)

#save to load local

In [None]:
# save token
token = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
token.save_pretrained("./model_uncase_bert")
#save model
bert = transformers.BertModel.from_pretrained('bert-base-uncased')
bert.save_pretrained("./model_uncase_bert")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
change max_length 314
add token_type_ids
change bert to clasifir
change drop out
change lr to