In [1]:
from IPython.display import clear_output
!pip install transformers
clear_output()

In [2]:
import re
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from torch import nn, optim
from torch.utils.data import  DataLoader, Dataset
import transformers
import matplotlib.pyplot as plt

from collections import defaultdict

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


path_tr = '/content/drive/MyDrive/CommonLit/input/train.csv'
path_test = '/content/drive/MyDrive/CommonLit/input/test.csv'
path_sub = '/content/drive/MyDrive/CommonLit/input/sample_submission.csv'

df = pd.read_csv(path_tr)

SEED =13
np.random.seed(SEED)
torch.manual_seed(SEED)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

def clean_text(txt):
    return re.sub('[^A-Za-z]+', ' ',str(txt).lower())
df['txt'] = df['excerpt'].apply(lambda x: clean_text(x))

df.head(2)

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error,txt
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009,when the young people returned to the ballroom...
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805,all through dinner time mrs fayre was somewhat...


# Model

In [3]:
class CL_Dataset(Dataset):

    def __init__(
        self,
        data: pd.DataFrame,
        max_len: int = 256,
        test: bool = False
        ) -> dict:
        self.data = data 
        self.max_len = max_len
        self.test = test
        self.token = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return self.data.shape[0]

    def  __getitem__(self, idx: int):
        text = self.data.txt.iloc[idx]
        encode = self.token.encode_plus(
            text,
            truncation=True,
            max_length=self.max_len,
            padding='max_length',
            add_special_tokens=True,            
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt'
            )
        if self.test:
            target = 0
        else:
            target = self.data.target.iloc[idx]                    
        return {
                'input_ids': encode['input_ids'],
                'attention_mask': encode['attention_mask'],
                'target': torch.tensor(target, dtype = torch.float)  
                }

class CL_model(nn.Module):

    def __init__(self, dim_out:int = 1):
        super(CL_model, self).__init__()
        self.dim_out = dim_out
        self.bert = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.drop = nn.Dropout(p = 0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, self.dim_out)

    def forward(self, input_ids, attention_mask):
        out_model = self.bert(input_ids, attention_mask)
        d1 = self.drop(out_model['pooler_output'])
        out = self.out(d1)
        return out

In [4]:
def loss_fn(output,target):
    return torch.sqrt(nn.MSELoss()(output,target))


def train(
    model:nn.Module, 
    loader: DataLoader,
    optimizer: transformers.AdamW,
    schedule: transformers.get_linear_schedule_with_warmup,
    batch: int,
    max_lenght: int
) -> list:
    model.train()
    torch.backends.cudnn.benchmark = True
    all_pred, all_target, losses = [], [], []
    for input in loader:
        optimizer.zero_grad()
        ii = input['input_ids'].squeeze().to(device) # view(batch, max_lenght).to(device)
        am = input['attention_mask'].squeeze().to(device) # .view(batch, max_lenght).to(device)
        target = input['target'].to(device)
        out = model(input_ids =ii, attention_mask = am)
        # out = out.logits # BertForSequenceClassification
        loss = loss_fn(out.squeeze(-1), target)
        losses.append(loss.detach().cpu().numpy())
        all_pred.append(out.squeeze(-1).detach().cpu().numpy())
        all_target.append(target.detach().cpu().numpy())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1.0)
        optimizer.step()    
        schedule.step()
    losses = np.mean(losses)
    allt = np.concatenate(all_target)
    allp = np.concatenate(all_pred)
    rmse = np.sqrt(mean_squared_error(allt, allp))  
    return losses, rmse


def validate(
    model:nn.Module, 
    loader: DataLoader, 
    batch: int,
    max_lenght: int
) -> list:
    model.eval()
    all_pred, all_target, losses = [], [], []
    for input in loader:
        ii = input['input_ids'].squeeze().to(device) # view(batch, max_lenght).to(device)
        am = input['attention_mask'].squeeze().to(device) # .view(batch, max_lenght).to(device)
        target = input['target'].to(device)
        out = model(input_ids =ii, attention_mask = am)
        # out = out.logits # BertForSequenceClassification
        loss = loss_fn(out.squeeze(-1), target)
        losses.append(loss.detach().cpu().numpy())
        all_pred.append(out.squeeze(-1).detach().cpu().numpy())
        all_target.append(target.detach().cpu().numpy())  
    losses = np.mean(losses)
    allt = np.concatenate(all_target)
    allp = np.concatenate(all_pred)
    rmse = np.sqrt(mean_squared_error(allt, allp)) 
    return losses, rmse


def showtime(model:nn.Module, data: pd.DataFrame) -> dict:
    
    history = defaultdict(list)
    model = model.to(device)
    X_train, X_test = train_test_split(
        data,
        test_size=0.2,
        random_state=SEED
    )# ((2267, 6), (567, 6))
    tr = CL_Dataset(X_train, MAX_LEN)
    vl = CL_Dataset(X_test,  MAX_LEN)
    tr_loader = DataLoader(
        tr,
        batch_size=BATCH,
        shuffle=True
    )
    vl_loader = DataLoader(
        vl,
        batch_size=BATCH,
        shuffle=False
    )
    optimizer = transformers.AdamW(model.parameters(), lr=2e-5, correct_bias=True)
    steps = len(tr_loader) * EPOCH
    lin_schedule = transformers.get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=1,
        num_training_steps=steps
    )
    fold = 0
    best_rmse = 100
    for epoch in tqdm(range(EPOCH)):
        tr_loss, tr_rmse = train(model, tr_loader, optimizer, lin_schedule, BATCH, MAX_LEN)
        vl_loss, vl_rmse = validate(model, vl_loader, BATCH, MAX_LEN)
        lr = optimizer.param_groups[0]['lr']
        history['train_loss'].append(tr_loss)
        history['valid_loss'].append(vl_loss)
        history['valid_rms'].append(vl_rmse)
        history['lr'].append(lr)
        print(f'Epoch: {epoch}, lr: {lr}, train rmse: {tr_rmse}, vl rmse: {vl_rmse}, vl loss: {vl_loss}')
        if vl_rmse < best_rmse:
            print(f'Save rmse: {vl_rmse}')
            torch.save(model.state_dict(), f'{MODEL}_model_{fold}_{vl_rmse}.pth')
    return history

In [None]:
EPOCH = 10
BATCH = 8
MAX_LEN = 314
model = CL_model()
# model = transformers.BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)
MODEL = model.__class__.__name__
showtime(model, df)

In [None]:
stop

#inference

In [6]:
import gc
_ = gc.collect()

In [7]:
test = pd.read_csv(path_test)
test.head(1)

Unnamed: 0,id,url_legal,license,excerpt
0,c0f722661,,,My hope lay in Jack's promise that he would ke...


In [None]:
@torch.no_grad()
def inference(model:nn.Module, data: pd.DataFrame):
    model.to(device)
    model.eval()
    test = CL_Dataset(data, MAX_LEN, True)
    test_loader = DataLoader(
        test,
        batch_size=1,
        shuffle=False
    )
    all_pred = []
    for input in tqdm(test_loader):
        ii = input['input_ids'].view(1, MAX_LEN).to(device)
        am = input['attention_mask'].view(1, MAX_LEN).to(device)
        out = model(input_ids =ii, attention_mask = am)
        all_pred.append(out.squeeze(-1).detach().cpu().numpy())
    return np.concatenate(all_pred)
    

model = CL_model()
model.load_state_dict(torch.load('/content/CL_model_model_0_0.5428774356842041.pth'))
def clean_text(txt):
    return re.sub('[^A-Za-z]+', ' ',str(txt).lower())


test['txt'] = test['excerpt'].apply(lambda x: clean_text(x))
pred = inference(model, test)

In [None]:
pred

#save to load local

In [None]:
# save token
token = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
token.save_pretrained("./model_uncase_bert")
#save model
bert = transformers.BertModel.from_pretrained('bert-base-uncased')
bert.save_pretrained("./model_uncase_bert")

In [None]:
model = transformers.BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)
token = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
token.save_pretrained("./model_uncase_bert")
#save model
model.save_pretrained("./model_uncase_bert")

# Experements
EPOCH = 10, BATCH = 8

- (?) clear_txt, max_length = 314 = score - 0.56
- (best train 0.5789) clear_txt, max_length = 256 = score - 0.608
- (best train 0.5466) clear_txt, max_length = 255 = score - 0.574
- (best train 0.5595) clear_txt, max_length = 356  = score -0.583
- (best train 0.5294) clear_txt, max_length = 314  = score -0.534
- (best train 0.5572 ) clear_txt, max_length = 192  = score -0.582

change bert to clasifir
- (best train 0.5596) clear_txt, max_length = 255  = score -0.593

autonlp
- (best train 0.6111) clear_txt, max_length = 255  = score -0.605

t5 -fast
- (best train 0.6111) clear_txt, max_length = 255  = score -0.605

batch up 8 --> 16
- clear_txt, max_length= score=





In [None]:
make folds abusheck
make learn by folds in kaggle use gpu maybe use large modle
use two accounts in kaggle

make plot result train

make auto nlp
check score