In [1]:
from google.colab import drive
drive.mount('/content/drive/')
!pip install torchsummary
!pip install -U albumentations
!pip install transformers

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
Requirement already up-to-date: albumentations in /usr/local/lib/python3.7/dist-packages (0.5.2)


In [2]:
import math
from tqdm import tqdm
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Visuals and CV2
import cv2

# albumentations for augs
import albumentations
from albumentations.pytorch.transforms import ToTensorV2

from sklearn.model_selection import KFold, train_test_split

#torch
import torch
import torch.nn as nn
from torch.nn import Parameter
from torch.nn import functional as F
from torch.utils.data import Dataset,DataLoader
from torch.optim import Adam
from torch.optim.lr_scheduler import _LRScheduler
from torch.optim import Adam, lr_scheduler
from sklearn.metrics import mean_squared_error

import transformers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup,get_cosine_schedule_with_warmup
from transformers import get_cosine_with_hard_restarts_schedule_with_warmup


import warnings
warnings.simplefilter('ignore')

In [3]:
df = pd.read_csv("/content/drive/MyDrive/kaggle/furugori/CommonLit Readability Prize/data/train.csv")
df = df[['id','excerpt','target','standard_error']]
df.head()

Unnamed: 0,id,excerpt,target,standard_error
0,c12129c31,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,Once upon a time there were Three Bears who li...,0.247197,0.510845


In [4]:
cdf = pd.read_csv("/content/drive/MyDrive/kaggle/furugori/CommonLit Readability Prize/data/df_train_stopword.csv")
#cdf = cdf[['id','excerpt','target','standard_error']]
cdf.head()

Unnamed: 0,excerpt,target
0,When the young people returned to the ballroom...,-0.340259
1,"All through dinner time, Mrs. Fayre was somewh...",-0.315372
2,"As Roger had predicted, the snow departed as q...",-0.580118
3,And outside before the palace a great garden w...,-1.054013
4,Once upon a time there were Three Bears who li...,0.247197


In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error

RANDOM_STATE = 42

kfold = KFold(n_splits=5, random_state=RANDOM_STATE, shuffle=True)
skfold = StratifiedKFold(n_splits=5, random_state=RANDOM_STATE, shuffle=True)
splits= kfold.split(df)
for i,(train_index, test_index) in enumerate(splits):
    print(train_index.shape,test_index.shape)
    df_train, df_test = df.iloc[train_index,:],df.iloc[test_index,:]
    df_train, df_test = df_train.reset_index(drop=True), df_test.reset_index(drop=True)
    if i==1:
        break

(2267,) (567,)
(2267,) (567,)


In [6]:
class cfg:
    NUM_WORKERS = 2
    TRAIN_BATCH_SIZE = 8
    EPOCHS = 20
    SEED = 2020
    LR = 1e-6
    max_len = 256
    transformer_model = "distilroberta-base"
    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.benchmark = True
TOKENIZER = transformers.AutoTokenizer.from_pretrained(cfg.transformer_model)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=480.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355863.0, style=ProgressStyle(descript…




In [7]:
class traindataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.max_len = max_len
        self.tokenizer = tokenizer

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        row = self.df.iloc[index]
        text = row.excerpt
        text = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_len, return_tensors="pt")
        ids = text["input_ids"][0]
        mask = text["attention_mask"][0]

        
        return {
            "input_ids": torch.tensor(ids),
            "attention_mask": torch.tensor(mask),
        },torch.tensor(row.target)

In [8]:
def train_fn(loader,model,criterion,optimizer,device,scheduler,epoch):
    model.train()
    
    allpreds = []
    alltargets = []
    
    for b_idx, (data,target) in enumerate(loader):
        for key, value in data.items():
            data[key] = value.to(device)

        optimizer.zero_grad()
        output = model(**data)
        output = output.logits.squeeze(-1)#.detach().cpu().numpy()  
        target = target.to(device).float()
        #print(output,target)
        loss = criterion(output,target)
        loss.backward()
        optimizer.step()
        
        allpreds.append(output.detach().cpu().numpy())
        alltargets.append(target.detach().squeeze(-1).cpu().numpy())
        if b_idx%50==0:
            print(b_idx,len(loader),loss.item(),scheduler.get_lr()[0])
        if scheduler is not None:
            scheduler.step(epoch + b_idx/len(loader))
    allpreds = np.concatenate(allpreds)
    alltargets = np.concatenate(alltargets)
    train_rme_loss = np.sqrt(mean_squared_error(alltargets,allpreds))
    print(f"rmse_score:{train_rme_loss}")
        
def val_fn(loader,model,criterion,optimizer,device,scheduler):
    model.eval()
    
    allpreds = []
    alltargets = []
    
    with torch.no_grad():
        for b_idx, (data,target) in enumerate(loader):
            for key, value in data.items():
                data[key] = value.to(device)

            output = model(**data)
            output = output.logits.squeeze(-1)#.detach().cpu().numpy()

            allpreds.append(output.detach().cpu().numpy())
            alltargets.append(target.detach().squeeze(-1).cpu().numpy())
            if b_idx%20==0:
                print(b_idx,len(loader))
        allpreds = np.concatenate(allpreds)
        alltargets = np.concatenate(alltargets)
        train_rme_loss = np.sqrt(mean_squared_error(alltargets,allpreds))
        print(f"rmse_score:{train_rme_loss}")

In [9]:
def run():

    model = transformers.AutoModelForSequenceClassification.from_pretrained(cfg.transformer_model,num_labels=1).to(device)
    #model.load_state_dict(torch.load('/content/drive/MyDrive/kaggle/furugori/CommonLit Readability Prize/inference_weight/bertmodel_9.pt'))
    train_ds = traindataset(df=df_train, tokenizer=TOKENIZER, max_len=cfg.max_len)
    train_loader = torch.utils.data.DataLoader(train_ds, 
                                               batch_size=cfg.TRAIN_BATCH_SIZE, 
                                               num_workers=2, 
                                               pin_memory=True, 
                                               shuffle=True)
    test_ds = traindataset(df=df_test, tokenizer=TOKENIZER, max_len=cfg.max_len)
    val_loader = torch.utils.data.DataLoader(test_ds, 
                                               batch_size=cfg.TRAIN_BATCH_SIZE, 
                                               num_workers=2, 
                                               pin_memory=True, 
                                               shuffle=False)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.1},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0},]  
    #optimizer = AdamW(optimizer_parameters,lr= 2e-5,betas=(0.9, 0.999)) 
    optimizer =torch.optim.Adam(model.parameters(), lr=2e-5)
    def connect(epoch):
        middle = 0.1
        if epoch< middle:
            return float((2.0*float(epoch)+ 0.01 * (middle-float(epoch)))/middle)#*(1.0+np.sin((epoch-int(epoch))/0.25))
        else:
            epoch_c = epoch-middle
            return float((1e-5*float(epoch_c)+ 2.0 * (cfg.EPOCHS-float(epoch_c)))/cfg.EPOCHS)#*(1.0+np.sin((epoch-int(epoch))/0.25))
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda = connect)
    #train_steps = int(len(train_loader)*cfg.EPOCHS)
    #num_steps = int(train_steps*0.01)
    #scheduler = get_linear_schedule_with_warmup(optimizer, num_steps, train_steps)
    criterion = nn.MSELoss()
    val_fn(val_loader,model,criterion,optimizer,device,scheduler)
    for epoch in range(cfg.EPOCHS):
        train_fn(train_loader,model,criterion,optimizer,device,scheduler,epoch)
        
        if epoch%1==0:
            val_fn(val_loader,model,criterion,optimizer,device,scheduler)
            path = "/content/drive/MyDrive/kaggle/furugori/CommonLit Readability Prize/inference_weight/"
            torch.save(model.state_dict(),path + f"bertmodel_{epoch}.pt")
            print(f"save epoch_{epoch}")

In [None]:
run()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=331070498.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight'

0 71
20 71
40 71
60 71
rmse_score:1.390767436970072
0 284 2.8716819286346436 2.0000000000000002e-07
50 284 0.3924112319946289 3.985493030281691e-05
100 284 0.30212724208831787 3.9502819387323944e-05
150 284 0.28890928626060486 3.915070847183099e-05
200 284 0.24104228615760803 3.8798597556338026e-05
250 284 0.5455853343009949 3.844648664084507e-05
rmse_score:0.746259868144989
0 71
20 71
40 71
60 71
rmse_score:0.6043894306050664
save epoch_0
0 284 0.4780074954032898 3.8207051218309855e-05
50 284 0.2848656177520752 3.78549403028169e-05
100 284 0.3503337800502777 3.7502829387323944e-05
150 284 0.10189233720302582 3.7150718471830995e-05
200 284 0.2428523153066635 3.679860755633803e-05
250 284 0.3027803897857666 3.644649664084507e-05
rmse_score:0.5141832232475281
0 71
20 71
40 71
60 71
rmse_score:0.5579882016164975
save epoch_1
0 284 0.11700780689716339 3.6207061218309855e-05
50 284 0.22928500175476074 3.5854950302816906e-05
100 284 0.03778675198554993 3.550283938732394e-05
150 284 0.1106559