# Versions

* Version 1: CV:- 0.4875 LB:-

# Imports

In [1]:
#General
import numpy as np 
import pandas as pd 
from tqdm import tqdm
from glob import glob
import os
import gc

#Sklearn
from sklearn.model_selection import train_test_split , KFold

#Pytorch
import torch 
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F

#Pytorch Lightning
import pytorch_lightning as pl
from pytorch_lightning import seed_everything , Trainer
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

#Hugging Face
from transformers import AutoModel , AutoConfig , AutoTokenizer ,  AdamW, get_linear_schedule_with_warmup,get_constant_schedule_with_warmup,get_cosine_schedule_with_warmup

In [2]:
paths = glob("../input/commonlitreadabilityprize/*.csv")
paths = sorted(paths)

df_ss = pd.read_csv(paths[0])
df_test = pd.read_csv(paths[1])
df_train = pd.read_csv(paths[2])

# Configs

In [3]:
class config:
    seed = 123
    batch_size = 16
    epochs = 20
    transformer_name = "roberta-base"
    transformer_path = "../input/huggingface-roberta/roberta-base"
    max_len = 250
    learning_rate = 2e-5
    save_dir = "./result"
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

seed_everything(config.seed)

123

In [4]:
if not os.path.exists(config.save_dir):
    os.makedirs(config.save_dir)

# Dataset

In [5]:
class CLRDataset:
    def __init__(self, name , dataset):
        self.tokenizer = AutoTokenizer.from_pretrained(name)
        self.max_len = config.max_len
        
        self.excerpt = dataset['excerpt'].to_numpy()
        

    def __len__(self):
        return len(self.excerpt)
    
    def __getitem__(self , idx):
        text = self.excerpt[idx]
        tokenized_text = self.tokenizer(text, truncation = True , padding = "max_length" , max_length= self.max_len )
        
        return {'input_ids': torch.tensor(tokenized_text['input_ids'], dtype = torch.long),
                'attention_mask' : torch.tensor(tokenized_text['attention_mask'] , dtype = torch.long)}

In [6]:
train_data = df_train.loc[:]
test_data = df_test.loc[:]

train = CLRDataset(config.transformer_path,train_data)
test = CLRDataset(config.transformer_path,test_data)

train_dataloader = DataLoader(train , batch_size = config.batch_size , shuffle = True , num_workers=4,pin_memory=False)
test_dataloader =  DataLoader(test , batch_size = config.batch_size , shuffle = False , num_workers=4,pin_memory=False)

# Model

In [7]:
def loss_function(output,target):
    return torch.sqrt(nn.MSELoss()(output,target))

In [8]:
class AttentionHead(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(768,512)
        self.tanh = nn.Tanh()
        self.linear2 = nn.Linear(512,1)
        self.softmax = nn.Softmax(dim = 1)
    
    def forward(self,input_tensors):
        x = self.linear(input_tensors)
        x = self.tanh(x)
        x = self.linear2(x)
        x = self.softmax(x)
        
        return x

class TransformerModel(nn.Module):
    def __init__(self,name):
        super().__init__()
        self.transformer_config = AutoConfig.from_pretrained(name)
        self.transformer_config.update({"output_hidden_states":True, "hidden_dropout_prob": 0.0,"layer_norm_eps": 1e-7}) 
        self.transformer_model = AutoModel.from_pretrained(name , config = self.transformer_config)
        
    def forward(self,input_ids, attention_mask):
        transformer_output = self.transformer_model(input_ids = input_ids , attention_mask = attention_mask)
        transformer_output = transformer_output.hidden_states[-1]
        
        return transformer_output

class RegressionHead(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(768,1)
    
    def forward(self,input_tensors):
        x = self.linear(input_tensors)
        
        return x

class CLRModel(pl.LightningModule):
    def __init__(self,name , train , validation):
        super().__init__()
        self._train_dataloader = train
        self._val_dataloader = validation
        self._test_dataloader = validation
        self.name = name
        self.transformer_model = TransformerModel(self.name)
        self.regression_head = RegressionHead()
        self.attention_head = AttentionHead()
        self.predictions = []
        self.save_hyperparameters()
    
    def forward(self,input_ids , attention_mask):
        last_layer_hidden_states = self.transformer_model(input_ids , attention_mask)
        weights = self.attention_head(last_layer_hidden_states) 
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)  
        output = self.regression_head(context_vector)
        
        return output
    
    def training_step(self,batch,batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        target = batch['target']
        

        output = self.forward(input_ids , attention_mask)
        loss = loss_function(output,target) 
        self.log('train_loss', loss , prog_bar=True)

        return {'loss': loss}

    def train_epoch_end(self, outputs):
        avg_loss = torch.stack([x['loss'] for x in outputs]).mean()
        print(f'epoch {trainer.current_epoch} training loss {avg_loss}')
        return {'train_loss': avg_loss}    
    
    
    def validation_step(self,batch,batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        target = batch['target']
        

        output = self.forward(input_ids , attention_mask)
        loss = loss_function(output,target) 
        self.log('val_loss', loss, prog_bar=True)
        
        return {'val_loss': loss}
    

    def validation_epoch_end(self, outputs):

        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        print(f'epoch {trainer.current_epoch} validation loss {avg_loss}')
        return {'val_loss': avg_loss}
    
    
    def test_step(self, batch,batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']

        output = self.forward(input_ids , attention_mask)
        self.predictions.append(output)
        return self.predictions

    
    def train_dataloader(self):
        return self._train_dataloader
    
    def val_dataloader(self):
        return self._val_dataloader
    
    def test_dataloader(self):
        return self._test_dataloader
    
    def configure_optimizers(self):
        named_parameters = list(model.named_parameters())    
    
        roberta_parameters = named_parameters[:197]    
        attention_parameters = named_parameters[199:203]
        regressor_parameters = named_parameters[203:]

        attention_group = [params for (name, params) in attention_parameters]
        regressor_group = [params for (name, params) in regressor_parameters]
        parameters = []
        parameters.append({"params": attention_group})
        parameters.append({"params": regressor_group})

        for layer_num, (name, params) in enumerate(roberta_parameters):
            weight_decay = 0.0 if "bias" in name else 0.01

            lr = 2e-5

            if layer_num >= 69:        
                lr = 5e-5

            if layer_num >= 133:
                lr = 1e-4

            parameters.append({"params": params,
                               "weight_decay": weight_decay,
                               "lr": lr})
        
        optimizer = AdamW(parameters)            
        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_training_steps= config.epochs * len(train_dataloader),
            num_warmup_steps=50)
            

        
        return [optimizer], [scheduler]
    
    


In [9]:
def predict(data_loader, model):
        
    model.to(config.device)
    model.eval()
    model.zero_grad()
    
    predictions = []
    for batch in tqdm(data_loader):
        inputs = {key:val.reshape(val.shape[0], -1).to(config.device) for key,val in batch.items()}
        outputs = model(**inputs)
        predictions.extend(outputs.detach().cpu().numpy().ravel())
        
    return predictions

In [10]:
model_weight = glob("../input/robertaweightspytorch/result/*.ckpt")
model_weight = sorted(model_weight)

fold_predictions = []
fold =1
for path in model_weight:
    model = CLRModel.load_from_checkpoint(path)
    print(f'*** fold {fold}: {path} ***')
    y_pred = predict(test_dataloader, model)
    fold_predictions.append(y_pred)
    
    fold +=1
    # Free memory
    del model
    gc.collect()
    
predictions = np.mean(fold_predictions, axis=0)

Some weights of the model checkpoint at ../input/huggingface-roberta/roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


*** fold 1: ../input/robertaweightspytorch/result/fold_1.ckpt ***


100%|██████████| 1/1 [00:01<00:00,  1.14s/it]
Some weights of the model checkpoint at ../input/huggingface-roberta/roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|          | 0/1 [00:00<?, ?it/s]

*** fold 2: ../input/robertaweightspytorch/result/fold_2.ckpt ***


100%|██████████| 1/1 [00:00<00:00,  3.24it/s]
Some weights of the model checkpoint at ../input/huggingface-roberta/roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|          | 0/1 [00:00<?, ?it/s]

*** fold 3: ../input/robertaweightspytorch/result/fold_3.ckpt ***


100%|██████████| 1/1 [00:00<00:00,  2.93it/s]
Some weights of the model checkpoint at ../input/huggingface-roberta/roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|          | 0/1 [00:00<?, ?it/s]

*** fold 4: ../input/robertaweightspytorch/result/fold_4.ckpt ***


100%|██████████| 1/1 [00:00<00:00,  2.08it/s]
Some weights of the model checkpoint at ../input/huggingface-roberta/roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|          | 0/1 [00:00<?, ?it/s]

*** fold 5: ../input/robertaweightspytorch/result/fold_5.ckpt ***


100%|██████████| 1/1 [00:00<00:00,  2.92it/s]


In [11]:
df_ss['target'] = predictions
df_ss.to_csv('submission.csv', index=False)
df_ss.head()

Unnamed: 0,id,target
0,c0f722661,-0.314052
1,f0953f0a5,-0.507427
2,0df072751,-0.495707
3,04caf4e0c,-2.372509
4,0e63f8bea,-1.79546
