In [1]:
import numpy as np 
import pandas as pd 
from tqdm import tqdm
import joblib
import gc
import time
import copy
import os
import random

import torch
import torch.nn as nn
from torch.optim import AdamW, RAdam

import transformers
transformers.logging.set_verbosity_error()

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

from transformers import AutoTokenizer,AutoModel,AutoModelForSequenceClassification,AutoConfig
from transformers import DataCollatorWithPadding

from transformers import (get_cosine_schedule_with_warmup,
                         get_linear_schedule_with_warmup,
                         get_cosine_with_hard_restarts_schedule_with_warmup)
from torch.utils.data import DataLoader
from torch.optim import lr_scheduler

from sklearn import datasets
from sklearn import model_selection

pd.set_option('max_columns',None)

In [2]:
in_=torch.tensor([1.,2.,3.])
in2_=torch.tensor([1.,3.,2.])

t_=torch.tensor([2.,3.,3.])


loss = nn.KLDivLoss(reduction = 'mean')
loss2 = nn.MSELoss()
loss3=nn.SmoothL1Loss()

print(loss(in_, t_),loss(in2_, t_))
print(loss2(in_, t_),loss2(in2_, t_))
print(loss3(in_, t_),loss3(in2_, t_))

tensor(-3.0073) tensor(-3.0073)
tensor(0.6667) tensor(0.6667)
tensor(0.3333) tensor(0.3333)


In [3]:
class CONFIG:
    seed=42
    train_path='../input/fp3-data/train_folds.csv'
    test_path='../input/feedback-prize-english-language-learning/test.csv'
    chkpt='microsoft/deberta-v3-base'
    hf_model='hf_model'
    trainable=True
    max_length=512
    n_fold=5
    epochs=3
    train_batch_size=8
    val_batch_size=16
    device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    n_accumulate= 1
    learning_rate= 1e-5
    weight_decay= 1e-3
    eps= 1e-6
    betas= [0.9, 0.999]
    min_lr= 1e-7
    T_max= 500
    replace_newline='[SEP]'
    

In [4]:
def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(CONFIG.seed)

In [5]:
df_train=pd.read_csv(CONFIG.train_path)
df_train['full_text']=df_train['full_text'].apply(lambda x:x.strip().lstrip().rstrip())
df_train.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions,kfold
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0,1
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5,0
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5,4
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0,3
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5,1


In [6]:
# def clean_text(text):
#     if CONFIG.replace_newline:
#         text = text.replace('\n\n-\n\n',CONFIG.replace_newline)
#         text = text.replace('\n\n',CONFIG.replace_newline)
#         text=text.replace('\r\n\r\n',CONFIG.replace_newline)
#         text=text.replace('\r\n',CONFIG.replace_newline)
#         text=text.replace('STUDENT_NAME','student')
#         text=text.replace('SCHOOL_NAME','school')
#         text=text.replace('Generic_Name','Paul')
#         text=text.replace('TEACHER_NAME','teacher')
#     return text

# df_train['full_text']=df_train['full_text'].apply(clean_text)

In [7]:
# all_special_tokens = []
# if CONFIG.replace_newline:
#     all_special_tokens.append(CONFIG.replace_newline)
    
tokenizer = AutoTokenizer.from_pretrained(CONFIG.chkpt,
                                          use_fast=True, 
#                                           additional_special_tokens=all_special_tokens, 
                                          return_special_tokens_mask=True)

# tokenizer.model_max_length = CONFIG.max_length
tokenizer.save_pretrained(f'{CONFIG.hf_model}')

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

('hf_model/tokenizer_config.json',
 'hf_model/special_tokens_map.json',
 'hf_model/spm.model',
 'hf_model/added_tokens.json',
 'hf_model/tokenizer.json')

In [8]:
class MakeTorchDataset(torch.utils.data.Dataset):
    def __init__(self,df,tokenizer):
        self.df=df
        self.tokenizer=tokenizer
        self.max_length=CONFIG.max_length
        self.full_text=self.df['full_text'].values
        self.labels=self.df[['cohesion', 'syntax', 'vocabulary','phraseology', 'grammar', 'conventions']].values.tolist()
    def __len__(self):
        return len(self.df)

  
    def __getitem__(self,idx):

        text=self.tokenizer.encode_plus(self.full_text[idx],
                        add_special_tokens=True,
                        truncation=True,
                        padding='max_length',
                        max_length=self.max_length,
                        )

        ids=text['input_ids']
        masks=text['attention_mask']
        masks_sep_cls=[1 if i in [self.tokenizer.cls_token_id,self.tokenizer.sep_token_id] else 0 for i in ids]
        
        return {
            'input_ids':ids,
            'attention_masks':masks,
            'masks_sep_cls':masks_sep_cls,
            'labels':self.labels[idx]
        }


In [9]:
class MeanPooling(torch.nn.Module):
    def __init__(self):
        super(MeanPooling,self).__init__()
    
    def forward(self,last_hidden_state,attention_mask,masks_sep_cls):
        input_mask_expanded=attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

In [10]:
class FeedModel(torch.nn.Module):
    def __init__(self):
        super(FeedModel,self).__init__()
        
        self.CONFIG=CONFIG    
        if self.CONFIG.trainable:
            self.chkpt=self.CONFIG.chkpt
            self.config=AutoConfig.from_pretrained(self.chkpt)
        else:
            self.chkpt=self.CONFIG.hf_model
            self.config=AutoConfig.from_pretrained(self.CONFIG.hf_model)
            
        self.config.hidden_dropout = 0.
        self.config.hidden_dropout_prob = 0.
        self.config.attention_dropout = 0.
        self.config.attention_probs_dropout_prob = 0.
            
#         self.config.max_position_embeddings=self.CONFIG['max_length']
#         self.config.position_biased_input=True #default is true
#         self.config.relative_attention=False#default is false

        self.mean_pooler=MeanPooling()
#         self.weight_layer_pooler=WeightedLayerPooling(num_hidden_layers=self.config.num_hidden_layers,
#                                                      layer_start=6)
#         self.bilstm = nn.LSTM(self.config.hidden_size, (self.config.hidden_size) // 2, num_layers=2, 
#                               dropout=self.config.hidden_dropout_prob, batch_first=True,
#                               bidirectional=True)
        
        self.dropout1=nn.Dropout(0.1)
        self.dropout2=nn.Dropout(0.2)
        self.dropout3=nn.Dropout(0.3)
        self.dropout4=nn.Dropout(0.4)
        self.dropout5=nn.Dropout(0.5)
        
        self.fc=nn.Linear(self.config.hidden_size,6)
        self.model=self.create_model()
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    

    def create_model(self):
        if self.CONFIG.trainable:
            model=AutoModel.from_pretrained(self.chkpt,output_hidden_states=True)
#             import torch.utils.checkpoint
#             model.gradient_checkpointing_enable()
            model.save_pretrained(CONFIG.hf_model)
        else:
            model=AutoModel.from_pretrained(CONFIG.hf_model)
        return model

    def forward(self,input_ids,attention_mask,masks_sep_cls):
        
#         # simple CLS
#         transformer_out=self.model(input_ids,attention_mask)
#         transformer_out = transformer_out[0][:, 0, :]
        
        #simple mean pooling
        transformer_out=self.model(input_ids,attention_mask)['last_hidden_state']
        transformer_out=self.mean_pooler(transformer_out,attention_mask,masks_sep_cls)
        
#        #weighted layer mean pooling foloowed by mean pooling
#         transformer_out=self.model(input_ids,attention_mask)['hidden_states']
#         transformer_out=torch.stack(transformer_out)
#         transformer_out=self.weight_layer_pooler(transformer_out)
#         transformer_out=self.mean_pooler(transformer_out,attention_mask)
        
#         transformer_out=self.dropout1(transformer_out)
        logits=self.fc(transformer_out)
        

#         logits1=self.fc(self.dropout1(transformer_out))
#         logits2=self.fc(self.dropout2(transformer_out))
#         logits3=self.fc(self.dropout3(transformer_out))
#         logits4=self.fc(self.dropout4(transformer_out))
#         logits5=self.fc(self.dropout5(transformer_out))
#         logits=(logits1+logits2+logits3+logits4+logits5)/5
        
        return logits

In [11]:
class Collate:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch):
        output = dict()
        output["input_ids"] = [sample["input_ids"] for sample in batch]
        output["attention_masks"] = [sample["attention_masks"] for sample in batch]
        output["masks_sep_cls"] = [sample["masks_sep_cls"] for sample in batch]
        output["labels"] = [sample["labels"] for sample in batch]

        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output["input_ids"]])

        # add padding
        if self.tokenizer.padding_side == "right":
            output["input_ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["input_ids"]]
            output["attention_masks"] = [s + (batch_max - len(s)) * [0] for s in output["attention_masks"]]
            output["masks_sep_cls"] = [s + (batch_max - len(s)) * [0] for s in output["masks_sep_cls"]]
        else:
            output["input_ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output["input_ids"]]
            output["attention_masks"] = [(batch_max - len(s)) * [0] + s for s in output["attention_masks"]]
            output["masks_sep_cls"] = [s + (batch_max - len(s)) * [0] for s in output["masks_sep_cls"]]

        # convert to tensors
        output["input_ids"] = torch.tensor(output["input_ids"], dtype=torch.long)
        output["attention_masks"] = torch.tensor(output["attention_masks"], dtype=torch.long)
        output["masks_sep_cls"] = torch.tensor(output["masks_sep_cls"], dtype=torch.long)
        output["labels"] = torch.tensor(output["labels"], dtype=torch.float)

        return output

In [12]:
collate_fn = Collate(tokenizer)

def get_data_loader(df,fold,tokenizer,train_batch_size,val_batch_size,max_length):
    
    df_train=df[df['kfold']!=fold].reset_index(drop=True)
    df_val=df[df['kfold']==fold].reset_index(drop=True)
    
    train_torch_dataset=MakeTorchDataset(df_train,
                                  tokenizer=tokenizer)

    val_torch_dataset=MakeTorchDataset(df_val,
                                tokenizer=tokenizer)
    
    
    train_loader = DataLoader(train_torch_dataset, 
                            batch_size=train_batch_size,
                            collate_fn=collate_fn, 
                            shuffle=True,
                            drop_last=True,
                           pin_memory=True,
                           num_workers=2)


    val_loader = DataLoader(val_torch_dataset,
                          batch_size=val_batch_size,
                          collate_fn=collate_fn,
                          shuffle=False,  
                         pin_memory=True,
                         num_workers=2,)
    return train_loader,val_loader

In [13]:
t=df_train.head(16).copy()
data=MakeTorchDataset(t,tokenizer)

In [14]:
def train_one_epoch(model,optimizer,scheduler,dataloader,device,epoch,fold):
    model.train()

    dataset_size=0
    running_loss=0
    running_fp_metr=0
    preds=[]
    labels_all=[]
    bar = tqdm(enumerate(dataloader), total=len(dataloader))

    for i,data in bar:
        input_ids=data['input_ids'].to(device)
        attention_masks=data['attention_masks'].to(device)
        masks_sep_cls=data['masks_sep_cls'].to(device)
        labels=data['labels'].to(device)

        batch_size=input_ids.size(0)

        outputs=model(input_ids=input_ids,attention_mask=attention_masks,masks_sep_cls=masks_sep_cls)
        loss=criterion(outputs,labels)
        loss = loss / CONFIG.n_accumulate
        loss.backward()
    
        if (i + 1) % CONFIG.n_accumulate == 0:
            optimizer.step()
            optimizer.zero_grad()

            if scheduler is not None:
                scheduler.step()

        running_loss+=(loss.item()*batch_size)
        dataset_size+=batch_size
        epoch_loss=running_loss/dataset_size
        preds.append(outputs.detach().cpu().numpy())
        labels_all.append(labels.detach().cpu().numpy())
#         fp_metr=monitor_metrics(outputs,labels)
#         running_fp_metr+=(fp_metr.item()*batch_size)
#         epoch_fp_metr=running_fp_metr/dataset_size

        bar.set_postfix(Epoch=epoch,Fold=fold, Train_Loss=epoch_loss,LR=optimizer.param_groups[0]['lr'])
    
    predictions = np.concatenate(preds)   
    labels_all = np.concatenate(labels_all) 
    gc.collect()
    return epoch_loss,predictions,labels_all

In [15]:
@torch.no_grad()
def valid_one_epoch(model,dataloader,device,epoch,fold):

    model.eval()

    running_loss=0
    running_fp_metr=0
    dataset_size=0
    preds=[]
    labels_all=[]
    bar = tqdm(enumerate(dataloader), total=len(dataloader))

    for i, data in bar:

        input_ids=data['input_ids'].to(device)
        attention_masks=data['attention_masks'].to(device)
        masks_sep_cls=data['masks_sep_cls'].to(device)
        labels=data['labels'].to(device)
        batch_size=input_ids.size(0)

        outputs=model(input_ids=input_ids,attention_mask=attention_masks,masks_sep_cls=masks_sep_cls)
        
        loss=criterion(outputs,labels)
        running_loss+=(loss.item()*batch_size)
        dataset_size+=batch_size
        epoch_loss=running_loss/dataset_size
        
        preds.append(outputs.detach().cpu().numpy())
        labels_all.append(labels.detach().cpu().numpy())
#         fp_metr=monitor_metrics(outputs,labels)
#         running_fp_metr+=(fp_metr.item()*batch_size)
#         epoch_fp_metr=running_fp_metr/dataset_size
        
        bar.set_postfix(Epoch=epoch,Folf=fold, Val_Loss=epoch_loss)
    gc.collect()
    predictions = np.concatenate(preds)
    labels_all = np.concatenate(labels_all)
    return epoch_loss,predictions,labels_all

In [16]:
import copy
def start_training(nepochs,fold,model,optimizer,scheduler,train_loader,val_loader,device):
    best_val_loss=np.inf
    patience=0
    best_model_wts = copy.deepcopy(model.state_dict())
    for epoch in range(nepochs):
        start=time.time()
        gc.collect()

        train_epoch_loss,preds_tr,labels_tr=train_one_epoch(model,
                                      optimizer=optimizer,
                                      scheduler=scheduler,
                                      dataloader=train_loader,
                                      device=device,
                                      epoch=epoch,
                                     fold=fold)

        val_epoch_loss,preds_va,labels_va=valid_one_epoch(model,
                                  dataloader=val_loader,
                                  device=device,
                                  epoch=epoch,
                                  fold=fold)
        end=time.time()
        
#         train_labels=train_loader['labels']
#         val_labels=val_loader['labels']
        
        fp_metric_tr=get_score(preds_tr, labels_tr)
        fp_metric_va=get_score(preds_va, labels_va)
        
        print(f"Fold {fold} Epoch {epoch}: \
        train_loss {train_epoch_loss:.6f} \
        val_loss {val_epoch_loss:.6f}  \
        fp_metric_tr {np.round(fp_metric_tr,4)} \
        fp_metric_va  {np.round(fp_metric_va,4)} \
        time {(end-start):.1f} seconds")

        if val_epoch_loss<=best_val_loss:
            print(f"Validation Loss Improved ({best_val_loss} ---> {val_epoch_loss})")
            best_val_loss=val_epoch_loss
            PATH = f"{CONFIG.epochs}-{fold}.bin"
            best_model_wts = copy.deepcopy(model.state_dict())
            torch.save(model.state_dict(), PATH)
            print(f"Model Saved")
        else:
            patience+=1

        print()
    
     # load best model weights
    model.load_state_dict(best_model_wts)
    
    return model

In [17]:
def get_optimizer_scheduler(model,num_training_steps):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {
                "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                "weight_decay": CONFIG.weight_decay,
            },
            {
                "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        
        
        opt = AdamW(optimizer_parameters,
                    lr=CONFIG.learning_rate,
                    weight_decay=CONFIG.weight_decay,
#                     betas=CONFIG.betas,
#                     eps=CONFIG.eps
                   )
    
        
        opt2=RAdam(optimizer_parameters,
                   lr=CONFIG.learning_rate,
                    betas=CONFIG.betas,
                    eps=CONFIG.eps)
        
        sch1 = get_linear_schedule_with_warmup(
            opt,
            num_training_steps=num_training_steps,
            num_warmup_steps=0.1*num_training_steps,
            last_epoch=-1,
        )
        
        sch2 = get_cosine_schedule_with_warmup(
            opt,
            num_warmup_steps=0.1*num_training_steps,#0.1*num_training_steps
            num_training_steps=num_training_steps,
            num_cycles=0.5,
            last_epoch=-1,
        )
        
        sch3= get_cosine_with_hard_restarts_schedule_with_warmup(
            opt,
            num_warmup_steps=0,
            num_training_steps=num_training_steps,
            num_cycles=3,
            last_epoch=-1
        )
        
        sch4 = lr_scheduler.CosineAnnealingLR(opt,
                                            T_max=CONFIG.T_max, 
                                            eta_min=CONFIG.min_lr)
        
        return opt, sch1

In [18]:
from sklearn import metrics
def criterion(outputs, targets):
#     loss_fct = nn.MSELoss()
    loss_fct=nn.SmoothL1Loss(reduction='mean', beta=1)#slight better lb score due to  beta=0.51
    loss = loss_fct(outputs, targets)
    return loss

def monitor_metrics(outputs, targets):
#     device = targets.get_device()
#     outputs = outputs.detach().cpu().numpy()
#     targets = targets.detach().cpu().numpy()
    mcrmse = []
    for i in range(6):
        mcrmse.append(
            metrics.mean_squared_error(
                targets[:, i],
                outputs[:, i],
                squared=False,
            ),
        )
        
    return np.mean(mcrmse)

def get_score(outputs, targets):
    mcrmse_score = monitor_metrics(outputs, targets)
    return mcrmse_score

In [19]:
temp_df=df_train.sample(n=300).copy()

In [None]:
device=CONFIG.device

for fold in range(5):

    train_loader,val_loader=get_data_loader(df_train,#temp_df
                                            fold=fold,
                                            tokenizer=tokenizer,
                                            train_batch_size=CONFIG.train_batch_size,
                                            val_batch_size=CONFIG.val_batch_size,
                                            max_length=CONFIG.max_length)
    model=FeedModel()
    
    
    
    num_training_steps = int(CONFIG.epochs * len(train_loader)/ CONFIG.n_accumulate)
    optimizer,scheduler=get_optimizer_scheduler(model,num_training_steps)
    
    model=model.to(device)
 
    model=start_training(CONFIG.epochs,
                   fold,
                   model,
                   optimizer,
                   scheduler,
                   train_loader,val_loader,
                   device)
    
    del model
    gc.collect()
    
    break

Downloading:   0%|          | 0.00/354M [00:00<?, ?B/s]

100%|██████████| 391/391 [04:22<00:00,  1.49it/s, Epoch=0, Fold=0, LR=7.41e-6, Train_Loss=0.419]
100%|██████████| 49/49 [00:21<00:00,  2.31it/s, Epoch=0, Folf=0, Val_Loss=0.135]


Fold 0 Epoch 0:         train_loss 0.418695         val_loss 0.135280          fp_metric_tr 1.1304999589920044         fp_metric_va  0.5216000080108643         time 284.8 seconds
Validation Loss Improved (inf ---> 0.13527973628867312)
Model Saved



100%|██████████| 391/391 [04:21<00:00,  1.50it/s, Epoch=1, Fold=0, LR=3.7e-6, Train_Loss=0.109] 
100%|██████████| 49/49 [00:21<00:00,  2.31it/s, Epoch=1, Folf=0, Val_Loss=0.166]


Fold 0 Epoch 1:         train_loss 0.109352         val_loss 0.166114          fp_metric_tr 0.46869999170303345         fp_metric_va  0.5810999870300293         time 283.3 seconds



100%|██████████| 391/391 [04:21<00:00,  1.50it/s, Epoch=2, Fold=0, LR=0, Train_Loss=0.1]         
100%|██████████| 49/49 [00:21<00:00,  2.31it/s, Epoch=2, Folf=0, Val_Loss=0.149]


Fold 0 Epoch 2:         train_loss 0.100215         val_loss 0.148652          fp_metric_tr 0.448199987411499         fp_metric_va  0.5480999946594238         time 283.2 seconds



100%|██████████| 391/391 [04:21<00:00,  1.50it/s, Epoch=0, Fold=1, LR=7.41e-6, Train_Loss=0.408]
100%|██████████| 49/49 [00:21<00:00,  2.31it/s, Epoch=0, Folf=1, Val_Loss=0.21] 


Fold 1 Epoch 0:         train_loss 0.408492         val_loss 0.209863          fp_metric_tr 1.111899971961975         fp_metric_va  0.6536999940872192         time 283.1 seconds
Validation Loss Improved (inf ---> 0.2098631973391444)
Model Saved



 95%|█████████▌| 372/391 [04:09<00:12,  1.50it/s, Epoch=1, Fold=1, LR=3.87e-6, Train_Loss=0.112]

In [None]:
#0.113 2nd valdata