In [1]:
import os
from accelerate import Accelerator, notebook_launcher
from accelerate.utils import set_seed
from tqdm.auto import tqdm
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
import torch
from sklearn.model_selection import StratifiedKFold
import time
import gc

# import warnings
# warnings.filterwarnings("ignore")
#torch.multiprocessing.set_start_method('spawn')
torch.backends.cudnn.benchmark=False
torch.backends.cudnn.deterministic=True
CUDA_LAUNCH_BLOCKING=1

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [2]:
OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# CFG and Set Seed

In [3]:
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
    
seed_everything(seed=42)

In [4]:
class cfg:
    select = 'base'
    model_name = f'/kaggle/input/deberta-v3-{select}/deberta-v3-{select}'
    only_model_name = f'deberta-v3-{select}'
    accum_iter = 8
    fold = 4
    split = 5
    seed = 42
    batch_size = 4
    max_len = 1024
    num_epoch = 4
    T_max= 1334
    hidden_dropout_prob=0.005
    attention_probs_dropout_prob=0.005

    scheduler = 'CosineAnnealingLR'
    weight_decay =  1e-6
    min_lr = 1e-6
    freezing = False
    pooling = 'GemText'
    weight_decay = 1e-2
    encoder_lr = 1e-5
    decoder_lr = 1e-5
    eps = 1e-6
    betas = (0.9, 0.999)
    
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]#.detach().to('cpu').numpy()
        y_pred = y_preds[:,i]#.detach().to('cpu').numpy()
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores

def score_loss(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return {
        'mcrmse_score' : mcrmse_score,
        'Content_score' : scores[0],
        'Wording_score' : scores[1]
    }

# Data Loading

In [5]:
prompts_train = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv')
prompts_test = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv')
summary_train = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv')
summary_test = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv')

train = prompts_train.merge(summary_train, on="prompt_id")

In [6]:
train['fold'] = -1
fold = StratifiedKFold(n_splits=cfg.fold, shuffle=True, random_state=cfg.seed)
for n, (train_index, val_index) in enumerate(fold.split(train, train['prompt_id'])):
    train.loc[val_index, 'fold'] = n
train['fold'] = train['fold'].astype(int)

# Tokenizer load

In [7]:
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
cfg.tokenizer = tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Custom Dataset

In [8]:
class ContentDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df
        self.tokenizer = cfg.tokenizer
        self.max_len = cfg.max_len
        self.fp = df['prompt_text'].values
        self.pq = df['prompt_question'].values
        self.title = df['prompt_title'].values
        self.text = df['text'].values
        self.targets = df['content'].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self , index):
        pq   =   self.pq[index]
        title = self.title[index]
        text =   self.text[index]
        fp = self.fp[index]
        full_text = title + self.tokenizer.sep_token + pq + self.tokenizer.sep_token + text
        
        inputs = self.tokenizer.encode_plus(
                        full_text,
                        truncation=True,
                        add_special_tokens=True,
                        max_length=self.max_len,
                        padding='max_length'
                    )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        target = self.targets[index]
        
   
        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(mask, dtype=torch.long),
            
        } , torch.tensor(target, dtype=torch.float)

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:, :mask_len]
    return inputs

class WordingDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df
        self.tokenizer = cfg.tokenizer
        self.max_len = cfg.max_len
        self.fp = df['prompt_text'].values
        self.pq = df['prompt_question'].values
        self.title = df['prompt_title'].values
        self.text = df['text'].values
        self.targets = df['wording'].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self , index):
        pq   =   self.pq[index]
        title = self.title[index]
        text =   self.text[index]
        fp = self.fp[index]
        full_text = title + self.tokenizer.sep_token + pq + self.tokenizer.sep_token + text
        
        inputs = self.tokenizer.encode_plus(
                        full_text,
                        truncation=True,
                        add_special_tokens=True,
                        max_length=self.max_len,
                        padding='max_length'
                    )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        target = self.targets[index]
        
   
        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(mask, dtype=torch.long),
            
        } , torch.tensor(target, dtype=torch.float)

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:, :mask_len]
    return inputs

# Define Train Function

In [9]:
def train_run(model, criterion, optimizer, dataloader, accelerator): # 
    gc.collect()
    model.train()
    running_loss = 0.0
    dataset_size = 0.0 
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    optimizer.zero_grad()
    for batch_idx, (data , labels) in bar:
        inputs , targets = collate(data) , labels

        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']

        batch_size = input_ids.size(0)

        outputs = model(input_ids, attention_mask)

        loss = criterion(outputs.logits, targets)
        loss = loss/cfg.accum_iter
        accelerator.backward(loss)
        if ((batch_idx + 1) % cfg.accum_iter == 0) or (batch_idx + 1 == len(dataloader)):
            optimizer.step()
            optimizer.zero_grad()
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
    epoch_loss = running_loss/dataset_size

    return epoch_loss

# Define Validtaion Function

In [10]:
@torch.no_grad()
def valid_run(model , criterion, dataloader, accelerator):
    model.eval()
    
    running_loss = 0.0
    dataset_size = 0.0
    
    predictions = []
    y_labels = []

    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for batch_idx, (data , labels) in bar:
        inputs , targets = collate(data) , labels
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        
        batch_size = input_ids.size(0)

        outputs = model(input_ids, attention_mask)
        outputs, targets = accelerator.gather_for_metrics((outputs, targets))

        loss = criterion(outputs, targets)
        
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        predictions.append(outputs.detach().to('cpu').numpy())
        y_labels.append(labels.detach().to('cpu').numpy())
    
    predictions = np.concatenate(predictions)
    y_labels    = np.concatenate(y_labels)
    epoch_loss = running_loss / dataset_size
    
    return epoch_loss , predictions , y_labels

# Define other utils

In [11]:
def prepare_fold(n_fold):
    dftrain = train[train['fold']!= n_fold]
    dfvalid = train[train['fold']== n_fold]
    
    train_dataset_content = ContentDataset(dftrain)
    valid_dataset_content = ContentDataset(dfvalid)
    
    train_content_loader = torch.utils.data.DataLoader(train_dataset_content , batch_size=cfg.batch_size, num_workers=2, shuffle=True, pin_memory=True) 
    valid_content_loader = torch.utils.data.DataLoader(valid_dataset_content , batch_size=cfg.batch_size, num_workers=2, shuffle=False, pin_memory=True) 
    
    train_dataset_Wording = WordingDataset(dftrain)
    valid_dataset_Wording = WordingDataset(dfvalid)
    
    train_Wording_loader = torch.utils.data.DataLoader(train_dataset_Wording , batch_size=cfg.batch_size, num_workers=2, shuffle=True, pin_memory=True) # 
    valid_Wording_loader = torch.utils.data.DataLoader(valid_dataset_Wording , batch_size=cfg.batch_size, num_workers=2, shuffle=False, pin_memory=True) # 

    
    return train_content_loader , valid_content_loader, train_Wording_loader, valid_Wording_loader

def oof_df(n_fold , true , pred):
    df_pred = pd.DataFrame(pred ,columns= ['pred_content' , 'pred_wording'])
    df_real = pd.DataFrame(true ,columns= ['content' , 'wording'])
    
    df = pd.concat([df_real , df_pred],1)
    return df

def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
              'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
              'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
              'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

# Train

In [12]:
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]#.detach().to('cpu').numpy()
        y_pred = y_preds[:,i]#.detach().to('cpu').numpy()
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores

def score_loss(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return {
        'mcrmse_score' : mcrmse_score,
        'Content_score' : scores[0],
        'Wording_score' : scores[1]
    }

In [13]:
def content_loop(mixed_precision='fp16', seed=42, n_fold=0):
    

    accelerator = Accelerator(gradient_accumulation_steps=cfg.accum_iter)# , mixed_precision=mixed_precision
    train_content_loader , valid_content_loader, train_Wording_loader, valid_Wording_loader = prepare_fold(fold=n_fold)


    model_config = AutoConfig.from_pretrained(cfg.model_name)
    model_config.update({
            "hidden_dropout_prob": cfg.hidden_dropout_prob,
            "attention_probs_dropout_prob": cfg.attention_probs_dropout_prob,
            "num_labels": 1,
            "problem_type": "regression"
        })

    
    with accelerator.main_process_first():
        model = AutoModelForSequenceClassification.from_pretrained(cfg.model_name, config=model_config)

    optimizer = torch.optim.AdamW(model.parameters(), lr=cfg.encoder_lr, eps=cfg.eps, betas=cfg.betas)
    
    model, optimizer, train_loader, valid_loader = accelerator.prepare(model, optimizer, train_content_loader, valid_content_loader)
    criterion = torch.nn.SmoothL1Loss(reduction='mean')
    best_epoch_score = np.inf

    for epoch in range(cfg.num_epoch):

        train_loss  = train_run(model, criterion, optimizer, train_content_loader, accelerator)#, scheduler
        valid_loss , valid_preds , valid_labels  = valid_run(model , criterion, valid_content_loader, accelerator)

        print(f'train_loss : {train_loss}, valid_loss : {valid_loss}, {score}')
        if valid_loss < best_epoch_score:
            best_epoch_score = valid_loss
            accelerator.wait_for_everyone()
            model_for_save = accelerator.unwrap_model(model)
            torch.save(model_for_save.state_dict(),f'/kaggle/working/deberta-v3-base-Fold_{n_fold}.pth')

In [14]:
for n_fold in range(4):
    print(n_fold)
    notebook_launcher(content_loop,('fp16',42,n_fold), num_processes=2)

0
Launching training on 2 GPUs.
1
Launching training on 2 GPUs.
2
Launching training on 2 GPUs.
3
Launching training on 2 GPUs.
