In [1]:
!pip install /kaggle/input/bitsandbytes/bitsandbytes-0.41.1-py3-none-any.whl
!pip install /kaggle/input/peft-whl/peft-0.4.0-py3-none-any.whl

Processing /kaggle/input/bitsandbytes/bitsandbytes-0.41.1-py3-none-any.whl
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.41.1
Processing /kaggle/input/peft-whl/peft-0.4.0-py3-none-any.whl
Installing collected packages: peft
Successfully installed peft-0.4.0


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from tqdm import tqdm, trange
import bitsandbytes
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
import warnings
import torch
from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
import os
import gc
warnings.filterwarnings('ignore')

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [3]:
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
    
seed_everything(seed=42)

In [4]:
OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [5]:
class cfg:
    select = 'large'
    model_name = f'/kaggle/input/deberta-v3-{select}-hf-weights'
    only_model_name = f'deberta-v3-{select}'
    accum_iter = 4
    fold = 4
    seed = 42
    batch_size = 8
    max_len = 512
    num_epoch = 5
    hidden_dropout_prob=0.005
    attention_probs_dropout_prob=0.005
    lr = 1e-4


In [6]:
prompts_train = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv')
prompts_test = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv')
summary_train = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv')
summary_test = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv')

train = summary_train.merge(prompts_train, how='left',on="prompt_id")

In [7]:
prompt_dict = {id : index for index,id in enumerate(train['prompt_id'].unique())}
train['fold'] = -1
train['fold'] = train['prompt_id'].apply(lambda x : prompt_dict[x])

In [8]:
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
cfg.tokenizer = tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df
        self.tokenizer = cfg.tokenizer
        self.max_len = cfg.max_len
        self.fp = df['prompt_text'].values
        self.pq = df['prompt_question'].values
        self.title = df['prompt_title'].values
        self.text = df['text'].values
        self.targets = df[['content','wording']].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self , index):
        pq   =   self.pq[index]
        title = self.title[index]
        text =   self.text[index]
        fp = self.fp[index]
        full_text = 'System Message: You are an AI assistant that scores summarized text based on provided questions and text summaries.'+ self.tokenizer.sep_token + pq + self.tokenizer.sep_token + text
        
        inputs = self.tokenizer.encode_plus(
                        full_text,
                        truncation=True,
                        add_special_tokens=True,
                        max_length=self.max_len,
                        padding='max_length'
                    )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        target = self.targets[index]
        
   
        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(mask, dtype=torch.long),
            
        } , torch.tensor(target, dtype=torch.float)

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:, :mask_len]
    return inputs

In [10]:
def prepare_fold(df, n_fold):
    dftrain = df[df['fold']!= n_fold]
    dfvalid = df[df['fold']== n_fold]
    
    train_dataset = CustomDataset(dftrain)
    valid_dataset = CustomDataset(dfvalid)
    
    train_loader = torch.utils.data.DataLoader(train_dataset , batch_size=cfg.batch_size, num_workers=0, shuffle=True, pin_memory=True) 
    valid_loader = torch.utils.data.DataLoader(valid_dataset , batch_size=cfg.batch_size, num_workers=0, shuffle=False, pin_memory=True) 
    
    return train_loader , valid_loader

In [11]:
def train_with_MP(cfg, n_fold, model, optimizer, train_loader, val_loader, scheduler):
    scaler = torch.cuda.amp.GradScaler()
    i = 0
    best_loss = 1
    gc.collect()
    for epoch in range(1, cfg.num_epoch + 1):
        model.train()
        train_loss = []
        for batch, labels in tqdm(iter(train_loader)):
            batch = {i: v.to("cuda") for i, v in batch.items()}
            labels = labels.cuda()

            with torch.cuda.amp.autocast():
                output = model(**batch, labels = labels)
                loss = output.loss / cfg.accum_iter

            scaler.scale(loss).backward()
            i += 1
            if i % cfg.accum_iter == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
            train_loss.append(loss.detach().cpu())
        
        _val_loss, scores = validation(model, val_loader)
        _train_loss = np.mean(train_loss)
        scheduler.step()
        print(
            f"Epoch [{epoch}], Train Loss : [{_train_loss:.5f}] Val Loss : [{_val_loss:.5f}] Val mcrmse : [{scores['mcrmse']:.5f}]"
        )
        
        if best_loss >= _val_loss:
            model.save_pretrained(f"/kaggle/working/deberta-v3-base-Fold_{n_fold}.pth")

In [12]:
def validation(model, val_loader):
    model.eval()
    val_loss = []
    pred_list = []
    label_list = []
    with torch.no_grad():
        for batch, labels in tqdm(iter(val_loader)):
            batch = {i: v.to("cuda") for i, v in batch.items()}
            labels = labels.cuda()

            output = model(**batch,labels = labels)
            pred = output.logits
            loss = output.loss
            
            val_loss.append(loss.item())
            pred_list += pred.detach().cpu().tolist()
            label_list += labels.detach().cpu().tolist()
        _val_loss = np.mean(val_loss)
        pred_list_re = np.array(pred_list).reshape(-1,2)[:,]
        label_list_re = np.array(label_list).reshape(-1,2)[:,]
        scores = compute_mcrmse(pred_list_re, label_list_re)

    return _val_loss, scores

In [13]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}

def compute_mcrmse(preds, labels):
    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }

def compt_score(content_true, content_pred, wording_true, wording_pred):
    content_score = mean_squared_error(content_true, content_pred)**(1/2)
    wording_score = mean_squared_error(wording_true, wording_pred)**(1/2)
    
    return (content_score + wording_score)/2

In [14]:
# model_config = AutoConfig.from_pretrained(cfg.model_name)
# model_config.update({
#         "hidden_dropout_prob": cfg.hidden_dropout_prob,
#         "attention_probs_dropout_prob": cfg.attention_probs_dropout_prob,
#         "num_labels": 2,
#         "problem_type": "regression"
#     })
# model = AutoModelForSequenceClassification.from_pretrained(cfg.model_name,config = model_config)

# peft_config = LoraConfig(
#     task_type=TaskType.SEQ_CLS, r=16, lora_alpha=16, target_modules=['query_proj','value_proj'], lora_dropout=0.1, bias="all", modules_to_save=['classifier','pooler']
# )
# peft_model = get_peft_model(model,peft_config)

# peft_model.print_trainable_parameters()

In [15]:
# peft_model.cuda()
# optimizer = torch.optim.AdamW(params = peft_model.parameters(), lr = cfg.lr)
# cosine_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max = 10,eta_min = 1e-5)
# reduce_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2, min_lr=1e-5)

In [16]:
model_config = AutoConfig.from_pretrained(cfg.model_name)
model_config.update({
        "num_labels": 2,
        "problem_type": "regression"
    })
model = AutoModelForSequenceClassification.from_pretrained(cfg.model_name,config = model_config)
lora_config = LoraConfig.from_pretrained(f'/kaggle/working/deberta-v3-base-Fold_0.pth')
peft_model = PeftModel.from_pretrained(model,f'/kaggle/working/deberta-v3-base-Fold_0.pth')
peft_model.print_trainable_parameters()
pref_model = peft_model.merge_and_unload()
peft_model = peft_model.to('cuda')
train_loader , valid_loader = prepare_fold(train, 0)
_val_loss, scores = validation(peft_model, valid_loader)
print(scores)

Some weights of the model checkpoint at /kaggle/input/deberta-v3-large-hf-weights were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassificat

In [None]:
n_fold = 0
train_loader, valid_loader = prepare_fold(train, n_fold)

# print(len(train_loader))
# train_with_MP(cfg, n_fold, peft_model, optimizer, train_loader, valid_loader, cosine_scheduler)

In [None]:
# model_config = AutoConfig.from_pretrained(cfg.model_name)
# model_config.update({
#         "hidden_dropout_prob": cfg.hidden_dropout_prob,
#         "attention_probs_dropout_prob": cfg.attention_probs_dropout_prob,
#         "num_labels": 2,
#         "problem_type": "regression"
#     })
# model = AutoModelForSequenceClassification.from_pretrained(cfg.model_name,config = model_config)
# lora_config = LoraConfig.from_pretrained('/kaggle/working/deberta-v3-base-Fold_0.pth')
# peft_model = PeftModel.from_pretrained(model,'/kaggle/working/deberta-v3-base-Fold_0.pth')
# peft_model.print_trainable_parameters()
# pref_model = peft_model.merge_and_unload()

In [None]:
# peft_model = peft_model.to('cuda')
# train_loader, valid_loader = prepare_fold(train, 0)
# _val_loss, scores =validation(peft_model,valid_loader)
# print(scores)