In [1]:
    # ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
from dataclasses import dataclass, field
from typing import Tuple, List
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

os.system('pip install iterative-stratification==0.1.7')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

os.system('pip install -q transformers')
os.system('pip install -q tokenizers')
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Collecting iterative-stratification==0.1.7
  Downloading iterative_stratification-0.1.7-py3-none-any.whl (8.5 kB)
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.7
tokenizers.__version__: 0.13.3
transformers.__version__: 4.30.2
env: TOKENIZERS_PARALLELISM=true


In [2]:
@dataclass
class Config:
    experiment_name: str = 'default-starter-training/'
    model: str = field(default="microsoft/deberta-v3-base")
    batch_size: int = field(default=8)
    max_len: int = field(default=512)
    epochs: int = field(default=4)
    encoder_lr: float = field(default=2e-5)
    decoder_lr: float = field(default=2e-5)
    min_lr: float = field(default=1e-6)
    scheduler: str = field(default='cosine')  # ['linear', 'cosine']
    debug: bool = field(default=False)
    apex: bool = field(default=True)
    print_freq: int = field(default=20)
    num_workers: int = field(default=4)
    gradient_checkpointing: bool = field(default=True)
    batch_scheduler: bool = field(default=True)
    num_cycles: float = field(default=0.5)
    num_warmup_steps: int = field(default=0)
    eps: float = field(default=1e-6)
    betas: Tuple[float, float] = field(default=(0.9, 0.999))
    weight_decay: float = field(default=0.01)
    gradient_accumulation_steps: int = field(default=1)
    max_grad_norm: int = field(default=1000)
    target_cols: List[str] = field(default_factory=lambda: ['content', 'wording'])
    seed: int = field(default=42)
    n_fold: int = field(default=4)
    trn_fold: List[int] = field(default_factory=lambda: [0, 1, 2, 3])
    train: bool = field(default=True)

In [3]:
config = Config()
OUTPUT_DIR = config.experiment_name
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [4]:
# ====================================================
# Utils
# ====================================================
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores


def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

In [5]:
# ====================================================
# Data Loading
# ====================================================
train = pd.read_csv('../input/commonlit-evaluate-student-summaries/summaries_train.csv')
test = pd.read_csv('../input/commonlit-evaluate-student-summaries/summaries_test.csv')
submission = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/sample_submission.csv')

print(f"train.shape: {train.shape}")
display(train.head())
print(f"test.shape: {test.shape}")
display(test.head())
print(f"submission.shape: {submission.shape}")
display(submission.head())

train.shape: (7165, 5)


Unnamed: 0,student_id,prompt_id,text,content,wording
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757


test.shape: (4, 3)


Unnamed: 0,student_id,prompt_id,text
0,000000ffffff,abc123,Example text 1
1,111111eeeeee,def789,Example text 2
2,222222cccccc,abc123,Example text 3
3,333333dddddd,def789,Example text 4


submission.shape: (4, 3)


Unnamed: 0,student_id,content,wording
0,000000ffffff,0.0,0.0
1,111111eeeeee,0.0,0.0
2,222222cccccc,0.0,0.0
3,333333dddddd,0.0,0.0


In [6]:
# ====================================================
# CV split
# ====================================================
Fold = MultilabelStratifiedKFold(n_splits=config.n_fold, shuffle=True, random_state=config.seed)
for n, (train_index, val_index) in enumerate(Fold.split(train, train[config.target_cols])):
    train.loc[val_index, 'fold'] = int(n)
    
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

fold
0    1791
1    1791
2    1792
3    1791
dtype: int64

In [7]:
if config.debug:
    display(train.groupby('fold').size())
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)
    display(train.groupby('fold').size())

In [8]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(config, text):
    inputs = config.tokenizer.encode_plus(
        text, 
        return_tensors=None, 
        add_special_tokens=True, 
        max_length=config.max_len,
        pad_to_max_length=True,
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, config, df):
        self.config = config
        self.texts = df['text'].values
        self.labels = df[config.target_cols].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.config, self.texts[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label
    

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

In [9]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, config, config_path=None, pretrained=False):
        super().__init__()
        self.config = config
        if config_path is None:
            self.config = AutoConfig.from_pretrained(config.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(config.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
#         if self.config.gradient_checkpointing:
#             self.model.gradient_checkpointing_enable()
        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, 2)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output

In [10]:
# ====================================================
# Loss
# ====================================================
class RMSELoss(nn.Module):
    def __init__(self, reduction='mean', eps=1e-9):
        super().__init__()
        self.mse = nn.MSELoss(reduction='none')
        self.reduction = reduction
        self.eps = eps

    def forward(self, y_pred, y_true):
        loss = torch.sqrt(self.mse(y_pred, y_true) + self.eps)
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss

In [11]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


In [12]:
def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=config.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=config.apex):
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if config.gradient_accumulation_steps > 1:
            loss = loss / config.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
        if (step + 1) % config.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if config.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % config.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if config.gradient_accumulation_steps > 1:
            loss = loss / config.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.to('cpu').numpy())
        end = time.time()
        if step % config.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions

In [13]:
# ====================================================
# train loop
# ====================================================
def train_loop(model_config, train_df, fold):
    dir_output = os.path.join(OUTPUT_DIR, model_config.model.replace('/', '-'))

    tokenizer = AutoTokenizer.from_pretrained(config.model)
    tokenizer.save_pretrained(os.path.join(dir_output, 'tokenizer'))
    model_config.tokenizer = tokenizer
    
    lengths = []
    tk0 = tqdm(train_df['text'].fillna("").values, total=len(train_df))
    
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        lengths.append(length)
        
    model_config.max_len = max(lengths) + 2 # cls & sep
    LOGGER.info(f"max_len: {config.max_len}")
    LOGGER.info(f"========== fold: {fold} training ==========")
    

    # ====================================================
    # loader
    # ====================================================
    train_folds = train_df[train_df['fold'] != fold].reset_index(drop=True)
    valid_folds = train_df[train_df['fold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds[config.target_cols].values
    
    train_dataset = TrainDataset(model_config, train_folds)
    valid_dataset = TrainDataset(model_config, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=model_config.batch_size,
                              shuffle=True,
                              num_workers=model_config.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=model_config.batch_size * 2,
                              shuffle=False,
                              num_workers=model_config.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(model_config, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=model_config.encoder_lr, 
                                                decoder_lr=model_config.decoder_lr,
                                                weight_decay=model_config.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=model_config.encoder_lr, eps=model_config.eps, betas=model_config.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(model_config, optimizer, num_train_steps):
        if model_config.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=model_config.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif model_config.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=model_config.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=model_config.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / model_config.batch_size * model_config.epochs)
    scheduler = get_scheduler(config, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.SmoothL1Loss(reduction='mean') # RMSELoss(reduction="mean")
    
    best_score = np.inf

    for epoch in range(model_config.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score, scores = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}  Scores: {scores}')
        
        if best_score > score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                         os.path.join(dir_output, f"{model_config.model.replace('/', '-')}_fold{fold}_best.pth"))

    predictions = torch.load(os.path.join(dir_output, f"{model_config.model.replace('/', '-')}_fold{fold}_best.pth"), 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds[[f"pred_{c}" for c in model_config.target_cols]] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [14]:
def get_result(oof_df):
    labels = oof_df[config.target_cols].values
    preds = oof_df[[f"pred_{c}" for c in config.target_cols]].values
    score, scores = get_score(labels, preds)
    LOGGER.info(f'Score: {score:<.4f}  Scores: {scores}')
    
config_list = [
    {
    'experiment_name': "base-deberta",
    'model': 'microsoft/deberta-v3-base',
    'batch_size': 8,
    }, 
    {
    'experiment_name': "base-roberta",
    'model': 'roberta-base',
    'batch_size': 8,
    }, 
    {
    'experiment_name': "albert-base-v2",
    'model': 'albert-base-v2',
    'batch_size': 8,
    }, 

]


for config_dict in config_list:
    # Create an instance of Config using the dictionary
    model_config = Config(**config_dict)

    oof_df = pd.DataFrame()
    for fold in range(config.n_fold):
        print(f"Fold: {fold} Experiment: {model_config.experiment_name}")
        if fold in config.trn_fold:
            _oof_df = train_loop(model_config, train, fold)
            oof_df = pd.concat([oof_df, _oof_df])
            LOGGER.info(f"========== fold: {fold} result ==========")
            get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_DIR+'oof_df.pkl')

Fold: 0 Experiment: base-deberta


Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/7165 [00:00<?, ?it/s]

max_len: 512
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.30.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}



Downloading pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.weight', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/671] Elapsed 0m 2s (remain 25m 26s) Loss: 0.9646(0.9646) Grad: inf  LR: 0.00002000  
Epoch: [1][20/671] Elapsed 0m 7s (remain 3m 37s) Loss: 0.7998(0.5248) Grad: 28477.8379  LR: 0.00002000  
Epoch: [1][40/671] Elapsed 0m 11s (remain 3m 1s) Loss: 0.2424(0.4071) Grad: 26165.3359  LR: 0.00001999  
Epoch: [1][60/671] Elapsed 0m 16s (remain 2m 45s) Loss: 0.2788(0.3422) Grad: 30115.7930  LR: 0.00001997  
Epoch: [1][80/671] Elapsed 0m 22s (remain 2m 43s) Loss: 0.2076(0.3032) Grad: 30670.9824  LR: 0.00001996  
Epoch: [1][100/671] Elapsed 0m 27s (remain 2m 36s) Loss: 0.1905(0.2720) Grad: 30739.7461  LR: 0.00001993  
Epoch: [1][120/671] Elapsed 0m 32s (remain 2m 27s) Loss: 0.1273(0.2548) Grad: 12413.3271  LR: 0.00001990  
Epoch: [1][140/671] Elapsed 0m 36s (remain 2m 18s) Loss: 0.2331(0.2422) Grad: 41199.8398  LR: 0.00001986  
Epoch: [1][160/671] Elapsed 0m 41s (remain 2m 12s) Loss: 0.0710(0.2325) Grad: 9542.8809  LR: 0.00001982  
Epoch: [1][180/671] Elapsed 0m 46s (remain 2m 6s) Los

Epoch 1 - avg_train_loss: 0.1678  avg_val_loss: 0.1190  time: 192s
Epoch 1 - Score: 0.4970  Scores: [0.43478886423301266, 0.5592748602553492]
Epoch 1 - Save Best Score: 0.4970 Model


EVAL: [111/112] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0865(0.1190) 
Epoch: [2][0/671] Elapsed 0m 0s (remain 5m 25s) Loss: 0.0417(0.0417) Grad: inf  LR: 0.00001707  
Epoch: [2][20/671] Elapsed 0m 5s (remain 2m 53s) Loss: 0.0600(0.0982) Grad: 29518.3359  LR: 0.00001690  
Epoch: [2][40/671] Elapsed 0m 10s (remain 2m 38s) Loss: 0.0370(0.0898) Grad: 19615.6055  LR: 0.00001673  
Epoch: [2][60/671] Elapsed 0m 14s (remain 2m 29s) Loss: 0.0644(0.0868) Grad: 31380.6309  LR: 0.00001656  
Epoch: [2][80/671] Elapsed 0m 19s (remain 2m 25s) Loss: 0.1407(0.0883) Grad: 55148.3203  LR: 0.00001638  
Epoch: [2][100/671] Elapsed 0m 25s (remain 2m 21s) Loss: 0.1262(0.0878) Grad: 31479.7734  LR: 0.00001620  
Epoch: [2][120/671] Elapsed 0m 30s (remain 2m 17s) Loss: 0.0645(0.0897) Grad: 22207.4219  LR: 0.00001601  
Epoch: [2][140/671] Elapsed 0m 35s (remain 2m 14s) Loss: 0.2540(0.0917) Grad: 45000.5352  LR: 0.00001582  
Epoch: [2][160/671] Elapsed 0m 41s (remain 2m 11s) Loss: 0.0923(0.0922) Grad: 37021.3594  L

Epoch 2 - avg_train_loss: 0.0892  avg_val_loss: 0.1140  time: 190s
Epoch 2 - Score: 0.4824  Scores: [0.41748048304465085, 0.547367119767985]
Epoch 2 - Save Best Score: 0.4824 Model


EVAL: [111/112] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0639(0.1140) 
Epoch: [3][0/671] Elapsed 0m 0s (remain 7m 57s) Loss: 0.1636(0.1636) Grad: inf  LR: 0.00001001  
Epoch: [3][20/671] Elapsed 0m 5s (remain 2m 55s) Loss: 0.0454(0.0645) Grad: 54203.9492  LR: 0.00000977  
Epoch: [3][40/671] Elapsed 0m 9s (remain 2m 32s) Loss: 0.0349(0.0580) Grad: 25290.9336  LR: 0.00000954  
Epoch: [3][60/671] Elapsed 0m 15s (remain 2m 35s) Loss: 0.0734(0.0610) Grad: 39819.9531  LR: 0.00000930  
Epoch: [3][80/671] Elapsed 0m 20s (remain 2m 27s) Loss: 0.0419(0.0610) Grad: 53522.3164  LR: 0.00000907  
Epoch: [3][100/671] Elapsed 0m 25s (remain 2m 23s) Loss: 0.0460(0.0617) Grad: 42439.7500  LR: 0.00000884  
Epoch: [3][120/671] Elapsed 0m 30s (remain 2m 17s) Loss: 0.1109(0.0632) Grad: 56431.8945  LR: 0.00000861  
Epoch: [3][140/671] Elapsed 0m 35s (remain 2m 12s) Loss: 0.0758(0.0654) Grad: 49448.9297  LR: 0.00000838  
Epoch: [3][160/671] Elapsed 0m 40s (remain 2m 7s) Loss: 0.0247(0.0645) Grad: 20055.3418  LR:

Epoch 3 - avg_train_loss: 0.0595  avg_val_loss: 0.1057  time: 189s
Epoch 3 - Score: 0.4624  Scores: [0.39352794482055337, 0.5313026693500256]
Epoch 3 - Save Best Score: 0.4624 Model


EVAL: [111/112] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0696(0.1057) 
Epoch: [4][0/671] Elapsed 0m 0s (remain 4m 34s) Loss: 0.0252(0.0252) Grad: 142171.5938  LR: 0.00000294  
Epoch: [4][20/671] Elapsed 0m 5s (remain 2m 39s) Loss: 0.0446(0.0446) Grad: 39119.7227  LR: 0.00000278  
Epoch: [4][40/671] Elapsed 0m 9s (remain 2m 28s) Loss: 0.0436(0.0454) Grad: 42712.3711  LR: 0.00000262  
Epoch: [4][60/671] Elapsed 0m 14s (remain 2m 26s) Loss: 0.0421(0.0442) Grad: 54246.7695  LR: 0.00000246  
Epoch: [4][80/671] Elapsed 0m 20s (remain 2m 29s) Loss: 0.0558(0.0476) Grad: 40351.3789  LR: 0.00000231  
Epoch: [4][100/671] Elapsed 0m 26s (remain 2m 27s) Loss: 0.0428(0.0486) Grad: 55406.3008  LR: 0.00000216  
Epoch: [4][120/671] Elapsed 0m 30s (remain 2m 20s) Loss: 0.0743(0.0481) Grad: 56595.5625  LR: 0.00000202  
Epoch: [4][140/671] Elapsed 0m 36s (remain 2m 16s) Loss: 0.0465(0.0480) Grad: 31243.8672  LR: 0.00000188  
Epoch: [4][160/671] Elapsed 0m 40s (remain 2m 8s) Loss: 0.0371(0.0480) Grad: 39197.1

Epoch 4 - avg_train_loss: 0.0466  avg_val_loss: 0.1059  time: 189s
Epoch 4 - Score: 0.4623  Scores: [0.3907715855277538, 0.5337474935285607]
Epoch 4 - Save Best Score: 0.4623 Model


EVAL: [111/112] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0766(0.1059) 


Score: 0.4623  Scores: [0.3907715855277538, 0.5337474935285607]
Score: 0.4623  Scores: [0.3907715855277538, 0.5337474935285607]


Fold: 1 Experiment: base-deberta


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/7165 [00:00<?, ?it/s]

max_len: 512
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.30.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing De

Epoch: [1][0/671] Elapsed 0m 0s (remain 4m 16s) Loss: 0.3736(0.3736) Grad: inf  LR: 0.00002000  
Epoch: [1][20/671] Elapsed 0m 5s (remain 2m 54s) Loss: 0.2846(0.3196) Grad: 105256.9062  LR: 0.00002000  
Epoch: [1][40/671] Elapsed 0m 10s (remain 2m 44s) Loss: 0.2152(0.2704) Grad: 20439.1914  LR: 0.00001999  
Epoch: [1][60/671] Elapsed 0m 16s (remain 2m 42s) Loss: 0.2072(0.2503) Grad: 33971.5664  LR: 0.00001997  
Epoch: [1][80/671] Elapsed 0m 21s (remain 2m 33s) Loss: 0.2527(0.2445) Grad: 33160.0430  LR: 0.00001996  
Epoch: [1][100/671] Elapsed 0m 26s (remain 2m 27s) Loss: 0.0958(0.2258) Grad: 60984.6602  LR: 0.00001993  
Epoch: [1][120/671] Elapsed 0m 30s (remain 2m 18s) Loss: 0.3466(0.2235) Grad: 105138.1719  LR: 0.00001990  
Epoch: [1][140/671] Elapsed 0m 35s (remain 2m 13s) Loss: 0.0517(0.2140) Grad: 30091.6133  LR: 0.00001986  
Epoch: [1][160/671] Elapsed 0m 40s (remain 2m 7s) Loss: 0.1912(0.2071) Grad: 108952.7812  LR: 0.00001982  
Epoch: [1][180/671] Elapsed 0m 44s (remain 2m 1s) 

Epoch 1 - avg_train_loss: 0.1575  avg_val_loss: 0.1238  time: 188s
Epoch 1 - Score: 0.5012  Scores: [0.4312315981840947, 0.571094900589673]
Epoch 1 - Save Best Score: 0.5012 Model


EVAL: [111/112] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0889(0.1238) 
Epoch: [2][0/671] Elapsed 0m 0s (remain 6m 49s) Loss: 0.1641(0.1641) Grad: inf  LR: 0.00001707  
Epoch: [2][20/671] Elapsed 0m 5s (remain 2m 56s) Loss: 0.0894(0.1058) Grad: 103648.1406  LR: 0.00001690  
Epoch: [2][40/671] Elapsed 0m 12s (remain 3m 7s) Loss: 0.0573(0.0941) Grad: 44718.2695  LR: 0.00001673  
Epoch: [2][60/671] Elapsed 0m 17s (remain 2m 57s) Loss: 0.1022(0.0916) Grad: 51567.9297  LR: 0.00001656  
Epoch: [2][80/671] Elapsed 0m 22s (remain 2m 42s) Loss: 0.0586(0.0932) Grad: 46527.0547  LR: 0.00001638  
Epoch: [2][100/671] Elapsed 0m 27s (remain 2m 33s) Loss: 0.0422(0.0920) Grad: 23642.1855  LR: 0.00001620  
Epoch: [2][120/671] Elapsed 0m 31s (remain 2m 24s) Loss: 0.1009(0.0921) Grad: 33985.0742  LR: 0.00001601  
Epoch: [2][140/671] Elapsed 0m 36s (remain 2m 16s) Loss: 0.0561(0.0930) Grad: 16955.5859  LR: 0.00001582  
Epoch: [2][160/671] Elapsed 0m 41s (remain 2m 11s) Loss: 0.0644(0.0922) Grad: 15149.9863  L

Epoch 2 - avg_train_loss: 0.0988  avg_val_loss: 0.1065  time: 189s
Epoch 2 - Score: 0.4612  Scores: [0.3913691319159277, 0.531015153989365]
Epoch 2 - Save Best Score: 0.4612 Model


EVAL: [111/112] Elapsed 0m 19s (remain 0m 0s) Loss: 0.1179(0.1065) 
Epoch: [3][0/671] Elapsed 0m 0s (remain 4m 25s) Loss: 0.0713(0.0713) Grad: inf  LR: 0.00001001  
Epoch: [3][20/671] Elapsed 0m 5s (remain 2m 53s) Loss: 0.1107(0.0725) Grad: 54772.5000  LR: 0.00000977  
Epoch: [3][40/671] Elapsed 0m 11s (remain 3m 0s) Loss: 0.0380(0.0704) Grad: 54666.0938  LR: 0.00000954  
Epoch: [3][60/671] Elapsed 0m 16s (remain 2m 44s) Loss: 0.0927(0.0699) Grad: 83195.6953  LR: 0.00000930  
Epoch: [3][80/671] Elapsed 0m 22s (remain 2m 42s) Loss: 0.0633(0.0685) Grad: 30393.9922  LR: 0.00000907  
Epoch: [3][100/671] Elapsed 0m 26s (remain 2m 31s) Loss: 0.1310(0.0683) Grad: 45537.4375  LR: 0.00000884  
Epoch: [3][120/671] Elapsed 0m 31s (remain 2m 24s) Loss: 0.0540(0.0714) Grad: 60687.6758  LR: 0.00000861  
Epoch: [3][140/671] Elapsed 0m 36s (remain 2m 18s) Loss: 0.0659(0.0693) Grad: 76643.0625  LR: 0.00000838  
Epoch: [3][160/671] Elapsed 0m 41s (remain 2m 13s) Loss: 0.0806(0.0699) Grad: 72636.6719  LR

Epoch 3 - avg_train_loss: 0.0623  avg_val_loss: 0.1059  time: 191s
Epoch 3 - Score: 0.4604  Scores: [0.3920193611457706, 0.5288446706629243]
Epoch 3 - Save Best Score: 0.4604 Model


EVAL: [111/112] Elapsed 0m 19s (remain 0m 0s) Loss: 0.1202(0.1059) 
Epoch: [4][0/671] Elapsed 0m 0s (remain 4m 11s) Loss: 0.0494(0.0494) Grad: 162650.5312  LR: 0.00000294  
Epoch: [4][20/671] Elapsed 0m 5s (remain 2m 45s) Loss: 0.0411(0.0501) Grad: 33161.7773  LR: 0.00000278  
Epoch: [4][40/671] Elapsed 0m 10s (remain 2m 46s) Loss: 0.0785(0.0540) Grad: 41894.7148  LR: 0.00000262  
Epoch: [4][60/671] Elapsed 0m 15s (remain 2m 36s) Loss: 0.0277(0.0518) Grad: 51501.4922  LR: 0.00000246  
Epoch: [4][80/671] Elapsed 0m 20s (remain 2m 29s) Loss: 0.0527(0.0511) Grad: 29750.0059  LR: 0.00000231  
Epoch: [4][100/671] Elapsed 0m 25s (remain 2m 25s) Loss: 0.0268(0.0493) Grad: 48245.8516  LR: 0.00000216  
Epoch: [4][120/671] Elapsed 0m 30s (remain 2m 16s) Loss: 0.1211(0.0487) Grad: 41899.0312  LR: 0.00000202  
Epoch: [4][140/671] Elapsed 0m 35s (remain 2m 14s) Loss: 0.0508(0.0497) Grad: 43730.3242  LR: 0.00000188  
Epoch: [4][160/671] Elapsed 0m 40s (remain 2m 9s) Loss: 0.0491(0.0481) Grad: 49097.

Epoch 4 - avg_train_loss: 0.0471  avg_val_loss: 0.1059  time: 191s
Epoch 4 - Score: 0.4604  Scores: [0.39033256855935816, 0.5303833135291249]
Epoch 4 - Save Best Score: 0.4604 Model


EVAL: [111/112] Elapsed 0m 19s (remain 0m 0s) Loss: 0.1172(0.1059) 


Score: 0.4604  Scores: [0.39033256855935816, 0.5303833135291249]
Score: 0.4613  Scores: [0.3905521387305545, 0.532068062430999]


Fold: 2 Experiment: base-deberta


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/7165 [00:00<?, ?it/s]

max_len: 512
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.30.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing De

Epoch: [1][0/671] Elapsed 0m 0s (remain 5m 24s) Loss: 0.6931(0.6931) Grad: inf  LR: 0.00002000  
Epoch: [1][20/671] Elapsed 0m 5s (remain 2m 38s) Loss: 0.2607(0.3963) Grad: 34166.5312  LR: 0.00002000  
Epoch: [1][40/671] Elapsed 0m 10s (remain 2m 45s) Loss: 0.1663(0.3288) Grad: 56802.9688  LR: 0.00001999  
Epoch: [1][60/671] Elapsed 0m 15s (remain 2m 32s) Loss: 0.3093(0.2822) Grad: 78728.9766  LR: 0.00001997  
Epoch: [1][80/671] Elapsed 0m 20s (remain 2m 31s) Loss: 0.2004(0.2586) Grad: 62329.9023  LR: 0.00001996  
Epoch: [1][100/671] Elapsed 0m 25s (remain 2m 24s) Loss: 0.1041(0.2396) Grad: 25687.9512  LR: 0.00001993  
Epoch: [1][120/671] Elapsed 0m 30s (remain 2m 19s) Loss: 0.1133(0.2303) Grad: 60985.9922  LR: 0.00001990  
Epoch: [1][140/671] Elapsed 0m 35s (remain 2m 14s) Loss: 0.1640(0.2160) Grad: 53361.0312  LR: 0.00001986  
Epoch: [1][160/671] Elapsed 0m 41s (remain 2m 10s) Loss: 0.1054(0.2122) Grad: 25874.8711  LR: 0.00001982  
Epoch: [1][180/671] Elapsed 0m 45s (remain 2m 4s) Lo

Epoch 1 - avg_train_loss: 0.1573  avg_val_loss: 0.1371  time: 189s
Epoch 1 - Score: 0.5340  Scores: [0.5093164178882102, 0.5587244422472643]
Epoch 1 - Save Best Score: 0.5340 Model


EVAL: [111/112] Elapsed 0m 20s (remain 0m 0s) Loss: 0.1642(0.1371) 
Epoch: [2][0/671] Elapsed 0m 0s (remain 6m 30s) Loss: 0.0848(0.0848) Grad: inf  LR: 0.00001707  
Epoch: [2][20/671] Elapsed 0m 4s (remain 2m 16s) Loss: 0.0519(0.1131) Grad: 73867.8594  LR: 0.00001690  
Epoch: [2][40/671] Elapsed 0m 8s (remain 2m 14s) Loss: 0.0425(0.1026) Grad: 70432.1250  LR: 0.00001673  
Epoch: [2][60/671] Elapsed 0m 13s (remain 2m 19s) Loss: 0.1236(0.1009) Grad: 53404.1016  LR: 0.00001655  
Epoch: [2][80/671] Elapsed 0m 19s (remain 2m 22s) Loss: 0.0729(0.0945) Grad: 55186.5625  LR: 0.00001637  
Epoch: [2][100/671] Elapsed 0m 24s (remain 2m 20s) Loss: 0.0805(0.0962) Grad: 67305.9922  LR: 0.00001619  
Epoch: [2][120/671] Elapsed 0m 29s (remain 2m 12s) Loss: 0.1945(0.0968) Grad: 43047.0664  LR: 0.00001601  
Epoch: [2][140/671] Elapsed 0m 33s (remain 2m 7s) Loss: 0.1304(0.0974) Grad: 66137.1094  LR: 0.00001582  
Epoch: [2][160/671] Elapsed 0m 38s (remain 2m 3s) Loss: 0.0714(0.0968) Grad: 65904.9531  LR: 

Epoch 2 - avg_train_loss: 0.0965  avg_val_loss: 0.1177  time: 189s
Epoch 2 - Score: 0.4898  Scores: [0.4400002764251101, 0.5396139143549323]
Epoch 2 - Save Best Score: 0.4898 Model


EVAL: [111/112] Elapsed 0m 20s (remain 0m 0s) Loss: 0.1405(0.1177) 
Epoch: [3][0/671] Elapsed 0m 0s (remain 5m 3s) Loss: 0.0589(0.0589) Grad: inf  LR: 0.00001000  
Epoch: [3][20/671] Elapsed 0m 5s (remain 2m 46s) Loss: 0.0756(0.0803) Grad: 49253.4688  LR: 0.00000977  
Epoch: [3][40/671] Elapsed 0m 10s (remain 2m 39s) Loss: 0.0405(0.0832) Grad: 54439.0391  LR: 0.00000953  
Epoch: [3][60/671] Elapsed 0m 15s (remain 2m 36s) Loss: 0.0919(0.0812) Grad: 73664.6250  LR: 0.00000930  
Epoch: [3][80/671] Elapsed 0m 20s (remain 2m 27s) Loss: 0.0578(0.0761) Grad: 44390.3320  LR: 0.00000907  
Epoch: [3][100/671] Elapsed 0m 26s (remain 2m 27s) Loss: 0.1746(0.0741) Grad: 76095.8828  LR: 0.00000883  
Epoch: [3][120/671] Elapsed 0m 31s (remain 2m 22s) Loss: 0.0396(0.0740) Grad: 44297.1289  LR: 0.00000860  
Epoch: [3][140/671] Elapsed 0m 36s (remain 2m 15s) Loss: 0.0581(0.0729) Grad: 73263.3438  LR: 0.00000837  
Epoch: [3][160/671] Elapsed 0m 40s (remain 2m 9s) Loss: 0.0853(0.0718) Grad: 104783.0000  LR

Epoch 3 - avg_train_loss: 0.0666  avg_val_loss: 0.1114  time: 189s
Epoch 3 - Score: 0.4728  Scores: [0.4038102725430311, 0.5417295610993825]
Epoch 3 - Save Best Score: 0.4728 Model


EVAL: [111/112] Elapsed 0m 20s (remain 0m 0s) Loss: 0.1390(0.1114) 
Epoch: [4][0/671] Elapsed 0m 0s (remain 5m 54s) Loss: 0.0552(0.0552) Grad: 213525.4688  LR: 0.00000293  
Epoch: [4][20/671] Elapsed 0m 5s (remain 2m 51s) Loss: 0.0669(0.0507) Grad: 34846.8125  LR: 0.00000277  
Epoch: [4][40/671] Elapsed 0m 11s (remain 2m 51s) Loss: 0.0486(0.0513) Grad: 69962.7891  LR: 0.00000261  
Epoch: [4][60/671] Elapsed 0m 16s (remain 2m 42s) Loss: 0.1099(0.0507) Grad: 46841.7422  LR: 0.00000245  
Epoch: [4][80/671] Elapsed 0m 20s (remain 2m 31s) Loss: 0.0376(0.0491) Grad: 54808.8203  LR: 0.00000230  
Epoch: [4][100/671] Elapsed 0m 25s (remain 2m 25s) Loss: 0.0309(0.0485) Grad: 26338.7246  LR: 0.00000216  
Epoch: [4][120/671] Elapsed 0m 31s (remain 2m 23s) Loss: 0.0553(0.0490) Grad: 32472.2051  LR: 0.00000201  
Epoch: [4][140/671] Elapsed 0m 37s (remain 2m 19s) Loss: 0.0636(0.0482) Grad: 27649.2207  LR: 0.00000187  
Epoch: [4][160/671] Elapsed 0m 41s (remain 2m 11s) Loss: 0.0297(0.0482) Grad: 36675

Epoch 4 - avg_train_loss: 0.0501  avg_val_loss: 0.1124  time: 189s
Epoch 4 - Score: 0.4751  Scores: [0.4066128578264426, 0.543614456503248]


EVAL: [111/112] Elapsed 0m 20s (remain 0m 0s) Loss: 0.1379(0.1124) 


Score: 0.4728  Scores: [0.4038102725430311, 0.5417295610993825]
Score: 0.4652  Scores: [0.3950226161753696, 0.5353091394955385]


Fold: 3 Experiment: base-deberta


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/7165 [00:00<?, ?it/s]

max_len: 512
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.30.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing De

Epoch: [1][0/671] Elapsed 0m 0s (remain 4m 51s) Loss: 0.5084(0.5084) Grad: inf  LR: 0.00002000  
Epoch: [1][20/671] Elapsed 0m 5s (remain 2m 41s) Loss: 0.3440(0.4188) Grad: 32330.7324  LR: 0.00002000  
Epoch: [1][40/671] Elapsed 0m 10s (remain 2m 33s) Loss: 0.2335(0.3669) Grad: 32973.9922  LR: 0.00001999  
Epoch: [1][60/671] Elapsed 0m 16s (remain 2m 41s) Loss: 0.2589(0.3135) Grad: 57234.6953  LR: 0.00001997  
Epoch: [1][80/671] Elapsed 0m 21s (remain 2m 37s) Loss: 0.2748(0.2782) Grad: 42711.0977  LR: 0.00001996  
Epoch: [1][100/671] Elapsed 0m 26s (remain 2m 31s) Loss: 0.0847(0.2516) Grad: 22631.1270  LR: 0.00001993  
Epoch: [1][120/671] Elapsed 0m 31s (remain 2m 24s) Loss: 0.2397(0.2343) Grad: 41193.8594  LR: 0.00001990  
Epoch: [1][140/671] Elapsed 0m 36s (remain 2m 18s) Loss: 0.2183(0.2286) Grad: 32892.6055  LR: 0.00001986  
Epoch: [1][160/671] Elapsed 0m 41s (remain 2m 11s) Loss: 0.0753(0.2203) Grad: 15538.1494  LR: 0.00001982  
Epoch: [1][180/671] Elapsed 0m 46s (remain 2m 6s) Lo

Epoch 1 - avg_train_loss: 0.1583  avg_val_loss: 0.1367  time: 189s
Epoch 1 - Score: 0.5327  Scores: [0.4654786759729373, 0.5998466990821216]
Epoch 1 - Save Best Score: 0.5327 Model


EVAL: [111/112] Elapsed 0m 20s (remain 0m 0s) Loss: 0.1551(0.1367) 
Epoch: [2][0/671] Elapsed 0m 0s (remain 8m 6s) Loss: 0.2238(0.2238) Grad: inf  LR: 0.00001707  
Epoch: [2][20/671] Elapsed 0m 4s (remain 2m 25s) Loss: 0.0710(0.1148) Grad: 52178.6836  LR: 0.00001690  
Epoch: [2][40/671] Elapsed 0m 9s (remain 2m 27s) Loss: 0.0972(0.1087) Grad: 70040.0000  LR: 0.00001673  
Epoch: [2][60/671] Elapsed 0m 14s (remain 2m 26s) Loss: 0.2513(0.1041) Grad: 77911.4922  LR: 0.00001656  
Epoch: [2][80/671] Elapsed 0m 19s (remain 2m 24s) Loss: 0.1597(0.1007) Grad: 53435.2539  LR: 0.00001638  
Epoch: [2][100/671] Elapsed 0m 25s (remain 2m 21s) Loss: 0.0815(0.0986) Grad: 37518.0586  LR: 0.00001620  
Epoch: [2][120/671] Elapsed 0m 30s (remain 2m 20s) Loss: 0.1188(0.0988) Grad: 53573.0820  LR: 0.00001601  
Epoch: [2][140/671] Elapsed 0m 36s (remain 2m 16s) Loss: 0.0256(0.1000) Grad: 23392.3555  LR: 0.00001582  
Epoch: [2][160/671] Elapsed 0m 41s (remain 2m 10s) Loss: 0.0897(0.0976) Grad: 28707.9062  LR:

Epoch 2 - avg_train_loss: 0.0997  avg_val_loss: 0.1169  time: 189s
Epoch 2 - Score: 0.4878  Scores: [0.418081505780061, 0.5574248344261303]
Epoch 2 - Save Best Score: 0.4878 Model


EVAL: [111/112] Elapsed 0m 20s (remain 0m 0s) Loss: 0.0864(0.1169) 
Epoch: [3][0/671] Elapsed 0m 0s (remain 5m 7s) Loss: 0.0437(0.0437) Grad: inf  LR: 0.00001001  
Epoch: [3][20/671] Elapsed 0m 6s (remain 3m 10s) Loss: 0.0583(0.0708) Grad: 66851.6016  LR: 0.00000977  
Epoch: [3][40/671] Elapsed 0m 11s (remain 2m 56s) Loss: 0.0886(0.0659) Grad: 78661.3672  LR: 0.00000954  
Epoch: [3][60/671] Elapsed 0m 16s (remain 2m 45s) Loss: 0.1080(0.0729) Grad: 68518.3750  LR: 0.00000930  
Epoch: [3][80/671] Elapsed 0m 21s (remain 2m 34s) Loss: 0.0370(0.0705) Grad: 38602.2930  LR: 0.00000907  
Epoch: [3][100/671] Elapsed 0m 26s (remain 2m 28s) Loss: 0.0673(0.0700) Grad: 47594.4688  LR: 0.00000884  
Epoch: [3][120/671] Elapsed 0m 31s (remain 2m 25s) Loss: 0.0483(0.0703) Grad: 34136.6367  LR: 0.00000861  
Epoch: [3][140/671] Elapsed 0m 37s (remain 2m 21s) Loss: 0.0953(0.0722) Grad: 78984.4219  LR: 0.00000838  
Epoch: [3][160/671] Elapsed 0m 42s (remain 2m 13s) Loss: 0.0713(0.0718) Grad: 74498.6094  LR

Epoch 3 - avg_train_loss: 0.0642  avg_val_loss: 0.1100  time: 189s
Epoch 3 - Score: 0.4721  Scores: [0.4064078885423127, 0.537845010209294]
Epoch 3 - Save Best Score: 0.4721 Model


EVAL: [111/112] Elapsed 0m 20s (remain 0m 0s) Loss: 0.0986(0.1100) 
Epoch: [4][0/671] Elapsed 0m 0s (remain 4m 37s) Loss: 0.0180(0.0180) Grad: 112857.5469  LR: 0.00000294  
Epoch: [4][20/671] Elapsed 0m 4s (remain 2m 25s) Loss: 0.0464(0.0480) Grad: 48286.6055  LR: 0.00000278  
Epoch: [4][40/671] Elapsed 0m 9s (remain 2m 31s) Loss: 0.0134(0.0497) Grad: 26515.8164  LR: 0.00000262  
Epoch: [4][60/671] Elapsed 0m 14s (remain 2m 28s) Loss: 0.0305(0.0485) Grad: 28089.3906  LR: 0.00000246  
Epoch: [4][80/671] Elapsed 0m 20s (remain 2m 26s) Loss: 0.0677(0.0490) Grad: 53605.9492  LR: 0.00000231  
Epoch: [4][100/671] Elapsed 0m 25s (remain 2m 22s) Loss: 0.0135(0.0474) Grad: 27189.6641  LR: 0.00000216  
Epoch: [4][120/671] Elapsed 0m 30s (remain 2m 17s) Loss: 0.0230(0.0470) Grad: 20783.8340  LR: 0.00000202  
Epoch: [4][140/671] Elapsed 0m 34s (remain 2m 11s) Loss: 0.0212(0.0469) Grad: 33439.4102  LR: 0.00000188  
Epoch: [4][160/671] Elapsed 0m 39s (remain 2m 4s) Loss: 0.0261(0.0468) Grad: 36437.8

Epoch 4 - avg_train_loss: 0.0483  avg_val_loss: 0.1099  time: 190s
Epoch 4 - Score: 0.4713  Scores: [0.4054812785432832, 0.5371189779973361]
Epoch 4 - Save Best Score: 0.4713 Model


EVAL: [111/112] Elapsed 0m 20s (remain 0m 0s) Loss: 0.0937(0.1099) 


Score: 0.4713  Scores: [0.4054812785432832, 0.5371189779973361]
Score: 0.4667  Scores: [0.397662702749927, 0.5357621090834028]


Fold: 0 Experiment: base-roberta


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/7165 [00:00<?, ?it/s]

max_len: 512


Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.30.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}



Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
