In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m52.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m105.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.1 tokenizers-0.13.2 transformers-4.26.1
Looking in indexes: https://pypi.org/simpl

In [3]:
!nvidia-smi

Fri Feb 17 13:57:10 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    51W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:

import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re
import html

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD, AdamW, RAdam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score, recall_score, accuracy_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:

# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model="microsoft/deberta-v3-base"
    # model='microsoft/deberta-base'
    # model='roberta-base'
    # model='roberta-large'
    # model='roberta-large-mnli'
    # model='xlnet-large-cased'
    # model='albert-xxlarge-v2'
    # model="microsoft/deberta-large"
    # model="microsoft/deberta-v3-large"
    # model='microsoft/deberta-v2-xlarge'
    # model='funnel-transformer/large'
    # model='funnel-transformer/medium'
    # model='albert-base-v2'
    # model='albert-large-v2'
    # model='google/electra-large-discriminator'
    # model='google/electra-base-discriminator'
    # model="facebook/bart-large-mnli"
    # model="facebook/bart-large"
    # model="facebook/bart-base"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=10
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=16
    fc_dropout=0.2
    target_size=1
    max_len=256
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=10
    trn_fold=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    train=True
    nth_awp_start_epoch=1
    gradient_checkpointing = True
    freezing = True
    clean_content = True

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1]

In [6]:
DIR = "/content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")
OUTPUT_EXP_DIR = DIR + '/output/EXP009/'
if not os.path.exists(OUTPUT_EXP_DIR):
    os.makedirs(OUTPUT_EXP_DIR)

In [7]:
def get_score(labels, outputs):
    thresh = 0.5
    y_pred = outputs
    y_true = labels
    return recall_score(y_true, (y_pred>thresh).astype(int))

def get_acc_score(labels, outputs):
    y_pred = outputs
    y_true = labels
    best_score = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.70, 0.01):
        thresh = np.round(thresh, 2)
        score = accuracy_score(y_true, (y_pred>thresh).astype(int))
        #print("Accuracy score at threshold {0} is {1}".format(thresh, score))
        if score > best_score:
          best_score = score
          best_thresh = thresh
    return accuracy_score(y_true, (y_pred>best_thresh).astype(int))


def get_logger(filename=OUTPUT_EXP_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=CFG.seed)

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
import pandas as pd
import numpy as np

train = pd.read_csv(os.path.join(INPUT_DIR,"train_data.csv"))
test = pd.read_csv(os.path.join(INPUT_DIR,"test_data.csv"))
sample_sub = pd.read_csv(os.path.join(INPUT_DIR,"submission.csv"))

print(train.shape)
display(train.head(3))

print(test.shape)
display(test.head(3))

print(sample_sub.shape)
display(sample_sub.head(3))

(4974, 6)


Unnamed: 0,id,title,year,abstract,keywords,y
0,1,Hierarchical Adversarially Learned Inference,2018,We propose a novel hierarchical generative mod...,"generative, hierarchical, unsupervised, semisu...",0
1,2,Learning to Compute Word Embeddings On the Fly,2018,Words in natural language follow a Zipfian dis...,"NLU, word embeddings, representation learning",0
2,3,Graph2Seq: Scalable Learning Dynamics for Graphs,2018,Neural networks are increasingly used as a gen...,,0


(6393, 5)


Unnamed: 0,id,title,year,abstract,keywords
0,1,StyleAlign: Analysis and Applications of Align...,2022,"In this paper, we perform an in-depth study of...","StyleGAN, transfer learning, fine tuning, mode..."
1,2,Embedding a random graph via GNN: mean-field i...,2021,We develop a theory for embedding a random gra...,"Graph neural network, graph embedding, multi-r..."
2,3,BBRefinement: an universal scheme to improve p...,2021,We present a conceptually simple yet powerful ...,"object detection, deep neural networks, refine..."


(6393, 2)


Unnamed: 0,id,y
0,1,0
1,2,0
2,3,0


In [10]:
train["texts"] = train["abstract"]  

In [11]:
skf = StratifiedKFold(n_splits=CFG.n_fold,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train.y)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)

if CFG.debug:
    display(train.groupby('kfold').size())
    train = train.sample(n=500, random_state=0).reset_index(drop=True)
    display(train.groupby('kfold').size())

In [12]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_EXP_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading (…)"spm.model";:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['texts'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 1 # cls
LOGGER.info(f"max_len: {CFG.max_len}")

100%|██████████| 4974/4974 [00:03<00:00, 1381.60it/s]
max_len: 510
INFO:__main__:max_len: 510


In [14]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False,
                           truncation=True)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df['y'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

#collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [15]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())

        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output

In [16]:

# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [17]:

# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['kfold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['kfold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds['y'].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)


    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size*2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_EXP_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss(reduction="mean")
    
    best_score = -1.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')

        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_EXP_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_EXP_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [18]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['y'].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        acc_score = get_acc_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
        LOGGER.info(f'ACC BEST Score: {acc_score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_EXP_DIR+'oof_df.pkl')

DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.26.1",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "at

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/279] Elapsed 0m 4s (remain 20m 20s) Loss: 0.9441(0.9441) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 17s (remain 0m 31s) Loss: 0.6226(0.6394) Grad: 1.0363  LR: 0.00001994  
Epoch: [1][200/279] Elapsed 0m 31s (remain 0m 12s) Loss: 0.5173(0.6280) Grad: 3.7887  LR: 0.00001975  
Epoch: [1][278/279] Elapsed 0m 41s (remain 0m 0s) Loss: 0.4559(0.6195) Grad: 6.1773  LR: 0.00001951  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.4111(0.4111) 


Epoch 1 - avg_train_loss: 0.6195  avg_val_loss: 0.5883  time: 45s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6195  avg_val_loss: 0.5883  time: 45s
Epoch 1 - Score: 0.0855
INFO:__main__:Epoch 1 - Score: 0.0855
Epoch 1 - Save Best Score: 0.0855 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.0855 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9451(0.5883) 
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 24s) Loss: 0.4446(0.4446) Grad: nan  LR: 0.00001951  
Epoch: [2][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.6124(0.5694) Grad: 3.5148  LR: 0.00001910  
Epoch: [2][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.3922(0.5703) Grad: 1.4912  LR: 0.00001858  
Epoch: [2][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.6534(0.5699) Grad: 2.3198  LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.4529(0.4529) 


Epoch 2 - avg_train_loss: 0.5699  avg_val_loss: 0.6056  time: 41s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5699  avg_val_loss: 0.6056  time: 41s
Epoch 2 - Score: 0.5066
INFO:__main__:Epoch 2 - Score: 0.5066
Epoch 2 - Save Best Score: 0.5066 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.5066 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8596(0.6056) 
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 36s) Loss: 0.4481(0.4481) Grad: nan  LR: 0.00001809  
Epoch: [3][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 1.0812(0.4449) Grad: 15.9016  LR: 0.00001738  
Epoch: [3][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.2747(0.4322) Grad: 3.3652  LR: 0.00001658  
Epoch: [3][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.5016(0.4222) Grad: 5.3500  LR: 0.00001590  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.1658(0.1658) 


Epoch 3 - avg_train_loss: 0.4222  avg_val_loss: 0.7006  time: 41s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4222  avg_val_loss: 0.7006  time: 41s
Epoch 3 - Score: 0.2763
INFO:__main__:Epoch 3 - Score: 0.2763


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.9082(0.7006) 
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 26s) Loss: 0.1654(0.1654) Grad: nan  LR: 0.00001589  
Epoch: [4][100/279] Elapsed 0m 13s (remain 0m 23s) Loss: 0.1069(0.1550) Grad: 7.8932  LR: 0.00001494  
Epoch: [4][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.2883(0.1413) Grad: 11.9086  LR: 0.00001394  
Epoch: [4][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0775(0.1349) Grad: 9.7213  LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.0593(0.0593) 


Epoch 4 - avg_train_loss: 0.1349  avg_val_loss: 1.1343  time: 41s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1349  avg_val_loss: 1.1343  time: 41s
Epoch 4 - Score: 0.2895
INFO:__main__:Epoch 4 - Score: 0.2895


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.1517(1.1343) 
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 24s) Loss: 0.0152(0.0152) Grad: nan  LR: 0.00001311  
Epoch: [5][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0814(0.0411) Grad: 8.8571  LR: 0.00001202  
Epoch: [5][200/279] Elapsed 0m 26s (remain 0m 10s) Loss: 0.0222(0.0312) Grad: 4.0073  LR: 0.00001091  
Epoch: [5][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0107(0.0282) Grad: 2.2989  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.1485(0.1485) 


Epoch 5 - avg_train_loss: 0.0282  avg_val_loss: 1.3848  time: 41s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0282  avg_val_loss: 1.3848  time: 41s
Epoch 5 - Score: 0.2105
INFO:__main__:Epoch 5 - Score: 0.2105


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 4.1589(1.3848) 
Epoch: [6][0/279] Elapsed 0m 0s (remain 1m 33s) Loss: 0.0029(0.0029) Grad: nan  LR: 0.00001003  
Epoch: [6][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0046(0.0105) Grad: 0.4610  LR: 0.00000891  
Epoch: [6][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.1111(0.0076) Grad: 20.4136  LR: 0.00000780  
Epoch: [6][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0016(0.0066) Grad: 0.0542  LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.3472(0.3472) 


Epoch 6 - avg_train_loss: 0.0066  avg_val_loss: 1.4982  time: 41s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0066  avg_val_loss: 1.4982  time: 41s
Epoch 6 - Score: 0.3553
INFO:__main__:Epoch 6 - Score: 0.3553


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.2575(1.4982) 
Epoch: [7][0/279] Elapsed 0m 0s (remain 1m 29s) Loss: 0.0032(0.0032) Grad: nan  LR: 0.00000694  
Epoch: [7][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0017(0.0019) Grad: 0.0982  LR: 0.00000590  
Epoch: [7][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0019(0.0032) Grad: 0.2290  LR: 0.00000490  
Epoch: [7][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0009(0.0027) Grad: 0.0303  LR: 0.00000417  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.2818(0.2818) 


Epoch 7 - avg_train_loss: 0.0027  avg_val_loss: 1.5867  time: 41s
INFO:__main__:Epoch 7 - avg_train_loss: 0.0027  avg_val_loss: 1.5867  time: 41s
Epoch 7 - Score: 0.3158
INFO:__main__:Epoch 7 - Score: 0.3158


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.8780(1.5867) 
Epoch: [8][0/279] Elapsed 0m 0s (remain 1m 23s) Loss: 0.0008(0.0008) Grad: nan  LR: 0.00000416  
Epoch: [8][100/279] Elapsed 0m 13s (remain 0m 23s) Loss: 0.0011(0.0013) Grad: 0.0439  LR: 0.00000328  
Epoch: [8][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0012(0.0013) Grad: 0.0668  LR: 0.00000250  
Epoch: [8][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0014(0.0012) Grad: 0.0941  LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.3071(0.3071) 


Epoch 8 - avg_train_loss: 0.0012  avg_val_loss: 1.6227  time: 41s
INFO:__main__:Epoch 8 - avg_train_loss: 0.0012  avg_val_loss: 1.6227  time: 41s
Epoch 8 - Score: 0.3158
INFO:__main__:Epoch 8 - Score: 0.3158


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.9190(1.6227) 
Epoch: [9][0/279] Elapsed 0m 0s (remain 1m 23s) Loss: 0.0009(0.0009) Grad: nan  LR: 0.00000194  
Epoch: [9][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0007(0.0010) Grad: 0.0410  LR: 0.00000133  
Epoch: [9][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0015(0.0011) Grad: 0.1499  LR: 0.00000082  
Epoch: [9][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0008(0.0011) Grad: 0.0370  LR: 0.00000051  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.3071(0.3071) 


Epoch 9 - avg_train_loss: 0.0011  avg_val_loss: 1.6381  time: 41s
INFO:__main__:Epoch 9 - avg_train_loss: 0.0011  avg_val_loss: 1.6381  time: 41s
Epoch 9 - Score: 0.3158
INFO:__main__:Epoch 9 - Score: 0.3158


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.9756(1.6381) 
Epoch: [10][0/279] Elapsed 0m 0s (remain 1m 30s) Loss: 0.0012(0.0012) Grad: nan  LR: 0.00000051  
Epoch: [10][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0011(0.0011) Grad: 0.0608  LR: 0.00000022  
Epoch: [10][200/279] Elapsed 0m 26s (remain 0m 10s) Loss: 0.0022(0.0011) Grad: 0.1573  LR: 0.00000005  
Epoch: [10][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0014(0.0010) Grad: 0.0588  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.3104(0.3104) 


Epoch 10 - avg_train_loss: 0.0010  avg_val_loss: 1.6399  time: 41s
INFO:__main__:Epoch 10 - avg_train_loss: 0.0010  avg_val_loss: 1.6399  time: 41s
Epoch 10 - Score: 0.3158
INFO:__main__:Epoch 10 - Score: 0.3158


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.9697(1.6399) 


Score: 0.5066
INFO:__main__:Score: 0.5066
ACC BEST Score: 0.7149
INFO:__main__:ACC BEST Score: 0.7149
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.26.1",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main

Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 44s) Loss: 0.5901(0.5901) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 13s (remain 0m 23s) Loss: 0.6114(0.6055) Grad: 1.9105  LR: 0.00001994  
Epoch: [1][200/279] Elapsed 0m 26s (remain 0m 10s) Loss: 0.7350(0.5995) Grad: 5.0241  LR: 0.00001975  
Epoch: [1][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.5725(0.5990) Grad: 1.8346  LR: 0.00001951  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.2206(0.2206) 


Epoch 1 - avg_train_loss: 0.5990  avg_val_loss: 0.5897  time: 41s
INFO:__main__:Epoch 1 - avg_train_loss: 0.5990  avg_val_loss: 0.5897  time: 41s
Epoch 1 - Score: 0.0065
INFO:__main__:Epoch 1 - Score: 0.0065
Epoch 1 - Save Best Score: 0.0065 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.0065 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.3888(0.5897) 
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 38s) Loss: 0.3924(0.3924) Grad: nan  LR: 0.00001951  
Epoch: [2][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.7110(0.5272) Grad: 7.6147  LR: 0.00001910  
Epoch: [2][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.4740(0.5034) Grad: 5.2017  LR: 0.00001858  
Epoch: [2][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.7112(0.5042) Grad: 9.4747  LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.2854(0.2854) 


Epoch 2 - avg_train_loss: 0.5042  avg_val_loss: 0.5804  time: 41s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5042  avg_val_loss: 0.5804  time: 41s
Epoch 2 - Score: 0.2549
INFO:__main__:Epoch 2 - Score: 0.2549
Epoch 2 - Save Best Score: 0.2549 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.2549 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2151(0.5804) 
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 31s) Loss: 0.3329(0.3329) Grad: nan  LR: 0.00001809  
Epoch: [3][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.0902(0.2201) Grad: 3.6986  LR: 0.00001738  
Epoch: [3][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.2544(0.2403) Grad: 9.5521  LR: 0.00001658  
Epoch: [3][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.0850(0.2294) Grad: 6.6846  LR: 0.00001590  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.0678(0.0678) 


Epoch 3 - avg_train_loss: 0.2294  avg_val_loss: 1.0566  time: 41s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2294  avg_val_loss: 1.0566  time: 41s
Epoch 3 - Score: 0.1569
INFO:__main__:Epoch 3 - Score: 0.1569


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.2085(1.0566) 
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 27s) Loss: 0.0819(0.0819) Grad: nan  LR: 0.00001589  
Epoch: [4][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0802(0.0548) Grad: 10.0032  LR: 0.00001494  
Epoch: [4][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0056(0.0458) Grad: 0.2951  LR: 0.00001394  
Epoch: [4][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0376(0.0403) Grad: 6.9007  LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.6505(0.6505) 


Epoch 4 - avg_train_loss: 0.0403  avg_val_loss: 1.1453  time: 41s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0403  avg_val_loss: 1.1453  time: 41s
Epoch 4 - Score: 0.4314
INFO:__main__:Epoch 4 - Score: 0.4314
Epoch 4 - Save Best Score: 0.4314 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.4314 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.9990(1.1453) 
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 32s) Loss: 0.0048(0.0048) Grad: nan  LR: 0.00001311  
Epoch: [5][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.0024(0.0047) Grad: 0.1829  LR: 0.00001202  
Epoch: [5][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0015(0.0047) Grad: 0.1095  LR: 0.00001091  
Epoch: [5][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.0028(0.0043) Grad: 0.2448  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.5199(0.5199) 


Epoch 5 - avg_train_loss: 0.0043  avg_val_loss: 1.3432  time: 42s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0043  avg_val_loss: 1.3432  time: 42s
Epoch 5 - Score: 0.3791
INFO:__main__:Epoch 5 - Score: 0.3791


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.8562(1.3432) 
Epoch: [6][0/279] Elapsed 0m 0s (remain 1m 35s) Loss: 0.0013(0.0013) Grad: nan  LR: 0.00001003  
Epoch: [6][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0015(0.0013) Grad: 0.1402  LR: 0.00000891  
Epoch: [6][200/279] Elapsed 0m 26s (remain 0m 10s) Loss: 0.0009(0.0011) Grad: 0.0464  LR: 0.00000780  
Epoch: [6][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0012(0.0011) Grad: 0.0612  LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.5965(0.5965) 


Epoch 6 - avg_train_loss: 0.0011  avg_val_loss: 1.4263  time: 41s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0011  avg_val_loss: 1.4263  time: 41s
Epoch 6 - Score: 0.3922
INFO:__main__:Epoch 6 - Score: 0.3922


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.9329(1.4263) 
Epoch: [7][0/279] Elapsed 0m 0s (remain 1m 33s) Loss: 0.0009(0.0009) Grad: nan  LR: 0.00000694  
Epoch: [7][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0006(0.0008) Grad: 0.0377  LR: 0.00000590  
Epoch: [7][200/279] Elapsed 0m 26s (remain 0m 10s) Loss: 0.0009(0.0007) Grad: 0.0761  LR: 0.00000490  
Epoch: [7][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0004(0.0007) Grad: 0.0270  LR: 0.00000417  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.5770(0.5770) 


Epoch 7 - avg_train_loss: 0.0007  avg_val_loss: 1.4766  time: 41s
INFO:__main__:Epoch 7 - avg_train_loss: 0.0007  avg_val_loss: 1.4766  time: 41s
Epoch 7 - Score: 0.3791
INFO:__main__:Epoch 7 - Score: 0.3791


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.1160(1.4766) 
Epoch: [8][0/279] Elapsed 0m 0s (remain 1m 40s) Loss: 0.0003(0.0003) Grad: nan  LR: 0.00000416  
Epoch: [8][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0005(0.0006) Grad: 0.0358  LR: 0.00000328  
Epoch: [8][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0002(0.0006) Grad: 0.0172  LR: 0.00000250  
Epoch: [8][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0005(0.0006) Grad: 0.0303  LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.5821(0.5821) 


Epoch 8 - avg_train_loss: 0.0006  avg_val_loss: 1.5022  time: 41s
INFO:__main__:Epoch 8 - avg_train_loss: 0.0006  avg_val_loss: 1.5022  time: 41s
Epoch 8 - Score: 0.3791
INFO:__main__:Epoch 8 - Score: 0.3791


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.1796(1.5022) 
Epoch: [9][0/279] Elapsed 0m 0s (remain 1m 33s) Loss: 0.0007(0.0007) Grad: nan  LR: 0.00000194  
Epoch: [9][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0002(0.0005) Grad: 0.0219  LR: 0.00000133  
Epoch: [9][200/279] Elapsed 0m 26s (remain 0m 10s) Loss: 0.0006(0.0005) Grad: 0.0551  LR: 0.00000082  
Epoch: [9][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0006(0.0005) Grad: 0.0569  LR: 0.00000051  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.5809(0.5809) 


Epoch 9 - avg_train_loss: 0.0005  avg_val_loss: 1.5132  time: 41s
INFO:__main__:Epoch 9 - avg_train_loss: 0.0005  avg_val_loss: 1.5132  time: 41s
Epoch 9 - Score: 0.3791
INFO:__main__:Epoch 9 - Score: 0.3791


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.2147(1.5132) 
Epoch: [10][0/279] Elapsed 0m 0s (remain 1m 32s) Loss: 0.0005(0.0005) Grad: nan  LR: 0.00000051  
Epoch: [10][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0002(0.0005) Grad: 0.0164  LR: 0.00000022  
Epoch: [10][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0002(0.0005) Grad: 0.0329  LR: 0.00000005  
Epoch: [10][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0004(0.0005) Grad: 0.0257  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.5818(0.5818) 


Epoch 10 - avg_train_loss: 0.0005  avg_val_loss: 1.5146  time: 41s
INFO:__main__:Epoch 10 - avg_train_loss: 0.0005  avg_val_loss: 1.5146  time: 41s
Epoch 10 - Score: 0.3791
INFO:__main__:Epoch 10 - Score: 0.3791


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.2172(1.5146) 


Score: 0.4314
INFO:__main__:Score: 0.4314
ACC BEST Score: 0.6928
INFO:__main__:ACC BEST Score: 0.6928
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.26.1",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main

Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 37s) Loss: 0.8085(0.8085) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.6897(0.6314) Grad: 2.3622  LR: 0.00001994  
Epoch: [1][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.5108(0.6258) Grad: 4.2958  LR: 0.00001975  
Epoch: [1][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.6219(0.6189) Grad: 0.9405  LR: 0.00001951  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.4241(0.4241) 


Epoch 1 - avg_train_loss: 0.6189  avg_val_loss: 0.5914  time: 41s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6189  avg_val_loss: 0.5914  time: 41s
Epoch 1 - Score: 0.0000
INFO:__main__:Epoch 1 - Score: 0.0000
Epoch 1 - Save Best Score: 0.0000 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.0000 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9949(0.5914) 
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 42s) Loss: 0.4896(0.4896) Grad: nan  LR: 0.00001951  
Epoch: [2][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.4980(0.5707) Grad: 1.5132  LR: 0.00001910  
Epoch: [2][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.3711(0.5699) Grad: 1.5114  LR: 0.00001858  
Epoch: [2][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.4811(0.5684) Grad: 5.8560  LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.4540(0.4540) 


Epoch 2 - avg_train_loss: 0.5684  avg_val_loss: 0.5670  time: 41s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5684  avg_val_loss: 0.5670  time: 41s
Epoch 2 - Score: 0.4118
INFO:__main__:Epoch 2 - Score: 0.4118
Epoch 2 - Save Best Score: 0.4118 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.4118 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8554(0.5670) 
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 37s) Loss: 0.4489(0.4489) Grad: nan  LR: 0.00001809  
Epoch: [3][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.4729(0.4467) Grad: 3.5947  LR: 0.00001738  
Epoch: [3][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.1783(0.4322) Grad: 3.0901  LR: 0.00001658  
Epoch: [3][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.3750(0.4288) Grad: 5.8486  LR: 0.00001590  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.2527(0.2527) 


Epoch 3 - avg_train_loss: 0.4288  avg_val_loss: 0.6409  time: 41s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4288  avg_val_loss: 0.6409  time: 41s
Epoch 3 - Score: 0.2222
INFO:__main__:Epoch 3 - Score: 0.2222


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.8588(0.6409) 
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 43s) Loss: 0.1245(0.1245) Grad: nan  LR: 0.00001589  
Epoch: [4][100/279] Elapsed 0m 13s (remain 0m 23s) Loss: 0.0811(0.1366) Grad: 7.4815  LR: 0.00001494  
Epoch: [4][200/279] Elapsed 0m 26s (remain 0m 10s) Loss: 0.0474(0.1278) Grad: 4.5302  LR: 0.00001394  
Epoch: [4][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.4438(0.1233) Grad: 29.5140  LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.9055(0.9055) 


Epoch 4 - avg_train_loss: 0.1233  avg_val_loss: 0.9412  time: 41s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1233  avg_val_loss: 0.9412  time: 41s
Epoch 4 - Score: 0.6144
INFO:__main__:Epoch 4 - Score: 0.6144
Epoch 4 - Save Best Score: 0.6144 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.6144 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.5424(0.9412) 
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 49s) Loss: 0.0112(0.0112) Grad: nan  LR: 0.00001311  
Epoch: [5][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.0113(0.0313) Grad: 2.0209  LR: 0.00001202  
Epoch: [5][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0071(0.0246) Grad: 0.6149  LR: 0.00001091  
Epoch: [5][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0010(0.0212) Grad: 0.0936  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3498(0.3498) 


Epoch 5 - avg_train_loss: 0.0212  avg_val_loss: 1.2312  time: 41s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0212  avg_val_loss: 1.2312  time: 41s
Epoch 5 - Score: 0.2876
INFO:__main__:Epoch 5 - Score: 0.2876


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.6340(1.2312) 
Epoch: [6][0/279] Elapsed 0m 0s (remain 2m 0s) Loss: 0.0010(0.0010) Grad: nan  LR: 0.00001003  
Epoch: [6][100/279] Elapsed 0m 13s (remain 0m 23s) Loss: 0.0019(0.0026) Grad: 0.1307  LR: 0.00000891  
Epoch: [6][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0013(0.0021) Grad: 0.0771  LR: 0.00000780  
Epoch: [6][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0021(0.0021) Grad: 0.1604  LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5538(0.5538) 


Epoch 6 - avg_train_loss: 0.0021  avg_val_loss: 1.2938  time: 41s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0021  avg_val_loss: 1.2938  time: 41s
Epoch 6 - Score: 0.3660
INFO:__main__:Epoch 6 - Score: 0.3660


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.4959(1.2938) 
Epoch: [7][0/279] Elapsed 0m 0s (remain 1m 32s) Loss: 0.0010(0.0010) Grad: nan  LR: 0.00000694  
Epoch: [7][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0015(0.0012) Grad: 0.0772  LR: 0.00000590  
Epoch: [7][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0009(0.0010) Grad: 0.0698  LR: 0.00000490  
Epoch: [7][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0007(0.0010) Grad: 0.0386  LR: 0.00000417  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.6302(0.6302) 


Epoch 7 - avg_train_loss: 0.0010  avg_val_loss: 1.3456  time: 41s
INFO:__main__:Epoch 7 - avg_train_loss: 0.0010  avg_val_loss: 1.3456  time: 41s
Epoch 7 - Score: 0.3791
INFO:__main__:Epoch 7 - Score: 0.3791


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.5437(1.3456) 
Epoch: [8][0/279] Elapsed 0m 0s (remain 1m 32s) Loss: 0.0008(0.0008) Grad: nan  LR: 0.00000416  
Epoch: [8][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0012(0.0007) Grad: 0.0802  LR: 0.00000328  
Epoch: [8][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0005(0.0008) Grad: 0.0335  LR: 0.00000250  
Epoch: [8][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0005(0.0007) Grad: 0.0291  LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.6460(0.6460) 


Epoch 8 - avg_train_loss: 0.0007  avg_val_loss: 1.3757  time: 41s
INFO:__main__:Epoch 8 - avg_train_loss: 0.0007  avg_val_loss: 1.3757  time: 41s
Epoch 8 - Score: 0.3791
INFO:__main__:Epoch 8 - Score: 0.3791


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.6102(1.3757) 
Epoch: [9][0/279] Elapsed 0m 0s (remain 1m 28s) Loss: 0.0012(0.0012) Grad: nan  LR: 0.00000194  
Epoch: [9][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.0005(0.0008) Grad: 0.0487  LR: 0.00000133  
Epoch: [9][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0007(0.0007) Grad: 0.0416  LR: 0.00000082  
Epoch: [9][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0003(0.0006) Grad: 0.0216  LR: 0.00000051  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6469(0.6469) 


Epoch 9 - avg_train_loss: 0.0006  avg_val_loss: 1.3880  time: 41s
INFO:__main__:Epoch 9 - avg_train_loss: 0.0006  avg_val_loss: 1.3880  time: 41s
Epoch 9 - Score: 0.3791
INFO:__main__:Epoch 9 - Score: 0.3791


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.6463(1.3880) 
Epoch: [10][0/279] Elapsed 0m 0s (remain 1m 31s) Loss: 0.0004(0.0004) Grad: nan  LR: 0.00000051  
Epoch: [10][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0008(0.0006) Grad: 0.0634  LR: 0.00000022  
Epoch: [10][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0007(0.0006) Grad: 0.0329  LR: 0.00000005  
Epoch: [10][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0003(0.0006) Grad: 0.0138  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.6480(0.6480) 


Epoch 10 - avg_train_loss: 0.0006  avg_val_loss: 1.3896  time: 41s
INFO:__main__:Epoch 10 - avg_train_loss: 0.0006  avg_val_loss: 1.3896  time: 41s
Epoch 10 - Score: 0.3791
INFO:__main__:Epoch 10 - Score: 0.3791


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.6496(1.3896) 


Score: 0.6144
INFO:__main__:Score: 0.6144
ACC BEST Score: 0.6888
INFO:__main__:ACC BEST Score: 0.6888
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.26.1",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main

Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 38s) Loss: 0.8458(0.8458) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.4634(0.6571) Grad: 5.0727  LR: 0.00001994  
Epoch: [1][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.5098(0.6370) Grad: 6.3033  LR: 0.00001975  
Epoch: [1][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.6291(0.6286) Grad: 1.9398  LR: 0.00001951  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.4494(0.4494) 


Epoch 1 - avg_train_loss: 0.6286  avg_val_loss: 0.6139  time: 41s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6286  avg_val_loss: 0.6139  time: 41s
Epoch 1 - Score: 0.0000
INFO:__main__:Epoch 1 - Score: 0.0000
Epoch 1 - Save Best Score: 0.0000 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.0000 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9919(0.6139) 
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 49s) Loss: 0.5907(0.5907) Grad: nan  LR: 0.00001951  
Epoch: [2][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.5936(0.5931) Grad: 0.6107  LR: 0.00001910  
Epoch: [2][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.5517(0.5892) Grad: 2.8547  LR: 0.00001858  
Epoch: [2][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.6157(0.5954) Grad: 7.9228  LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.5521(0.5521) 


Epoch 2 - avg_train_loss: 0.5954  avg_val_loss: 0.6309  time: 42s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5954  avg_val_loss: 0.6309  time: 42s
Epoch 2 - Score: 0.3725
INFO:__main__:Epoch 2 - Score: 0.3725
Epoch 2 - Save Best Score: 0.3725 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.3725 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7610(0.6309) 
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 39s) Loss: 0.6583(0.6583) Grad: nan  LR: 0.00001809  
Epoch: [3][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.3317(0.5483) Grad: 3.2685  LR: 0.00001738  
Epoch: [3][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.5585(0.5456) Grad: 3.0125  LR: 0.00001658  
Epoch: [3][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.4044(0.5339) Grad: 1.7682  LR: 0.00001590  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.2149(0.2149) 


Epoch 3 - avg_train_loss: 0.5339  avg_val_loss: 0.6044  time: 41s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5339  avg_val_loss: 0.6044  time: 41s
Epoch 3 - Score: 0.0523
INFO:__main__:Epoch 3 - Score: 0.0523


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.3911(0.6044) 
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 26s) Loss: 0.4595(0.4595) Grad: nan  LR: 0.00001589  
Epoch: [4][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.1921(0.3606) Grad: 3.3636  LR: 0.00001494  
Epoch: [4][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.3251(0.3330) Grad: 11.7310  LR: 0.00001394  
Epoch: [4][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.4327(0.3257) Grad: 10.1146  LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.2738(0.2738) 


Epoch 4 - avg_train_loss: 0.3257  avg_val_loss: 0.7019  time: 41s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3257  avg_val_loss: 0.7019  time: 41s
Epoch 4 - Score: 0.2680
INFO:__main__:Epoch 4 - Score: 0.2680


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.4791(0.7019) 
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 36s) Loss: 0.0540(0.0540) Grad: nan  LR: 0.00001311  
Epoch: [5][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0519(0.0545) Grad: 6.3684  LR: 0.00001202  
Epoch: [5][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0633(0.0650) Grad: 5.8982  LR: 0.00001091  
Epoch: [5][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0191(0.0597) Grad: 3.6595  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2988(0.2988) 


Epoch 5 - avg_train_loss: 0.0597  avg_val_loss: 1.1385  time: 41s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0597  avg_val_loss: 1.1385  time: 41s
Epoch 5 - Score: 0.3072
INFO:__main__:Epoch 5 - Score: 0.3072


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.5203(1.1385) 
Epoch: [6][0/279] Elapsed 0m 0s (remain 1m 31s) Loss: 0.0175(0.0175) Grad: nan  LR: 0.00001003  
Epoch: [6][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0035(0.0059) Grad: 0.2625  LR: 0.00000891  
Epoch: [6][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0013(0.0053) Grad: 0.0535  LR: 0.00000780  
Epoch: [6][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0018(0.0051) Grad: 0.1824  LR: 0.00000695  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.5005(0.5005) 


Epoch 6 - avg_train_loss: 0.0051  avg_val_loss: 1.3753  time: 41s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0051  avg_val_loss: 1.3753  time: 41s
Epoch 6 - Score: 0.3399
INFO:__main__:Epoch 6 - Score: 0.3399


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.3650(1.3753) 
Epoch: [7][0/279] Elapsed 0m 0s (remain 1m 34s) Loss: 0.0028(0.0028) Grad: nan  LR: 0.00000694  
Epoch: [7][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0014(0.0020) Grad: 0.1008  LR: 0.00000590  
Epoch: [7][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0010(0.0016) Grad: 0.0832  LR: 0.00000490  
Epoch: [7][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0007(0.0015) Grad: 0.0457  LR: 0.00000417  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.3896(0.3896) 


Epoch 7 - avg_train_loss: 0.0015  avg_val_loss: 1.5050  time: 41s
INFO:__main__:Epoch 7 - avg_train_loss: 0.0015  avg_val_loss: 1.5050  time: 41s
Epoch 7 - Score: 0.3007
INFO:__main__:Epoch 7 - Score: 0.3007


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.0384(1.5050) 
Epoch: [8][0/279] Elapsed 0m 0s (remain 1m 26s) Loss: 0.0013(0.0013) Grad: nan  LR: 0.00000416  
Epoch: [8][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0005(0.0012) Grad: 0.0337  LR: 0.00000328  
Epoch: [8][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0004(0.0011) Grad: 0.0248  LR: 0.00000250  
Epoch: [8][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0014(0.0010) Grad: 0.0846  LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.4053(0.4053) 


Epoch 8 - avg_train_loss: 0.0010  avg_val_loss: 1.5489  time: 41s
INFO:__main__:Epoch 8 - avg_train_loss: 0.0010  avg_val_loss: 1.5489  time: 41s
Epoch 8 - Score: 0.3007
INFO:__main__:Epoch 8 - Score: 0.3007


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.1309(1.5489) 
Epoch: [9][0/279] Elapsed 0m 0s (remain 1m 27s) Loss: 0.0007(0.0007) Grad: nan  LR: 0.00000194  
Epoch: [9][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0010(0.0009) Grad: 0.0471  LR: 0.00000133  
Epoch: [9][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0007(0.0009) Grad: 0.0352  LR: 0.00000082  
Epoch: [9][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0005(0.0009) Grad: 0.0323  LR: 0.00000051  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.4160(0.4160) 


Epoch 9 - avg_train_loss: 0.0009  avg_val_loss: 1.5649  time: 41s
INFO:__main__:Epoch 9 - avg_train_loss: 0.0009  avg_val_loss: 1.5649  time: 41s
Epoch 9 - Score: 0.3007
INFO:__main__:Epoch 9 - Score: 0.3007


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.1527(1.5649) 
Epoch: [10][0/279] Elapsed 0m 0s (remain 1m 38s) Loss: 0.0005(0.0005) Grad: nan  LR: 0.00000051  
Epoch: [10][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0009(0.0008) Grad: 0.1033  LR: 0.00000022  
Epoch: [10][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0006(0.0008) Grad: 0.0266  LR: 0.00000005  
Epoch: [10][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0002(0.0008) Grad: 0.0249  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.4204(0.4204) 


Epoch 10 - avg_train_loss: 0.0008  avg_val_loss: 1.5663  time: 41s
INFO:__main__:Epoch 10 - avg_train_loss: 0.0008  avg_val_loss: 1.5663  time: 41s
Epoch 10 - Score: 0.3007
INFO:__main__:Epoch 10 - Score: 0.3007


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.1446(1.5663) 


Score: 0.3725
INFO:__main__:Score: 0.3725
ACC BEST Score: 0.7048
INFO:__main__:ACC BEST Score: 0.7048
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.26.1",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main

Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 38s) Loss: 0.6623(0.6623) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.5144(0.6198) Grad: 4.9023  LR: 0.00001994  
Epoch: [1][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.5240(0.6172) Grad: 9.4712  LR: 0.00001975  
Epoch: [1][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.6373(0.6106) Grad: 1.1889  LR: 0.00001951  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.5191(0.5191) 


Epoch 1 - avg_train_loss: 0.6106  avg_val_loss: 0.6087  time: 41s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6106  avg_val_loss: 0.6087  time: 41s
Epoch 1 - Score: 0.2829
INFO:__main__:Epoch 1 - Score: 0.2829
Epoch 1 - Save Best Score: 0.2829 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.2829 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7525(0.6087) 
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 41s) Loss: 0.6279(0.6279) Grad: nan  LR: 0.00001951  
Epoch: [2][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.6139(0.5382) Grad: 3.4415  LR: 0.00001910  
Epoch: [2][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.3408(0.5363) Grad: 4.1973  LR: 0.00001858  
Epoch: [2][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.3721(0.5321) Grad: 4.4738  LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.2981(0.2981) 


Epoch 2 - avg_train_loss: 0.5321  avg_val_loss: 0.5819  time: 41s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5321  avg_val_loss: 0.5819  time: 41s
Epoch 2 - Score: 0.1645
INFO:__main__:Epoch 2 - Score: 0.1645


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1747(0.5819) 
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 29s) Loss: 0.5098(0.5098) Grad: nan  LR: 0.00001809  
Epoch: [3][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.3512(0.3325) Grad: 7.2442  LR: 0.00001739  
Epoch: [3][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.2565(0.3319) Grad: 6.3050  LR: 0.00001658  
Epoch: [3][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.1238(0.3157) Grad: 5.8186  LR: 0.00001590  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.6004(0.6004) 


Epoch 3 - avg_train_loss: 0.3157  avg_val_loss: 0.7448  time: 41s
INFO:__main__:Epoch 3 - avg_train_loss: 0.3157  avg_val_loss: 0.7448  time: 41s
Epoch 3 - Score: 0.4211
INFO:__main__:Epoch 3 - Score: 0.4211
Epoch 3 - Save Best Score: 0.4211 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.4211 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9156(0.7448) 
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 41s) Loss: 0.0483(0.0483) Grad: nan  LR: 0.00001589  
Epoch: [4][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.0279(0.0523) Grad: 2.8438  LR: 0.00001495  
Epoch: [4][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0309(0.0562) Grad: 3.8549  LR: 0.00001394  
Epoch: [4][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.3842(0.0550) Grad: 6.2685  LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.6418(0.6418) 


Epoch 4 - avg_train_loss: 0.0550  avg_val_loss: 1.1332  time: 41s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0550  avg_val_loss: 1.1332  time: 41s
Epoch 4 - Score: 0.3092
INFO:__main__:Epoch 4 - Score: 0.3092


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.0113(1.1332) 
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 32s) Loss: 0.0088(0.0088) Grad: nan  LR: 0.00001311  
Epoch: [5][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0202(0.0079) Grad: 4.2258  LR: 0.00001203  
Epoch: [5][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0025(0.0067) Grad: 0.2006  LR: 0.00001092  
Epoch: [5][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0025(0.0058) Grad: 0.3907  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.8633(0.8633) 


Epoch 5 - avg_train_loss: 0.0058  avg_val_loss: 1.3597  time: 41s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0058  avg_val_loss: 1.3597  time: 41s
Epoch 5 - Score: 0.3684
INFO:__main__:Epoch 5 - Score: 0.3684


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.4556(1.3597) 
Epoch: [6][0/279] Elapsed 0m 0s (remain 1m 43s) Loss: 0.0018(0.0018) Grad: nan  LR: 0.00001003  
Epoch: [6][100/279] Elapsed 0m 13s (remain 0m 23s) Loss: 0.0010(0.0020) Grad: 0.0637  LR: 0.00000891  
Epoch: [6][200/279] Elapsed 0m 26s (remain 0m 10s) Loss: 0.0013(0.0016) Grad: 0.0810  LR: 0.00000781  
Epoch: [6][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0010(0.0015) Grad: 0.1032  LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.8532(0.8532) 


Epoch 6 - avg_train_loss: 0.0015  avg_val_loss: 1.4433  time: 41s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0015  avg_val_loss: 1.4433  time: 41s
Epoch 6 - Score: 0.3618
INFO:__main__:Epoch 6 - Score: 0.3618


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.7354(1.4433) 
Epoch: [7][0/279] Elapsed 0m 0s (remain 1m 34s) Loss: 0.0014(0.0014) Grad: nan  LR: 0.00000695  
Epoch: [7][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.0004(0.0008) Grad: 0.0355  LR: 0.00000590  
Epoch: [7][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0004(0.0008) Grad: 0.0314  LR: 0.00000491  
Epoch: [7][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0009(0.0008) Grad: 0.0906  LR: 0.00000417  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.8310(0.8310) 


Epoch 7 - avg_train_loss: 0.0008  avg_val_loss: 1.4979  time: 41s
INFO:__main__:Epoch 7 - avg_train_loss: 0.0008  avg_val_loss: 1.4979  time: 41s
Epoch 7 - Score: 0.3553
INFO:__main__:Epoch 7 - Score: 0.3553


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.9258(1.4979) 
Epoch: [8][0/279] Elapsed 0m 0s (remain 1m 46s) Loss: 0.0012(0.0012) Grad: nan  LR: 0.00000416  
Epoch: [8][100/279] Elapsed 0m 13s (remain 0m 23s) Loss: 0.0005(0.0007) Grad: 0.0361  LR: 0.00000329  
Epoch: [8][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0007(0.0006) Grad: 0.0790  LR: 0.00000250  
Epoch: [8][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0004(0.0006) Grad: 0.0296  LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.8222(0.8222) 


Epoch 8 - avg_train_loss: 0.0006  avg_val_loss: 1.5269  time: 41s
INFO:__main__:Epoch 8 - avg_train_loss: 0.0006  avg_val_loss: 1.5269  time: 41s
Epoch 8 - Score: 0.3553
INFO:__main__:Epoch 8 - Score: 0.3553


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.0228(1.5269) 
Epoch: [9][0/279] Elapsed 0m 0s (remain 1m 46s) Loss: 0.0002(0.0002) Grad: nan  LR: 0.00000195  
Epoch: [9][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.0004(0.0005) Grad: 0.0402  LR: 0.00000133  
Epoch: [9][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0005(0.0005) Grad: 0.0319  LR: 0.00000083  
Epoch: [9][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0005(0.0005) Grad: 0.0407  LR: 0.00000051  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.8311(0.8311) 


Epoch 9 - avg_train_loss: 0.0005  avg_val_loss: 1.5374  time: 41s
INFO:__main__:Epoch 9 - avg_train_loss: 0.0005  avg_val_loss: 1.5374  time: 41s
Epoch 9 - Score: 0.3553
INFO:__main__:Epoch 9 - Score: 0.3553


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.0329(1.5374) 
Epoch: [10][0/279] Elapsed 0m 0s (remain 1m 36s) Loss: 0.0004(0.0004) Grad: nan  LR: 0.00000051  
Epoch: [10][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0002(0.0005) Grad: 0.0157  LR: 0.00000022  
Epoch: [10][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0007(0.0005) Grad: 0.0375  LR: 0.00000005  
Epoch: [10][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0001(0.0005) Grad: 0.0161  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.8302(0.8302) 


Epoch 10 - avg_train_loss: 0.0005  avg_val_loss: 1.5392  time: 41s
INFO:__main__:Epoch 10 - avg_train_loss: 0.0005  avg_val_loss: 1.5392  time: 41s
Epoch 10 - Score: 0.3553
INFO:__main__:Epoch 10 - Score: 0.3553


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.0400(1.5392) 


Score: 0.4211
INFO:__main__:Score: 0.4211
ACC BEST Score: 0.6901
INFO:__main__:ACC BEST Score: 0.6901
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.26.1",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main

Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 32s) Loss: 0.6257(0.6257) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.6679(0.6135) Grad: 5.4943  LR: 0.00001994  
Epoch: [1][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.5924(0.6035) Grad: 1.0335  LR: 0.00001975  
Epoch: [1][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.4453(0.6014) Grad: 2.8897  LR: 0.00001951  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.3763(0.3763) 


Epoch 1 - avg_train_loss: 0.6014  avg_val_loss: 0.6017  time: 41s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6014  avg_val_loss: 0.6017  time: 41s
Epoch 1 - Score: 0.0592
INFO:__main__:Epoch 1 - Score: 0.0592
Epoch 1 - Save Best Score: 0.0592 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.0592 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0526(0.6017) 
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 49s) Loss: 0.5062(0.5062) Grad: nan  LR: 0.00001951  
Epoch: [2][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.6628(0.5345) Grad: 3.3481  LR: 0.00001910  
Epoch: [2][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.9752(0.5294) Grad: 9.8305  LR: 0.00001858  
Epoch: [2][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.4942(0.5249) Grad: 3.1766  LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.2489(0.2489) 


Epoch 2 - avg_train_loss: 0.5249  avg_val_loss: 0.6262  time: 41s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5249  avg_val_loss: 0.6262  time: 41s
Epoch 2 - Score: 0.0789
INFO:__main__:Epoch 2 - Score: 0.0789
Epoch 2 - Save Best Score: 0.0789 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.0789 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.4267(0.6262) 
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 33s) Loss: 0.3220(0.3220) Grad: nan  LR: 0.00001809  
Epoch: [3][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.2712(0.3098) Grad: 11.5610  LR: 0.00001739  
Epoch: [3][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.3651(0.3003) Grad: 16.3451  LR: 0.00001658  
Epoch: [3][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.1926(0.3008) Grad: 5.4255  LR: 0.00001590  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.5940(0.5940) 


Epoch 3 - avg_train_loss: 0.3008  avg_val_loss: 0.7239  time: 41s
INFO:__main__:Epoch 3 - avg_train_loss: 0.3008  avg_val_loss: 0.7239  time: 41s
Epoch 3 - Score: 0.4342
INFO:__main__:Epoch 3 - Score: 0.4342
Epoch 3 - Save Best Score: 0.4342 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.4342 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9103(0.7239) 
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 44s) Loss: 0.0777(0.0777) Grad: nan  LR: 0.00001589  
Epoch: [4][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.0842(0.0555) Grad: 7.4505  LR: 0.00001495  
Epoch: [4][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0120(0.0609) Grad: 0.8143  LR: 0.00001394  
Epoch: [4][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0075(0.0543) Grad: 0.6247  LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 1.1531(1.1531) 


Epoch 4 - avg_train_loss: 0.0543  avg_val_loss: 1.1684  time: 41s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0543  avg_val_loss: 1.1684  time: 41s
Epoch 4 - Score: 0.4342
INFO:__main__:Epoch 4 - Score: 0.4342


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.5475(1.1684) 
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 33s) Loss: 0.0075(0.0075) Grad: nan  LR: 0.00001311  
Epoch: [5][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0030(0.0109) Grad: 0.2309  LR: 0.00001203  
Epoch: [5][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0027(0.0084) Grad: 0.1765  LR: 0.00001092  
Epoch: [5][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.1365(0.0100) Grad: 17.2464  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3760(0.3760) 


Epoch 5 - avg_train_loss: 0.0100  avg_val_loss: 1.3623  time: 41s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0100  avg_val_loss: 1.3623  time: 41s
Epoch 5 - Score: 0.1908
INFO:__main__:Epoch 5 - Score: 0.1908


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.9222(1.3623) 
Epoch: [6][0/279] Elapsed 0m 0s (remain 1m 32s) Loss: 0.0039(0.0039) Grad: nan  LR: 0.00001003  
Epoch: [6][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0010(0.0027) Grad: 0.0526  LR: 0.00000891  
Epoch: [6][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0013(0.0027) Grad: 0.0814  LR: 0.00000781  
Epoch: [6][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0008(0.0023) Grad: 0.0297  LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.6465(0.6465) 


Epoch 6 - avg_train_loss: 0.0023  avg_val_loss: 1.4591  time: 41s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0023  avg_val_loss: 1.4591  time: 41s
Epoch 6 - Score: 0.2566
INFO:__main__:Epoch 6 - Score: 0.2566


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.6722(1.4591) 
Epoch: [7][0/279] Elapsed 0m 0s (remain 1m 28s) Loss: 0.0007(0.0007) Grad: nan  LR: 0.00000695  
Epoch: [7][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0005(0.0010) Grad: 0.0242  LR: 0.00000590  
Epoch: [7][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0011(0.0010) Grad: 0.0492  LR: 0.00000491  
Epoch: [7][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0006(0.0010) Grad: 0.0369  LR: 0.00000417  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.7932(0.7932) 


Epoch 7 - avg_train_loss: 0.0010  avg_val_loss: 1.5154  time: 41s
INFO:__main__:Epoch 7 - avg_train_loss: 0.0010  avg_val_loss: 1.5154  time: 41s
Epoch 7 - Score: 0.2829
INFO:__main__:Epoch 7 - Score: 0.2829


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.6053(1.5154) 
Epoch: [8][0/279] Elapsed 0m 0s (remain 1m 29s) Loss: 0.0007(0.0007) Grad: nan  LR: 0.00000416  
Epoch: [8][100/279] Elapsed 0m 13s (remain 0m 23s) Loss: 0.0005(0.0008) Grad: 0.0299  LR: 0.00000329  
Epoch: [8][200/279] Elapsed 0m 26s (remain 0m 10s) Loss: 0.0009(0.0008) Grad: 0.0520  LR: 0.00000250  
Epoch: [8][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0004(0.0008) Grad: 0.0373  LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.7958(0.7958) 


Epoch 8 - avg_train_loss: 0.0008  avg_val_loss: 1.5514  time: 41s
INFO:__main__:Epoch 8 - avg_train_loss: 0.0008  avg_val_loss: 1.5514  time: 41s
Epoch 8 - Score: 0.2763
INFO:__main__:Epoch 8 - Score: 0.2763


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.7056(1.5514) 
Epoch: [9][0/279] Elapsed 0m 0s (remain 1m 29s) Loss: 0.0002(0.0002) Grad: nan  LR: 0.00000195  
Epoch: [9][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0006(0.0007) Grad: 0.0337  LR: 0.00000133  
Epoch: [9][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0009(0.0007) Grad: 0.0534  LR: 0.00000083  
Epoch: [9][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0015(0.0007) Grad: 0.2387  LR: 0.00000051  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.7946(0.7946) 


Epoch 9 - avg_train_loss: 0.0007  avg_val_loss: 1.5663  time: 41s
INFO:__main__:Epoch 9 - avg_train_loss: 0.0007  avg_val_loss: 1.5663  time: 41s
Epoch 9 - Score: 0.2763
INFO:__main__:Epoch 9 - Score: 0.2763


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.7514(1.5663) 
Epoch: [10][0/279] Elapsed 0m 0s (remain 1m 29s) Loss: 0.0009(0.0009) Grad: nan  LR: 0.00000051  
Epoch: [10][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0010(0.0007) Grad: 0.0716  LR: 0.00000022  
Epoch: [10][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0004(0.0006) Grad: 0.0209  LR: 0.00000005  
Epoch: [10][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0010(0.0007) Grad: 0.0726  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.8005(0.8005) 


Epoch 10 - avg_train_loss: 0.0007  avg_val_loss: 1.5678  time: 41s
INFO:__main__:Epoch 10 - avg_train_loss: 0.0007  avg_val_loss: 1.5678  time: 41s
Epoch 10 - Score: 0.2763
INFO:__main__:Epoch 10 - Score: 0.2763


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.7457(1.5678) 


Score: 0.4342
INFO:__main__:Score: 0.4342
ACC BEST Score: 0.6640
INFO:__main__:ACC BEST Score: 0.6640
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.26.1",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main

Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 36s) Loss: 0.8643(0.8643) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.5742(0.6481) Grad: 2.9423  LR: 0.00001994  
Epoch: [1][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.7707(0.6251) Grad: 4.8847  LR: 0.00001975  
Epoch: [1][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.5540(0.6228) Grad: 6.7533  LR: 0.00001951  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4778(0.4778) 


Epoch 1 - avg_train_loss: 0.6228  avg_val_loss: 0.6050  time: 41s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6228  avg_val_loss: 0.6050  time: 41s
Epoch 1 - Score: 0.0000
INFO:__main__:Epoch 1 - Score: 0.0000
Epoch 1 - Save Best Score: 0.0000 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.0000 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8973(0.6050) 
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 47s) Loss: 0.5965(0.5965) Grad: nan  LR: 0.00001951  
Epoch: [2][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.6590(0.5819) Grad: 4.5439  LR: 0.00001910  
Epoch: [2][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.6967(0.5883) Grad: 1.4676  LR: 0.00001858  
Epoch: [2][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.5758(0.5910) Grad: 3.6243  LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4737(0.4737) 


Epoch 2 - avg_train_loss: 0.5910  avg_val_loss: 0.5938  time: 42s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5910  avg_val_loss: 0.5938  time: 42s
Epoch 2 - Score: 0.0461
INFO:__main__:Epoch 2 - Score: 0.0461
Epoch 2 - Save Best Score: 0.0461 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.0461 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9034(0.5938) 
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 42s) Loss: 0.5662(0.5662) Grad: nan  LR: 0.00001809  
Epoch: [3][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.4155(0.5393) Grad: 2.8896  LR: 0.00001739  
Epoch: [3][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.6478(0.5306) Grad: 4.7710  LR: 0.00001658  
Epoch: [3][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.6236(0.5224) Grad: 5.1784  LR: 0.00001590  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3716(0.3716) 


Epoch 3 - avg_train_loss: 0.5224  avg_val_loss: 0.5653  time: 41s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5224  avg_val_loss: 0.5653  time: 41s
Epoch 3 - Score: 0.2829
INFO:__main__:Epoch 3 - Score: 0.2829
Epoch 3 - Save Best Score: 0.2829 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.2829 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1090(0.5653) 
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 42s) Loss: 0.3076(0.3076) Grad: nan  LR: 0.00001589  
Epoch: [4][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.1548(0.3696) Grad: 4.9580  LR: 0.00001495  
Epoch: [4][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.4666(0.3448) Grad: 9.2167  LR: 0.00001394  
Epoch: [4][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.4874(0.3459) Grad: 14.5198  LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.1993(0.1993) 


Epoch 4 - avg_train_loss: 0.3459  avg_val_loss: 0.6493  time: 42s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3459  avg_val_loss: 0.6493  time: 42s
Epoch 4 - Score: 0.2368
INFO:__main__:Epoch 4 - Score: 0.2368


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.6715(0.6493) 
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 37s) Loss: 0.2195(0.2195) Grad: nan  LR: 0.00001311  
Epoch: [5][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0509(0.0965) Grad: 5.4632  LR: 0.00001203  
Epoch: [5][200/279] Elapsed 0m 26s (remain 0m 10s) Loss: 0.0330(0.0986) Grad: 3.3203  LR: 0.00001092  
Epoch: [5][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0966(0.0936) Grad: 9.2521  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 1.0634(1.0634) 


Epoch 5 - avg_train_loss: 0.0936  avg_val_loss: 0.8980  time: 41s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0936  avg_val_loss: 0.8980  time: 41s
Epoch 5 - Score: 0.5855
INFO:__main__:Epoch 5 - Score: 0.5855
Epoch 5 - Save Best Score: 0.5855 Model
INFO:__main__:Epoch 5 - Save Best Score: 0.5855 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.6154(0.8980) 
Epoch: [6][0/279] Elapsed 0m 0s (remain 1m 40s) Loss: 0.0206(0.0206) Grad: nan  LR: 0.00001003  
Epoch: [6][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.0240(0.0237) Grad: 2.4421  LR: 0.00000891  
Epoch: [6][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0160(0.0193) Grad: 1.8684  LR: 0.00000781  
Epoch: [6][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0040(0.0191) Grad: 0.1864  LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6352(0.6352) 


Epoch 6 - avg_train_loss: 0.0191  avg_val_loss: 1.0573  time: 41s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0191  avg_val_loss: 1.0573  time: 41s
Epoch 6 - Score: 0.3618
INFO:__main__:Epoch 6 - Score: 0.3618


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.5664(1.0573) 
Epoch: [7][0/279] Elapsed 0m 0s (remain 1m 42s) Loss: 0.0032(0.0032) Grad: nan  LR: 0.00000695  
Epoch: [7][100/279] Elapsed 0m 13s (remain 0m 23s) Loss: 0.0035(0.0069) Grad: 0.1933  LR: 0.00000590  
Epoch: [7][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0020(0.0051) Grad: 0.0735  LR: 0.00000491  
Epoch: [7][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0015(0.0048) Grad: 0.0620  LR: 0.00000417  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.8483(0.8483) 


Epoch 7 - avg_train_loss: 0.0048  avg_val_loss: 1.1574  time: 41s
INFO:__main__:Epoch 7 - avg_train_loss: 0.0048  avg_val_loss: 1.1574  time: 41s
Epoch 7 - Score: 0.3947
INFO:__main__:Epoch 7 - Score: 0.3947


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.6076(1.1574) 
Epoch: [8][0/279] Elapsed 0m 0s (remain 1m 30s) Loss: 0.0036(0.0036) Grad: nan  LR: 0.00000416  
Epoch: [8][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0025(0.0023) Grad: 0.1546  LR: 0.00000329  
Epoch: [8][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0015(0.0026) Grad: 0.1199  LR: 0.00000250  
Epoch: [8][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0018(0.0032) Grad: 0.0736  LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.8807(0.8807) 


Epoch 8 - avg_train_loss: 0.0032  avg_val_loss: 1.2012  time: 41s
INFO:__main__:Epoch 8 - avg_train_loss: 0.0032  avg_val_loss: 1.2012  time: 41s
Epoch 8 - Score: 0.3947
INFO:__main__:Epoch 8 - Score: 0.3947


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.7086(1.2012) 
Epoch: [9][0/279] Elapsed 0m 0s (remain 1m 33s) Loss: 0.0019(0.0019) Grad: nan  LR: 0.00000195  
Epoch: [9][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0015(0.0028) Grad: 0.0840  LR: 0.00000133  
Epoch: [9][200/279] Elapsed 0m 26s (remain 0m 10s) Loss: 0.0018(0.0030) Grad: 0.0732  LR: 0.00000083  
Epoch: [9][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0012(0.0026) Grad: 0.0790  LR: 0.00000051  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.8544(0.8544) 


Epoch 9 - avg_train_loss: 0.0026  avg_val_loss: 1.2159  time: 41s
INFO:__main__:Epoch 9 - avg_train_loss: 0.0026  avg_val_loss: 1.2159  time: 41s
Epoch 9 - Score: 0.3816
INFO:__main__:Epoch 9 - Score: 0.3816


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.7757(1.2159) 
Epoch: [10][0/279] Elapsed 0m 0s (remain 1m 27s) Loss: 0.0011(0.0011) Grad: nan  LR: 0.00000051  
Epoch: [10][100/279] Elapsed 0m 13s (remain 0m 23s) Loss: 0.0012(0.0018) Grad: 0.0516  LR: 0.00000022  
Epoch: [10][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0021(0.0023) Grad: 0.0834  LR: 0.00000005  
Epoch: [10][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0021(0.0025) Grad: 0.1474  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.8532(0.8532) 


Epoch 10 - avg_train_loss: 0.0025  avg_val_loss: 1.2185  time: 41s
INFO:__main__:Epoch 10 - avg_train_loss: 0.0025  avg_val_loss: 1.2185  time: 41s
Epoch 10 - Score: 0.3816
INFO:__main__:Epoch 10 - Score: 0.3816


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.7850(1.2185) 


Score: 0.5855
INFO:__main__:Score: 0.5855
ACC BEST Score: 0.6881
INFO:__main__:ACC BEST Score: 0.6881
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.26.1",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main

Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 37s) Loss: 0.9076(0.9076) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.4770(0.6337) Grad: 1.0541  LR: 0.00001994  
Epoch: [1][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.6065(0.6131) Grad: 1.7285  LR: 0.00001975  
Epoch: [1][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.8436(0.6101) Grad: 7.4279  LR: 0.00001951  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.3022(0.3022) 


Epoch 1 - avg_train_loss: 0.6101  avg_val_loss: 0.5888  time: 41s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6101  avg_val_loss: 0.5888  time: 41s
Epoch 1 - Score: 0.0000
INFO:__main__:Epoch 1 - Score: 0.0000
Epoch 1 - Save Best Score: 0.0000 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.0000 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1865(0.5888) 
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 41s) Loss: 0.5918(0.5918) Grad: nan  LR: 0.00001951  
Epoch: [2][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.4386(0.5332) Grad: 4.7818  LR: 0.00001910  
Epoch: [2][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.5129(0.5402) Grad: 2.5437  LR: 0.00001858  
Epoch: [2][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.5607(0.5384) Grad: 4.9772  LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.3112(0.3112) 


Epoch 2 - avg_train_loss: 0.5384  avg_val_loss: 0.5855  time: 41s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5384  avg_val_loss: 0.5855  time: 41s
Epoch 2 - Score: 0.2566
INFO:__main__:Epoch 2 - Score: 0.2566
Epoch 2 - Save Best Score: 0.2566 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.2566 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1095(0.5855) 
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 52s) Loss: 0.2758(0.2758) Grad: nan  LR: 0.00001809  
Epoch: [3][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.2072(0.3213) Grad: 3.9870  LR: 0.00001739  
Epoch: [3][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.1839(0.3178) Grad: 7.9369  LR: 0.00001658  
Epoch: [3][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.3992(0.3162) Grad: 11.7917  LR: 0.00001590  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.5940(0.5940) 


Epoch 3 - avg_train_loss: 0.3162  avg_val_loss: 0.8024  time: 41s
INFO:__main__:Epoch 3 - avg_train_loss: 0.3162  avg_val_loss: 0.8024  time: 41s
Epoch 3 - Score: 0.5592
INFO:__main__:Epoch 3 - Score: 0.5592
Epoch 3 - Save Best Score: 0.5592 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.5592 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8154(0.8024) 
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 38s) Loss: 0.0927(0.0927) Grad: nan  LR: 0.00001589  
Epoch: [4][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.0171(0.0663) Grad: 1.2450  LR: 0.00001495  
Epoch: [4][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0671(0.0598) Grad: 8.1861  LR: 0.00001394  
Epoch: [4][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.0604(0.0682) Grad: 7.8026  LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.9916(0.9916) 


Epoch 4 - avg_train_loss: 0.0682  avg_val_loss: 1.1865  time: 41s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0682  avg_val_loss: 1.1865  time: 41s
Epoch 4 - Score: 0.5197
INFO:__main__:Epoch 4 - Score: 0.5197


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.3205(1.1865) 
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 37s) Loss: 0.0145(0.0145) Grad: nan  LR: 0.00001311  
Epoch: [5][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.0030(0.0136) Grad: 0.1811  LR: 0.00001203  
Epoch: [5][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0160(0.0125) Grad: 2.8428  LR: 0.00001092  
Epoch: [5][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.0020(0.0106) Grad: 0.1931  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.7384(0.7384) 


Epoch 5 - avg_train_loss: 0.0106  avg_val_loss: 1.3595  time: 41s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0106  avg_val_loss: 1.3595  time: 41s
Epoch 5 - Score: 0.3882
INFO:__main__:Epoch 5 - Score: 0.3882


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.8483(1.3595) 
Epoch: [6][0/279] Elapsed 0m 0s (remain 1m 35s) Loss: 0.0021(0.0021) Grad: nan  LR: 0.00001003  
Epoch: [6][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0033(0.0025) Grad: 0.3162  LR: 0.00000891  
Epoch: [6][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0011(0.0023) Grad: 0.0352  LR: 0.00000781  
Epoch: [6][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0012(0.0021) Grad: 0.0625  LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.9201(0.9201) 


Epoch 6 - avg_train_loss: 0.0021  avg_val_loss: 1.4708  time: 41s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0021  avg_val_loss: 1.4708  time: 41s
Epoch 6 - Score: 0.4079
INFO:__main__:Epoch 6 - Score: 0.4079


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.8352(1.4708) 
Epoch: [7][0/279] Elapsed 0m 0s (remain 1m 32s) Loss: 0.0010(0.0010) Grad: nan  LR: 0.00000695  
Epoch: [7][100/279] Elapsed 0m 13s (remain 0m 23s) Loss: 0.0009(0.0014) Grad: 0.0397  LR: 0.00000590  
Epoch: [7][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0008(0.0012) Grad: 0.0526  LR: 0.00000491  
Epoch: [7][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0008(0.0012) Grad: 0.0466  LR: 0.00000417  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.9452(0.9452) 


Epoch 7 - avg_train_loss: 0.0012  avg_val_loss: 1.5263  time: 41s
INFO:__main__:Epoch 7 - avg_train_loss: 0.0012  avg_val_loss: 1.5263  time: 41s
Epoch 7 - Score: 0.3947
INFO:__main__:Epoch 7 - Score: 0.3947


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.0087(1.5263) 
Epoch: [8][0/279] Elapsed 0m 0s (remain 1m 30s) Loss: 0.0011(0.0011) Grad: nan  LR: 0.00000416  
Epoch: [8][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0005(0.0009) Grad: 0.0276  LR: 0.00000329  
Epoch: [8][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0008(0.0010) Grad: 0.0386  LR: 0.00000250  
Epoch: [8][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0011(0.0009) Grad: 0.0816  LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.9777(0.9777) 


Epoch 8 - avg_train_loss: 0.0009  avg_val_loss: 1.5572  time: 41s
INFO:__main__:Epoch 8 - avg_train_loss: 0.0009  avg_val_loss: 1.5572  time: 41s
Epoch 8 - Score: 0.4013
INFO:__main__:Epoch 8 - Score: 0.4013


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.0628(1.5572) 
Epoch: [9][0/279] Elapsed 0m 0s (remain 1m 29s) Loss: 0.0013(0.0013) Grad: nan  LR: 0.00000195  
Epoch: [9][100/279] Elapsed 0m 13s (remain 0m 23s) Loss: 0.0008(0.0007) Grad: 0.0423  LR: 0.00000133  
Epoch: [9][200/279] Elapsed 0m 26s (remain 0m 10s) Loss: 0.0007(0.0007) Grad: 0.0421  LR: 0.00000083  
Epoch: [9][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0011(0.0008) Grad: 0.0917  LR: 0.00000051  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.9824(0.9824) 


Epoch 9 - avg_train_loss: 0.0008  avg_val_loss: 1.5686  time: 41s
INFO:__main__:Epoch 9 - avg_train_loss: 0.0008  avg_val_loss: 1.5686  time: 41s
Epoch 9 - Score: 0.4013
INFO:__main__:Epoch 9 - Score: 0.4013


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.0993(1.5686) 
Epoch: [10][0/279] Elapsed 0m 0s (remain 1m 26s) Loss: 0.0009(0.0009) Grad: nan  LR: 0.00000051  
Epoch: [10][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0010(0.0007) Grad: 0.0726  LR: 0.00000022  
Epoch: [10][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0008(0.0008) Grad: 0.0376  LR: 0.00000005  
Epoch: [10][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0004(0.0008) Grad: 0.0319  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.9849(0.9849) 


Epoch 10 - avg_train_loss: 0.0008  avg_val_loss: 1.5704  time: 41s
INFO:__main__:Epoch 10 - avg_train_loss: 0.0008  avg_val_loss: 1.5704  time: 41s
Epoch 10 - Score: 0.4013
INFO:__main__:Epoch 10 - Score: 0.4013


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.1015(1.5704) 


Score: 0.5592
INFO:__main__:Score: 0.5592
ACC BEST Score: 0.6579
INFO:__main__:ACC BEST Score: 0.6579
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.26.1",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main

Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 51s) Loss: 0.6728(0.6728) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 13s (remain 0m 23s) Loss: 0.6460(0.6230) Grad: 3.5298  LR: 0.00001994  
Epoch: [1][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.5618(0.6147) Grad: 1.6153  LR: 0.00001975  
Epoch: [1][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.6465(0.6059) Grad: 1.8756  LR: 0.00001951  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.5189(0.5189) 


Epoch 1 - avg_train_loss: 0.6059  avg_val_loss: 0.5858  time: 41s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6059  avg_val_loss: 0.5858  time: 41s
Epoch 1 - Score: 0.2171
INFO:__main__:Epoch 1 - Score: 0.2171
Epoch 1 - Save Best Score: 0.2171 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.2171 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8075(0.5858) 
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 40s) Loss: 0.6038(0.6038) Grad: nan  LR: 0.00001951  
Epoch: [2][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.5085(0.5423) Grad: 3.5463  LR: 0.00001910  
Epoch: [2][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.5255(0.5258) Grad: 4.2949  LR: 0.00001858  
Epoch: [2][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.4670(0.5216) Grad: 2.7547  LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.3208(0.3208) 


Epoch 2 - avg_train_loss: 0.5216  avg_val_loss: 0.5492  time: 41s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5216  avg_val_loss: 0.5492  time: 41s
Epoch 2 - Score: 0.2434
INFO:__main__:Epoch 2 - Score: 0.2434
Epoch 2 - Save Best Score: 0.2434 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.2434 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1842(0.5492) 
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 54s) Loss: 0.2876(0.2876) Grad: nan  LR: 0.00001809  
Epoch: [3][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.1120(0.2895) Grad: 5.0716  LR: 0.00001739  
Epoch: [3][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.1178(0.2704) Grad: 5.5336  LR: 0.00001658  
Epoch: [3][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.3903(0.2688) Grad: 10.1501  LR: 0.00001590  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6426(0.6426) 


Epoch 3 - avg_train_loss: 0.2688  avg_val_loss: 0.6511  time: 42s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2688  avg_val_loss: 0.6511  time: 42s
Epoch 3 - Score: 0.5526
INFO:__main__:Epoch 3 - Score: 0.5526
Epoch 3 - Save Best Score: 0.5526 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.5526 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9270(0.6511) 
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 35s) Loss: 0.1123(0.1123) Grad: nan  LR: 0.00001589  
Epoch: [4][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.0266(0.0496) Grad: 1.9993  LR: 0.00001495  
Epoch: [4][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0314(0.0417) Grad: 3.8976  LR: 0.00001394  
Epoch: [4][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0023(0.0387) Grad: 0.1388  LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4673(0.4673) 


Epoch 4 - avg_train_loss: 0.0387  avg_val_loss: 1.0973  time: 41s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0387  avg_val_loss: 1.0973  time: 41s
Epoch 4 - Score: 0.2895
INFO:__main__:Epoch 4 - Score: 0.2895


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.0592(1.0973) 
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 40s) Loss: 0.0717(0.0717) Grad: nan  LR: 0.00001311  
Epoch: [5][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0041(0.0092) Grad: 0.3067  LR: 0.00001203  
Epoch: [5][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0023(0.0068) Grad: 0.1548  LR: 0.00001092  
Epoch: [5][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0009(0.0057) Grad: 0.0682  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4989(0.4989) 


Epoch 5 - avg_train_loss: 0.0057  avg_val_loss: 1.1713  time: 41s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0057  avg_val_loss: 1.1713  time: 41s
Epoch 5 - Score: 0.3487
INFO:__main__:Epoch 5 - Score: 0.3487


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.4769(1.1713) 
Epoch: [6][0/279] Elapsed 0m 0s (remain 1m 34s) Loss: 0.0005(0.0005) Grad: nan  LR: 0.00001003  
Epoch: [6][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0013(0.0017) Grad: 0.0804  LR: 0.00000891  
Epoch: [6][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0016(0.0016) Grad: 0.3762  LR: 0.00000781  
Epoch: [6][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.0007(0.0015) Grad: 0.0357  LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6091(0.6091) 


Epoch 6 - avg_train_loss: 0.0015  avg_val_loss: 1.2223  time: 41s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0015  avg_val_loss: 1.2223  time: 41s
Epoch 6 - Score: 0.4211
INFO:__main__:Epoch 6 - Score: 0.4211


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.5530(1.2223) 
Epoch: [7][0/279] Elapsed 0m 0s (remain 1m 30s) Loss: 0.0005(0.0005) Grad: nan  LR: 0.00000695  
Epoch: [7][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0005(0.0008) Grad: 0.0285  LR: 0.00000590  
Epoch: [7][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0011(0.0008) Grad: 0.1345  LR: 0.00000491  
Epoch: [7][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0006(0.0008) Grad: 0.0513  LR: 0.00000417  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5773(0.5773) 


Epoch 7 - avg_train_loss: 0.0008  avg_val_loss: 1.2688  time: 41s
INFO:__main__:Epoch 7 - avg_train_loss: 0.0008  avg_val_loss: 1.2688  time: 41s
Epoch 7 - Score: 0.4079
INFO:__main__:Epoch 7 - Score: 0.4079


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.7826(1.2688) 
Epoch: [8][0/279] Elapsed 0m 0s (remain 1m 30s) Loss: 0.0006(0.0006) Grad: nan  LR: 0.00000416  
Epoch: [8][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0007(0.0006) Grad: 0.0561  LR: 0.00000329  
Epoch: [8][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0007(0.0007) Grad: 0.0485  LR: 0.00000250  
Epoch: [8][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0008(0.0006) Grad: 0.0657  LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.5778(0.5778) 


Epoch 8 - avg_train_loss: 0.0006  avg_val_loss: 1.2922  time: 41s
INFO:__main__:Epoch 8 - avg_train_loss: 0.0006  avg_val_loss: 1.2922  time: 41s
Epoch 8 - Score: 0.4013
INFO:__main__:Epoch 8 - Score: 0.4013


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.8708(1.2922) 
Epoch: [9][0/279] Elapsed 0m 0s (remain 1m 30s) Loss: 0.0004(0.0004) Grad: nan  LR: 0.00000195  
Epoch: [9][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0004(0.0005) Grad: 0.0282  LR: 0.00000133  
Epoch: [9][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0004(0.0005) Grad: 0.0278  LR: 0.00000083  
Epoch: [9][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0002(0.0006) Grad: 0.0285  LR: 0.00000051  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5784(0.5784) 


Epoch 9 - avg_train_loss: 0.0006  avg_val_loss: 1.3017  time: 41s
INFO:__main__:Epoch 9 - avg_train_loss: 0.0006  avg_val_loss: 1.3017  time: 41s
Epoch 9 - Score: 0.4013
INFO:__main__:Epoch 9 - Score: 0.4013


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.9055(1.3017) 
Epoch: [10][0/279] Elapsed 0m 0s (remain 1m 33s) Loss: 0.0002(0.0002) Grad: nan  LR: 0.00000051  
Epoch: [10][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0005(0.0005) Grad: 0.0380  LR: 0.00000022  
Epoch: [10][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0002(0.0006) Grad: 0.0138  LR: 0.00000005  
Epoch: [10][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0005(0.0005) Grad: 0.0371  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5786(0.5786) 


Epoch 10 - avg_train_loss: 0.0005  avg_val_loss: 1.3030  time: 41s
INFO:__main__:Epoch 10 - avg_train_loss: 0.0005  avg_val_loss: 1.3030  time: 41s
Epoch 10 - Score: 0.4013
INFO:__main__:Epoch 10 - Score: 0.4013


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.9102(1.3030) 


Score: 0.5526
INFO:__main__:Score: 0.5526
ACC BEST Score: 0.7022
INFO:__main__:ACC BEST Score: 0.7022
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.26.1",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main

Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 39s) Loss: 0.7892(0.7892) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.7270(0.6116) Grad: 5.5718  LR: 0.00001994  
Epoch: [1][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.6701(0.6066) Grad: 2.1004  LR: 0.00001975  
Epoch: [1][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.7264(0.6028) Grad: 4.7933  LR: 0.00001951  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.5881(0.5881) 


Epoch 1 - avg_train_loss: 0.6028  avg_val_loss: 0.6464  time: 41s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6028  avg_val_loss: 0.6464  time: 41s
Epoch 1 - Score: 0.2895
INFO:__main__:Epoch 1 - Score: 0.2895
Epoch 1 - Save Best Score: 0.2895 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.2895 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7552(0.6464) 
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 35s) Loss: 0.6107(0.6107) Grad: nan  LR: 0.00001951  
Epoch: [2][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.6598(0.5545) Grad: 2.2767  LR: 0.00001910  
Epoch: [2][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.6875(0.5520) Grad: 6.4094  LR: 0.00001858  
Epoch: [2][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.3962(0.5492) Grad: 2.8344  LR: 0.00001810  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.1543(0.1543) 


Epoch 2 - avg_train_loss: 0.5492  avg_val_loss: 0.6454  time: 41s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5492  avg_val_loss: 0.6454  time: 41s
Epoch 2 - Score: 0.0263
INFO:__main__:Epoch 2 - Score: 0.0263


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.6310(0.6454) 
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 38s) Loss: 0.4968(0.4968) Grad: nan  LR: 0.00001809  
Epoch: [3][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.2525(0.3317) Grad: 6.1642  LR: 0.00001739  
Epoch: [3][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.4414(0.3449) Grad: 13.3500  LR: 0.00001658  
Epoch: [3][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.3927(0.3386) Grad: 15.3925  LR: 0.00001590  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.6137(0.6137) 


Epoch 3 - avg_train_loss: 0.3386  avg_val_loss: 0.7992  time: 41s
INFO:__main__:Epoch 3 - avg_train_loss: 0.3386  avg_val_loss: 0.7992  time: 41s
Epoch 3 - Score: 0.4671
INFO:__main__:Epoch 3 - Score: 0.4671
Epoch 3 - Save Best Score: 0.4671 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.4671 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.3186(0.7992) 
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 34s) Loss: 0.1647(0.1647) Grad: nan  LR: 0.00001589  
Epoch: [4][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.0779(0.0863) Grad: 7.0064  LR: 0.00001495  
Epoch: [4][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0123(0.0772) Grad: 0.5643  LR: 0.00001394  
Epoch: [4][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.0340(0.0811) Grad: 3.8504  LR: 0.00001312  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.3326(0.3326) 


Epoch 4 - avg_train_loss: 0.0811  avg_val_loss: 1.1191  time: 42s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0811  avg_val_loss: 1.1191  time: 42s
Epoch 4 - Score: 0.2237
INFO:__main__:Epoch 4 - Score: 0.2237


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.3122(1.1191) 
Epoch: [5][0/279] Elapsed 0m 0s (remain 2m 3s) Loss: 0.0174(0.0174) Grad: nan  LR: 0.00001311  
Epoch: [5][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0038(0.0263) Grad: 0.1838  LR: 0.00001203  
Epoch: [5][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0035(0.0209) Grad: 0.1670  LR: 0.00001092  
Epoch: [5][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0021(0.0201) Grad: 0.0810  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.4061(0.4061) 


Epoch 5 - avg_train_loss: 0.0201  avg_val_loss: 1.3126  time: 41s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0201  avg_val_loss: 1.3126  time: 41s
Epoch 5 - Score: 0.2763
INFO:__main__:Epoch 5 - Score: 0.2763


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.0642(1.3126) 
Epoch: [6][0/279] Elapsed 0m 0s (remain 1m 33s) Loss: 0.0067(0.0067) Grad: nan  LR: 0.00001003  
Epoch: [6][100/279] Elapsed 0m 13s (remain 0m 23s) Loss: 0.0029(0.0068) Grad: 0.1440  LR: 0.00000891  
Epoch: [6][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0016(0.0047) Grad: 0.0673  LR: 0.00000781  
Epoch: [6][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0066(0.0054) Grad: 1.1690  LR: 0.00000696  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.6179(0.6179) 


Epoch 6 - avg_train_loss: 0.0054  avg_val_loss: 1.4449  time: 41s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0054  avg_val_loss: 1.4449  time: 41s
Epoch 6 - Score: 0.3618
INFO:__main__:Epoch 6 - Score: 0.3618


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.9074(1.4449) 
Epoch: [7][0/279] Elapsed 0m 0s (remain 1m 32s) Loss: 0.0028(0.0028) Grad: nan  LR: 0.00000695  
Epoch: [7][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.0016(0.0019) Grad: 0.0712  LR: 0.00000590  
Epoch: [7][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0013(0.0022) Grad: 0.0648  LR: 0.00000491  
Epoch: [7][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0015(0.0029) Grad: 0.0810  LR: 0.00000417  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.5155(0.5155) 


Epoch 7 - avg_train_loss: 0.0029  avg_val_loss: 1.5043  time: 41s
INFO:__main__:Epoch 7 - avg_train_loss: 0.0029  avg_val_loss: 1.5043  time: 41s
Epoch 7 - Score: 0.3026
INFO:__main__:Epoch 7 - Score: 0.3026


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.4877(1.5043) 
Epoch: [8][0/279] Elapsed 0m 0s (remain 1m 30s) Loss: 0.0016(0.0016) Grad: nan  LR: 0.00000416  
Epoch: [8][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0015(0.0016) Grad: 0.0841  LR: 0.00000329  
Epoch: [8][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0018(0.0025) Grad: 0.1084  LR: 0.00000250  
Epoch: [8][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0015(0.0021) Grad: 0.1108  LR: 0.00000195  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.5245(0.5245) 


Epoch 8 - avg_train_loss: 0.0021  avg_val_loss: 1.5398  time: 41s
INFO:__main__:Epoch 8 - avg_train_loss: 0.0021  avg_val_loss: 1.5398  time: 41s
Epoch 8 - Score: 0.2961
INFO:__main__:Epoch 8 - Score: 0.2961


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.5868(1.5398) 
Epoch: [9][0/279] Elapsed 0m 0s (remain 1m 29s) Loss: 0.0011(0.0011) Grad: nan  LR: 0.00000195  
Epoch: [9][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0009(0.0031) Grad: 0.0381  LR: 0.00000133  
Epoch: [9][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0012(0.0022) Grad: 0.0773  LR: 0.00000083  
Epoch: [9][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0010(0.0019) Grad: 0.0468  LR: 0.00000051  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.5307(0.5307) 


Epoch 9 - avg_train_loss: 0.0019  avg_val_loss: 1.5534  time: 41s
INFO:__main__:Epoch 9 - avg_train_loss: 0.0019  avg_val_loss: 1.5534  time: 41s
Epoch 9 - Score: 0.3026
INFO:__main__:Epoch 9 - Score: 0.3026


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.6163(1.5534) 
Epoch: [10][0/279] Elapsed 0m 0s (remain 1m 33s) Loss: 0.0007(0.0007) Grad: nan  LR: 0.00000051  
Epoch: [10][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0012(0.0012) Grad: 0.0589  LR: 0.00000022  
Epoch: [10][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0013(0.0013) Grad: 0.0980  LR: 0.00000005  
Epoch: [10][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.0009(0.0018) Grad: 0.0421  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.5310(0.5310) 


Epoch 10 - avg_train_loss: 0.0018  avg_val_loss: 1.5556  time: 41s
INFO:__main__:Epoch 10 - avg_train_loss: 0.0018  avg_val_loss: 1.5556  time: 41s
Epoch 10 - Score: 0.3026
INFO:__main__:Epoch 10 - Score: 0.3026


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.6230(1.5556) 


Score: 0.4671
INFO:__main__:Score: 0.4671
ACC BEST Score: 0.6459
INFO:__main__:ACC BEST Score: 0.6459
Score: 0.4944
INFO:__main__:Score: 0.4944
ACC BEST Score: 0.6807
INFO:__main__:ACC BEST Score: 0.6807


In [None]:
from google.colab import runtime
runtime.unassign()