In [1]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!nvidia-smi

Mon Apr  3 14:07:52 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    48W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:

import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re
import html

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD, AdamW, RAdam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score, recall_score, accuracy_score, precision_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:

# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    # model="microsoft/deberta-v3-base"
    # model='microsoft/deberta-base'
    # model='roberta-base'
    # model='roberta-large'
    # model='roberta-large-mnli'
    # model='xlnet-large-cased'
    # model='albert-xxlarge-v2'
    # model="microsoft/deberta-large"
    model="microsoft/deberta-v3-large"
    # model='microsoft/deberta-v2-xlarge'
    # model='funnel-transformer/large'
    # model='funnel-transformer/medium'
    # model='albert-base-v2'
    # model='albert-large-v2'
    # model='google/electra-large-discriminator'
    # model='google/electra-base-discriminator'
    # model="facebook/bart-large-mnli"
    # model="facebook/bart-large"
    # model="facebook/bart-base"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=16
    fc_dropout=0.2
    target_size=1
    max_len=256
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=10
    trn_fold=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    train=True
    nth_awp_start_epoch=1
    gradient_checkpointing = True
    freezing = True

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1]

In [6]:
DIR = "/content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")
OUTPUT_EXP_DIR = DIR + '/output/EXP016/'
if not os.path.exists(OUTPUT_EXP_DIR):
    os.makedirs(OUTPUT_EXP_DIR)

In [7]:
def get_score(labels, outputs):
    thresh = 0.5
    y_pred = outputs
    y_true = labels
    f_score = f1_score(y_true, (y_pred>thresh).astype(int))
    r_score = recall_score(y_true, (y_pred>thresh).astype(int))
    p_score = precision_score(y_true, (y_pred>thresh).astype(int))
    print(f"f1 score : {f_score}")
    print(f"recall score : {r_score}")
    print(f"precision score : {p_score}")
    return accuracy_score(y_true, (y_pred>thresh).astype(int))

def get_acc_score(labels, outputs):
    y_pred = outputs
    y_true = labels
    best_score = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.80, 0.01):
        thresh = np.round(thresh, 2)
        score = accuracy_score(y_true, (y_pred>thresh).astype(int))
        #print("Accuracy score at threshold {0} is {1}".format(thresh, score))
        if score > best_score:
          best_score = score
          best_thresh = thresh
    return accuracy_score(y_true, (y_pred>best_thresh).astype(int))


def get_logger(filename=OUTPUT_EXP_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=CFG.seed)

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
import pandas as pd
import numpy as np

train = pd.read_csv(os.path.join(INPUT_DIR,"train_data.csv"))
test = pd.read_csv(os.path.join(INPUT_DIR,"test_data.csv"))
sample_sub = pd.read_csv(os.path.join(INPUT_DIR,"submission.csv"))

print(train.shape)
display(train.head(3))

print(test.shape)
display(test.head(3))

print(sample_sub.shape)
display(sample_sub.head(3))

(4974, 6)


Unnamed: 0,id,title,year,abstract,keywords,y
0,1,Hierarchical Adversarially Learned Inference,2018,We propose a novel hierarchical generative mod...,"generative, hierarchical, unsupervised, semisu...",0
1,2,Learning to Compute Word Embeddings On the Fly,2018,Words in natural language follow a Zipfian dis...,"NLU, word embeddings, representation learning",0
2,3,Graph2Seq: Scalable Learning Dynamics for Graphs,2018,Neural networks are increasingly used as a gen...,,0


(6393, 5)


Unnamed: 0,id,title,year,abstract,keywords
0,1,StyleAlign: Analysis and Applications of Align...,2022,"In this paper, we perform an in-depth study of...","StyleGAN, transfer learning, fine tuning, mode..."
1,2,Embedding a random graph via GNN: mean-field i...,2021,We develop a theory for embedding a random gra...,"Graph neural network, graph embedding, multi-r..."
2,3,BBRefinement: an universal scheme to improve p...,2021,We present a conceptually simple yet powerful ...,"object detection, deep neural networks, refine..."


(6393, 2)


Unnamed: 0,id,y
0,1,0
1,2,0
2,3,0


In [10]:
train["texts"] = train["title"].str.lower() + "[SEP]" + train["abstract"].str.lower()  

In [11]:
skf = StratifiedKFold(n_splits=CFG.n_fold,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train.y)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)

if CFG.debug:
    display(train.groupby('kfold').size())
    train = train.sample(n=500, random_state=0).reset_index(drop=True)
    display(train.groupby('kfold').size())

In [12]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_EXP_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['texts'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 3 # cls
LOGGER.info(f"max_len: {CFG.max_len}")

100%|██████████| 4974/4974 [00:03<00:00, 1389.17it/s]
max_len: 536
INFO:__main__:max_len: 536


In [14]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False,
                           truncation=True)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df['y'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.half)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

class ValidDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df['y'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

#collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [15]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())

        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self._init_weights(self.fc)
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
        self.sig = nn.Sigmoid()
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        feature = self.layer_norm1(feature)
        output = self.fc(feature)
        #output = self.sig(output)
        return output

In [16]:

# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
        #print(y_preds.sigmoid().squeeze().view(1, -1))
        loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [17]:

# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['kfold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['kfold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds['y'].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = ValidDataset(CFG, valid_folds)


    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size*2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_EXP_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCELoss()
    
    best_score = -1.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')

        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_EXP_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_EXP_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [18]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['y'].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        acc_score = get_acc_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
        LOGGER.info(f'ACC BEST Score: {acc_score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
            #break
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_EXP_DIR+'oof_df.pkl')

DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.27.4",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
 

Epoch: [1][0/279] Elapsed 0m 2s (remain 9m 34s) Loss: 0.6909(0.6909) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 38s (remain 1m 7s) Loss: 0.6479(0.6375) Grad: 4.0682  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 1m 14s (remain 0m 28s) Loss: 0.4978(0.6333) Grad: 2.5521  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 1m 43s (remain 0m 0s) Loss: 0.7134(0.6275) Grad: 4.9415  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 11s) Loss: 0.2751(0.2751) 


Epoch 1 - avg_train_loss: 0.6275  avg_val_loss: 0.6253  time: 113s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6275  avg_val_loss: 0.6253  time: 113s
Epoch 1 - Score: 0.6948
INFO:__main__:Epoch 1 - Score: 0.6948
Epoch 1 - Save Best Score: 0.6948 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6948 Model


EVAL: [15/16] Elapsed 0m 10s (remain 0m 0s) Loss: 1.4223(0.6253) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
Epoch: [2][0/279] Elapsed 0m 0s (remain 2m 52s) Loss: 0.7041(0.7041) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 37s (remain 1m 6s) Loss: 0.4448(0.6147) Grad: 5.1957  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 1m 14s (remain 0m 28s) Loss: 0.5654(0.6103) Grad: 1.2453  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 1m 41s (remain 0m 0s) Loss: 0.6899(0.6182) Grad: 2.0086  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 11s) Loss: 0.4706(0.4706) 


Epoch 2 - avg_train_loss: 0.6182  avg_val_loss: 0.6250  time: 112s
INFO:__main__:Epoch 2 - avg_train_loss: 0.6182  avg_val_loss: 0.6250  time: 112s
Epoch 2 - Score: 0.6948
INFO:__main__:Epoch 2 - Score: 0.6948


EVAL: [15/16] Elapsed 0m 10s (remain 0m 0s) Loss: 0.9748(0.6250) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
Epoch: [3][0/279] Elapsed 0m 0s (remain 2m 59s) Loss: 0.5381(0.5381) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 37s (remain 1m 5s) Loss: 0.5972(0.6121) Grad: 3.7888  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 1m 13s (remain 0m 28s) Loss: 0.6392(0.6028) Grad: 5.4380  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 1m 40s (remain 0m 0s) Loss: 0.5337(0.6006) Grad: 6.6965  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 11s) Loss: 0.1925(0.1925) 


Epoch 3 - avg_train_loss: 0.6006  avg_val_loss: 0.6336  time: 111s
INFO:__main__:Epoch 3 - avg_train_loss: 0.6006  avg_val_loss: 0.6336  time: 111s
Epoch 3 - Score: 0.6948
INFO:__main__:Epoch 3 - Score: 0.6948


EVAL: [15/16] Elapsed 0m 10s (remain 0m 0s) Loss: 1.4981(0.6336) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
Epoch: [4][0/279] Elapsed 0m 0s (remain 2m 25s) Loss: 0.4075(0.4075) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 36s (remain 1m 3s) Loss: 0.5225(0.5706) Grad: 3.5685  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 1m 12s (remain 0m 28s) Loss: 0.5010(0.5465) Grad: 6.5095  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 1m 41s (remain 0m 0s) Loss: 0.6533(0.5542) Grad: 3.9976  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 12s) Loss: 0.3359(0.3359) 


Epoch 4 - avg_train_loss: 0.5542  avg_val_loss: 0.5906  time: 111s
INFO:__main__:Epoch 4 - avg_train_loss: 0.5542  avg_val_loss: 0.5906  time: 111s
Epoch 4 - Score: 0.6807
INFO:__main__:Epoch 4 - Score: 0.6807


EVAL: [15/16] Elapsed 0m 10s (remain 0m 0s) Loss: 0.9817(0.5906) 
f1 score : 0.3614457831325301
recall score : 0.29605263157894735
precision score : 0.4639175257731959


Score: 0.6948
INFO:__main__:Score: 0.6948
ACC BEST Score: 0.6948
INFO:__main__:ACC BEST Score: 0.6948


f1 score : 0.0
recall score : 0.0
precision score : 0.0


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.27.4",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
 

Epoch: [1][0/279] Elapsed 0m 0s (remain 4m 33s) Loss: 0.7773(0.7773) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 36s (remain 1m 3s) Loss: 0.5566(0.6372) Grad: 1.4301  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 1m 11s (remain 0m 27s) Loss: 0.4746(0.6214) Grad: 3.2017  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 1m 40s (remain 0m 0s) Loss: 0.8223(0.6171) Grad: 11.4466  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 13s) Loss: 0.2819(0.2819) 


Epoch 1 - avg_train_loss: 0.6171  avg_val_loss: 0.5886  time: 111s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6171  avg_val_loss: 0.5886  time: 111s
Epoch 1 - Score: 0.6928
INFO:__main__:Epoch 1 - Score: 0.6928
Epoch 1 - Save Best Score: 0.6928 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6928 Model


EVAL: [15/16] Elapsed 0m 10s (remain 0m 0s) Loss: 1.2740(0.5886) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
Epoch: [2][0/279] Elapsed 0m 0s (remain 2m 58s) Loss: 0.5439(0.5439) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 37s (remain 1m 5s) Loss: 0.3525(0.4954) Grad: 4.0780  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 1m 13s (remain 0m 28s) Loss: 0.4160(0.4955) Grad: 6.4832  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 1m 41s (remain 0m 0s) Loss: 0.4832(0.4909) Grad: 4.2352  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 13s) Loss: 0.5080(0.5080) 


Epoch 2 - avg_train_loss: 0.4909  avg_val_loss: 0.6103  time: 112s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4909  avg_val_loss: 0.6103  time: 112s
Epoch 2 - Score: 0.6667
INFO:__main__:Epoch 2 - Score: 0.6667


EVAL: [15/16] Elapsed 0m 10s (remain 0m 0s) Loss: 0.8797(0.6103) 
f1 score : 0.5257142857142857
recall score : 0.6013071895424836
precision score : 0.467005076142132
Epoch: [3][0/279] Elapsed 0m 0s (remain 2m 42s) Loss: 0.2042(0.2042) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 36s (remain 1m 5s) Loss: 0.0313(0.1415) Grad: 2.0848  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 1m 13s (remain 0m 28s) Loss: 0.0082(0.1114) Grad: 0.3532  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 1m 41s (remain 0m 0s) Loss: 0.0732(0.0982) Grad: 10.7179  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 13s) Loss: 0.6924(0.6924) 


Epoch 3 - avg_train_loss: 0.0982  avg_val_loss: 0.9031  time: 112s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0982  avg_val_loss: 0.9031  time: 112s
Epoch 3 - Score: 0.6747
INFO:__main__:Epoch 3 - Score: 0.6747


EVAL: [15/16] Elapsed 0m 10s (remain 0m 0s) Loss: 2.5001(0.9031) 
f1 score : 0.45637583892617445
recall score : 0.4444444444444444
precision score : 0.4689655172413793
Epoch: [4][0/279] Elapsed 0m 0s (remain 3m 41s) Loss: 0.0059(0.0059) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 36s (remain 1m 5s) Loss: 0.0028(0.0166) Grad: 0.0972  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 1m 13s (remain 0m 28s) Loss: 0.0042(0.0117) Grad: 0.3198  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 1m 41s (remain 0m 0s) Loss: 0.0038(0.0110) Grad: 0.2094  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 13s) Loss: 0.5414(0.5414) 


Epoch 4 - avg_train_loss: 0.0110  avg_val_loss: 0.9587  time: 112s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0110  avg_val_loss: 0.9587  time: 112s
Epoch 4 - Score: 0.7028
INFO:__main__:Epoch 4 - Score: 0.7028
Epoch 4 - Save Best Score: 0.7028 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7028 Model


EVAL: [15/16] Elapsed 0m 10s (remain 0m 0s) Loss: 3.1421(0.9587) 
f1 score : 0.4558823529411765
recall score : 0.40522875816993464
precision score : 0.5210084033613446


Score: 0.7028
INFO:__main__:Score: 0.7028
ACC BEST Score: 0.7349
INFO:__main__:ACC BEST Score: 0.7349


f1 score : 0.4558823529411765
recall score : 0.40522875816993464
precision score : 0.5210084033613446


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.27.4",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
 

Epoch: [1][0/279] Elapsed 0m 0s (remain 2m 55s) Loss: 0.9893(0.9893) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 36s (remain 1m 4s) Loss: 0.3767(0.6128) Grad: 5.3982  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 1m 13s (remain 0m 28s) Loss: 0.6367(0.6179) Grad: 1.9986  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 1m 41s (remain 0m 0s) Loss: 0.7031(0.6107) Grad: 1.9548  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 13s) Loss: 0.3064(0.3064) 


Epoch 1 - avg_train_loss: 0.6107  avg_val_loss: 0.5569  time: 112s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6107  avg_val_loss: 0.5569  time: 112s
Epoch 1 - Score: 0.7048
INFO:__main__:Epoch 1 - Score: 0.7048
Epoch 1 - Save Best Score: 0.7048 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7048 Model


EVAL: [15/16] Elapsed 0m 10s (remain 0m 0s) Loss: 1.1122(0.5569) 
f1 score : 0.07547169811320754
recall score : 0.0392156862745098
precision score : 1.0
Epoch: [2][0/279] Elapsed 0m 0s (remain 3m 0s) Loss: 0.4155(0.4155) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 36s (remain 1m 5s) Loss: 0.4138(0.4745) Grad: 3.8345  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 1m 13s (remain 0m 28s) Loss: 0.4087(0.4685) Grad: 7.8568  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 1m 41s (remain 0m 0s) Loss: 0.3992(0.4454) Grad: 7.4687  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 13s) Loss: 0.2891(0.2891) 


Epoch 2 - avg_train_loss: 0.4454  avg_val_loss: 0.6200  time: 112s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4454  avg_val_loss: 0.6200  time: 112s
Epoch 2 - Score: 0.6867
INFO:__main__:Epoch 2 - Score: 0.6867


EVAL: [15/16] Elapsed 0m 10s (remain 0m 0s) Loss: 1.3264(0.6200) 
f1 score : 0.3709677419354839
recall score : 0.3006535947712418
precision score : 0.4842105263157895
Epoch: [3][0/279] Elapsed 0m 0s (remain 2m 37s) Loss: 0.0817(0.0817) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 36s (remain 1m 4s) Loss: 0.0420(0.0725) Grad: 4.9382  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 1m 12s (remain 0m 28s) Loss: 0.0416(0.0591) Grad: 5.3206  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 1m 41s (remain 0m 0s) Loss: 0.0207(0.0566) Grad: 5.1337  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 13s) Loss: 0.9006(0.9006) 


Epoch 3 - avg_train_loss: 0.0566  avg_val_loss: 1.0020  time: 112s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0566  avg_val_loss: 1.0020  time: 112s
Epoch 3 - Score: 0.6406
INFO:__main__:Epoch 3 - Score: 0.6406


EVAL: [15/16] Elapsed 0m 10s (remain 0m 0s) Loss: 1.3080(1.0020) 
f1 score : 0.5149051490514904
recall score : 0.6209150326797386
precision score : 0.4398148148148148
Epoch: [4][0/279] Elapsed 0m 0s (remain 2m 49s) Loss: 0.0125(0.0125) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 36s (remain 1m 5s) Loss: 0.0027(0.0091) Grad: 0.1424  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 1m 11s (remain 0m 27s) Loss: 0.0025(0.0072) Grad: 0.2567  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 1m 40s (remain 0m 0s) Loss: 0.0016(0.0071) Grad: 0.1067  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 13s) Loss: 0.4182(0.4182) 


Epoch 4 - avg_train_loss: 0.0071  avg_val_loss: 0.9759  time: 111s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0071  avg_val_loss: 0.9759  time: 111s
Epoch 4 - Score: 0.6807
INFO:__main__:Epoch 4 - Score: 0.6807


EVAL: [15/16] Elapsed 0m 10s (remain 0m 0s) Loss: 2.2863(0.9759) 
f1 score : 0.4
recall score : 0.3464052287581699
precision score : 0.4732142857142857


Score: 0.7048
INFO:__main__:Score: 0.7048
ACC BEST Score: 0.7269
INFO:__main__:ACC BEST Score: 0.7269


f1 score : 0.07547169811320754
recall score : 0.0392156862745098
precision score : 1.0


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.27.4",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
 

Epoch: [1][0/279] Elapsed 0m 0s (remain 2m 48s) Loss: 0.5278(0.5278) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 37s (remain 1m 6s) Loss: 0.6055(0.6188) Grad: 2.3614  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 1m 13s (remain 0m 28s) Loss: 0.6455(0.6115) Grad: 2.2809  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 1m 41s (remain 0m 0s) Loss: 0.4966(0.5995) Grad: 2.3155  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 13s) Loss: 0.3449(0.3449) 


Epoch 1 - avg_train_loss: 0.5995  avg_val_loss: 0.5822  time: 112s
INFO:__main__:Epoch 1 - avg_train_loss: 0.5995  avg_val_loss: 0.5822  time: 112s
Epoch 1 - Score: 0.6968
INFO:__main__:Epoch 1 - Score: 0.6968
Epoch 1 - Save Best Score: 0.6968 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6968 Model


EVAL: [15/16] Elapsed 0m 9s (remain 0m 0s) Loss: 1.0449(0.5822) 
f1 score : 0.05031446540880503
recall score : 0.026143790849673203
precision score : 0.6666666666666666
Epoch: [2][0/279] Elapsed 0m 0s (remain 2m 48s) Loss: 0.6997(0.6997) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 36s (remain 1m 3s) Loss: 0.4822(0.4557) Grad: 7.3224  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 1m 12s (remain 0m 28s) Loss: 0.6548(0.4346) Grad: 9.2723  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 1m 41s (remain 0m 0s) Loss: 0.5337(0.4242) Grad: 8.6310  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 13s) Loss: 0.2148(0.2148) 


Epoch 2 - avg_train_loss: 0.4242  avg_val_loss: 0.6838  time: 112s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4242  avg_val_loss: 0.6838  time: 112s
Epoch 2 - Score: 0.6908
INFO:__main__:Epoch 2 - Score: 0.6908


EVAL: [15/16] Elapsed 0m 9s (remain 0m 0s) Loss: 1.5622(0.6838) 
f1 score : 0.25242718446601947
recall score : 0.16993464052287582
precision score : 0.49056603773584906
Epoch: [3][0/279] Elapsed 0m 0s (remain 3m 12s) Loss: 0.1440(0.1440) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 37s (remain 1m 5s) Loss: 0.0059(0.0674) Grad: 0.2651  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 1m 13s (remain 0m 28s) Loss: 0.0148(0.0631) Grad: 2.3957  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 1m 41s (remain 0m 0s) Loss: 0.0239(0.0564) Grad: 2.7964  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 13s) Loss: 0.2250(0.2250) 


Epoch 3 - avg_train_loss: 0.0564  avg_val_loss: 1.2526  time: 111s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0564  avg_val_loss: 1.2526  time: 111s
Epoch 3 - Score: 0.6988
INFO:__main__:Epoch 3 - Score: 0.6988
Epoch 3 - Save Best Score: 0.6988 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.6988 Model


EVAL: [15/16] Elapsed 0m 9s (remain 0m 0s) Loss: 3.7282(1.2526) 
f1 score : 0.21875
recall score : 0.13725490196078433
precision score : 0.5384615384615384
Epoch: [4][0/279] Elapsed 0m 0s (remain 2m 54s) Loss: 0.1509(0.1509) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 37s (remain 1m 6s) Loss: 0.0096(0.0195) Grad: 1.6679  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 1m 13s (remain 0m 28s) Loss: 0.0021(0.0192) Grad: 0.1073  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 1m 41s (remain 0m 0s) Loss: 0.0021(0.0153) Grad: 0.0917  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 13s) Loss: 0.7257(0.7257) 


Epoch 4 - avg_train_loss: 0.0153  avg_val_loss: 1.1353  time: 112s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0153  avg_val_loss: 1.1353  time: 112s
Epoch 4 - Score: 0.6727
INFO:__main__:Epoch 4 - Score: 0.6727


EVAL: [15/16] Elapsed 0m 9s (remain 0m 0s) Loss: 1.9032(1.1353) 
f1 score : 0.40727272727272723
recall score : 0.3660130718954248
precision score : 0.45901639344262296


Score: 0.6988
INFO:__main__:Score: 0.6988
ACC BEST Score: 0.7088
INFO:__main__:ACC BEST Score: 0.7088


f1 score : 0.21875
recall score : 0.13725490196078433
precision score : 0.5384615384615384


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.27.4",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
 

Epoch: [1][0/279] Elapsed 0m 0s (remain 3m 31s) Loss: 0.6128(0.6128) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 36s (remain 1m 4s) Loss: 0.4468(0.5900) Grad: 1.4276  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 1m 13s (remain 0m 28s) Loss: 0.5420(0.6002) Grad: 2.5261  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 1m 41s (remain 0m 0s) Loss: 0.6802(0.6028) Grad: 3.6673  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 13s) Loss: 0.4253(0.4253) 


Epoch 1 - avg_train_loss: 0.6028  avg_val_loss: 0.5752  time: 112s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6028  avg_val_loss: 0.5752  time: 112s
Epoch 1 - Score: 0.7404
INFO:__main__:Epoch 1 - Score: 0.7404
Epoch 1 - Save Best Score: 0.7404 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7404 Model


EVAL: [15/16] Elapsed 0m 10s (remain 0m 0s) Loss: 0.8722(0.5752) 
f1 score : 0.39999999999999997
recall score : 0.28289473684210525
precision score : 0.6825396825396826
Epoch: [2][0/279] Elapsed 0m 0s (remain 2m 46s) Loss: 0.4570(0.4570) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 36s (remain 1m 5s) Loss: 0.4741(0.4821) Grad: 5.5444  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 1m 13s (remain 0m 28s) Loss: 0.3770(0.4660) Grad: 6.4081  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 1m 41s (remain 0m 0s) Loss: 0.4534(0.4591) Grad: 7.0612  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 13s) Loss: 0.2557(0.2557) 


Epoch 2 - avg_train_loss: 0.4591  avg_val_loss: 0.6120  time: 112s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4591  avg_val_loss: 0.6120  time: 112s
Epoch 2 - Score: 0.7042
INFO:__main__:Epoch 2 - Score: 0.7042


EVAL: [15/16] Elapsed 0m 10s (remain 0m 0s) Loss: 1.2051(0.6120) 
f1 score : 0.40485829959514175
recall score : 0.32894736842105265
precision score : 0.5263157894736842
Epoch: [3][0/279] Elapsed 0m 0s (remain 2m 42s) Loss: 0.0843(0.0843) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 37s (remain 1m 6s) Loss: 0.0603(0.0697) Grad: 9.1201  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 1m 13s (remain 0m 28s) Loss: 0.0336(0.0659) Grad: 5.2624  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 1m 41s (remain 0m 0s) Loss: 0.0102(0.0607) Grad: 1.3981  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 13s) Loss: 0.4569(0.4569) 


Epoch 3 - avg_train_loss: 0.0607  avg_val_loss: 1.0011  time: 112s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0607  avg_val_loss: 1.0011  time: 112s
Epoch 3 - Score: 0.6680
INFO:__main__:Epoch 3 - Score: 0.6680


EVAL: [15/16] Elapsed 0m 10s (remain 0m 0s) Loss: 1.6145(1.0011) 
f1 score : 0.4290657439446367
recall score : 0.40789473684210525
precision score : 0.45255474452554745
Epoch: [4][0/279] Elapsed 0m 0s (remain 3m 25s) Loss: 0.0038(0.0038) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 36s (remain 1m 4s) Loss: 0.0020(0.0157) Grad: 0.1551  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 1m 12s (remain 0m 27s) Loss: 0.0023(0.0126) Grad: 0.1140  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 1m 41s (remain 0m 0s) Loss: 0.0054(0.0112) Grad: 0.3792  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 13s) Loss: 0.4577(0.4577) 


Epoch 4 - avg_train_loss: 0.0112  avg_val_loss: 1.0448  time: 112s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0112  avg_val_loss: 1.0448  time: 112s
Epoch 4 - Score: 0.6841
INFO:__main__:Epoch 4 - Score: 0.6841


EVAL: [15/16] Elapsed 0m 10s (remain 0m 0s) Loss: 1.7525(1.0448) 
f1 score : 0.4332129963898917
recall score : 0.39473684210526316
precision score : 0.48


Score: 0.7404
INFO:__main__:Score: 0.7404
ACC BEST Score: 0.7404
INFO:__main__:ACC BEST Score: 0.7404


f1 score : 0.39999999999999997
recall score : 0.28289473684210525
precision score : 0.6825396825396826


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.27.4",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
 

Epoch: [1][0/279] Elapsed 0m 0s (remain 2m 56s) Loss: 0.6880(0.6880) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 36s (remain 1m 4s) Loss: 0.7007(0.6024) Grad: 7.4489  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 1m 14s (remain 0m 28s) Loss: 0.6597(0.6003) Grad: 1.4510  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 1m 41s (remain 0m 0s) Loss: 0.5308(0.5995) Grad: 1.6394  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 13s) Loss: 0.4878(0.4878) 


Epoch 1 - avg_train_loss: 0.5995  avg_val_loss: 0.6000  time: 112s
INFO:__main__:Epoch 1 - avg_train_loss: 0.5995  avg_val_loss: 0.6000  time: 112s
Epoch 1 - Score: 0.6539
INFO:__main__:Epoch 1 - Score: 0.6539
Epoch 1 - Save Best Score: 0.6539 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6539 Model


EVAL: [15/16] Elapsed 0m 9s (remain 0m 0s) Loss: 0.8051(0.6000) 
f1 score : 0.2950819672131147
recall score : 0.23684210526315788
precision score : 0.391304347826087
Epoch: [2][0/279] Elapsed 0m 0s (remain 3m 20s) Loss: 0.5679(0.5679) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 36s (remain 1m 4s) Loss: 0.2177(0.4652) Grad: 5.3416  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 1m 12s (remain 0m 28s) Loss: 0.5737(0.4560) Grad: 5.9210  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 1m 41s (remain 0m 0s) Loss: 0.4792(0.4493) Grad: 10.8769  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 13s) Loss: 0.2333(0.2333) 


Epoch 2 - avg_train_loss: 0.4493  avg_val_loss: 0.6681  time: 112s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4493  avg_val_loss: 0.6681  time: 112s
Epoch 2 - Score: 0.6821
INFO:__main__:Epoch 2 - Score: 0.6821
Epoch 2 - Save Best Score: 0.6821 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.6821 Model


EVAL: [15/16] Elapsed 0m 9s (remain 0m 0s) Loss: 1.3449(0.6681) 
f1 score : 0.22549019607843138
recall score : 0.1513157894736842
precision score : 0.4423076923076923
Epoch: [3][0/279] Elapsed 0m 0s (remain 2m 51s) Loss: 0.2023(0.2023) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 38s (remain 1m 6s) Loss: 0.0587(0.0818) Grad: 7.8259  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 1m 13s (remain 0m 28s) Loss: 0.0148(0.0806) Grad: 1.3560  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 1m 42s (remain 0m 0s) Loss: 0.0086(0.0712) Grad: 1.0477  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 13s) Loss: 0.2853(0.2853) 


Epoch 3 - avg_train_loss: 0.0712  avg_val_loss: 1.1558  time: 112s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0712  avg_val_loss: 1.1558  time: 112s
Epoch 3 - Score: 0.6680
INFO:__main__:Epoch 3 - Score: 0.6680


EVAL: [15/16] Elapsed 0m 9s (remain 0m 0s) Loss: 2.1288(1.1558) 
f1 score : 0.28571428571428575
recall score : 0.21710526315789475
precision score : 0.4177215189873418
Epoch: [4][0/279] Elapsed 0m 0s (remain 2m 53s) Loss: 0.0203(0.0203) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 36s (remain 1m 4s) Loss: 0.0036(0.0149) Grad: 0.3111  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 1m 13s (remain 0m 28s) Loss: 0.0036(0.0171) Grad: 0.1894  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 1m 41s (remain 0m 0s) Loss: 0.0027(0.0149) Grad: 0.1490  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 13s) Loss: 0.4373(0.4373) 


Epoch 4 - avg_train_loss: 0.0149  avg_val_loss: 1.1913  time: 112s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0149  avg_val_loss: 1.1913  time: 112s
Epoch 4 - Score: 0.6620
INFO:__main__:Epoch 4 - Score: 0.6620


EVAL: [15/16] Elapsed 0m 9s (remain 0m 0s) Loss: 1.8540(1.1913) 
f1 score : 0.34375
recall score : 0.2894736842105263
precision score : 0.4230769230769231


Score: 0.6821
INFO:__main__:Score: 0.6821
ACC BEST Score: 0.7022
INFO:__main__:ACC BEST Score: 0.7022


f1 score : 0.22549019607843138
recall score : 0.1513157894736842
precision score : 0.4423076923076923


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.27.4",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
 

Epoch: [1][0/279] Elapsed 0m 0s (remain 2m 59s) Loss: 0.8350(0.8350) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 36s (remain 1m 3s) Loss: 0.8784(0.6187) Grad: 14.2425  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 1m 12s (remain 0m 27s) Loss: 0.6499(0.6058) Grad: 3.4097  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 1m 40s (remain 0m 0s) Loss: 0.5596(0.6019) Grad: 2.8832  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 14s) Loss: 0.4954(0.4954) 


Epoch 1 - avg_train_loss: 0.6019  avg_val_loss: 0.5927  time: 112s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6019  avg_val_loss: 0.5927  time: 112s
Epoch 1 - Score: 0.7082
INFO:__main__:Epoch 1 - Score: 0.7082
Epoch 1 - Save Best Score: 0.7082 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7082 Model


EVAL: [15/16] Elapsed 0m 10s (remain 0m 0s) Loss: 0.7989(0.5927) 
f1 score : 0.24083769633507854
recall score : 0.1513157894736842
precision score : 0.5897435897435898
Epoch: [2][0/279] Elapsed 0m 0s (remain 2m 57s) Loss: 0.6118(0.6118) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 36s (remain 1m 4s) Loss: 0.4407(0.4803) Grad: 7.7376  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 1m 12s (remain 0m 28s) Loss: 0.2776(0.4822) Grad: 4.1587  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 1m 41s (remain 0m 0s) Loss: 0.3440(0.4674) Grad: 5.2637  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 14s) Loss: 0.3821(0.3821) 


Epoch 2 - avg_train_loss: 0.4674  avg_val_loss: 0.5981  time: 112s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4674  avg_val_loss: 0.5981  time: 112s
Epoch 2 - Score: 0.7082
INFO:__main__:Epoch 2 - Score: 0.7082


EVAL: [15/16] Elapsed 0m 10s (remain 0m 0s) Loss: 1.1804(0.5981) 
f1 score : 0.45692883895131087
recall score : 0.40131578947368424
precision score : 0.5304347826086957
Epoch: [3][0/279] Elapsed 0m 0s (remain 3m 10s) Loss: 0.1266(0.1266) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 37s (remain 1m 5s) Loss: 0.0351(0.0956) Grad: 3.0554  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 1m 13s (remain 0m 28s) Loss: 0.0172(0.0953) Grad: 2.8451  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 1m 41s (remain 0m 0s) Loss: 0.0704(0.0933) Grad: 9.4863  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 14s) Loss: 0.4347(0.4347) 


Epoch 3 - avg_train_loss: 0.0933  avg_val_loss: 0.8748  time: 112s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0933  avg_val_loss: 0.8748  time: 112s
Epoch 3 - Score: 0.7042
INFO:__main__:Epoch 3 - Score: 0.7042


EVAL: [15/16] Elapsed 0m 10s (remain 0m 0s) Loss: 2.6180(0.8748) 
f1 score : 0.4235294117647059
recall score : 0.35526315789473684
precision score : 0.5242718446601942
Epoch: [4][0/279] Elapsed 0m 0s (remain 2m 50s) Loss: 0.0084(0.0084) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 36s (remain 1m 4s) Loss: 0.0046(0.0171) Grad: 0.4409  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 1m 13s (remain 0m 28s) Loss: 0.0659(0.0169) Grad: 7.9060  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 1m 41s (remain 0m 0s) Loss: 0.0068(0.0165) Grad: 0.6866  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 14s) Loss: 0.5898(0.5898) 


Epoch 4 - avg_train_loss: 0.0165  avg_val_loss: 0.9361  time: 112s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0165  avg_val_loss: 0.9361  time: 112s
Epoch 4 - Score: 0.6942
INFO:__main__:Epoch 4 - Score: 0.6942


EVAL: [15/16] Elapsed 0m 10s (remain 0m 0s) Loss: 2.5801(0.9361) 
f1 score : 0.4242424242424242
recall score : 0.3684210526315789
precision score : 0.5


Score: 0.7082
INFO:__main__:Score: 0.7082
ACC BEST Score: 0.7082
INFO:__main__:ACC BEST Score: 0.7082


f1 score : 0.24083769633507854
recall score : 0.1513157894736842
precision score : 0.5897435897435898


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.27.4",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
 

Epoch: [1][0/279] Elapsed 0m 0s (remain 4m 14s) Loss: 0.8901(0.8901) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 37s (remain 1m 5s) Loss: 0.5244(0.6451) Grad: 1.9965  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 1m 14s (remain 0m 28s) Loss: 0.6045(0.6279) Grad: 1.2944  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 1m 41s (remain 0m 0s) Loss: 0.3057(0.6113) Grad: 1.8851  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 13s) Loss: 0.1671(0.1671) 


Epoch 1 - avg_train_loss: 0.6113  avg_val_loss: 0.6319  time: 112s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6113  avg_val_loss: 0.6319  time: 112s
Epoch 1 - Score: 0.7143
INFO:__main__:Epoch 1 - Score: 0.7143
Epoch 1 - Save Best Score: 0.7143 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7143 Model


EVAL: [15/16] Elapsed 0m 9s (remain 0m 0s) Loss: 1.5316(0.6319) 
f1 score : 0.12345679012345677
recall score : 0.06578947368421052
precision score : 1.0
Epoch: [2][0/279] Elapsed 0m 0s (remain 3m 20s) Loss: 0.7246(0.7246) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 37s (remain 1m 5s) Loss: 0.6538(0.4687) Grad: 5.6003  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 1m 13s (remain 0m 28s) Loss: 0.4626(0.4630) Grad: 4.7503  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 1m 42s (remain 0m 0s) Loss: 0.3223(0.4636) Grad: 3.3526  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 13s) Loss: 0.2542(0.2542) 


Epoch 2 - avg_train_loss: 0.4636  avg_val_loss: 0.6170  time: 112s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4636  avg_val_loss: 0.6170  time: 112s
Epoch 2 - Score: 0.7223
INFO:__main__:Epoch 2 - Score: 0.7223
Epoch 2 - Save Best Score: 0.7223 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7223 Model


EVAL: [15/16] Elapsed 0m 9s (remain 0m 0s) Loss: 1.2823(0.6170) 
f1 score : 0.3490566037735849
recall score : 0.24342105263157895
precision score : 0.6166666666666667
Epoch: [3][0/279] Elapsed 0m 0s (remain 3m 20s) Loss: 0.1792(0.1792) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 37s (remain 1m 5s) Loss: 0.0606(0.1018) Grad: 5.5455  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 1m 13s (remain 0m 28s) Loss: 0.0319(0.0740) Grad: 4.7253  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 1m 41s (remain 0m 0s) Loss: 0.0684(0.0656) Grad: 8.6515  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 13s) Loss: 0.5532(0.5532) 


Epoch 3 - avg_train_loss: 0.0656  avg_val_loss: 0.9668  time: 112s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0656  avg_val_loss: 0.9668  time: 112s
Epoch 3 - Score: 0.6700
INFO:__main__:Epoch 3 - Score: 0.6700


EVAL: [15/16] Elapsed 0m 9s (remain 0m 0s) Loss: 1.6878(0.9668) 
f1 score : 0.43448275862068964
recall score : 0.4144736842105263
precision score : 0.45652173913043476
Epoch: [4][0/279] Elapsed 0m 0s (remain 3m 9s) Loss: 0.0021(0.0021) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 37s (remain 1m 6s) Loss: 0.0021(0.0044) Grad: 0.1439  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 1m 13s (remain 0m 28s) Loss: 0.0043(0.0082) Grad: 0.6168  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 1m 41s (remain 0m 0s) Loss: 0.0035(0.0090) Grad: 0.2799  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 13s) Loss: 0.4904(0.4904) 


Epoch 4 - avg_train_loss: 0.0090  avg_val_loss: 1.0418  time: 112s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0090  avg_val_loss: 1.0418  time: 112s
Epoch 4 - Score: 0.6841
INFO:__main__:Epoch 4 - Score: 0.6841


EVAL: [15/16] Elapsed 0m 9s (remain 0m 0s) Loss: 2.0014(1.0418) 
f1 score : 0.4163568773234201
recall score : 0.3684210526315789
precision score : 0.47863247863247865


Score: 0.7223
INFO:__main__:Score: 0.7223
ACC BEST Score: 0.7264
INFO:__main__:ACC BEST Score: 0.7264


f1 score : 0.3490566037735849
recall score : 0.24342105263157895
precision score : 0.6166666666666667


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.27.4",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
 

Epoch: [1][0/279] Elapsed 0m 0s (remain 2m 45s) Loss: 0.9121(0.9121) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 37s (remain 1m 5s) Loss: 0.4290(0.6338) Grad: 2.9176  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 1m 13s (remain 0m 28s) Loss: 0.5327(0.6153) Grad: 7.5070  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 1m 41s (remain 0m 0s) Loss: 0.5806(0.6103) Grad: 3.4169  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 14s) Loss: 0.4022(0.4022) 


Epoch 1 - avg_train_loss: 0.6103  avg_val_loss: 0.5562  time: 112s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6103  avg_val_loss: 0.5562  time: 112s
Epoch 1 - Score: 0.7062
INFO:__main__:Epoch 1 - Score: 0.7062
Epoch 1 - Save Best Score: 0.7062 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7062 Model


EVAL: [15/16] Elapsed 0m 10s (remain 0m 0s) Loss: 1.0180(0.5562) 
f1 score : 0.20652173913043478
recall score : 0.125
precision score : 0.59375
Epoch: [2][0/279] Elapsed 0m 0s (remain 3m 14s) Loss: 0.5762(0.5762) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 37s (remain 1m 6s) Loss: 0.5737(0.4982) Grad: 13.2036  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 1m 13s (remain 0m 28s) Loss: 0.5127(0.4608) Grad: 6.6390  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 1m 42s (remain 0m 0s) Loss: 0.4763(0.4449) Grad: 6.4549  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 14s) Loss: 0.1707(0.1707) 


Epoch 2 - avg_train_loss: 0.4449  avg_val_loss: 0.6493  time: 112s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4449  avg_val_loss: 0.6493  time: 112s
Epoch 2 - Score: 0.7163
INFO:__main__:Epoch 2 - Score: 0.7163
Epoch 2 - Save Best Score: 0.7163 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7163 Model


EVAL: [15/16] Elapsed 0m 10s (remain 0m 0s) Loss: 1.8215(0.6493) 
f1 score : 0.24598930481283424
recall score : 0.1513157894736842
precision score : 0.6571428571428571
Epoch: [3][0/279] Elapsed 0m 0s (remain 3m 22s) Loss: 0.0983(0.0983) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 37s (remain 1m 5s) Loss: 0.0167(0.0591) Grad: 1.3630  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 1m 13s (remain 0m 28s) Loss: 0.0090(0.0493) Grad: 0.7205  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 1m 41s (remain 0m 0s) Loss: 0.0026(0.0449) Grad: 0.1515  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 14s) Loss: 0.6583(0.6583) 


Epoch 3 - avg_train_loss: 0.0449  avg_val_loss: 0.9257  time: 112s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0449  avg_val_loss: 0.9257  time: 112s
Epoch 3 - Score: 0.7002
INFO:__main__:Epoch 3 - Score: 0.7002


EVAL: [15/16] Elapsed 0m 10s (remain 0m 0s) Loss: 2.2348(0.9257) 
f1 score : 0.4983164983164983
recall score : 0.4868421052631579
precision score : 0.5103448275862069
Epoch: [4][0/279] Elapsed 0m 0s (remain 3m 6s) Loss: 0.0054(0.0054) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 36s (remain 1m 5s) Loss: 0.0024(0.0147) Grad: 0.1163  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 1m 12s (remain 0m 28s) Loss: 0.0057(0.0131) Grad: 0.9154  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 1m 40s (remain 0m 0s) Loss: 0.0020(0.0110) Grad: 0.0991  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 14s) Loss: 0.4507(0.4507) 


Epoch 4 - avg_train_loss: 0.0110  avg_val_loss: 0.9656  time: 111s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0110  avg_val_loss: 0.9656  time: 111s
Epoch 4 - Score: 0.7203
INFO:__main__:Epoch 4 - Score: 0.7203
Epoch 4 - Save Best Score: 0.7203 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7203 Model


EVAL: [15/16] Elapsed 0m 10s (remain 0m 0s) Loss: 2.9082(0.9656) 
f1 score : 0.47940074906367036
recall score : 0.42105263157894735
precision score : 0.5565217391304348


Score: 0.7203
INFO:__main__:Score: 0.7203
ACC BEST Score: 0.7344
INFO:__main__:ACC BEST Score: 0.7344


f1 score : 0.47940074906367036
recall score : 0.42105263157894735
precision score : 0.5565217391304348


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.27.4",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
 

Epoch: [1][0/279] Elapsed 0m 0s (remain 3m 22s) Loss: 0.7065(0.7065) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 37s (remain 1m 5s) Loss: 0.6284(0.6025) Grad: 2.1033  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 1m 14s (remain 0m 28s) Loss: 0.6973(0.6055) Grad: 6.7416  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 1m 42s (remain 0m 0s) Loss: 0.5103(0.6013) Grad: 3.9450  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 12s) Loss: 0.2264(0.2264) 


Epoch 1 - avg_train_loss: 0.6013  avg_val_loss: 0.5922  time: 113s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6013  avg_val_loss: 0.5922  time: 113s
Epoch 1 - Score: 0.6982
INFO:__main__:Epoch 1 - Score: 0.6982
Epoch 1 - Save Best Score: 0.6982 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6982 Model


EVAL: [15/16] Elapsed 0m 9s (remain 0m 0s) Loss: 1.3634(0.5922) 
f1 score : 0.062499999999999986
recall score : 0.03289473684210526
precision score : 0.625
Epoch: [2][0/279] Elapsed 0m 0s (remain 2m 48s) Loss: 0.6362(0.6362) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 36s (remain 1m 3s) Loss: 0.3130(0.4698) Grad: 5.2688  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 1m 12s (remain 0m 28s) Loss: 0.2788(0.4488) Grad: 8.6827  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 1m 40s (remain 0m 0s) Loss: 0.4094(0.4314) Grad: 6.8290  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 13s) Loss: 0.3285(0.3285) 


Epoch 2 - avg_train_loss: 0.4314  avg_val_loss: 0.6216  time: 111s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4314  avg_val_loss: 0.6216  time: 111s
Epoch 2 - Score: 0.6499
INFO:__main__:Epoch 2 - Score: 0.6499


EVAL: [15/16] Elapsed 0m 10s (remain 0m 0s) Loss: 1.0944(0.6216) 
f1 score : 0.3785714285714286
recall score : 0.34868421052631576
precision score : 0.4140625
Epoch: [3][0/279] Elapsed 0m 0s (remain 3m 15s) Loss: 0.1620(0.1620) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 37s (remain 1m 5s) Loss: 0.0055(0.0749) Grad: 0.1930  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 1m 12s (remain 0m 28s) Loss: 0.0039(0.0572) Grad: 0.2030  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 1m 41s (remain 0m 0s) Loss: 0.0092(0.0581) Grad: 1.5273  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 12s) Loss: 0.1439(0.1439) 


Epoch 3 - avg_train_loss: 0.0581  avg_val_loss: 1.1658  time: 112s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0581  avg_val_loss: 1.1658  time: 112s
Epoch 3 - Score: 0.6740
INFO:__main__:Epoch 3 - Score: 0.6740


EVAL: [15/16] Elapsed 0m 9s (remain 0m 0s) Loss: 2.6064(1.1658) 
f1 score : 0.3360655737704918
recall score : 0.26973684210526316
precision score : 0.44565217391304346
Epoch: [4][0/279] Elapsed 0m 0s (remain 2m 59s) Loss: 0.0026(0.0026) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 37s (remain 1m 5s) Loss: 0.0017(0.0059) Grad: 0.0884  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 1m 13s (remain 0m 28s) Loss: 0.1150(0.0098) Grad: 10.5590  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 1m 41s (remain 0m 0s) Loss: 0.0050(0.0144) Grad: 0.5012  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 12s) Loss: 0.2053(0.2053) 


Epoch 4 - avg_train_loss: 0.0144  avg_val_loss: 1.1710  time: 112s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0144  avg_val_loss: 1.1710  time: 112s
Epoch 4 - Score: 0.6700
INFO:__main__:Epoch 4 - Score: 0.6700


EVAL: [15/16] Elapsed 0m 9s (remain 0m 0s) Loss: 2.3447(1.1710) 
f1 score : 0.36923076923076925
recall score : 0.3157894736842105
precision score : 0.4444444444444444


Score: 0.6982
INFO:__main__:Score: 0.6982
ACC BEST Score: 0.7103
INFO:__main__:ACC BEST Score: 0.7103
Score: 0.7073
INFO:__main__:Score: 0.7073
ACC BEST Score: 0.7077
INFO:__main__:ACC BEST Score: 0.7077


f1 score : 0.062499999999999986
recall score : 0.03289473684210526
precision score : 0.625
f1 score : 0.2806324110671937
recall score : 0.18647406434668418
precision score : 0.5668662674650699


In [None]:
from google.colab import runtime
runtime.unassign()