In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m55.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m103.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.4
Looking in indexes: https://pypi.org/simple, htt

In [3]:
!nvidia-smi

Sun Apr  2 15:48:54 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   30C    P0    46W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:

import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re
import html

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD, AdamW, RAdam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score, recall_score, accuracy_score, precision_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:

# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model="microsoft/deberta-v3-base"
    # model='microsoft/deberta-base'
    # model='roberta-base'
    # model='roberta-large'
    # model='roberta-large-mnli'
    # model='xlnet-large-cased'
    # model='albert-xxlarge-v2'
    # model="microsoft/deberta-large"
    # model="microsoft/deberta-v3-large"
    # model='microsoft/deberta-v2-xlarge'
    # model='funnel-transformer/large'
    # model='funnel-transformer/medium'
    # model='albert-base-v2'
    # model='albert-large-v2'
    # model='google/electra-large-discriminator'
    # model='google/electra-base-discriminator'
    # model="facebook/bart-large-mnli"
    # model="facebook/bart-large"
    # model="facebook/bart-base"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=6
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=16
    fc_dropout=0.2
    target_size=1
    target_cols = 'y'
    max_len=256
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    train=True
    nth_awp_start_epoch=1
    gradient_checkpointing = False
    freezing = False

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1]

In [6]:
DIR = "/content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")
OUTPUT_EXP_DIR = DIR + '/output/EXP014/'
if not os.path.exists(OUTPUT_EXP_DIR):
    os.makedirs(OUTPUT_EXP_DIR)

In [7]:
def get_score(labels, outputs):
    thresh = 0.5
    y_pred = outputs
    y_true = labels
    f_score = f1_score(y_true, (y_pred>thresh).astype(int))
    r_score = recall_score(y_true, (y_pred>thresh).astype(int))
    p_score = precision_score(y_true, (y_pred>thresh).astype(int))
    print(f"f1 score : {f_score}")
    print(f"recall score : {r_score}")
    print(f"precision score : {p_score}")
    return accuracy_score(y_true, (y_pred>thresh).astype(int))

def get_acc_score(labels, outputs):
    y_pred = outputs
    y_true = labels
    best_score = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.80, 0.01):
        thresh = np.round(thresh, 2)
        score = accuracy_score(y_true, (y_pred>thresh).astype(int))
        #print("Accuracy score at threshold {0} is {1}".format(thresh, score))
        if score > best_score:
          best_score = score
          best_thresh = thresh
    return accuracy_score(y_true, (y_pred>best_thresh).astype(int))


def get_logger(filename=OUTPUT_EXP_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=CFG.seed)

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
import pandas as pd
import numpy as np

train = pd.read_csv(os.path.join(INPUT_DIR,"train_data.csv"))
test = pd.read_csv(os.path.join(INPUT_DIR,"test_data.csv"))
sample_sub = pd.read_csv(os.path.join(INPUT_DIR,"submission.csv"))

print(train.shape)
display(train.head(3))

print(test.shape)
display(test.head(3))

print(sample_sub.shape)
display(sample_sub.head(3))

(4974, 6)


Unnamed: 0,id,title,year,abstract,keywords,y
0,1,Hierarchical Adversarially Learned Inference,2018,We propose a novel hierarchical generative mod...,"generative, hierarchical, unsupervised, semisu...",0
1,2,Learning to Compute Word Embeddings On the Fly,2018,Words in natural language follow a Zipfian dis...,"NLU, word embeddings, representation learning",0
2,3,Graph2Seq: Scalable Learning Dynamics for Graphs,2018,Neural networks are increasingly used as a gen...,,0


(6393, 5)


Unnamed: 0,id,title,year,abstract,keywords
0,1,StyleAlign: Analysis and Applications of Align...,2022,"In this paper, we perform an in-depth study of...","StyleGAN, transfer learning, fine tuning, mode..."
1,2,Embedding a random graph via GNN: mean-field i...,2021,We develop a theory for embedding a random gra...,"Graph neural network, graph embedding, multi-r..."
2,3,BBRefinement: an universal scheme to improve p...,2021,We present a conceptually simple yet powerful ...,"object detection, deep neural networks, refine..."


(6393, 2)


Unnamed: 0,id,y
0,1,0
1,2,0
2,3,0


In [10]:
train["texts"] = train["title"] + "[SEP]" + train["abstract"]

In [11]:
skf = StratifiedKFold(n_splits=CFG.n_fold,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train.y)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)

if CFG.debug:
    display(train.groupby('kfold').size())
    train = train.sample(n=500, random_state=0).reset_index(drop=True)
    display(train.groupby('kfold').size())

In [12]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_EXP_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['texts'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 3 # cls
LOGGER.info(f"max_len: {CFG.max_len}")

100%|██████████| 4974/4974 [00:03<00:00, 1336.39it/s]
max_len: 522
INFO:__main__:max_len: 522


In [14]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors=None, 
        add_special_tokens=True, 
        max_length=CFG.max_len,
        pad_to_max_length=True,
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['texts'].values
        self.labels = df[cfg.target_cols].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label
    

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

#collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [15]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())

        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self._init_weights(self.fc)
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        feature = self.layer_norm1(feature)
        output = self.fc(feature)
        return output

In [16]:

# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [17]:

# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['kfold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['kfold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds['y'].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)


    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size*2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_EXP_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss(reduction="mean")
    
    best_score = -1.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')

        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_EXP_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_EXP_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [18]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['y'].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        acc_score = get_acc_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
        LOGGER.info(f'ACC BEST Score: {acc_score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
            #break
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_EXP_DIR+'oof_df.pkl')

DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.27.4",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "at

Downloading pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/248] Elapsed 0m 4s (remain 17m 12s) Loss: 1.0488(1.0488) Grad: nan  LR: 0.00002000  
Epoch: [1][100/248] Elapsed 0m 15s (remain 0m 22s) Loss: 0.6767(0.6393) Grad: 3.1847  LR: 0.00001977  
Epoch: [1][200/248] Elapsed 0m 27s (remain 0m 6s) Loss: 0.5489(0.6296) Grad: 5.2055  LR: 0.00001912  
Epoch: [1][247/248] Elapsed 0m 33s (remain 0m 0s) Loss: 0.6207(0.6246) Grad: 2.3336  LR: 0.00001867  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) Loss: 0.4701(0.4701) 


Epoch 1 - avg_train_loss: 0.6246  avg_val_loss: 0.6176  time: 40s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6246  avg_val_loss: 0.6176  time: 40s
Epoch 1 - Score: 0.6945
INFO:__main__:Epoch 1 - Score: 0.6945
Epoch 1 - Save Best Score: 0.6945 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6945 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.9023(0.6176) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
Epoch: [2][0/248] Elapsed 0m 0s (remain 1m 19s) Loss: 0.5513(0.5513) Grad: nan  LR: 0.00001866  
Epoch: [2][100/248] Elapsed 0m 12s (remain 0m 17s) Loss: 0.6038(0.5968) Grad: 1.1525  LR: 0.00001742  
Epoch: [2][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.6025(0.5958) Grad: 1.0499  LR: 0.00001585  
Epoch: [2][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.6617(0.5934) Grad: 3.4961  LR: 0.00001502  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 11s) Loss: 0.4739(0.4739) 


Epoch 2 - avg_train_loss: 0.5934  avg_val_loss: 0.6075  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5934  avg_val_loss: 0.6075  time: 36s
Epoch 2 - Score: 0.6945
INFO:__main__:Epoch 2 - Score: 0.6945


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.7667(0.6075) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
Epoch: [3][0/248] Elapsed 0m 0s (remain 1m 15s) Loss: 0.5495(0.5495) Grad: nan  LR: 0.00001501  
Epoch: [3][100/248] Elapsed 0m 11s (remain 0m 17s) Loss: 0.5912(0.5434) Grad: 3.9101  LR: 0.00001309  
Epoch: [3][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.7354(0.5326) Grad: 4.1980  LR: 0.00001103  
Epoch: [3][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.3796(0.5300) Grad: 4.1755  LR: 0.00001004  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) Loss: 0.3079(0.3079) 


Epoch 3 - avg_train_loss: 0.5300  avg_val_loss: 0.5761  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5300  avg_val_loss: 0.5761  time: 36s
Epoch 3 - Score: 0.7095
INFO:__main__:Epoch 3 - Score: 0.7095
Epoch 3 - Save Best Score: 0.7095 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7095 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.7463(0.5761) 
f1 score : 0.3620309050772627
recall score : 0.26973684210526316
precision score : 0.5503355704697986
Epoch: [4][0/248] Elapsed 0m 0s (remain 1m 20s) Loss: 0.4289(0.4289) Grad: nan  LR: 0.00001002  
Epoch: [4][100/248] Elapsed 0m 12s (remain 0m 17s) Loss: 0.2358(0.3723) Grad: 3.6903  LR: 0.00000793  
Epoch: [4][200/248] Elapsed 0m 24s (remain 0m 5s) Loss: 0.3143(0.3547) Grad: 6.4130  LR: 0.00000593  
Epoch: [4][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.2112(0.3540) Grad: 7.5469  LR: 0.00000505  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 11s) Loss: 0.2752(0.2752) 


Epoch 4 - avg_train_loss: 0.3540  avg_val_loss: 0.6917  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3540  avg_val_loss: 0.6917  time: 36s
Epoch 4 - Score: 0.7005
INFO:__main__:Epoch 4 - Score: 0.7005


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 1.3336(0.6917) 
f1 score : 0.39676113360323884
recall score : 0.3223684210526316
precision score : 0.5157894736842106
Epoch: [5][0/248] Elapsed 0m 0s (remain 1m 17s) Loss: 0.1409(0.1409) Grad: nan  LR: 0.00000503  
Epoch: [5][100/248] Elapsed 0m 12s (remain 0m 17s) Loss: 0.0399(0.1257) Grad: 3.9407  LR: 0.00000333  
Epoch: [5][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.0274(0.1204) Grad: 2.1428  LR: 0.00000192  
Epoch: [5][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.0400(0.1155) Grad: 3.3450  LR: 0.00000138  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) Loss: 0.6527(0.6527) 


Epoch 5 - avg_train_loss: 0.1155  avg_val_loss: 1.0888  time: 36s
INFO:__main__:Epoch 5 - avg_train_loss: 0.1155  avg_val_loss: 1.0888  time: 36s
Epoch 5 - Score: 0.6523
INFO:__main__:Epoch 5 - Score: 0.6523


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 2.8590(1.0888) 
f1 score : 0.4525316455696203
recall score : 0.47039473684210525
precision score : 0.43597560975609756
Epoch: [6][0/248] Elapsed 0m 0s (remain 1m 14s) Loss: 0.0132(0.0132) Grad: nan  LR: 0.00000136  
Epoch: [6][100/248] Elapsed 0m 12s (remain 0m 17s) Loss: 0.0101(0.0439) Grad: 0.3299  LR: 0.00000050  
Epoch: [6][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.0140(0.0424) Grad: 0.4096  LR: 0.00000006  
Epoch: [6][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.0183(0.0411) Grad: 0.6955  LR: 0.00000000  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 11s) Loss: 0.5140(0.5140) 


Epoch 6 - avg_train_loss: 0.0411  avg_val_loss: 1.1222  time: 36s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0411  avg_val_loss: 1.1222  time: 36s
Epoch 6 - Score: 0.6653
INFO:__main__:Epoch 6 - Score: 0.6653


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 3.6922(1.1222) 
f1 score : 0.408525754884547
recall score : 0.3782894736842105
precision score : 0.444015444015444


Score: 0.7095
INFO:__main__:Score: 0.7095
ACC BEST Score: 0.7196
INFO:__main__:ACC BEST Score: 0.7196
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.27.4",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main

f1 score : 0.3620309050772627
recall score : 0.26973684210526316
precision score : 0.5503355704697986


Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/248] Elapsed 0m 0s (remain 1m 28s) Loss: 0.6437(0.6437) Grad: nan  LR: 0.00002000  
Epoch: [1][100/248] Elapsed 0m 12s (remain 0m 17s) Loss: 0.6903(0.6265) Grad: 2.7052  LR: 0.00001977  
Epoch: [1][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.4235(0.6143) Grad: 3.1411  LR: 0.00001912  
Epoch: [1][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.6040(0.6150) Grad: 4.7076  LR: 0.00001867  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) Loss: 0.4861(0.4861) 


Epoch 1 - avg_train_loss: 0.6150  avg_val_loss: 0.6135  time: 36s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6150  avg_val_loss: 0.6135  time: 36s
Epoch 1 - Score: 0.6935
INFO:__main__:Epoch 1 - Score: 0.6935
Epoch 1 - Save Best Score: 0.6935 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6935 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.8273(0.6135) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
Epoch: [2][0/248] Elapsed 0m 0s (remain 1m 22s) Loss: 0.4795(0.4795) Grad: nan  LR: 0.00001866  
Epoch: [2][100/248] Elapsed 0m 11s (remain 0m 17s) Loss: 0.5344(0.5722) Grad: 2.6775  LR: 0.00001742  
Epoch: [2][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.6994(0.5459) Grad: 10.6357  LR: 0.00001585  
Epoch: [2][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.5851(0.5491) Grad: 5.4288  LR: 0.00001502  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) Loss: 0.5142(0.5142) 


Epoch 2 - avg_train_loss: 0.5491  avg_val_loss: 0.6060  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5491  avg_val_loss: 0.6060  time: 36s
Epoch 2 - Score: 0.6734
INFO:__main__:Epoch 2 - Score: 0.6734


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.6791(0.6060) 
f1 score : 0.5255474452554745
recall score : 0.5901639344262295
precision score : 0.47368421052631576
Epoch: [3][0/248] Elapsed 0m 0s (remain 1m 18s) Loss: 0.4547(0.4547) Grad: nan  LR: 0.00001501  
Epoch: [3][100/248] Elapsed 0m 11s (remain 0m 17s) Loss: 0.1211(0.2941) Grad: 6.5697  LR: 0.00001309  
Epoch: [3][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.2947(0.2606) Grad: 9.8802  LR: 0.00001103  
Epoch: [3][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.1600(0.2587) Grad: 8.1870  LR: 0.00001004  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) Loss: 0.5282(0.5282) 


Epoch 3 - avg_train_loss: 0.2587  avg_val_loss: 0.7252  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2587  avg_val_loss: 0.7252  time: 36s
Epoch 3 - Score: 0.6663
INFO:__main__:Epoch 3 - Score: 0.6663


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 1.2061(0.7252) 
f1 score : 0.4779874213836478
recall score : 0.49836065573770494
precision score : 0.459214501510574
Epoch: [4][0/248] Elapsed 0m 0s (remain 1m 19s) Loss: 0.0445(0.0445) Grad: nan  LR: 0.00001002  
Epoch: [4][100/248] Elapsed 0m 12s (remain 0m 17s) Loss: 0.0226(0.0372) Grad: 2.6387  LR: 0.00000793  
Epoch: [4][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.0184(0.0292) Grad: 3.3244  LR: 0.00000593  
Epoch: [4][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.0083(0.0266) Grad: 0.3861  LR: 0.00000505  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) Loss: 0.3494(0.3494) 


Epoch 4 - avg_train_loss: 0.0266  avg_val_loss: 1.0242  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0266  avg_val_loss: 1.0242  time: 36s
Epoch 4 - Score: 0.6844
INFO:__main__:Epoch 4 - Score: 0.6844


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 3.3568(1.0242) 
f1 score : 0.38910505836575876
recall score : 0.32786885245901637
precision score : 0.4784688995215311
Epoch: [5][0/248] Elapsed 0m 0s (remain 1m 16s) Loss: 0.0063(0.0063) Grad: nan  LR: 0.00000503  
Epoch: [5][100/248] Elapsed 0m 12s (remain 0m 17s) Loss: 0.0050(0.0052) Grad: 0.3095  LR: 0.00000333  
Epoch: [5][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.0037(0.0047) Grad: 0.1680  LR: 0.00000192  
Epoch: [5][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.0044(0.0046) Grad: 0.2251  LR: 0.00000138  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) Loss: 0.3208(0.3208) 


Epoch 5 - avg_train_loss: 0.0046  avg_val_loss: 1.1249  time: 36s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0046  avg_val_loss: 1.1249  time: 36s
Epoch 5 - Score: 0.6824
INFO:__main__:Epoch 5 - Score: 0.6824


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 4.0561(1.1249) 
f1 score : 0.373015873015873
recall score : 0.3081967213114754
precision score : 0.4723618090452261
Epoch: [6][0/248] Elapsed 0m 0s (remain 1m 19s) Loss: 0.0045(0.0045) Grad: nan  LR: 0.00000136  
Epoch: [6][100/248] Elapsed 0m 11s (remain 0m 17s) Loss: 0.0050(0.0036) Grad: 0.3436  LR: 0.00000050  
Epoch: [6][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.0031(0.0036) Grad: 0.1016  LR: 0.00000006  
Epoch: [6][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.0040(0.0036) Grad: 0.1481  LR: 0.00000000  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) Loss: 0.3210(0.3210) 


Epoch 6 - avg_train_loss: 0.0036  avg_val_loss: 1.1365  time: 36s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0036  avg_val_loss: 1.1365  time: 36s
Epoch 6 - Score: 0.6824
INFO:__main__:Epoch 6 - Score: 0.6824


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 4.1192(1.1365) 
f1 score : 0.373015873015873
recall score : 0.3081967213114754
precision score : 0.4723618090452261


Score: 0.6935
INFO:__main__:Score: 0.6935
ACC BEST Score: 0.7055
INFO:__main__:ACC BEST Score: 0.7055
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.27.4",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main

f1 score : 0.0
recall score : 0.0
precision score : 0.0


Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/248] Elapsed 0m 0s (remain 1m 26s) Loss: 0.5637(0.5637) Grad: nan  LR: 0.00002000  
Epoch: [1][100/248] Elapsed 0m 11s (remain 0m 17s) Loss: 0.5198(0.6187) Grad: 3.5967  LR: 0.00001977  
Epoch: [1][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.6723(0.6094) Grad: 4.6654  LR: 0.00001912  
Epoch: [1][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.5741(0.6119) Grad: 1.7543  LR: 0.00001867  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 13s) Loss: 0.4289(0.4289) 


Epoch 1 - avg_train_loss: 0.6119  avg_val_loss: 0.5961  time: 36s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6119  avg_val_loss: 0.5961  time: 36s
Epoch 1 - Score: 0.6975
INFO:__main__:Epoch 1 - Score: 0.6975
Epoch 1 - Save Best Score: 0.6975 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6975 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.8530(0.5961) 
f1 score : 0.02588996763754045
recall score : 0.013114754098360656
precision score : 1.0
Epoch: [2][0/248] Elapsed 0m 0s (remain 1m 21s) Loss: 0.5148(0.5148) Grad: nan  LR: 0.00001866  
Epoch: [2][100/248] Elapsed 0m 12s (remain 0m 17s) Loss: 0.6701(0.5156) Grad: 10.3291  LR: 0.00001742  
Epoch: [2][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.6589(0.5274) Grad: 3.4113  LR: 0.00001585  
Epoch: [2][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.4548(0.5265) Grad: 3.4114  LR: 0.00001502  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) Loss: 0.4165(0.4165) 


Epoch 2 - avg_train_loss: 0.5265  avg_val_loss: 0.5851  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5265  avg_val_loss: 0.5851  time: 36s
Epoch 2 - Score: 0.6945
INFO:__main__:Epoch 2 - Score: 0.6945


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.8887(0.5851) 
f1 score : 0.3303964757709251
recall score : 0.2459016393442623
precision score : 0.5033557046979866
Epoch: [3][0/248] Elapsed 0m 0s (remain 1m 23s) Loss: 0.2947(0.2947) Grad: nan  LR: 0.00001501  
Epoch: [3][100/248] Elapsed 0m 12s (remain 0m 17s) Loss: 0.2044(0.3114) Grad: 6.1052  LR: 0.00001309  
Epoch: [3][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.3122(0.2824) Grad: 11.5005  LR: 0.00001103  
Epoch: [3][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.1305(0.2722) Grad: 11.4153  LR: 0.00001004  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) Loss: 0.4312(0.4312) 


Epoch 3 - avg_train_loss: 0.2722  avg_val_loss: 0.8330  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2722  avg_val_loss: 0.8330  time: 36s
Epoch 3 - Score: 0.6724
INFO:__main__:Epoch 3 - Score: 0.6724


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 1.6769(0.8330) 
f1 score : 0.3319672131147541
recall score : 0.26557377049180325
precision score : 0.4426229508196721
Epoch: [4][0/248] Elapsed 0m 0s (remain 1m 24s) Loss: 0.3489(0.3489) Grad: nan  LR: 0.00001002  
Epoch: [4][100/248] Elapsed 0m 11s (remain 0m 17s) Loss: 0.0196(0.0455) Grad: 2.1008  LR: 0.00000793  
Epoch: [4][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.3292(0.0413) Grad: 4.7269  LR: 0.00000593  
Epoch: [4][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.0213(0.0372) Grad: 3.6759  LR: 0.00000505  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) Loss: 0.6284(0.6284) 


Epoch 4 - avg_train_loss: 0.0372  avg_val_loss: 1.1216  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0372  avg_val_loss: 1.1216  time: 36s
Epoch 4 - Score: 0.6774
INFO:__main__:Epoch 4 - Score: 0.6774


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 1.6266(1.1216) 
f1 score : 0.3885714285714285
recall score : 0.3344262295081967
precision score : 0.4636363636363636
Epoch: [5][0/248] Elapsed 0m 0s (remain 1m 17s) Loss: 0.0054(0.0054) Grad: nan  LR: 0.00000503  
Epoch: [5][100/248] Elapsed 0m 12s (remain 0m 17s) Loss: 0.0061(0.0094) Grad: 0.4266  LR: 0.00000333  
Epoch: [5][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.0049(0.0079) Grad: 0.3102  LR: 0.00000192  
Epoch: [5][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.0044(0.0084) Grad: 0.2463  LR: 0.00000138  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 13s) Loss: 0.7564(0.7564) 


Epoch 5 - avg_train_loss: 0.0084  avg_val_loss: 1.2143  time: 36s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0084  avg_val_loss: 1.2143  time: 36s
Epoch 5 - Score: 0.6643
INFO:__main__:Epoch 5 - Score: 0.6643


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 1.5855(1.2143) 
f1 score : 0.3927272727272727
recall score : 0.3540983606557377
precision score : 0.44081632653061226
Epoch: [6][0/248] Elapsed 0m 0s (remain 1m 18s) Loss: 0.0044(0.0044) Grad: nan  LR: 0.00000136  
Epoch: [6][100/248] Elapsed 0m 12s (remain 0m 17s) Loss: 0.0033(0.0063) Grad: 0.1807  LR: 0.00000050  
Epoch: [6][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.0035(0.0060) Grad: 0.0944  LR: 0.00000006  
Epoch: [6][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.0049(0.0057) Grad: 0.4310  LR: 0.00000000  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) Loss: 0.7697(0.7697) 


Epoch 6 - avg_train_loss: 0.0057  avg_val_loss: 1.2234  time: 36s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0057  avg_val_loss: 1.2234  time: 36s
Epoch 6 - Score: 0.6633
INFO:__main__:Epoch 6 - Score: 0.6633


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 1.5705(1.2234) 
f1 score : 0.39421338155515373
recall score : 0.35737704918032787
precision score : 0.43951612903225806


Score: 0.6975
INFO:__main__:Score: 0.6975
ACC BEST Score: 0.6995
INFO:__main__:ACC BEST Score: 0.6995


f1 score : 0.02588996763754045
recall score : 0.013114754098360656
precision score : 1.0


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.27.4",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "at

Epoch: [1][0/248] Elapsed 0m 0s (remain 1m 26s) Loss: 0.7845(0.7845) Grad: nan  LR: 0.00002000  
Epoch: [1][100/248] Elapsed 0m 11s (remain 0m 17s) Loss: 0.8779(0.6359) Grad: 8.5131  LR: 0.00001977  
Epoch: [1][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.6193(0.6227) Grad: 0.6727  LR: 0.00001912  
Epoch: [1][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.4074(0.6171) Grad: 4.6525  LR: 0.00001867  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 13s) Loss: 0.3356(0.3356) 


Epoch 1 - avg_train_loss: 0.6171  avg_val_loss: 0.5921  time: 36s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6171  avg_val_loss: 0.5921  time: 36s
Epoch 1 - Score: 0.6935
INFO:__main__:Epoch 1 - Score: 0.6935
Epoch 1 - Save Best Score: 0.6935 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6935 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 1.1065(0.5921) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
Epoch: [2][0/248] Elapsed 0m 0s (remain 1m 30s) Loss: 0.4953(0.4953) Grad: nan  LR: 0.00001866  
Epoch: [2][100/248] Elapsed 0m 12s (remain 0m 17s) Loss: 0.5818(0.5512) Grad: 6.8944  LR: 0.00001742  
Epoch: [2][200/248] Elapsed 0m 24s (remain 0m 5s) Loss: 0.4428(0.5521) Grad: 1.6330  LR: 0.00001585  
Epoch: [2][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.4949(0.5503) Grad: 2.8630  LR: 0.00001502  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 13s) Loss: 0.2760(0.2760) 


Epoch 2 - avg_train_loss: 0.5503  avg_val_loss: 0.5861  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5503  avg_val_loss: 0.5861  time: 36s
Epoch 2 - Score: 0.7075
INFO:__main__:Epoch 2 - Score: 0.7075
Epoch 2 - Save Best Score: 0.7075 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7075 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.8509(0.5861) 
f1 score : 0.24806201550387597
recall score : 0.15737704918032788
precision score : 0.5853658536585366
Epoch: [3][0/248] Elapsed 0m 0s (remain 1m 23s) Loss: 0.5085(0.5085) Grad: nan  LR: 0.00001501  
Epoch: [3][100/248] Elapsed 0m 12s (remain 0m 17s) Loss: 0.2198(0.3254) Grad: 7.8743  LR: 0.00001309  
Epoch: [3][200/248] Elapsed 0m 24s (remain 0m 5s) Loss: 0.2304(0.3160) Grad: 5.5701  LR: 0.00001103  
Epoch: [3][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.2220(0.3057) Grad: 6.8008  LR: 0.00001004  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) Loss: 0.1702(0.1702) 


Epoch 3 - avg_train_loss: 0.3057  avg_val_loss: 0.7827  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.3057  avg_val_loss: 0.7827  time: 36s
Epoch 3 - Score: 0.7075
INFO:__main__:Epoch 3 - Score: 0.7075


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 1.4858(0.7827) 
f1 score : 0.3576158940397351
recall score : 0.26557377049180325
precision score : 0.5472972972972973
Epoch: [4][0/248] Elapsed 0m 0s (remain 1m 22s) Loss: 0.0534(0.0534) Grad: nan  LR: 0.00001002  
Epoch: [4][100/248] Elapsed 0m 12s (remain 0m 17s) Loss: 0.0201(0.0571) Grad: 1.2040  LR: 0.00000793  
Epoch: [4][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.0055(0.0423) Grad: 0.1883  LR: 0.00000593  
Epoch: [4][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.0294(0.0401) Grad: 4.0628  LR: 0.00000505  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 13s) Loss: 0.1355(0.1355) 


Epoch 4 - avg_train_loss: 0.0401  avg_val_loss: 1.0644  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0401  avg_val_loss: 1.0644  time: 36s
Epoch 4 - Score: 0.6995
INFO:__main__:Epoch 4 - Score: 0.6995


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 2.5162(1.0644) 
f1 score : 0.3757828810020877
recall score : 0.29508196721311475
precision score : 0.5172413793103449
Epoch: [5][0/248] Elapsed 0m 0s (remain 1m 20s) Loss: 0.0037(0.0037) Grad: nan  LR: 0.00000503  
Epoch: [5][100/248] Elapsed 0m 11s (remain 0m 17s) Loss: 0.0046(0.0069) Grad: 0.1999  LR: 0.00000333  
Epoch: [5][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.0067(0.0069) Grad: 0.2687  LR: 0.00000192  
Epoch: [5][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.0047(0.0065) Grad: 0.2253  LR: 0.00000138  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) Loss: 0.2657(0.2657) 


Epoch 5 - avg_train_loss: 0.0065  avg_val_loss: 1.1371  time: 36s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0065  avg_val_loss: 1.1371  time: 36s
Epoch 5 - Score: 0.6844
INFO:__main__:Epoch 5 - Score: 0.6844


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 2.3438(1.1371) 
f1 score : 0.41417910447761197
recall score : 0.3639344262295082
precision score : 0.4805194805194805
Epoch: [6][0/248] Elapsed 0m 0s (remain 1m 17s) Loss: 0.0043(0.0043) Grad: nan  LR: 0.00000136  
Epoch: [6][100/248] Elapsed 0m 12s (remain 0m 17s) Loss: 0.0042(0.0041) Grad: 0.1866  LR: 0.00000050  
Epoch: [6][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.0086(0.0042) Grad: 0.6710  LR: 0.00000006  
Epoch: [6][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.0032(0.0041) Grad: 0.1164  LR: 0.00000000  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 13s) Loss: 0.2730(0.2730) 


Epoch 6 - avg_train_loss: 0.0041  avg_val_loss: 1.1494  time: 36s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0041  avg_val_loss: 1.1494  time: 36s
Epoch 6 - Score: 0.6834
INFO:__main__:Epoch 6 - Score: 0.6834


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 2.3574(1.1494) 
f1 score : 0.4134078212290503
recall score : 0.3639344262295082
precision score : 0.47844827586206895


Score: 0.7075
INFO:__main__:Score: 0.7075
ACC BEST Score: 0.7106
INFO:__main__:ACC BEST Score: 0.7106
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.27.4",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main

f1 score : 0.24806201550387597
recall score : 0.15737704918032788
precision score : 0.5853658536585366


Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/248] Elapsed 0m 0s (remain 1m 26s) Loss: 0.9381(0.9381) Grad: nan  LR: 0.00002000  
Epoch: [1][100/248] Elapsed 0m 11s (remain 0m 17s) Loss: 0.5472(0.6456) Grad: 5.2954  LR: 0.00001977  
Epoch: [1][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.6459(0.6268) Grad: 3.1163  LR: 0.00001912  
Epoch: [1][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.4931(0.6237) Grad: 0.8296  LR: 0.00001867  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) Loss: 0.1636(0.1636) 


Epoch 1 - avg_train_loss: 0.6237  avg_val_loss: 0.6660  time: 36s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6237  avg_val_loss: 0.6660  time: 36s
Epoch 1 - Score: 0.6942
INFO:__main__:Epoch 1 - Score: 0.6942
Epoch 1 - Save Best Score: 0.6942 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6942 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 1.7054(0.6660) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
Epoch: [2][0/248] Elapsed 0m 0s (remain 1m 25s) Loss: 0.7546(0.7546) Grad: nan  LR: 0.00001866  
Epoch: [2][100/248] Elapsed 0m 12s (remain 0m 17s) Loss: 0.3957(0.5837) Grad: 8.1014  LR: 0.00001742  
Epoch: [2][200/248] Elapsed 0m 24s (remain 0m 5s) Loss: 0.5935(0.5796) Grad: 1.4834  LR: 0.00001585  
Epoch: [2][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.6189(0.5794) Grad: 1.8745  LR: 0.00001502  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 11s) Loss: 0.3348(0.3348) 


Epoch 2 - avg_train_loss: 0.5794  avg_val_loss: 0.5710  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5794  avg_val_loss: 0.5710  time: 36s
Epoch 2 - Score: 0.7022
INFO:__main__:Epoch 2 - Score: 0.7022
Epoch 2 - Save Best Score: 0.7022 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7022 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.8074(0.5710) 
f1 score : 0.34801762114537443
recall score : 0.2598684210526316
precision score : 0.5266666666666666
Epoch: [3][0/248] Elapsed 0m 0s (remain 1m 26s) Loss: 0.4624(0.4624) Grad: nan  LR: 0.00001501  
Epoch: [3][100/248] Elapsed 0m 12s (remain 0m 18s) Loss: 0.2892(0.5394) Grad: 4.6753  LR: 0.00001309  
Epoch: [3][200/248] Elapsed 0m 24s (remain 0m 5s) Loss: 0.6725(0.5295) Grad: 7.9773  LR: 0.00001103  
Epoch: [3][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.4945(0.5247) Grad: 2.3085  LR: 0.00001004  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 11s) Loss: 0.2609(0.2609) 


Epoch 3 - avg_train_loss: 0.5247  avg_val_loss: 0.5705  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5247  avg_val_loss: 0.5705  time: 36s
Epoch 3 - Score: 0.7052
INFO:__main__:Epoch 3 - Score: 0.7052
Epoch 3 - Save Best Score: 0.7052 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7052 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 1.1658(0.5705) 
f1 score : 0.16524216524216523
recall score : 0.09539473684210527
precision score : 0.6170212765957447
Epoch: [4][0/248] Elapsed 0m 0s (remain 1m 20s) Loss: 0.4744(0.4744) Grad: nan  LR: 0.00001002  
Epoch: [4][100/248] Elapsed 0m 11s (remain 0m 17s) Loss: 0.4467(0.3670) Grad: 7.2963  LR: 0.00000793  
Epoch: [4][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.3658(0.3464) Grad: 6.8206  LR: 0.00000593  
Epoch: [4][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.2360(0.3402) Grad: 5.9277  LR: 0.00000505  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 11s) Loss: 0.3711(0.3711) 


Epoch 4 - avg_train_loss: 0.3402  avg_val_loss: 0.6540  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3402  avg_val_loss: 0.6540  time: 36s
Epoch 4 - Score: 0.6720
INFO:__main__:Epoch 4 - Score: 0.6720


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 1.0145(0.6540) 
f1 score : 0.4673202614379085
recall score : 0.47039473684210525
precision score : 0.4642857142857143
Epoch: [5][0/248] Elapsed 0m 0s (remain 1m 20s) Loss: 0.1916(0.1916) Grad: nan  LR: 0.00000503  
Epoch: [5][100/248] Elapsed 0m 12s (remain 0m 17s) Loss: 0.0674(0.1214) Grad: 7.0610  LR: 0.00000333  
Epoch: [5][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.0469(0.1082) Grad: 4.3727  LR: 0.00000192  
Epoch: [5][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.0981(0.1009) Grad: 7.6404  LR: 0.00000138  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 11s) Loss: 0.3188(0.3188) 


Epoch 5 - avg_train_loss: 0.1009  avg_val_loss: 0.8635  time: 36s
INFO:__main__:Epoch 5 - avg_train_loss: 0.1009  avg_val_loss: 0.8635  time: 36s
Epoch 5 - Score: 0.6891
INFO:__main__:Epoch 5 - Score: 0.6891


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 1.8169(0.8635) 
f1 score : 0.4452423698384201
recall score : 0.40789473684210525
precision score : 0.4901185770750988
Epoch: [6][0/248] Elapsed 0m 0s (remain 1m 19s) Loss: 0.0207(0.0207) Grad: nan  LR: 0.00000136  
Epoch: [6][100/248] Elapsed 0m 11s (remain 0m 17s) Loss: 0.0240(0.0354) Grad: 2.8209  LR: 0.00000050  
Epoch: [6][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.0138(0.0352) Grad: 0.4245  LR: 0.00000006  
Epoch: [6][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.0282(0.0343) Grad: 2.1296  LR: 0.00000000  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 11s) Loss: 0.3410(0.3410) 


Epoch 6 - avg_train_loss: 0.0343  avg_val_loss: 0.8976  time: 36s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0343  avg_val_loss: 0.8976  time: 36s
Epoch 6 - Score: 0.6891
INFO:__main__:Epoch 6 - Score: 0.6891


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 1.8827(0.8976) 
f1 score : 0.45502645502645506
recall score : 0.4243421052631579
precision score : 0.49049429657794674


Score: 0.7052
INFO:__main__:Score: 0.7052
ACC BEST Score: 0.7082
INFO:__main__:ACC BEST Score: 0.7082
Score: 0.7027
INFO:__main__:Score: 0.7027
ACC BEST Score: 0.7035
INFO:__main__:ACC BEST Score: 0.7035


f1 score : 0.16524216524216523
recall score : 0.09539473684210527
precision score : 0.6170212765957447
f1 score : 0.18060941828254848
recall score : 0.10702560735390676
precision score : 0.5780141843971631


In [None]:
from google.colab import runtime
runtime.unassign()