In [1]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!nvidia-smi

Sun Apr  2 14:27:18 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    47W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:

import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re
import html

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD, AdamW, RAdam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score, recall_score, accuracy_score, precision_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:

# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model="microsoft/deberta-v3-base"
    # model='microsoft/deberta-base'
    # model='roberta-base'
    # model='roberta-large'
    # model='roberta-large-mnli'
    # model='xlnet-large-cased'
    # model='albert-xxlarge-v2'
    # model="microsoft/deberta-large"
    # model="microsoft/deberta-v3-large"
    # model='microsoft/deberta-v2-xlarge'
    # model='funnel-transformer/large'
    # model='funnel-transformer/medium'
    # model='albert-base-v2'
    # model='albert-large-v2'
    # model='google/electra-large-discriminator'
    # model='google/electra-base-discriminator'
    # model="facebook/bart-large-mnli"
    # model="facebook/bart-large"
    # model="facebook/bart-base"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=6
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=16
    fc_dropout=0.2
    target_size=1
    target_cols = 'y'
    max_len=256
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    train=True
    nth_awp_start_epoch=1
    gradient_checkpointing = False
    freezing = False

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1]

In [6]:
DIR = "/content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")
OUTPUT_EXP_DIR = DIR + '/output/EXP013/'
if not os.path.exists(OUTPUT_EXP_DIR):
    os.makedirs(OUTPUT_EXP_DIR)

In [7]:
def get_score(labels, outputs):
    thresh = 0.5
    y_pred = outputs
    y_true = labels
    f_score = f1_score(y_true, (y_pred>thresh).astype(int))
    r_score = recall_score(y_true, (y_pred>thresh).astype(int))
    p_score = precision_score(y_true, (y_pred>thresh).astype(int))
    print(f"f1 score : {f_score}")
    print(f"recall score : {r_score}")
    print(f"precision score : {p_score}")
    return accuracy_score(y_true, (y_pred>thresh).astype(int))

def get_acc_score(labels, outputs):
    y_pred = outputs
    y_true = labels
    best_score = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.80, 0.01):
        thresh = np.round(thresh, 2)
        score = accuracy_score(y_true, (y_pred>thresh).astype(int))
        #print("Accuracy score at threshold {0} is {1}".format(thresh, score))
        if score > best_score:
          best_score = score
          best_thresh = thresh
    return accuracy_score(y_true, (y_pred>best_thresh).astype(int))


def get_logger(filename=OUTPUT_EXP_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=CFG.seed)

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
import pandas as pd
import numpy as np

train = pd.read_csv(os.path.join(INPUT_DIR,"train_data.csv"))
test = pd.read_csv(os.path.join(INPUT_DIR,"test_data.csv"))
sample_sub = pd.read_csv(os.path.join(INPUT_DIR,"submission.csv"))

print(train.shape)
display(train.head(3))

print(test.shape)
display(test.head(3))

print(sample_sub.shape)
display(sample_sub.head(3))

(4974, 6)


Unnamed: 0,id,title,year,abstract,keywords,y
0,1,Hierarchical Adversarially Learned Inference,2018,We propose a novel hierarchical generative mod...,"generative, hierarchical, unsupervised, semisu...",0
1,2,Learning to Compute Word Embeddings On the Fly,2018,Words in natural language follow a Zipfian dis...,"NLU, word embeddings, representation learning",0
2,3,Graph2Seq: Scalable Learning Dynamics for Graphs,2018,Neural networks are increasingly used as a gen...,,0


(6393, 5)


Unnamed: 0,id,title,year,abstract,keywords
0,1,StyleAlign: Analysis and Applications of Align...,2022,"In this paper, we perform an in-depth study of...","StyleGAN, transfer learning, fine tuning, mode..."
1,2,Embedding a random graph via GNN: mean-field i...,2021,We develop a theory for embedding a random gra...,"Graph neural network, graph embedding, multi-r..."
2,3,BBRefinement: an universal scheme to improve p...,2021,We present a conceptually simple yet powerful ...,"object detection, deep neural networks, refine..."


(6393, 2)


Unnamed: 0,id,y
0,1,0
1,2,0
2,3,0


In [10]:
train["keywords"] = train["keywords"].fillna("None keywords")
train["texts"] = train["keywords"] + "[SEP]" + train["title"] + "[SEP]" + train["abstract"]

In [11]:
skf = StratifiedKFold(n_splits=CFG.n_fold,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train.y)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)

if CFG.debug:
    display(train.groupby('kfold').size())
    train = train.sample(n=500, random_state=0).reset_index(drop=True)
    display(train.groupby('kfold').size())

In [12]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_EXP_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['texts'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 2 # cls
LOGGER.info(f"max_len: {CFG.max_len}")

100%|██████████| 4974/4974 [00:04<00:00, 1218.97it/s]
max_len: 537
INFO:__main__:max_len: 537


In [14]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors=None, 
        add_special_tokens=True, 
        max_length=CFG.max_len,
        pad_to_max_length=True,
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['texts'].values
        self.labels = df[cfg.target_cols].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label
    

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

#collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [15]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())

        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self._init_weights(self.fc)
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        feature = self.layer_norm1(feature)
        output = self.fc(feature)
        return output

In [16]:

# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [17]:

# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['kfold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['kfold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds['y'].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)


    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size*2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_EXP_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss(reduction="mean")
    
    best_score = -1.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')

        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_EXP_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_EXP_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [18]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['y'].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        acc_score = get_acc_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
        LOGGER.info(f'ACC BEST Score: {acc_score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
            #break
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_EXP_DIR+'oof_df.pkl')

DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.27.4",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "at

Epoch: [1][0/248] Elapsed 0m 1s (remain 6m 52s) Loss: 1.0465(1.0465) Grad: nan  LR: 0.00002000  
Epoch: [1][100/248] Elapsed 0m 13s (remain 0m 20s) Loss: 0.6729(0.6391) Grad: 3.1944  LR: 0.00001977  
Epoch: [1][200/248] Elapsed 0m 26s (remain 0m 6s) Loss: 0.5376(0.6301) Grad: 4.7953  LR: 0.00001912  
Epoch: [1][247/248] Elapsed 0m 31s (remain 0m 0s) Loss: 0.5846(0.6229) Grad: 0.8036  LR: 0.00001867  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 13s) Loss: 0.3194(0.3194) 


Epoch 1 - avg_train_loss: 0.6229  avg_val_loss: 0.5901  time: 39s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6229  avg_val_loss: 0.5901  time: 39s
Epoch 1 - Score: 0.6945
INFO:__main__:Epoch 1 - Score: 0.6945
Epoch 1 - Save Best Score: 0.6945 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6945 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.9424(0.5901) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
Epoch: [2][0/248] Elapsed 0m 0s (remain 1m 33s) Loss: 0.4734(0.4734) Grad: nan  LR: 0.00001866  
Epoch: [2][100/248] Elapsed 0m 12s (remain 0m 18s) Loss: 0.6098(0.6050) Grad: 1.8741  LR: 0.00001742  
Epoch: [2][200/248] Elapsed 0m 24s (remain 0m 5s) Loss: 0.6036(0.6053) Grad: 2.0104  LR: 0.00001585  
Epoch: [2][247/248] Elapsed 0m 30s (remain 0m 0s) Loss: 0.6867(0.6003) Grad: 5.1984  LR: 0.00001502  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 13s) Loss: 0.5474(0.5474) 


Epoch 2 - avg_train_loss: 0.6003  avg_val_loss: 0.6268  time: 38s
INFO:__main__:Epoch 2 - avg_train_loss: 0.6003  avg_val_loss: 0.6268  time: 38s
Epoch 2 - Score: 0.6362
INFO:__main__:Epoch 2 - Score: 0.6362


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.6247(0.6268) 
f1 score : 0.46607669616519176
recall score : 0.5197368421052632
precision score : 0.42245989304812837
Epoch: [3][0/248] Elapsed 0m 0s (remain 1m 28s) Loss: 0.5507(0.5507) Grad: nan  LR: 0.00001501  
Epoch: [3][100/248] Elapsed 0m 12s (remain 0m 18s) Loss: 0.5344(0.5454) Grad: 3.0671  LR: 0.00001309  
Epoch: [3][200/248] Elapsed 0m 24s (remain 0m 5s) Loss: 0.6648(0.5339) Grad: 3.1556  LR: 0.00001103  
Epoch: [3][247/248] Elapsed 0m 30s (remain 0m 0s) Loss: 0.4015(0.5318) Grad: 3.5810  LR: 0.00001004  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 13s) Loss: 0.3304(0.3304) 


Epoch 3 - avg_train_loss: 0.5318  avg_val_loss: 0.5621  time: 38s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5318  avg_val_loss: 0.5621  time: 38s
Epoch 3 - Score: 0.7226
INFO:__main__:Epoch 3 - Score: 0.7226
Epoch 3 - Save Best Score: 0.7226 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7226 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.6732(0.5621) 
f1 score : 0.39999999999999997
recall score : 0.3026315789473684
precision score : 0.5897435897435898
Epoch: [4][0/248] Elapsed 0m 0s (remain 1m 29s) Loss: 0.4011(0.4011) Grad: nan  LR: 0.00001002  
Epoch: [4][100/248] Elapsed 0m 12s (remain 0m 18s) Loss: 0.1777(0.3548) Grad: 5.7742  LR: 0.00000793  
Epoch: [4][200/248] Elapsed 0m 25s (remain 0m 5s) Loss: 0.3131(0.3401) Grad: 8.5934  LR: 0.00000593  
Epoch: [4][247/248] Elapsed 0m 30s (remain 0m 0s) Loss: 0.2325(0.3424) Grad: 4.8625  LR: 0.00000505  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 13s) Loss: 0.4837(0.4837) 


Epoch 4 - avg_train_loss: 0.3424  avg_val_loss: 0.6650  time: 38s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3424  avg_val_loss: 0.6650  time: 38s
Epoch 4 - Score: 0.6643
INFO:__main__:Epoch 4 - Score: 0.6643


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.3631(0.6650) 
f1 score : 0.4577922077922078
recall score : 0.46381578947368424
precision score : 0.4519230769230769
Epoch: [5][0/248] Elapsed 0m 0s (remain 1m 27s) Loss: 0.1795(0.1795) Grad: nan  LR: 0.00000503  
Epoch: [5][100/248] Elapsed 0m 12s (remain 0m 18s) Loss: 0.2718(0.1364) Grad: 7.0401  LR: 0.00000333  
Epoch: [5][200/248] Elapsed 0m 24s (remain 0m 5s) Loss: 0.0269(0.1209) Grad: 1.6773  LR: 0.00000192  
Epoch: [5][247/248] Elapsed 0m 30s (remain 0m 0s) Loss: 0.0595(0.1149) Grad: 3.5125  LR: 0.00000138  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 13s) Loss: 0.5651(0.5651) 


Epoch 5 - avg_train_loss: 0.1149  avg_val_loss: 0.9269  time: 38s
INFO:__main__:Epoch 5 - avg_train_loss: 0.1149  avg_val_loss: 0.9269  time: 38s
Epoch 5 - Score: 0.6714
INFO:__main__:Epoch 5 - Score: 0.6714


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.6635(0.9269) 
f1 score : 0.44293015332197616
recall score : 0.4276315789473684
precision score : 0.45936395759717313
Epoch: [6][0/248] Elapsed 0m 0s (remain 1m 27s) Loss: 0.0209(0.0209) Grad: nan  LR: 0.00000136  
Epoch: [6][100/248] Elapsed 0m 12s (remain 0m 18s) Loss: 0.0152(0.0522) Grad: 0.6770  LR: 0.00000050  
Epoch: [6][200/248] Elapsed 0m 24s (remain 0m 5s) Loss: 0.2492(0.0556) Grad: 13.4256  LR: 0.00000006  
Epoch: [6][247/248] Elapsed 0m 30s (remain 0m 0s) Loss: 0.0305(0.0520) Grad: 1.4128  LR: 0.00000000  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 13s) Loss: 0.6151(0.6151) 


Epoch 6 - avg_train_loss: 0.0520  avg_val_loss: 0.9837  time: 38s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0520  avg_val_loss: 0.9837  time: 38s
Epoch 6 - Score: 0.6704
INFO:__main__:Epoch 6 - Score: 0.6704


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.9384(0.9837) 
f1 score : 0.4421768707482993
recall score : 0.4276315789473684
precision score : 0.45774647887323944


Score: 0.7226
INFO:__main__:Score: 0.7226
ACC BEST Score: 0.7246
INFO:__main__:ACC BEST Score: 0.7246
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.27.4",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main

f1 score : 0.39999999999999997
recall score : 0.3026315789473684
precision score : 0.5897435897435898


Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.weight', 'mask_predictions.classifier.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/248] Elapsed 0m 0s (remain 1m 39s) Loss: 0.6366(0.6366) Grad: nan  LR: 0.00002000  
Epoch: [1][100/248] Elapsed 0m 12s (remain 0m 18s) Loss: 0.6851(0.6240) Grad: 2.3603  LR: 0.00001977  
Epoch: [1][200/248] Elapsed 0m 25s (remain 0m 5s) Loss: 0.4300(0.6132) Grad: 3.3721  LR: 0.00001912  
Epoch: [1][247/248] Elapsed 0m 30s (remain 0m 0s) Loss: 0.6015(0.6140) Grad: 4.7235  LR: 0.00001867  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 13s) Loss: 0.4909(0.4909) 


Epoch 1 - avg_train_loss: 0.6140  avg_val_loss: 0.6140  time: 38s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6140  avg_val_loss: 0.6140  time: 38s
Epoch 1 - Score: 0.6945
INFO:__main__:Epoch 1 - Score: 0.6945
Epoch 1 - Save Best Score: 0.6945 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6945 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.8212(0.6140) 
f1 score : 0.012987012987012988
recall score : 0.006557377049180328
precision score : 0.6666666666666666
Epoch: [2][0/248] Elapsed 0m 0s (remain 1m 38s) Loss: 0.4924(0.4924) Grad: nan  LR: 0.00001866  
Epoch: [2][100/248] Elapsed 0m 12s (remain 0m 18s) Loss: 0.5412(0.5642) Grad: 2.5065  LR: 0.00001742  
Epoch: [2][200/248] Elapsed 0m 25s (remain 0m 5s) Loss: 0.6396(0.5410) Grad: 9.0034  LR: 0.00001585  
Epoch: [2][247/248] Elapsed 0m 30s (remain 0m 0s) Loss: 0.6230(0.5451) Grad: 5.7002  LR: 0.00001502  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 13s) Loss: 0.5207(0.5207) 


Epoch 2 - avg_train_loss: 0.5451  avg_val_loss: 0.6009  time: 38s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5451  avg_val_loss: 0.6009  time: 38s
Epoch 2 - Score: 0.6744
INFO:__main__:Epoch 2 - Score: 0.6744


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.6926(0.6009) 
f1 score : 0.5149700598802395
recall score : 0.5639344262295082
precision score : 0.4738292011019284
Epoch: [3][0/248] Elapsed 0m 0s (remain 1m 28s) Loss: 0.4520(0.4520) Grad: nan  LR: 0.00001501  
Epoch: [3][100/248] Elapsed 0m 12s (remain 0m 18s) Loss: 0.1816(0.3255) Grad: 7.0599  LR: 0.00001309  
Epoch: [3][200/248] Elapsed 0m 24s (remain 0m 5s) Loss: 0.3753(0.2938) Grad: 12.4569  LR: 0.00001103  
Epoch: [3][247/248] Elapsed 0m 30s (remain 0m 0s) Loss: 0.3473(0.2899) Grad: 11.8736  LR: 0.00001004  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 13s) Loss: 0.4038(0.4038) 


Epoch 3 - avg_train_loss: 0.2899  avg_val_loss: 0.6774  time: 38s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2899  avg_val_loss: 0.6774  time: 38s
Epoch 3 - Score: 0.6794
INFO:__main__:Epoch 3 - Score: 0.6794


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 1.0927(0.6774) 
f1 score : 0.4674457429048414
recall score : 0.45901639344262296
precision score : 0.47619047619047616
Epoch: [4][0/248] Elapsed 0m 0s (remain 1m 32s) Loss: 0.0630(0.0630) Grad: nan  LR: 0.00001002  
Epoch: [4][100/248] Elapsed 0m 12s (remain 0m 18s) Loss: 0.0203(0.0506) Grad: 1.0323  LR: 0.00000793  
Epoch: [4][200/248] Elapsed 0m 24s (remain 0m 5s) Loss: 0.1240(0.0425) Grad: 10.7522  LR: 0.00000593  
Epoch: [4][247/248] Elapsed 0m 30s (remain 0m 0s) Loss: 0.0259(0.0386) Grad: 3.3318  LR: 0.00000505  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 15s) Loss: 0.3606(0.3606) 


Epoch 4 - avg_train_loss: 0.0386  avg_val_loss: 0.9868  time: 37s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0386  avg_val_loss: 0.9868  time: 37s
Epoch 4 - Score: 0.6774
INFO:__main__:Epoch 4 - Score: 0.6774


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 2.6162(0.9868) 
f1 score : 0.3977485928705441
recall score : 0.3475409836065574
precision score : 0.4649122807017544
Epoch: [5][0/248] Elapsed 0m 0s (remain 1m 30s) Loss: 0.0087(0.0087) Grad: nan  LR: 0.00000503  
Epoch: [5][100/248] Elapsed 0m 12s (remain 0m 18s) Loss: 0.0048(0.0075) Grad: 0.1649  LR: 0.00000333  
Epoch: [5][200/248] Elapsed 0m 24s (remain 0m 5s) Loss: 0.0049(0.0065) Grad: 0.1605  LR: 0.00000192  
Epoch: [5][247/248] Elapsed 0m 30s (remain 0m 0s) Loss: 0.0053(0.0062) Grad: 0.2456  LR: 0.00000138  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 14s) Loss: 0.3004(0.3004) 


Epoch 5 - avg_train_loss: 0.0062  avg_val_loss: 1.1072  time: 38s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0062  avg_val_loss: 1.1072  time: 38s
Epoch 5 - Score: 0.6874
INFO:__main__:Epoch 5 - Score: 0.6874


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 3.4420(1.1072) 
f1 score : 0.3841584158415842
recall score : 0.3180327868852459
precision score : 0.485
Epoch: [6][0/248] Elapsed 0m 0s (remain 1m 33s) Loss: 0.0049(0.0049) Grad: nan  LR: 0.00000136  
Epoch: [6][100/248] Elapsed 0m 12s (remain 0m 18s) Loss: 0.0049(0.0046) Grad: 0.1706  LR: 0.00000050  
Epoch: [6][200/248] Elapsed 0m 25s (remain 0m 5s) Loss: 0.0034(0.0045) Grad: 0.0997  LR: 0.00000006  
Epoch: [6][247/248] Elapsed 0m 30s (remain 0m 0s) Loss: 0.0053(0.0045) Grad: 0.2200  LR: 0.00000000  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 13s) Loss: 0.3040(0.3040) 


Epoch 6 - avg_train_loss: 0.0045  avg_val_loss: 1.1193  time: 38s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0045  avg_val_loss: 1.1193  time: 38s
Epoch 6 - Score: 0.6874
INFO:__main__:Epoch 6 - Score: 0.6874


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 3.4952(1.1193) 
f1 score : 0.3865877712031558
recall score : 0.32131147540983607
precision score : 0.48514851485148514


Score: 0.6945
INFO:__main__:Score: 0.6945
ACC BEST Score: 0.7005
INFO:__main__:ACC BEST Score: 0.7005


f1 score : 0.012987012987012988
recall score : 0.006557377049180328
precision score : 0.6666666666666666


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.27.4",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "at

Epoch: [1][0/248] Elapsed 0m 0s (remain 1m 40s) Loss: 0.5555(0.5555) Grad: nan  LR: 0.00002000  
Epoch: [1][100/248] Elapsed 0m 12s (remain 0m 18s) Loss: 0.5254(0.6192) Grad: 3.7398  LR: 0.00001977  
Epoch: [1][200/248] Elapsed 0m 24s (remain 0m 5s) Loss: 0.6609(0.6101) Grad: 4.1707  LR: 0.00001912  
Epoch: [1][247/248] Elapsed 0m 30s (remain 0m 0s) Loss: 0.5818(0.6126) Grad: 0.9757  LR: 0.00001867  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 14s) Loss: 0.3809(0.3809) 


Epoch 1 - avg_train_loss: 0.6126  avg_val_loss: 0.5946  time: 38s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6126  avg_val_loss: 0.5946  time: 38s
Epoch 1 - Score: 0.6935
INFO:__main__:Epoch 1 - Score: 0.6935
Epoch 1 - Save Best Score: 0.6935 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6935 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.9896(0.5946) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
Epoch: [2][0/248] Elapsed 0m 0s (remain 1m 34s) Loss: 0.5187(0.5187) Grad: nan  LR: 0.00001866  
Epoch: [2][100/248] Elapsed 0m 12s (remain 0m 18s) Loss: 0.7199(0.5226) Grad: 10.4643  LR: 0.00001742  
Epoch: [2][200/248] Elapsed 0m 25s (remain 0m 5s) Loss: 0.6606(0.5320) Grad: 4.1814  LR: 0.00001585  
Epoch: [2][247/248] Elapsed 0m 30s (remain 0m 0s) Loss: 0.5152(0.5309) Grad: 3.5718  LR: 0.00001502  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 14s) Loss: 0.3758(0.3758) 


Epoch 2 - avg_train_loss: 0.5309  avg_val_loss: 0.5802  time: 38s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5309  avg_val_loss: 0.5802  time: 38s
Epoch 2 - Score: 0.6995
INFO:__main__:Epoch 2 - Score: 0.6995
Epoch 2 - Save Best Score: 0.6995 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.6995 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.8531(0.5802) 
f1 score : 0.26894865525672373
recall score : 0.18032786885245902
precision score : 0.5288461538461539
Epoch: [3][0/248] Elapsed 0m 0s (remain 1m 42s) Loss: 0.3595(0.3595) Grad: nan  LR: 0.00001501  
Epoch: [3][100/248] Elapsed 0m 12s (remain 0m 18s) Loss: 0.2356(0.3209) Grad: 9.5319  LR: 0.00001309  
Epoch: [3][200/248] Elapsed 0m 25s (remain 0m 5s) Loss: 0.3205(0.3016) Grad: 8.2650  LR: 0.00001103  
Epoch: [3][247/248] Elapsed 0m 31s (remain 0m 0s) Loss: 0.1405(0.2858) Grad: 10.9324  LR: 0.00001004  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 14s) Loss: 0.3283(0.3283) 


Epoch 3 - avg_train_loss: 0.2858  avg_val_loss: 0.8396  time: 38s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2858  avg_val_loss: 0.8396  time: 38s
Epoch 3 - Score: 0.6824
INFO:__main__:Epoch 3 - Score: 0.6824


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 1.5370(0.8396) 
f1 score : 0.3333333333333333
recall score : 0.25901639344262295
precision score : 0.46745562130177515
Epoch: [4][0/248] Elapsed 0m 0s (remain 1m 32s) Loss: 0.2616(0.2616) Grad: nan  LR: 0.00001002  
Epoch: [4][100/248] Elapsed 0m 12s (remain 0m 18s) Loss: 0.0213(0.0576) Grad: 2.1286  LR: 0.00000793  
Epoch: [4][200/248] Elapsed 0m 24s (remain 0m 5s) Loss: 0.0870(0.0498) Grad: 9.5485  LR: 0.00000593  
Epoch: [4][247/248] Elapsed 0m 30s (remain 0m 0s) Loss: 0.0117(0.0450) Grad: 1.1766  LR: 0.00000505  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 14s) Loss: 0.4633(0.4633) 


Epoch 4 - avg_train_loss: 0.0450  avg_val_loss: 1.1422  time: 38s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0450  avg_val_loss: 1.1422  time: 38s
Epoch 4 - Score: 0.6834
INFO:__main__:Epoch 4 - Score: 0.6834


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 1.9937(1.1422) 
f1 score : 0.33121019108280253
recall score : 0.25573770491803277
precision score : 0.46987951807228917
Epoch: [5][0/248] Elapsed 0m 0s (remain 1m 28s) Loss: 0.0042(0.0042) Grad: nan  LR: 0.00000503  
Epoch: [5][100/248] Elapsed 0m 12s (remain 0m 18s) Loss: 0.0084(0.0093) Grad: 0.8251  LR: 0.00000333  
Epoch: [5][200/248] Elapsed 0m 24s (remain 0m 5s) Loss: 0.0046(0.0073) Grad: 0.1931  LR: 0.00000192  
Epoch: [5][247/248] Elapsed 0m 30s (remain 0m 0s) Loss: 0.0047(0.0085) Grad: 0.1566  LR: 0.00000138  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 14s) Loss: 0.7107(0.7107) 


Epoch 5 - avg_train_loss: 0.0085  avg_val_loss: 1.1993  time: 38s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0085  avg_val_loss: 1.1993  time: 38s
Epoch 5 - Score: 0.6734
INFO:__main__:Epoch 5 - Score: 0.6734


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 1.6256(1.1993) 
f1 score : 0.3902439024390244
recall score : 0.34098360655737703
precision score : 0.45614035087719296
Epoch: [6][0/248] Elapsed 0m 0s (remain 1m 28s) Loss: 0.0045(0.0045) Grad: nan  LR: 0.00000136  
Epoch: [6][100/248] Elapsed 0m 12s (remain 0m 18s) Loss: 0.0031(0.0063) Grad: 0.1307  LR: 0.00000050  
Epoch: [6][200/248] Elapsed 0m 24s (remain 0m 5s) Loss: 0.0043(0.0069) Grad: 0.2493  LR: 0.00000006  
Epoch: [6][247/248] Elapsed 0m 30s (remain 0m 0s) Loss: 0.0032(0.0064) Grad: 0.1164  LR: 0.00000000  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 14s) Loss: 0.7112(0.7112) 


Epoch 6 - avg_train_loss: 0.0064  avg_val_loss: 1.2066  time: 38s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0064  avg_val_loss: 1.2066  time: 38s
Epoch 6 - Score: 0.6724
INFO:__main__:Epoch 6 - Score: 0.6724


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 1.6283(1.2066) 
f1 score : 0.3895131086142322
recall score : 0.34098360655737703
precision score : 0.45414847161572053


Score: 0.6995
INFO:__main__:Score: 0.6995
ACC BEST Score: 0.7085
INFO:__main__:ACC BEST Score: 0.7085
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.27.4",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main

f1 score : 0.26894865525672373
recall score : 0.18032786885245902
precision score : 0.5288461538461539


Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.weight', 'mask_predictions.classifier.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/248] Elapsed 0m 0s (remain 1m 39s) Loss: 0.7851(0.7851) Grad: nan  LR: 0.00002000  
Epoch: [1][100/248] Elapsed 0m 12s (remain 0m 18s) Loss: 0.8146(0.6365) Grad: 6.7473  LR: 0.00001977  
Epoch: [1][200/248] Elapsed 0m 24s (remain 0m 5s) Loss: 0.6201(0.6196) Grad: 0.9979  LR: 0.00001912  
Epoch: [1][247/248] Elapsed 0m 30s (remain 0m 0s) Loss: 0.4235(0.6143) Grad: 5.0565  LR: 0.00001867  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 14s) Loss: 0.3505(0.3505) 


Epoch 1 - avg_train_loss: 0.6143  avg_val_loss: 0.5931  time: 38s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6143  avg_val_loss: 0.5931  time: 38s
Epoch 1 - Score: 0.6935
INFO:__main__:Epoch 1 - Score: 0.6935
Epoch 1 - Save Best Score: 0.6935 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6935 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 1.0601(0.5931) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
Epoch: [2][0/248] Elapsed 0m 0s (remain 1m 43s) Loss: 0.5021(0.5021) Grad: nan  LR: 0.00001866  
Epoch: [2][100/248] Elapsed 0m 12s (remain 0m 18s) Loss: 0.5856(0.5415) Grad: 6.8744  LR: 0.00001742  
Epoch: [2][200/248] Elapsed 0m 24s (remain 0m 5s) Loss: 0.4488(0.5398) Grad: 1.7402  LR: 0.00001585  
Epoch: [2][247/248] Elapsed 0m 30s (remain 0m 0s) Loss: 0.5070(0.5372) Grad: 2.7021  LR: 0.00001502  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 14s) Loss: 0.3216(0.3216) 


Epoch 2 - avg_train_loss: 0.5372  avg_val_loss: 0.5802  time: 38s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5372  avg_val_loss: 0.5802  time: 38s
Epoch 2 - Score: 0.7005
INFO:__main__:Epoch 2 - Score: 0.7005
Epoch 2 - Save Best Score: 0.7005 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7005 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.9563(0.5802) 
f1 score : 0.3257918552036199
recall score : 0.2360655737704918
precision score : 0.5255474452554745
Epoch: [3][0/248] Elapsed 0m 0s (remain 1m 32s) Loss: 0.4880(0.4880) Grad: nan  LR: 0.00001501  
Epoch: [3][100/248] Elapsed 0m 12s (remain 0m 18s) Loss: 0.2040(0.2872) Grad: 10.6926  LR: 0.00001309  
Epoch: [3][200/248] Elapsed 0m 25s (remain 0m 5s) Loss: 0.2419(0.2942) Grad: 5.1125  LR: 0.00001103  
Epoch: [3][247/248] Elapsed 0m 30s (remain 0m 0s) Loss: 0.2281(0.2827) Grad: 7.6388  LR: 0.00001004  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 14s) Loss: 0.3015(0.3015) 


Epoch 3 - avg_train_loss: 0.2827  avg_val_loss: 0.7916  time: 38s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2827  avg_val_loss: 0.7916  time: 38s
Epoch 3 - Score: 0.6804
INFO:__main__:Epoch 3 - Score: 0.6804


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 1.3904(0.7916) 
f1 score : 0.37647058823529417
recall score : 0.31475409836065577
precision score : 0.4682926829268293
Epoch: [4][0/248] Elapsed 0m 0s (remain 1m 35s) Loss: 0.0354(0.0354) Grad: nan  LR: 0.00001002  
Epoch: [4][100/248] Elapsed 0m 12s (remain 0m 18s) Loss: 0.0156(0.0468) Grad: 1.2528  LR: 0.00000793  
Epoch: [4][200/248] Elapsed 0m 24s (remain 0m 5s) Loss: 0.0050(0.0381) Grad: 0.1614  LR: 0.00000593  
Epoch: [4][247/248] Elapsed 0m 30s (remain 0m 0s) Loss: 0.0126(0.0350) Grad: 1.5365  LR: 0.00000505  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 14s) Loss: 0.6444(0.6444) 


Epoch 4 - avg_train_loss: 0.0350  avg_val_loss: 1.0857  time: 37s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0350  avg_val_loss: 1.0857  time: 37s
Epoch 4 - Score: 0.6563
INFO:__main__:Epoch 4 - Score: 0.6563


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 1.4214(1.0857) 
f1 score : 0.4519230769230769
recall score : 0.46229508196721314
precision score : 0.44200626959247646
Epoch: [5][0/248] Elapsed 0m 0s (remain 1m 33s) Loss: 0.0049(0.0049) Grad: nan  LR: 0.00000503  
Epoch: [5][100/248] Elapsed 0m 12s (remain 0m 18s) Loss: 0.0066(0.0069) Grad: 0.3304  LR: 0.00000333  
Epoch: [5][200/248] Elapsed 0m 24s (remain 0m 5s) Loss: 0.0081(0.0058) Grad: 0.5772  LR: 0.00000192  
Epoch: [5][247/248] Elapsed 0m 30s (remain 0m 0s) Loss: 0.0064(0.0055) Grad: 0.6329  LR: 0.00000138  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 15s) Loss: 0.4319(0.4319) 


Epoch 5 - avg_train_loss: 0.0055  avg_val_loss: 1.1700  time: 38s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0055  avg_val_loss: 1.1700  time: 38s
Epoch 5 - Score: 0.6794
INFO:__main__:Epoch 5 - Score: 0.6794


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 2.0471(1.1700) 
f1 score : 0.39924670433145015
recall score : 0.3475409836065574
precision score : 0.4690265486725664
Epoch: [6][0/248] Elapsed 0m 0s (remain 1m 27s) Loss: 0.0043(0.0043) Grad: nan  LR: 0.00000136  
Epoch: [6][100/248] Elapsed 0m 12s (remain 0m 18s) Loss: 0.0038(0.0039) Grad: 0.1865  LR: 0.00000050  
Epoch: [6][200/248] Elapsed 0m 25s (remain 0m 5s) Loss: 0.0052(0.0038) Grad: 0.2833  LR: 0.00000006  
Epoch: [6][247/248] Elapsed 0m 30s (remain 0m 0s) Loss: 0.0042(0.0038) Grad: 0.2681  LR: 0.00000000  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 15s) Loss: 0.4209(0.4209) 


Epoch 6 - avg_train_loss: 0.0038  avg_val_loss: 1.1850  time: 38s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0038  avg_val_loss: 1.1850  time: 38s
Epoch 6 - Score: 0.6764
INFO:__main__:Epoch 6 - Score: 0.6764


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 2.1130(1.1850) 
f1 score : 0.38549618320610685
recall score : 0.33114754098360655
precision score : 0.4611872146118721


Score: 0.7005
INFO:__main__:Score: 0.7005
ACC BEST Score: 0.7085
INFO:__main__:ACC BEST Score: 0.7085
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.27.4",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main

f1 score : 0.3257918552036199
recall score : 0.2360655737704918
precision score : 0.5255474452554745


Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.weight', 'mask_predictions.classifier.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/248] Elapsed 0m 0s (remain 1m 41s) Loss: 0.9303(0.9303) Grad: nan  LR: 0.00002000  
Epoch: [1][100/248] Elapsed 0m 12s (remain 0m 18s) Loss: 0.5421(0.6471) Grad: 5.0783  LR: 0.00001977  
Epoch: [1][200/248] Elapsed 0m 24s (remain 0m 5s) Loss: 0.6348(0.6266) Grad: 2.0988  LR: 0.00001912  
Epoch: [1][247/248] Elapsed 0m 30s (remain 0m 0s) Loss: 0.4707(0.6244) Grad: 0.4110  LR: 0.00001867  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 13s) Loss: 0.1842(0.1842) 


Epoch 1 - avg_train_loss: 0.6244  avg_val_loss: 0.6513  time: 38s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6244  avg_val_loss: 0.6513  time: 38s
Epoch 1 - Score: 0.6942
INFO:__main__:Epoch 1 - Score: 0.6942
Epoch 1 - Save Best Score: 0.6942 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6942 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 1.6780(0.6513) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
Epoch: [2][0/248] Elapsed 0m 0s (remain 1m 41s) Loss: 0.7293(0.7293) Grad: nan  LR: 0.00001866  
Epoch: [2][100/248] Elapsed 0m 12s (remain 0m 18s) Loss: 0.3467(0.6008) Grad: 6.5414  LR: 0.00001742  
Epoch: [2][200/248] Elapsed 0m 25s (remain 0m 5s) Loss: 0.5945(0.5871) Grad: 1.2251  LR: 0.00001585  
Epoch: [2][247/248] Elapsed 0m 31s (remain 0m 0s) Loss: 0.6129(0.5863) Grad: 2.0642  LR: 0.00001502  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 13s) Loss: 0.3393(0.3393) 


Epoch 2 - avg_train_loss: 0.5863  avg_val_loss: 0.5739  time: 38s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5863  avg_val_loss: 0.5739  time: 38s
Epoch 2 - Score: 0.7022
INFO:__main__:Epoch 2 - Score: 0.7022
Epoch 2 - Save Best Score: 0.7022 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7022 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.9982(0.5739) 
f1 score : 0.4032258064516129
recall score : 0.32894736842105265
precision score : 0.5208333333333334
Epoch: [3][0/248] Elapsed 0m 0s (remain 1m 35s) Loss: 0.5598(0.5598) Grad: nan  LR: 0.00001501  
Epoch: [3][100/248] Elapsed 0m 13s (remain 0m 19s) Loss: 0.3573(0.5462) Grad: 5.8236  LR: 0.00001309  
Epoch: [3][200/248] Elapsed 0m 25s (remain 0m 5s) Loss: 0.6011(0.5378) Grad: 7.5054  LR: 0.00001103  
Epoch: [3][247/248] Elapsed 0m 31s (remain 0m 0s) Loss: 0.4675(0.5329) Grad: 1.9428  LR: 0.00001004  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 13s) Loss: 0.2804(0.2804) 


Epoch 3 - avg_train_loss: 0.5329  avg_val_loss: 0.5687  time: 38s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5329  avg_val_loss: 0.5687  time: 38s
Epoch 3 - Score: 0.7032
INFO:__main__:Epoch 3 - Score: 0.7032
Epoch 3 - Save Best Score: 0.7032 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7032 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 1.0625(0.5687) 
f1 score : 0.2891566265060241
recall score : 0.19736842105263158
precision score : 0.5405405405405406
Epoch: [4][0/248] Elapsed 0m 0s (remain 1m 37s) Loss: 0.4771(0.4771) Grad: nan  LR: 0.00001002  
Epoch: [4][100/248] Elapsed 0m 12s (remain 0m 18s) Loss: 0.3933(0.4050) Grad: 6.0314  LR: 0.00000793  
Epoch: [4][200/248] Elapsed 0m 25s (remain 0m 5s) Loss: 0.3868(0.3993) Grad: 4.7751  LR: 0.00000593  
Epoch: [4][247/248] Elapsed 0m 31s (remain 0m 0s) Loss: 0.3406(0.3918) Grad: 10.5578  LR: 0.00000505  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 13s) Loss: 0.4318(0.4318) 


Epoch 4 - avg_train_loss: 0.3918  avg_val_loss: 0.6592  time: 38s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3918  avg_val_loss: 0.6592  time: 38s
Epoch 4 - Score: 0.6519
INFO:__main__:Epoch 4 - Score: 0.6519


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 1.0146(0.6592) 
f1 score : 0.4911764705882353
recall score : 0.5493421052631579
precision score : 0.4441489361702128
Epoch: [5][0/248] Elapsed 0m 0s (remain 1m 35s) Loss: 0.2746(0.2746) Grad: nan  LR: 0.00000503  
Epoch: [5][100/248] Elapsed 0m 12s (remain 0m 18s) Loss: 0.0816(0.1788) Grad: 5.2463  LR: 0.00000333  
Epoch: [5][200/248] Elapsed 0m 25s (remain 0m 5s) Loss: 0.0411(0.1673) Grad: 3.3875  LR: 0.00000192  
Epoch: [5][247/248] Elapsed 0m 30s (remain 0m 0s) Loss: 0.0901(0.1599) Grad: 7.1669  LR: 0.00000138  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 13s) Loss: 0.3747(0.3747) 


Epoch 5 - avg_train_loss: 0.1599  avg_val_loss: 0.7926  time: 38s
INFO:__main__:Epoch 5 - avg_train_loss: 0.1599  avg_val_loss: 0.7926  time: 38s
Epoch 5 - Score: 0.6761
INFO:__main__:Epoch 5 - Score: 0.6761


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 1.6066(0.7926) 
f1 score : 0.46688741721854304
recall score : 0.46381578947368424
precision score : 0.47
Epoch: [6][0/248] Elapsed 0m 0s (remain 1m 35s) Loss: 0.0477(0.0477) Grad: nan  LR: 0.00000136  
Epoch: [6][100/248] Elapsed 0m 12s (remain 0m 18s) Loss: 0.0313(0.0602) Grad: 1.6137  LR: 0.00000050  
Epoch: [6][200/248] Elapsed 0m 25s (remain 0m 5s) Loss: 0.0270(0.0582) Grad: 0.9071  LR: 0.00000006  
Epoch: [6][247/248] Elapsed 0m 30s (remain 0m 0s) Loss: 0.0669(0.0570) Grad: 5.5083  LR: 0.00000000  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 13s) Loss: 0.3055(0.3055) 


Epoch 6 - avg_train_loss: 0.0570  avg_val_loss: 0.8311  time: 38s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0570  avg_val_loss: 0.8311  time: 38s
Epoch 6 - Score: 0.6901
INFO:__main__:Epoch 6 - Score: 0.6901


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 2.0712(0.8311) 
f1 score : 0.4420289855072464
recall score : 0.40131578947368424
precision score : 0.49193548387096775


Score: 0.7032
INFO:__main__:Score: 0.7032
ACC BEST Score: 0.7062
INFO:__main__:ACC BEST Score: 0.7062
Score: 0.7041
INFO:__main__:Score: 0.7041
ACC BEST Score: 0.7057
INFO:__main__:ACC BEST Score: 0.7057


f1 score : 0.2891566265060241
recall score : 0.19736842105263158
precision score : 0.5405405405405406
f1 score : 0.2763028515240905
recall score : 0.1845042678923178
precision score : 0.5499021526418787


In [None]:
from google.colab import runtime
runtime.unassign()