In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m55.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m105.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.4
Looking in indexes: https://pypi.org/simple, https://u

In [None]:
!nvidia-smi

Mon Apr  3 12:20:28 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0    44W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:

import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re
import html

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD, AdamW, RAdam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score, recall_score, accuracy_score, precision_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:

# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model="microsoft/deberta-v3-base"
    # model='microsoft/deberta-base'
    # model='roberta-base'
    # model='roberta-large'
    # model='roberta-large-mnli'
    # model='xlnet-large-cased'
    # model='albert-xxlarge-v2'
    # model="microsoft/deberta-large"
    # model="microsoft/deberta-v3-large"
    # model='microsoft/deberta-v2-xlarge'
    # model='funnel-transformer/large'
    # model='funnel-transformer/medium'
    # model='albert-base-v2'
    # model='albert-large-v2'
    # model='google/electra-large-discriminator'
    # model='google/electra-base-discriminator'
    # model="facebook/bart-large-mnli"
    # model="facebook/bart-large"
    # model="facebook/bart-base"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=6
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=16
    fc_dropout=0.2
    target_size=1
    max_len=256
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    train=True
    nth_awp_start_epoch=1
    gradient_checkpointing = False
    freezing = False

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1]

In [None]:
DIR = "/content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")
OUTPUT_EXP_DIR = DIR + '/output/EXP015/'
if not os.path.exists(OUTPUT_EXP_DIR):
    os.makedirs(OUTPUT_EXP_DIR)

In [None]:
def get_score(labels, outputs):
    thresh = 0.5
    y_pred = outputs
    y_true = labels
    f_score = f1_score(y_true, (y_pred>thresh).astype(int))
    r_score = recall_score(y_true, (y_pred>thresh).astype(int))
    p_score = precision_score(y_true, (y_pred>thresh).astype(int))
    print(f"f1 score : {f_score}")
    print(f"recall score : {r_score}")
    print(f"precision score : {p_score}")
    return accuracy_score(y_true, (y_pred>thresh).astype(int))

def get_acc_score(labels, outputs):
    y_pred = outputs
    y_true = labels
    best_score = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.80, 0.01):
        thresh = np.round(thresh, 2)
        score = accuracy_score(y_true, (y_pred>thresh).astype(int))
        #print("Accuracy score at threshold {0} is {1}".format(thresh, score))
        if score > best_score:
          best_score = score
          best_thresh = thresh
    return accuracy_score(y_true, (y_pred>best_thresh).astype(int))


def get_logger(filename=OUTPUT_EXP_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=CFG.seed)

In [None]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [None]:
import pandas as pd
import numpy as np

train = pd.read_csv(os.path.join(INPUT_DIR,"train_data.csv"))
test = pd.read_csv(os.path.join(INPUT_DIR,"test_data.csv"))
sample_sub = pd.read_csv(os.path.join(INPUT_DIR,"submission.csv"))

print(train.shape)
display(train.head(3))

print(test.shape)
display(test.head(3))

print(sample_sub.shape)
display(sample_sub.head(3))

(4974, 6)


Unnamed: 0,id,title,year,abstract,keywords,y
0,1,Hierarchical Adversarially Learned Inference,2018,We propose a novel hierarchical generative mod...,"generative, hierarchical, unsupervised, semisu...",0
1,2,Learning to Compute Word Embeddings On the Fly,2018,Words in natural language follow a Zipfian dis...,"NLU, word embeddings, representation learning",0
2,3,Graph2Seq: Scalable Learning Dynamics for Graphs,2018,Neural networks are increasingly used as a gen...,,0


(6393, 5)


Unnamed: 0,id,title,year,abstract,keywords
0,1,StyleAlign: Analysis and Applications of Align...,2022,"In this paper, we perform an in-depth study of...","StyleGAN, transfer learning, fine tuning, mode..."
1,2,Embedding a random graph via GNN: mean-field i...,2021,We develop a theory for embedding a random gra...,"Graph neural network, graph embedding, multi-r..."
2,3,BBRefinement: an universal scheme to improve p...,2021,We present a conceptually simple yet powerful ...,"object detection, deep neural networks, refine..."


(6393, 2)


Unnamed: 0,id,y
0,1,0
1,2,0
2,3,0


In [None]:
train["texts"] = train["title"] + "[SEP]" + train["abstract"]

In [None]:
skf = StratifiedKFold(n_splits=CFG.n_fold,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train.y)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)

if CFG.debug:
    display(train.groupby('kfold').size())
    train = train.sample(n=500, random_state=0).reset_index(drop=True)
    display(train.groupby('kfold').size())

In [None]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_EXP_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['texts'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 2 # cls
LOGGER.info(f"max_len: {CFG.max_len}")

100%|██████████| 4974/4974 [00:03<00:00, 1364.28it/s]
max_len: 521
INFO:__main__:max_len: 521


In [None]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False,
                           truncation=True)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df['y'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.half)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

class ValidDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df['y'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

#collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())

        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self._init_weights(self.fc)
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
        self.sig = nn.Sigmoid()
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        feature = self.layer_norm1(feature)
        output = self.fc(feature)
        #output = self.sig(output)
        return output

In [None]:
class Focal_MultiLabel_Loss(nn.Module):
    def __init__(self, gamma):
      super(Focal_MultiLabel_Loss, self).__init__()
      self.gamma = gamma
      self.bceloss = nn.BCELoss(reduction='none')

    def forward(self, outputs, targets): 
      bce = self.bceloss(outputs, targets)
      bce_exp = torch.exp(-bce)
      focal_loss = (1-bce_exp)**self.gamma * bce
      return focal_loss.mean()

In [None]:

# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
        #print(y_preds.sigmoid().squeeze().view(1, -1))
        loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [None]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['kfold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['kfold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds['y'].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = ValidDataset(CFG, valid_folds)


    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size*2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_EXP_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = Focal_MultiLabel_Loss(gamma=2.5)
    
    best_score = -1.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')

        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_EXP_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_EXP_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [None]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['y'].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        acc_score = get_acc_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
        LOGGER.info(f'ACC BEST Score: {acc_score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
            #break
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_EXP_DIR+'oof_df.pkl')

DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.27.4",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "at

Downloading pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/248] Elapsed 0m 3s (remain 14m 41s) Loss: 0.3860(0.3860) Grad: nan  LR: 0.00002000  
Epoch: [1][100/248] Elapsed 0m 15s (remain 0m 21s) Loss: 0.1163(0.1251) Grad: 1.1510  LR: 0.00001977  
Epoch: [1][200/248] Elapsed 0m 26s (remain 0m 6s) Loss: 0.0995(0.1175) Grad: 2.6562  LR: 0.00001912  
Epoch: [1][247/248] Elapsed 0m 32s (remain 0m 0s) Loss: 0.1014(0.1150) Grad: 0.4395  LR: 0.00001867  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) Loss: 0.0682(0.0682) 


Epoch 1 - avg_train_loss: 0.1150  avg_val_loss: 0.1046  time: 39s
INFO:__main__:Epoch 1 - avg_train_loss: 0.1150  avg_val_loss: 0.1046  time: 39s
Epoch 1 - Score: 0.6985
INFO:__main__:Epoch 1 - Score: 0.6985
Epoch 1 - Save Best Score: 0.6985 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6985 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.1314(0.1046) 
f1 score : 0.14772727272727273
recall score : 0.08552631578947369
precision score : 0.5416666666666666
Epoch: [2][0/248] Elapsed 0m 0s (remain 1m 23s) Loss: 0.0923(0.0923) Grad: nan  LR: 0.00001866  
Epoch: [2][100/248] Elapsed 0m 12s (remain 0m 17s) Loss: 0.1031(0.1024) Grad: 0.3439  LR: 0.00001742  
Epoch: [2][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.0972(0.1034) Grad: 0.5321  LR: 0.00001585  
Epoch: [2][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.1301(0.1027) Grad: 2.5909  LR: 0.00001502  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) Loss: 0.0917(0.0917) 


Epoch 2 - avg_train_loss: 0.1027  avg_val_loss: 0.1065  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.1027  avg_val_loss: 0.1065  time: 36s
Epoch 2 - Score: 0.6784
INFO:__main__:Epoch 2 - Score: 0.6784


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.0928(0.1065) 
f1 score : 0.50920245398773
recall score : 0.5460526315789473
precision score : 0.47701149425287354
Epoch: [3][0/248] Elapsed 0m 0s (remain 1m 14s) Loss: 0.0736(0.0736) Grad: nan  LR: 0.00001501  
Epoch: [3][100/248] Elapsed 0m 11s (remain 0m 17s) Loss: 0.0764(0.0831) Grad: 0.9606  LR: 0.00001309  
Epoch: [3][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.0916(0.0784) Grad: 3.5395  LR: 0.00001103  
Epoch: [3][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.0493(0.0778) Grad: 0.7188  LR: 0.00001004  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 11s) Loss: 0.0383(0.0383) 


Epoch 3 - avg_train_loss: 0.0778  avg_val_loss: 0.1222  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0778  avg_val_loss: 0.1222  time: 36s
Epoch 3 - Score: 0.7055
INFO:__main__:Epoch 3 - Score: 0.7055
Epoch 3 - Save Best Score: 0.7055 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7055 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.0997(0.1222) 
f1 score : 0.3588621444201313
recall score : 0.26973684210526316
precision score : 0.5359477124183006
Epoch: [4][0/248] Elapsed 0m 0s (remain 1m 19s) Loss: 0.0341(0.0341) Grad: nan  LR: 0.00001002  
Epoch: [4][100/248] Elapsed 0m 12s (remain 0m 17s) Loss: 0.0397(0.0309) Grad: 2.8030  LR: 0.00000793  
Epoch: [4][200/248] Elapsed 0m 24s (remain 0m 5s) Loss: 0.0540(0.0270) Grad: 5.3572  LR: 0.00000593  
Epoch: [4][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.0079(0.0262) Grad: 0.9104  LR: 0.00000505  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 11s) Loss: 0.1584(0.1584) 


Epoch 4 - avg_train_loss: 0.0262  avg_val_loss: 0.2079  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0262  avg_val_loss: 0.2079  time: 36s
Epoch 4 - Score: 0.6402
INFO:__main__:Epoch 4 - Score: 0.6402


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.0175(0.2079) 
f1 score : 0.5082417582417582
recall score : 0.6085526315789473
precision score : 0.4363207547169811
Epoch: [5][0/248] Elapsed 0m 0s (remain 1m 14s) Loss: 0.0216(0.0216) Grad: nan  LR: 0.00000503  
Epoch: [5][100/248] Elapsed 0m 11s (remain 0m 17s) Loss: 0.0127(0.0040) Grad: 2.1290  LR: 0.00000333  
Epoch: [5][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.0006(0.0029) Grad: 0.0746  LR: 0.00000192  
Epoch: [5][247/248] Elapsed 0m 28s (remain 0m 0s) Loss: 0.0025(0.0026) Grad: 0.3565  LR: 0.00000138  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) Loss: 0.0887(0.0887) 


Epoch 5 - avg_train_loss: 0.0026  avg_val_loss: 0.2839  time: 36s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0026  avg_val_loss: 0.2839  time: 36s
Epoch 5 - Score: 0.6704
INFO:__main__:Epoch 5 - Score: 0.6704


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.0316(0.2839) 
f1 score : 0.43448275862068964
recall score : 0.4144736842105263
precision score : 0.45652173913043476
Epoch: [6][0/248] Elapsed 0m 0s (remain 1m 15s) Loss: 0.0006(0.0006) Grad: nan  LR: 0.00000136  
Epoch: [6][100/248] Elapsed 0m 12s (remain 0m 17s) Loss: 0.0004(0.0008) Grad: 0.0634  LR: 0.00000050  
Epoch: [6][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.0006(0.0007) Grad: 0.0583  LR: 0.00000006  
Epoch: [6][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.0012(0.0007) Grad: 0.1313  LR: 0.00000000  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 11s) Loss: 0.0579(0.0579) 


Epoch 6 - avg_train_loss: 0.0007  avg_val_loss: 0.3010  time: 36s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0007  avg_val_loss: 0.3010  time: 36s
Epoch 6 - Score: 0.6864
INFO:__main__:Epoch 6 - Score: 0.6864


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.0567(0.3010) 
f1 score : 0.4113207547169811
recall score : 0.35855263157894735
precision score : 0.4823008849557522


Score: 0.7055
INFO:__main__:Score: 0.7055
ACC BEST Score: 0.7075
INFO:__main__:ACC BEST Score: 0.7075
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.27.4",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main

f1 score : 0.3588621444201313
recall score : 0.26973684210526316
precision score : 0.5359477124183006


Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/248] Elapsed 0m 0s (remain 1m 25s) Loss: 0.1038(0.1038) Grad: nan  LR: 0.00002000  
Epoch: [1][100/248] Elapsed 0m 12s (remain 0m 17s) Loss: 0.1351(0.1145) Grad: 2.6208  LR: 0.00001977  
Epoch: [1][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.0914(0.1114) Grad: 2.8264  LR: 0.00001912  
Epoch: [1][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.1035(0.1109) Grad: 2.2712  LR: 0.00001867  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) Loss: 0.0806(0.0806) 


Epoch 1 - avg_train_loss: 0.1109  avg_val_loss: 0.1052  time: 36s
INFO:__main__:Epoch 1 - avg_train_loss: 0.1109  avg_val_loss: 0.1052  time: 36s
Epoch 1 - Score: 0.6955
INFO:__main__:Epoch 1 - Score: 0.6955
Epoch 1 - Save Best Score: 0.6955 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6955 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.1389(0.1052) 
f1 score : 0.2210796915167095
recall score : 0.14098360655737704
precision score : 0.5119047619047619
Epoch: [2][0/248] Elapsed 0m 0s (remain 1m 24s) Loss: 0.0743(0.0743) Grad: nan  LR: 0.00001866  
Epoch: [2][100/248] Elapsed 0m 12s (remain 0m 17s) Loss: 0.0955(0.1003) Grad: 1.4391  LR: 0.00001742  
Epoch: [2][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.1015(0.0964) Grad: 3.2533  LR: 0.00001585  
Epoch: [2][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.1152(0.0972) Grad: 3.4339  LR: 0.00001502  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) Loss: 0.0924(0.0924) 


Epoch 2 - avg_train_loss: 0.0972  avg_val_loss: 0.1073  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.0972  avg_val_loss: 0.1073  time: 36s
Epoch 2 - Score: 0.6754
INFO:__main__:Epoch 2 - Score: 0.6754


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.1161(0.1073) 
f1 score : 0.50231124807396
recall score : 0.5344262295081967
precision score : 0.4738372093023256
Epoch: [3][0/248] Elapsed 0m 0s (remain 1m 18s) Loss: 0.0786(0.0786) Grad: nan  LR: 0.00001501  
Epoch: [3][100/248] Elapsed 0m 11s (remain 0m 17s) Loss: 0.0229(0.0583) Grad: 0.9811  LR: 0.00001309  
Epoch: [3][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.0578(0.0533) Grad: 2.9290  LR: 0.00001103  
Epoch: [3][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.0395(0.0521) Grad: 1.6711  LR: 0.00001004  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) Loss: 0.0931(0.0931) 


Epoch 3 - avg_train_loss: 0.0521  avg_val_loss: 0.1318  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0521  avg_val_loss: 0.1318  time: 36s
Epoch 3 - Score: 0.6804
INFO:__main__:Epoch 3 - Score: 0.6804


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.1952(0.1318) 
f1 score : 0.4717607973421926
recall score : 0.46557377049180326
precision score : 0.4781144781144781
Epoch: [4][0/248] Elapsed 0m 0s (remain 1m 18s) Loss: 0.0133(0.0133) Grad: nan  LR: 0.00001002  
Epoch: [4][100/248] Elapsed 0m 12s (remain 0m 17s) Loss: 0.0011(0.0053) Grad: 0.1439  LR: 0.00000793  
Epoch: [4][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.0039(0.0046) Grad: 0.8528  LR: 0.00000593  
Epoch: [4][247/248] Elapsed 0m 28s (remain 0m 0s) Loss: 0.0012(0.0041) Grad: 0.2682  LR: 0.00000505  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) Loss: 0.0548(0.0548) 


Epoch 4 - avg_train_loss: 0.0041  avg_val_loss: 0.3225  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0041  avg_val_loss: 0.3225  time: 36s
Epoch 4 - Score: 0.6965
INFO:__main__:Epoch 4 - Score: 0.6965
Epoch 4 - Save Best Score: 0.6965 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.6965 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 1.0415(0.3225) 
f1 score : 0.3258928571428571
recall score : 0.23934426229508196
precision score : 0.5104895104895105
Epoch: [5][0/248] Elapsed 0m 0s (remain 1m 25s) Loss: 0.0009(0.0009) Grad: nan  LR: 0.00000503  
Epoch: [5][100/248] Elapsed 0m 12s (remain 0m 17s) Loss: 0.0005(0.0004) Grad: 0.1116  LR: 0.00000333  
Epoch: [5][200/248] Elapsed 0m 24s (remain 0m 5s) Loss: 0.0002(0.0004) Grad: 0.0306  LR: 0.00000192  
Epoch: [5][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.0003(0.0004) Grad: 0.0417  LR: 0.00000138  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) Loss: 0.0963(0.0963) 


Epoch 5 - avg_train_loss: 0.0004  avg_val_loss: 0.3183  time: 36s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0004  avg_val_loss: 0.3183  time: 36s
Epoch 5 - Score: 0.6905
INFO:__main__:Epoch 5 - Score: 0.6905


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.8763(0.3183) 
f1 score : 0.3688524590163934
recall score : 0.29508196721311475
precision score : 0.4918032786885246
Epoch: [6][0/248] Elapsed 0m 0s (remain 1m 20s) Loss: 0.0005(0.0005) Grad: nan  LR: 0.00000136  
Epoch: [6][100/248] Elapsed 0m 11s (remain 0m 17s) Loss: 0.0003(0.0002) Grad: 0.0449  LR: 0.00000050  
Epoch: [6][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.0002(0.0002) Grad: 0.0292  LR: 0.00000006  
Epoch: [6][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.0004(0.0002) Grad: 0.0925  LR: 0.00000000  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) Loss: 0.0912(0.0912) 


Epoch 6 - avg_train_loss: 0.0002  avg_val_loss: 0.3264  time: 36s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0002  avg_val_loss: 0.3264  time: 36s
Epoch 6 - Score: 0.6925
INFO:__main__:Epoch 6 - Score: 0.6925


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.9334(0.3264) 
f1 score : 0.3677685950413223
recall score : 0.29180327868852457
precision score : 0.4972067039106145


Score: 0.6965
INFO:__main__:Score: 0.6965
ACC BEST Score: 0.7005
INFO:__main__:ACC BEST Score: 0.7005


f1 score : 0.3258928571428571
recall score : 0.23934426229508196
precision score : 0.5104895104895105


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.27.4",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "at

Epoch: [1][0/248] Elapsed 0m 0s (remain 1m 27s) Loss: 0.0825(0.0825) Grad: nan  LR: 0.00002000  
Epoch: [1][100/248] Elapsed 0m 11s (remain 0m 17s) Loss: 0.1008(0.1114) Grad: 2.2068  LR: 0.00001977  
Epoch: [1][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.1384(0.1098) Grad: 3.8395  LR: 0.00001912  
Epoch: [1][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.0966(0.1101) Grad: 0.6803  LR: 0.00001867  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) Loss: 0.0752(0.0752) 


Epoch 1 - avg_train_loss: 0.1101  avg_val_loss: 0.1056  time: 36s
INFO:__main__:Epoch 1 - avg_train_loss: 0.1101  avg_val_loss: 0.1056  time: 36s
Epoch 1 - Score: 0.6985
INFO:__main__:Epoch 1 - Score: 0.6985
Epoch 1 - Save Best Score: 0.6985 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6985 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.1505(0.1056) 
f1 score : 0.0625
recall score : 0.03278688524590164
precision score : 0.6666666666666666
Epoch: [2][0/248] Elapsed 0m 0s (remain 1m 22s) Loss: 0.0898(0.0898) Grad: nan  LR: 0.00001866  
Epoch: [2][100/248] Elapsed 0m 12s (remain 0m 17s) Loss: 0.1223(0.0946) Grad: 4.3890  LR: 0.00001742  
Epoch: [2][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.1194(0.0966) Grad: 1.5835  LR: 0.00001585  
Epoch: [2][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.0830(0.0966) Grad: 1.3258  LR: 0.00001502  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) Loss: 0.0694(0.0694) 


Epoch 2 - avg_train_loss: 0.0966  avg_val_loss: 0.1045  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.0966  avg_val_loss: 0.1045  time: 36s
Epoch 2 - Score: 0.7116
INFO:__main__:Epoch 2 - Score: 0.7116
Epoch 2 - Save Best Score: 0.7116 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7116 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.1484(0.1045) 
f1 score : 0.25065274151436034
recall score : 0.15737704918032788
precision score : 0.6153846153846154
Epoch: [3][0/248] Elapsed 0m 0s (remain 1m 33s) Loss: 0.0764(0.0764) Grad: nan  LR: 0.00001501  
Epoch: [3][100/248] Elapsed 0m 12s (remain 0m 17s) Loss: 0.0612(0.0680) Grad: 1.2347  LR: 0.00001309  
Epoch: [3][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.0514(0.0628) Grad: 1.3192  LR: 0.00001103  
Epoch: [3][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.0343(0.0612) Grad: 1.8156  LR: 0.00001004  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) Loss: 0.0811(0.0811) 


Epoch 3 - avg_train_loss: 0.0612  avg_val_loss: 0.1318  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0612  avg_val_loss: 0.1318  time: 36s
Epoch 3 - Score: 0.6794
INFO:__main__:Epoch 3 - Score: 0.6794


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.1207(0.1318) 
f1 score : 0.39924670433145015
recall score : 0.3475409836065574
precision score : 0.4690265486725664
Epoch: [4][0/248] Elapsed 0m 0s (remain 1m 25s) Loss: 0.0353(0.0353) Grad: nan  LR: 0.00001002  
Epoch: [4][100/248] Elapsed 0m 12s (remain 0m 17s) Loss: 0.0006(0.0094) Grad: 0.0601  LR: 0.00000793  
Epoch: [4][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.0099(0.0089) Grad: 1.6324  LR: 0.00000593  
Epoch: [4][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.0017(0.0080) Grad: 0.2439  LR: 0.00000505  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) Loss: 0.2169(0.2169) 


Epoch 4 - avg_train_loss: 0.0080  avg_val_loss: 0.3014  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0080  avg_val_loss: 0.3014  time: 36s
Epoch 4 - Score: 0.6513
INFO:__main__:Epoch 4 - Score: 0.6513


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.0680(0.3014) 
f1 score : 0.42644628099173554
recall score : 0.42295081967213116
precision score : 0.43
Epoch: [5][0/248] Elapsed 0m 0s (remain 1m 15s) Loss: 0.0011(0.0011) Grad: nan  LR: 0.00000503  
Epoch: [5][100/248] Elapsed 0m 12s (remain 0m 17s) Loss: 0.0004(0.0012) Grad: 0.0869  LR: 0.00000333  
Epoch: [5][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.0006(0.0009) Grad: 0.1763  LR: 0.00000192  
Epoch: [5][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.0003(0.0008) Grad: 0.0506  LR: 0.00000138  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 13s) Loss: 0.1868(0.1868) 


Epoch 5 - avg_train_loss: 0.0008  avg_val_loss: 0.3603  time: 36s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0008  avg_val_loss: 0.3603  time: 36s
Epoch 5 - Score: 0.6693
INFO:__main__:Epoch 5 - Score: 0.6693


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.1087(0.3603) 
f1 score : 0.4156305506216696
recall score : 0.3836065573770492
precision score : 0.45348837209302323
Epoch: [6][0/248] Elapsed 0m 0s (remain 1m 17s) Loss: 0.0001(0.0001) Grad: nan  LR: 0.00000136  
Epoch: [6][100/248] Elapsed 0m 12s (remain 0m 17s) Loss: 0.0002(0.0003) Grad: 0.0261  LR: 0.00000050  
Epoch: [6][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.0002(0.0002) Grad: 0.0336  LR: 0.00000006  
Epoch: [6][247/248] Elapsed 0m 28s (remain 0m 0s) Loss: 0.0001(0.0002) Grad: 0.0227  LR: 0.00000000  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) Loss: 0.1651(0.1651) 


Epoch 6 - avg_train_loss: 0.0002  avg_val_loss: 0.3723  time: 36s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0002  avg_val_loss: 0.3723  time: 36s
Epoch 6 - Score: 0.6693
INFO:__main__:Epoch 6 - Score: 0.6693


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.1515(0.3723) 
f1 score : 0.38273921200750466
recall score : 0.3344262295081967
precision score : 0.4473684210526316


Score: 0.7116
INFO:__main__:Score: 0.7116
ACC BEST Score: 0.7116
INFO:__main__:ACC BEST Score: 0.7116


f1 score : 0.25065274151436034
recall score : 0.15737704918032788
precision score : 0.6153846153846154


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.27.4",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "at

Epoch: [1][0/248] Elapsed 0m 0s (remain 1m 30s) Loss: 0.1825(0.1825) Grad: nan  LR: 0.00002000  
Epoch: [1][100/248] Elapsed 0m 11s (remain 0m 17s) Loss: 0.1462(0.1160) Grad: 3.8209  LR: 0.00001977  
Epoch: [1][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.1213(0.1119) Grad: 2.1736  LR: 0.00001912  
Epoch: [1][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.0662(0.1108) Grad: 1.7307  LR: 0.00001867  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 13s) Loss: 0.0489(0.0489) 


Epoch 1 - avg_train_loss: 0.1108  avg_val_loss: 0.1068  time: 36s
INFO:__main__:Epoch 1 - avg_train_loss: 0.1108  avg_val_loss: 0.1068  time: 36s
Epoch 1 - Score: 0.6935
INFO:__main__:Epoch 1 - Score: 0.6935
Epoch 1 - Save Best Score: 0.6935 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6935 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.2004(0.1068) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
Epoch: [2][0/248] Elapsed 0m 0s (remain 1m 29s) Loss: 0.0848(0.0848) Grad: nan  LR: 0.00001866  
Epoch: [2][100/248] Elapsed 0m 12s (remain 0m 17s) Loss: 0.0944(0.0933) Grad: 0.8826  LR: 0.00001742  
Epoch: [2][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.0780(0.0952) Grad: 0.8504  LR: 0.00001585  
Epoch: [2][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.0914(0.0956) Grad: 1.0760  LR: 0.00001502  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 13s) Loss: 0.0501(0.0501) 


Epoch 2 - avg_train_loss: 0.0956  avg_val_loss: 0.1062  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.0956  avg_val_loss: 0.1062  time: 36s
Epoch 2 - Score: 0.7065
INFO:__main__:Epoch 2 - Score: 0.7065
Epoch 2 - Save Best Score: 0.7065 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7065 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.1374(0.1062) 
f1 score : 0.23157894736842108
recall score : 0.14426229508196722
precision score : 0.5866666666666667
Epoch: [3][0/248] Elapsed 0m 0s (remain 1m 24s) Loss: 0.0936(0.0936) Grad: nan  LR: 0.00001501  
Epoch: [3][100/248] Elapsed 0m 12s (remain 0m 17s) Loss: 0.0361(0.0572) Grad: 1.1194  LR: 0.00001309  
Epoch: [3][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.0446(0.0567) Grad: 1.3369  LR: 0.00001103  
Epoch: [3][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.0426(0.0547) Grad: 1.6284  LR: 0.00001004  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 13s) Loss: 0.0478(0.0478) 


Epoch 3 - avg_train_loss: 0.0547  avg_val_loss: 0.1698  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0547  avg_val_loss: 0.1698  time: 36s
Epoch 3 - Score: 0.6915
INFO:__main__:Epoch 3 - Score: 0.6915


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.1527(0.1698) 
f1 score : 0.389662027833002
recall score : 0.32131147540983607
precision score : 0.494949494949495
Epoch: [4][0/248] Elapsed 0m 0s (remain 1m 21s) Loss: 0.0075(0.0075) Grad: nan  LR: 0.00001002  
Epoch: [4][100/248] Elapsed 0m 12s (remain 0m 17s) Loss: 0.0045(0.0082) Grad: 0.6745  LR: 0.00000793  
Epoch: [4][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.0133(0.0072) Grad: 2.0365  LR: 0.00000593  
Epoch: [4][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.0029(0.0087) Grad: 0.4144  LR: 0.00000505  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 13s) Loss: 0.0280(0.0280) 


Epoch 4 - avg_train_loss: 0.0087  avg_val_loss: 0.2696  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0087  avg_val_loss: 0.2696  time: 36s
Epoch 4 - Score: 0.7005
INFO:__main__:Epoch 4 - Score: 0.7005


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.4408(0.2696) 
f1 score : 0.3521739130434782
recall score : 0.26557377049180325
precision score : 0.5225806451612903
Epoch: [5][0/248] Elapsed 0m 0s (remain 1m 19s) Loss: 0.0006(0.0006) Grad: nan  LR: 0.00000503  
Epoch: [5][100/248] Elapsed 0m 11s (remain 0m 17s) Loss: 0.0016(0.0015) Grad: 0.2640  LR: 0.00000333  
Epoch: [5][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.0016(0.0014) Grad: 0.2714  LR: 0.00000192  
Epoch: [5][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.0004(0.0013) Grad: 0.0672  LR: 0.00000138  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 13s) Loss: 0.0688(0.0688) 


Epoch 5 - avg_train_loss: 0.0013  avg_val_loss: 0.3161  time: 36s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0013  avg_val_loss: 0.3161  time: 36s
Epoch 5 - Score: 0.6804
INFO:__main__:Epoch 5 - Score: 0.6804


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.4926(0.3161) 
f1 score : 0.39543726235741444
recall score : 0.34098360655737703
precision score : 0.47058823529411764
Epoch: [6][0/248] Elapsed 0m 0s (remain 1m 17s) Loss: 0.0007(0.0007) Grad: nan  LR: 0.00000136  
Epoch: [6][100/248] Elapsed 0m 11s (remain 0m 17s) Loss: 0.0006(0.0003) Grad: 0.0561  LR: 0.00000050  
Epoch: [6][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.0007(0.0004) Grad: 0.1162  LR: 0.00000006  
Epoch: [6][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.0002(0.0004) Grad: 0.0233  LR: 0.00000000  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 13s) Loss: 0.0748(0.0748) 


Epoch 6 - avg_train_loss: 0.0004  avg_val_loss: 0.3181  time: 36s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0004  avg_val_loss: 0.3181  time: 36s
Epoch 6 - Score: 0.6734
INFO:__main__:Epoch 6 - Score: 0.6734


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.4676(0.3181) 
f1 score : 0.3970315398886827
recall score : 0.35081967213114756
precision score : 0.45726495726495725


Score: 0.7065
INFO:__main__:Score: 0.7065
ACC BEST Score: 0.7085
INFO:__main__:ACC BEST Score: 0.7085
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.27.4",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main

f1 score : 0.23157894736842108
recall score : 0.14426229508196722
precision score : 0.5866666666666667


Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/248] Elapsed 0m 0s (remain 1m 27s) Loss: 0.4897(0.4897) Grad: nan  LR: 0.00002000  
Epoch: [1][100/248] Elapsed 0m 11s (remain 0m 17s) Loss: 0.0985(0.1435) Grad: 2.5316  LR: 0.00001977  
Epoch: [1][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.1161(0.1268) Grad: 1.3207  LR: 0.00001912  
Epoch: [1][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.1044(0.1237) Grad: 1.9219  LR: 0.00001867  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 11s) Loss: 0.0251(0.0251) 


Epoch 1 - avg_train_loss: 0.1237  avg_val_loss: 0.1283  time: 36s
INFO:__main__:Epoch 1 - avg_train_loss: 0.1237  avg_val_loss: 0.1283  time: 36s
Epoch 1 - Score: 0.6942
INFO:__main__:Epoch 1 - Score: 0.6942
Epoch 1 - Save Best Score: 0.6942 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6942 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.3172(0.1283) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
Epoch: [2][0/248] Elapsed 0m 0s (remain 1m 24s) Loss: 0.1400(0.1400) Grad: nan  LR: 0.00001866  
Epoch: [2][100/248] Elapsed 0m 12s (remain 0m 17s) Loss: 0.0584(0.1043) Grad: 2.5775  LR: 0.00001742  
Epoch: [2][200/248] Elapsed 0m 24s (remain 0m 5s) Loss: 0.1112(0.1042) Grad: 2.1927  LR: 0.00001585  
Epoch: [2][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.1123(0.1041) Grad: 1.4770  LR: 0.00001502  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 11s) Loss: 0.0676(0.0676) 


Epoch 2 - avg_train_loss: 0.1041  avg_val_loss: 0.1032  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.1041  avg_val_loss: 0.1032  time: 36s
Epoch 2 - Score: 0.6982
INFO:__main__:Epoch 2 - Score: 0.6982
Epoch 2 - Save Best Score: 0.6982 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.6982 Model


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.1472(0.1032) 
f1 score : 0.2890995260663507
recall score : 0.20065789473684212
precision score : 0.5169491525423728
Epoch: [3][0/248] Elapsed 0m 0s (remain 1m 27s) Loss: 0.0916(0.0916) Grad: nan  LR: 0.00001501  
Epoch: [3][100/248] Elapsed 0m 12s (remain 0m 18s) Loss: 0.0576(0.0974) Grad: 1.5710  LR: 0.00001309  
Epoch: [3][200/248] Elapsed 0m 24s (remain 0m 5s) Loss: 0.1306(0.0964) Grad: 4.7437  LR: 0.00001103  
Epoch: [3][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.0856(0.0959) Grad: 1.5611  LR: 0.00001004  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 11s) Loss: 0.0919(0.0919) 


Epoch 3 - avg_train_loss: 0.0959  avg_val_loss: 0.1067  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0959  avg_val_loss: 0.1067  time: 36s
Epoch 3 - Score: 0.6871
INFO:__main__:Epoch 3 - Score: 0.6871


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.1211(0.1067) 
f1 score : 0.4909983633387888
recall score : 0.4934210526315789
precision score : 0.48859934853420195
Epoch: [4][0/248] Elapsed 0m 0s (remain 1m 18s) Loss: 0.0792(0.0792) Grad: nan  LR: 0.00001002  
Epoch: [4][100/248] Elapsed 0m 11s (remain 0m 17s) Loss: 0.0781(0.0744) Grad: 1.9702  LR: 0.00000793  
Epoch: [4][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.0616(0.0701) Grad: 0.9299  LR: 0.00000593  
Epoch: [4][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.0539(0.0693) Grad: 2.0189  LR: 0.00000505  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) Loss: 0.0901(0.0901) 


Epoch 4 - avg_train_loss: 0.0693  avg_val_loss: 0.1166  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0693  avg_val_loss: 0.1166  time: 36s
Epoch 4 - Score: 0.6751
INFO:__main__:Epoch 4 - Score: 0.6751


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.1130(0.1166) 
f1 score : 0.5228951255539143
recall score : 0.5822368421052632
precision score : 0.4745308310991957
Epoch: [5][0/248] Elapsed 0m 0s (remain 1m 21s) Loss: 0.0455(0.0455) Grad: nan  LR: 0.00000503  
Epoch: [5][100/248] Elapsed 0m 12s (remain 0m 17s) Loss: 0.0170(0.0314) Grad: 1.1941  LR: 0.00000333  
Epoch: [5][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.0099(0.0302) Grad: 1.0749  LR: 0.00000192  
Epoch: [5][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.0320(0.0285) Grad: 1.9938  LR: 0.00000138  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) Loss: 0.0762(0.0762) 


Epoch 5 - avg_train_loss: 0.0285  avg_val_loss: 0.1765  time: 36s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0285  avg_val_loss: 0.1765  time: 36s
Epoch 5 - Score: 0.6881
INFO:__main__:Epoch 5 - Score: 0.6881


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.2164(0.1765) 
f1 score : 0.46366782006920415
recall score : 0.4407894736842105
precision score : 0.48905109489051096
Epoch: [6][0/248] Elapsed 0m 0s (remain 1m 17s) Loss: 0.0097(0.0097) Grad: nan  LR: 0.00000136  
Epoch: [6][100/248] Elapsed 0m 11s (remain 0m 17s) Loss: 0.0032(0.0101) Grad: 0.4246  LR: 0.00000050  
Epoch: [6][200/248] Elapsed 0m 23s (remain 0m 5s) Loss: 0.0071(0.0096) Grad: 1.2598  LR: 0.00000006  
Epoch: [6][247/248] Elapsed 0m 29s (remain 0m 0s) Loss: 0.0098(0.0097) Grad: 0.6153  LR: 0.00000000  
EVAL: [0/32] Elapsed 0m 0s (remain 0m 12s) Loss: 0.0813(0.0813) 


Epoch 6 - avg_train_loss: 0.0097  avg_val_loss: 0.2230  time: 36s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0097  avg_val_loss: 0.2230  time: 36s
Epoch 6 - Score: 0.6952
INFO:__main__:Epoch 6 - Score: 0.6952


EVAL: [31/32] Elapsed 0m 6s (remain 0m 0s) Loss: 0.3583(0.2230) 
f1 score : 0.4357541899441341
recall score : 0.3848684210526316
precision score : 0.5021459227467812


Score: 0.6982
INFO:__main__:Score: 0.6982
ACC BEST Score: 0.7032
INFO:__main__:ACC BEST Score: 0.7032
Score: 0.7037
INFO:__main__:Score: 0.7037
ACC BEST Score: 0.7037
INFO:__main__:ACC BEST Score: 0.7037


f1 score : 0.2890995260663507
recall score : 0.20065789473684212
precision score : 0.5169491525423728
f1 score : 0.29473684210526313
recall score : 0.20223243598161522
precision score : 0.5432098765432098


In [None]:
from google.colab import runtime
runtime.unassign()