In [1]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!nvidia-smi

Tue Apr  4 06:34:08 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    46W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:

import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re
import html

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD, AdamW, RAdam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score, recall_score, accuracy_score, precision_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
DIR = "/content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")
CUSTOM_MODEL_DIR = os.path.join(OUTPUT_DIR,'clrp_deberta_v3_base')
OUTPUT_EXP_DIR = DIR + '/output/EXP017/'
if not os.path.exists(OUTPUT_EXP_DIR):
    os.makedirs(OUTPUT_EXP_DIR)

In [6]:

# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model_name="microsoft/deberta-v3-base"
    # model='microsoft/deberta-base'
    # model='roberta-base'
    # model='roberta-large'
    # model='roberta-large-mnli'
    # model='xlnet-large-cased'
    # model='albert-xxlarge-v2'
    # model="microsoft/deberta-large"
    # model="microsoft/deberta-v3-large"
    # model='microsoft/deberta-v2-xlarge'
    # model='funnel-transformer/large'
    # model='funnel-transformer/medium'
    # model='albert-base-v2'
    # model='albert-large-v2'
    # model='google/electra-large-discriminator'
    # model='google/electra-base-discriminator'
    # model="facebook/bart-large-mnli"
    # model="facebook/bart-large"
    # model="facebook/bart-base"
    model = CUSTOM_MODEL_DIR
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=16
    fc_dropout=0.2
    target_size=1
    max_len=256
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=10
    trn_fold=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    train=True
    nth_awp_start_epoch=1
    gradient_checkpointing = True
    freezing = True

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1]

In [7]:
def get_score(labels, outputs):
    thresh = 0.5
    y_pred = outputs
    y_true = labels
    f_score = f1_score(y_true, (y_pred>thresh).astype(int))
    r_score = recall_score(y_true, (y_pred>thresh).astype(int))
    p_score = precision_score(y_true, (y_pred>thresh).astype(int))
    print(f"f1 score : {f_score}")
    print(f"recall score : {r_score}")
    print(f"precision score : {p_score}")
    return accuracy_score(y_true, (y_pred>thresh).astype(int))

def get_acc_score(labels, outputs):
    y_pred = outputs
    y_true = labels
    best_score = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.80, 0.01):
        thresh = np.round(thresh, 2)
        score = accuracy_score(y_true, (y_pred>thresh).astype(int))
        #print("Accuracy score at threshold {0} is {1}".format(thresh, score))
        if score > best_score:
          best_score = score
          best_thresh = thresh
    return accuracy_score(y_true, (y_pred>best_thresh).astype(int))


def get_logger(filename=OUTPUT_EXP_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=CFG.seed)

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
import pandas as pd
import numpy as np

train = pd.read_csv(os.path.join(INPUT_DIR,"train_data.csv"))
test = pd.read_csv(os.path.join(INPUT_DIR,"test_data.csv"))
sample_sub = pd.read_csv(os.path.join(INPUT_DIR,"submission.csv"))

print(train.shape)
display(train.head(3))

print(test.shape)
display(test.head(3))

print(sample_sub.shape)
display(sample_sub.head(3))

(4974, 6)


Unnamed: 0,id,title,year,abstract,keywords,y
0,1,Hierarchical Adversarially Learned Inference,2018,We propose a novel hierarchical generative mod...,"generative, hierarchical, unsupervised, semisu...",0
1,2,Learning to Compute Word Embeddings On the Fly,2018,Words in natural language follow a Zipfian dis...,"NLU, word embeddings, representation learning",0
2,3,Graph2Seq: Scalable Learning Dynamics for Graphs,2018,Neural networks are increasingly used as a gen...,,0


(6393, 5)


Unnamed: 0,id,title,year,abstract,keywords
0,1,StyleAlign: Analysis and Applications of Align...,2022,"In this paper, we perform an in-depth study of...","StyleGAN, transfer learning, fine tuning, mode..."
1,2,Embedding a random graph via GNN: mean-field i...,2021,We develop a theory for embedding a random gra...,"Graph neural network, graph embedding, multi-r..."
2,3,BBRefinement: an universal scheme to improve p...,2021,We present a conceptually simple yet powerful ...,"object detection, deep neural networks, refine..."


(6393, 2)


Unnamed: 0,id,y
0,1,0
1,2,0
2,3,0


In [10]:
train["texts"] = train["title"] + "[SEP]" + train["abstract"] 

In [11]:
skf = StratifiedKFold(n_splits=CFG.n_fold,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train.y)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)

if CFG.debug:
    display(train.groupby('kfold').size())
    train = train.sample(n=500, random_state=0).reset_index(drop=True)
    display(train.groupby('kfold').size())

In [12]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_EXP_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

In [13]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['texts'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 3 # cls + sep + sep
LOGGER.info(f"max_len: {CFG.max_len}")

100%|██████████| 4974/4974 [00:03<00:00, 1335.01it/s]
max_len: 522
INFO:__main__:max_len: 522


In [14]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False,
                           truncation=True)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df['y'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.half)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

class ValidDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df['y'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

#collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [15]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())

        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self._init_weights(self.fc)
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
        self.sig = nn.Sigmoid()
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        feature = self.layer_norm1(feature)
        output = self.fc(feature)
        #output = self.sig(output)
        return output

In [16]:

# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
        #print(y_preds.sigmoid().squeeze().view(1, -1))
        loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [17]:

# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['kfold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['kfold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds['y'].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = ValidDataset(CFG, valid_folds)


    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size*2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_EXP_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCELoss()
    
    best_score = -1.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')

        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_EXP_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_EXP_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [None]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['y'].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        acc_score = get_acc_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
        LOGGER.info(f'ACC BEST Score: {acc_score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
            #break
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_EXP_DIR+'oof_df.pkl')

DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "torch_dtype": "f

Epoch: [1][0/279] Elapsed 0m 1s (remain 7m 15s) Loss: 0.8086(0.8086) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 15s (remain 0m 27s) Loss: 0.6230(0.6316) Grad: 1.0710  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 28s (remain 0m 11s) Loss: 0.5054(0.6244) Grad: 3.7745  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 39s (remain 0m 0s) Loss: 0.3545(0.6176) Grad: 2.8952  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2569(0.2569) 


Epoch 1 - avg_train_loss: 0.6176  avg_val_loss: 0.5863  time: 43s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6176  avg_val_loss: 0.5863  time: 43s
Epoch 1 - Score: 0.6948
INFO:__main__:Epoch 1 - Score: 0.6948
Epoch 1 - Save Best Score: 0.6948 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6948 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2601(0.5863) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 40s) Loss: 0.3635(0.3635) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.5620(0.5674) Grad: 1.7819  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.4666(0.5783) Grad: 3.7517  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.6460(0.5784) Grad: 1.7290  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3522(0.3522) 


Epoch 2 - avg_train_loss: 0.5784  avg_val_loss: 0.5602  time: 43s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5784  avg_val_loss: 0.5602  time: 43s
Epoch 2 - Score: 0.7129
INFO:__main__:Epoch 2 - Score: 0.7129
Epoch 2 - Save Best Score: 0.7129 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7129 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9820(0.5602) 
f1 score : 0.2513089005235602
recall score : 0.15789473684210525
precision score : 0.6153846153846154
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 44s) Loss: 0.4736(0.4736) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.6606(0.5368) Grad: 7.1263  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.4353(0.5174) Grad: 1.9415  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.5273(0.5190) Grad: 5.2540  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4714(0.4714) 


Epoch 3 - avg_train_loss: 0.5190  avg_val_loss: 0.5728  time: 42s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5190  avg_val_loss: 0.5728  time: 42s
Epoch 3 - Score: 0.6847
INFO:__main__:Epoch 3 - Score: 0.6847


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8259(0.5728) 
f1 score : 0.46779661016949153
recall score : 0.45394736842105265
precision score : 0.4825174825174825
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 37s) Loss: 0.4680(0.4680) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.2954(0.4458) Grad: 2.6012  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.2578(0.4271) Grad: 2.3150  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.5679(0.4168) Grad: 8.4854  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3116(0.3116) 


Epoch 4 - avg_train_loss: 0.4168  avg_val_loss: 0.5887  time: 42s
INFO:__main__:Epoch 4 - avg_train_loss: 0.4168  avg_val_loss: 0.5887  time: 42s
Epoch 4 - Score: 0.7108
INFO:__main__:Epoch 4 - Score: 0.7108


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2053(0.5887) 
f1 score : 0.4330708661417323
recall score : 0.3618421052631579
precision score : 0.5392156862745098


Score: 0.7129
INFO:__main__:Score: 0.7129
ACC BEST Score: 0.7229
INFO:__main__:ACC BEST Score: 0.7229
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.2513089005235602
recall score : 0.15789473684210525
precision score : 0.6153846153846154


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 45s) Loss: 0.6724(0.6724) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.6733(0.6100) Grad: 1.2002  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.5430(0.6115) Grad: 6.0595  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.6304(0.6065) Grad: 2.1689  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3992(0.3992) 


Epoch 1 - avg_train_loss: 0.6065  avg_val_loss: 0.5940  time: 42s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6065  avg_val_loss: 0.5940  time: 42s
Epoch 1 - Score: 0.6908
INFO:__main__:Epoch 1 - Score: 0.6908
Epoch 1 - Save Best Score: 0.6908 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6908 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0566(0.5940) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 54s) Loss: 0.4634(0.4634) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.6758(0.5627) Grad: 1.6602  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.5190(0.5651) Grad: 6.1754  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.4368(0.5621) Grad: 4.4169  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4358(0.4358) 


Epoch 2 - avg_train_loss: 0.5621  avg_val_loss: 0.5704  time: 43s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5621  avg_val_loss: 0.5704  time: 43s
Epoch 2 - Score: 0.7108
INFO:__main__:Epoch 2 - Score: 0.7108
Epoch 2 - Save Best Score: 0.7108 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7108 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8932(0.5704) 
f1 score : 0.3949579831932773
recall score : 0.30718954248366015
precision score : 0.5529411764705883
Epoch: [3][0/279] Elapsed 0m 0s (remain 2m 2s) Loss: 0.6665(0.6665) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.4836(0.4578) Grad: 2.5968  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.6064(0.4523) Grad: 6.2906  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.5317(0.4497) Grad: 6.3906  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3327(0.3327) 


Epoch 3 - avg_train_loss: 0.4497  avg_val_loss: 0.5830  time: 42s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4497  avg_val_loss: 0.5830  time: 42s
Epoch 3 - Score: 0.7048
INFO:__main__:Epoch 3 - Score: 0.7048


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2934(0.5830) 
f1 score : 0.35807860262008734
recall score : 0.2679738562091503
precision score : 0.5394736842105263
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 47s) Loss: 0.3042(0.3042) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.1460(0.2978) Grad: 5.7639  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.2111(0.2849) Grad: 8.3358  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.2291(0.2767) Grad: 4.3254  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4193(0.4193) 


Epoch 4 - avg_train_loss: 0.2767  avg_val_loss: 0.6390  time: 42s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2767  avg_val_loss: 0.6390  time: 42s
Epoch 4 - Score: 0.7149
INFO:__main__:Epoch 4 - Score: 0.7149
Epoch 4 - Save Best Score: 0.7149 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7149 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.3464(0.6390) 
f1 score : 0.45801526717557256
recall score : 0.39215686274509803
precision score : 0.5504587155963303


Score: 0.7149
INFO:__main__:Score: 0.7149
ACC BEST Score: 0.7189
INFO:__main__:ACC BEST Score: 0.7189


f1 score : 0.45801526717557256
recall score : 0.39215686274509803
precision score : 0.5504587155963303


DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "torch_dtype": "f

Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 44s) Loss: 0.6362(0.6362) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.5552(0.6207) Grad: 1.4300  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.6753(0.6168) Grad: 4.3558  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.7432(0.6114) Grad: 5.1800  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3016(0.3016) 


Epoch 1 - avg_train_loss: 0.6114  avg_val_loss: 0.5895  time: 42s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6114  avg_val_loss: 0.5895  time: 42s
Epoch 1 - Score: 0.6928
INFO:__main__:Epoch 1 - Score: 0.6928
Epoch 1 - Save Best Score: 0.6928 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6928 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2674(0.5895) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 53s) Loss: 0.4897(0.4897) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.6226(0.5548) Grad: 5.1736  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.6582(0.5490) Grad: 5.4610  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 39s (remain 0m 0s) Loss: 0.5977(0.5576) Grad: 5.1184  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.4355(0.4355) 


Epoch 2 - avg_train_loss: 0.5576  avg_val_loss: 0.5685  time: 43s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5576  avg_val_loss: 0.5685  time: 43s
Epoch 2 - Score: 0.6948
INFO:__main__:Epoch 2 - Score: 0.6948
Epoch 2 - Save Best Score: 0.6948 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.6948 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8634(0.5685) 
f1 score : 0.39199999999999996
recall score : 0.3202614379084967
precision score : 0.5051546391752577
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 55s) Loss: 0.4060(0.4060) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.8159(0.4650) Grad: 7.1997  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.5820(0.4665) Grad: 6.4499  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.4119(0.4494) Grad: 3.2170  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3951(0.3951) 


Epoch 3 - avg_train_loss: 0.4494  avg_val_loss: 0.5859  time: 42s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4494  avg_val_loss: 0.5859  time: 42s
Epoch 3 - Score: 0.6948
INFO:__main__:Epoch 3 - Score: 0.6948


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0656(0.5859) 
f1 score : 0.46853146853146854
recall score : 0.43790849673202614
precision score : 0.5037593984962406
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 48s) Loss: 0.2325(0.2325) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.2742(0.2878) Grad: 11.8173  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.2107(0.2825) Grad: 5.5075  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.3210(0.2794) Grad: 4.4012  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4041(0.4041) 


Epoch 4 - avg_train_loss: 0.2794  avg_val_loss: 0.6532  time: 42s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2794  avg_val_loss: 0.6532  time: 42s
Epoch 4 - Score: 0.6727
INFO:__main__:Epoch 4 - Score: 0.6727


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.3468(0.6532) 
f1 score : 0.41577060931899645
recall score : 0.3790849673202614
precision score : 0.4603174603174603


Score: 0.6948
INFO:__main__:Score: 0.6948
ACC BEST Score: 0.7189
INFO:__main__:ACC BEST Score: 0.7189
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.39199999999999996
recall score : 0.3202614379084967
precision score : 0.5051546391752577


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 52s) Loss: 0.5879(0.5879) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.4106(0.6244) Grad: 6.6541  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.6631(0.6187) Grad: 1.5327  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.6113(0.6170) Grad: 1.6187  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3016(0.3016) 


Epoch 1 - avg_train_loss: 0.6170  avg_val_loss: 0.6086  time: 42s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6170  avg_val_loss: 0.6086  time: 42s
Epoch 1 - Score: 0.6928
INFO:__main__:Epoch 1 - Score: 0.6928
Epoch 1 - Save Best Score: 0.6928 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6928 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2909(0.6086) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
Epoch: [2][0/279] Elapsed 0m 0s (remain 2m 4s) Loss: 0.7881(0.7881) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.5991(0.5971) Grad: 0.9358  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.6250(0.5848) Grad: 0.8182  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 39s (remain 0m 0s) Loss: 0.4121(0.5769) Grad: 3.0071  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2410(0.2410) 


Epoch 2 - avg_train_loss: 0.5769  avg_val_loss: 0.5868  time: 43s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5769  avg_val_loss: 0.5868  time: 43s
Epoch 2 - Score: 0.6968
INFO:__main__:Epoch 2 - Score: 0.6968
Epoch 2 - Save Best Score: 0.6968 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.6968 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2957(0.5868) 
f1 score : 0.10650887573964497
recall score : 0.058823529411764705
precision score : 0.5625
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 57s) Loss: 0.4634(0.4634) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.6069(0.5284) Grad: 1.9595  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.5444(0.5247) Grad: 4.5993  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.3286(0.5113) Grad: 2.1203  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3096(0.3096) 


Epoch 3 - avg_train_loss: 0.5113  avg_val_loss: 0.5742  time: 42s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5113  avg_val_loss: 0.5742  time: 42s
Epoch 3 - Score: 0.7088
INFO:__main__:Epoch 3 - Score: 0.7088
Epoch 3 - Save Best Score: 0.7088 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7088 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0705(0.5742) 
f1 score : 0.3983402489626556
recall score : 0.3137254901960784
precision score : 0.5454545454545454
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 58s) Loss: 0.3823(0.3823) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.3225(0.4147) Grad: 2.6115  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.2610(0.4158) Grad: 2.7563  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.5684(0.4152) Grad: 3.8212  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2547(0.2547) 


Epoch 4 - avg_train_loss: 0.4152  avg_val_loss: 0.5902  time: 42s
INFO:__main__:Epoch 4 - avg_train_loss: 0.4152  avg_val_loss: 0.5902  time: 42s
Epoch 4 - Score: 0.7149
INFO:__main__:Epoch 4 - Score: 0.7149
Epoch 4 - Save Best Score: 0.7149 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7149 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2558(0.5902) 
f1 score : 0.40336134453781514
recall score : 0.3137254901960784
precision score : 0.5647058823529412


Score: 0.7149
INFO:__main__:Score: 0.7149
ACC BEST Score: 0.7249
INFO:__main__:ACC BEST Score: 0.7249
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.40336134453781514
recall score : 0.3137254901960784
precision score : 0.5647058823529412


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 50s) Loss: 0.5498(0.5498) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.5879(0.6318) Grad: 3.2070  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.5957(0.6166) Grad: 1.2590  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.5718(0.6116) Grad: 2.3499  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2779(0.2779) 


Epoch 1 - avg_train_loss: 0.6116  avg_val_loss: 0.5812  time: 42s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6116  avg_val_loss: 0.5812  time: 42s
Epoch 1 - Score: 0.6982
INFO:__main__:Epoch 1 - Score: 0.6982
Epoch 1 - Save Best Score: 0.6982 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6982 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1919(0.5812) 
f1 score : 0.03846153846153846
recall score : 0.019736842105263157
precision score : 0.75
Epoch: [2][0/279] Elapsed 0m 0s (remain 2m 12s) Loss: 0.8994(0.8994) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.6860(0.5798) Grad: 3.3558  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.5454(0.5698) Grad: 1.3232  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.7988(0.5686) Grad: 4.9928  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2286(0.2286) 


Epoch 2 - avg_train_loss: 0.5686  avg_val_loss: 0.5641  time: 43s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5686  avg_val_loss: 0.5641  time: 43s
Epoch 2 - Score: 0.7143
INFO:__main__:Epoch 2 - Score: 0.7143
Epoch 2 - Save Best Score: 0.7143 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7143 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1984(0.5641) 
f1 score : 0.17441860465116277
recall score : 0.09868421052631579
precision score : 0.75
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 54s) Loss: 0.6982(0.6982) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.4321(0.4821) Grad: 2.2458  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.4846(0.4876) Grad: 6.3854  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.6636(0.4775) Grad: 13.2321  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2776(0.2776) 


Epoch 3 - avg_train_loss: 0.4775  avg_val_loss: 0.5879  time: 42s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4775  avg_val_loss: 0.5879  time: 42s
Epoch 3 - Score: 0.7103
INFO:__main__:Epoch 3 - Score: 0.7103


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9629(0.5879) 
f1 score : 0.4193548387096774
recall score : 0.34210526315789475
precision score : 0.5416666666666666
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 46s) Loss: 0.4534(0.4534) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.4304(0.3547) Grad: 5.3849  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.4519(0.3366) Grad: 4.4519  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.2111(0.3379) Grad: 3.5169  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3096(0.3096) 


Epoch 4 - avg_train_loss: 0.3379  avg_val_loss: 0.6437  time: 42s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3379  avg_val_loss: 0.6437  time: 42s
Epoch 4 - Score: 0.7002
INFO:__main__:Epoch 4 - Score: 0.7002


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9840(0.6437) 
f1 score : 0.4377358490566038
recall score : 0.3815789473684211
precision score : 0.5132743362831859


Score: 0.7143
INFO:__main__:Score: 0.7143
ACC BEST Score: 0.7223
INFO:__main__:ACC BEST Score: 0.7223
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.17441860465116277
recall score : 0.09868421052631579
precision score : 0.75


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 52s) Loss: 1.1865(1.1865) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.6680(0.6650) Grad: 2.8744  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.5557(0.6369) Grad: 1.9063  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.5718(0.6233) Grad: 0.7171  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.3820(0.3820) 


Epoch 1 - avg_train_loss: 0.6233  avg_val_loss: 0.6034  time: 42s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6233  avg_val_loss: 0.6034  time: 42s
Epoch 1 - Score: 0.6942
INFO:__main__:Epoch 1 - Score: 0.6942
Epoch 1 - Save Best Score: 0.6942 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6942 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1091(0.6034) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 47s) Loss: 0.6880(0.6880) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.4915(0.5813) Grad: 0.9938  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 28s (remain 0m 11s) Loss: 0.7837(0.5761) Grad: 6.7638  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 39s (remain 0m 0s) Loss: 0.4390(0.5669) Grad: 5.1291  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4142(0.4142) 


Epoch 2 - avg_train_loss: 0.5669  avg_val_loss: 0.6038  time: 43s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5669  avg_val_loss: 0.6038  time: 43s
Epoch 2 - Score: 0.6600
INFO:__main__:Epoch 2 - Score: 0.6600


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0572(0.6038) 
f1 score : 0.23529411764705882
recall score : 0.17105263157894737
precision score : 0.37681159420289856
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 46s) Loss: 0.4956(0.4956) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.4749(0.4980) Grad: 1.8170  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.3120(0.4739) Grad: 10.2544  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.7314(0.4714) Grad: 3.7289  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3647(0.3647) 


Epoch 3 - avg_train_loss: 0.4714  avg_val_loss: 0.6483  time: 42s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4714  avg_val_loss: 0.6483  time: 42s
Epoch 3 - Score: 0.6740
INFO:__main__:Epoch 3 - Score: 0.6740


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.3473(0.6483) 
f1 score : 0.2831858407079646
recall score : 0.21052631578947367
precision score : 0.43243243243243246
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 42s) Loss: 0.3823(0.3823) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.2297(0.3409) Grad: 3.4599  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.2041(0.3313) Grad: 3.9132  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.2620(0.3261) Grad: 5.8195  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4838(0.4838) 


Epoch 4 - avg_train_loss: 0.3261  avg_val_loss: 0.7308  time: 42s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3261  avg_val_loss: 0.7308  time: 42s
Epoch 4 - Score: 0.6620
INFO:__main__:Epoch 4 - Score: 0.6620


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.3237(0.7308) 
f1 score : 0.3777777777777778
recall score : 0.3355263157894737
precision score : 0.4322033898305085


Score: 0.6942
INFO:__main__:Score: 0.6942
ACC BEST Score: 0.6942
INFO:__main__:ACC BEST Score: 0.6942
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.0
recall score : 0.0
precision score : 0.0


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 51s) Loss: 1.0010(1.0010) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.7432(0.6476) Grad: 5.3742  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.6465(0.6250) Grad: 2.1510  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.6201(0.6185) Grad: 3.5070  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.3942(0.3942) 


Epoch 1 - avg_train_loss: 0.6185  avg_val_loss: 0.5809  time: 42s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6185  avg_val_loss: 0.5809  time: 42s
Epoch 1 - Score: 0.7022
INFO:__main__:Epoch 1 - Score: 0.7022
Epoch 1 - Save Best Score: 0.7022 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7022 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9679(0.5809) 
f1 score : 0.10843373493975902
recall score : 0.05921052631578947
precision score : 0.6428571428571429
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 51s) Loss: 0.6045(0.6045) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.7563(0.5940) Grad: 7.0468  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.4495(0.5744) Grad: 1.1647  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.3323(0.5682) Grad: 3.9780  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.2796(0.2796) 


Epoch 2 - avg_train_loss: 0.5682  avg_val_loss: 0.5732  time: 43s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5682  avg_val_loss: 0.5732  time: 43s
Epoch 2 - Score: 0.7082
INFO:__main__:Epoch 2 - Score: 0.7082
Epoch 2 - Save Best Score: 0.7082 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7082 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2115(0.5732) 
f1 score : 0.1420118343195266
recall score : 0.07894736842105263
precision score : 0.7058823529411765
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 57s) Loss: 0.7290(0.7290) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.4878(0.5150) Grad: 2.0024  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.3867(0.4977) Grad: 6.5247  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.3333(0.4890) Grad: 6.1864  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.4124(0.4124) 


Epoch 3 - avg_train_loss: 0.4890  avg_val_loss: 0.5750  time: 42s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4890  avg_val_loss: 0.5750  time: 42s
Epoch 3 - Score: 0.7022
INFO:__main__:Epoch 3 - Score: 0.7022


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9670(0.5750) 
f1 score : 0.4598540145985402
recall score : 0.4144736842105263
precision score : 0.5163934426229508
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 46s) Loss: 0.4155(0.4155) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.4741(0.3827) Grad: 8.9874  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.2795(0.3674) Grad: 3.4743  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.1776(0.3607) Grad: 5.0534  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.3590(0.3590) 


Epoch 4 - avg_train_loss: 0.3607  avg_val_loss: 0.6149  time: 42s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3607  avg_val_loss: 0.6149  time: 42s
Epoch 4 - Score: 0.7163
INFO:__main__:Epoch 4 - Score: 0.7163
Epoch 4 - Save Best Score: 0.7163 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7163 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.3861(0.6149) 
f1 score : 0.46387832699619774
recall score : 0.40131578947368424
precision score : 0.5495495495495496


Score: 0.7163
INFO:__main__:Score: 0.7163
ACC BEST Score: 0.7223
INFO:__main__:ACC BEST Score: 0.7223
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.46387832699619774
recall score : 0.40131578947368424
precision score : 0.5495495495495496


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 51s) Loss: 0.5557(0.5557) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.6978(0.6126) Grad: 4.4819  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 28s (remain 0m 11s) Loss: 0.5635(0.6092) Grad: 0.8639  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 39s (remain 0m 0s) Loss: 0.5093(0.6039) Grad: 1.1431  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3290(0.3290) 


Epoch 1 - avg_train_loss: 0.6039  avg_val_loss: 0.5881  time: 43s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6039  avg_val_loss: 0.5881  time: 43s
Epoch 1 - Score: 0.7002
INFO:__main__:Epoch 1 - Score: 0.7002
Epoch 1 - Save Best Score: 0.7002 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7002 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1504(0.5881) 
f1 score : 0.03870967741935484
recall score : 0.019736842105263157
precision score : 1.0
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 53s) Loss: 0.6465(0.6465) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.6318(0.5719) Grad: 5.9233  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.4036(0.5656) Grad: 3.8739  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.4905(0.5563) Grad: 1.5509  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4122(0.4122) 


Epoch 2 - avg_train_loss: 0.5563  avg_val_loss: 0.5706  time: 42s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5563  avg_val_loss: 0.5706  time: 42s
Epoch 2 - Score: 0.7344
INFO:__main__:Epoch 2 - Score: 0.7344
Epoch 2 - Save Best Score: 0.7344 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7344 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8712(0.5706) 
f1 score : 0.35294117647058826
recall score : 0.23684210526315788
precision score : 0.6923076923076923
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 47s) Loss: 0.6348(0.6348) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.4568(0.4474) Grad: 3.5948  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.3699(0.4386) Grad: 7.3801  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.3455(0.4326) Grad: 4.8223  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5478(0.5478) 


Epoch 3 - avg_train_loss: 0.4326  avg_val_loss: 0.6189  time: 43s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4326  avg_val_loss: 0.6189  time: 43s
Epoch 3 - Score: 0.6479
INFO:__main__:Epoch 3 - Score: 0.6479


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6100(0.6189) 
f1 score : 0.4868035190615836
recall score : 0.5460526315789473
precision score : 0.43915343915343913
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 42s) Loss: 0.2625(0.2625) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.1742(0.2710) Grad: 5.9622  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.1569(0.2425) Grad: 6.4275  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.2069(0.2478) Grad: 4.3685  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3825(0.3825) 


Epoch 4 - avg_train_loss: 0.2478  avg_val_loss: 0.6745  time: 42s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2478  avg_val_loss: 0.6745  time: 42s
Epoch 4 - Score: 0.6922
INFO:__main__:Epoch 4 - Score: 0.6922


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0422(0.6745) 
f1 score : 0.43122676579925656
recall score : 0.3815789473684211
precision score : 0.49572649572649574


Score: 0.7344
INFO:__main__:Score: 0.7344
ACC BEST Score: 0.7364
INFO:__main__:ACC BEST Score: 0.7364
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.35294117647058826
recall score : 0.23684210526315788
precision score : 0.6923076923076923


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 49s) Loss: 0.6240(0.6240) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.7114(0.6283) Grad: 3.3369  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.5142(0.6093) Grad: 4.3221  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.6265(0.6092) Grad: 3.2128  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.3207(0.3207) 


Epoch 1 - avg_train_loss: 0.6092  avg_val_loss: 0.5858  time: 42s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6092  avg_val_loss: 0.5858  time: 42s
Epoch 1 - Score: 0.6962
INFO:__main__:Epoch 1 - Score: 0.6962
Epoch 1 - Save Best Score: 0.6962 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6962 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1376(0.5858) 
f1 score : 0.013071895424836602
recall score : 0.006578947368421052
precision score : 1.0
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 49s) Loss: 0.4375(0.4375) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.3984(0.5800) Grad: 3.6625  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.7759(0.5768) Grad: 8.4993  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.6973(0.5753) Grad: 1.9510  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5309(0.5309) 


Epoch 2 - avg_train_loss: 0.5753  avg_val_loss: 0.5962  time: 43s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5753  avg_val_loss: 0.5962  time: 43s
Epoch 2 - Score: 0.6761
INFO:__main__:Epoch 2 - Score: 0.6761


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6551(0.5962) 
f1 score : 0.4888888888888889
recall score : 0.506578947368421
precision score : 0.4723926380368098
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 47s) Loss: 0.4678(0.4678) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.4060(0.5380) Grad: 1.4714  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.4973(0.5298) Grad: 4.1474  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 39s (remain 0m 0s) Loss: 0.5635(0.5181) Grad: 2.2734  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.3702(0.3702) 


Epoch 3 - avg_train_loss: 0.5181  avg_val_loss: 0.5518  time: 43s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5181  avg_val_loss: 0.5518  time: 43s
Epoch 3 - Score: 0.7002
INFO:__main__:Epoch 3 - Score: 0.7002
Epoch 3 - Save Best Score: 0.7002 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7002 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8100(0.5518) 
f1 score : 0.40160642570281124
recall score : 0.32894736842105265
precision score : 0.5154639175257731
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 56s) Loss: 0.2476(0.2476) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.2898(0.4290) Grad: 4.8963  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 28s (remain 0m 11s) Loss: 0.5562(0.4248) Grad: 7.8980  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.3950(0.4242) Grad: 6.4167  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3923(0.3923) 


Epoch 4 - avg_train_loss: 0.4242  avg_val_loss: 0.5650  time: 43s
INFO:__main__:Epoch 4 - avg_train_loss: 0.4242  avg_val_loss: 0.5650  time: 43s
Epoch 4 - Score: 0.6922
INFO:__main__:Epoch 4 - Score: 0.6922


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7844(0.5650) 
f1 score : 0.4631578947368421
recall score : 0.4342105263157895
precision score : 0.49624060150375937


Score: 0.7002
INFO:__main__:Score: 0.7002
ACC BEST Score: 0.7203
INFO:__main__:ACC BEST Score: 0.7203
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.40160642570281124
recall score : 0.32894736842105265
precision score : 0.5154639175257731


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/279] Elapsed 0m 0s (remain 2m 3s) Loss: 0.6816(0.6816) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.7910(0.6394) Grad: 6.4642  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.5898(0.6202) Grad: 4.3145  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.5322(0.6127) Grad: 4.9400  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3155(0.3155) 


Epoch 1 - avg_train_loss: 0.6127  avg_val_loss: 0.5939  time: 42s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6127  avg_val_loss: 0.5939  time: 42s
Epoch 1 - Score: 0.6942
INFO:__main__:Epoch 1 - Score: 0.6942
Epoch 1 - Save Best Score: 0.6942 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6942 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1924(0.5939) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
Epoch: [2][0/279] Elapsed 0m 0s (remain 2m 15s) Loss: 0.6875(0.6875) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.6455(0.5942) Grad: 4.4060  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.4644(0.5746) Grad: 1.7761  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.4023(0.5692) Grad: 1.1812  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.1965(0.1965) 


Epoch 2 - avg_train_loss: 0.5692  avg_val_loss: 0.5996  time: 42s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5692  avg_val_loss: 0.5996  time: 42s
Epoch 2 - Score: 0.7022
INFO:__main__:Epoch 2 - Score: 0.7022
Epoch 2 - Save Best Score: 0.7022 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7022 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.4674(0.5996) 
f1 score : 0.08641975308641975
recall score : 0.046052631578947366
precision score : 0.7
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 53s) Loss: 0.4414(0.4414) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.6196(0.5128) Grad: 1.9952  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.3013(0.5074) Grad: 4.2709  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.4995(0.4988) Grad: 3.5271  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2871(0.2871) 


Epoch 3 - avg_train_loss: 0.4988  avg_val_loss: 0.5950  time: 42s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4988  avg_val_loss: 0.5950  time: 42s
Epoch 3 - Score: 0.7082
INFO:__main__:Epoch 3 - Score: 0.7082
Epoch 3 - Save Best Score: 0.7082 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7082 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1815(0.5950) 
f1 score : 0.39330543933054396
recall score : 0.3092105263157895
precision score : 0.5402298850574713
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 52s) Loss: 0.3577(0.3577) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.5361(0.4141) Grad: 5.4332  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 28s (remain 0m 11s) Loss: 0.3525(0.4031) Grad: 5.4429  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.3196(0.3944) Grad: 2.6560  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2827(0.2827) 


Epoch 4 - avg_train_loss: 0.3944  avg_val_loss: 0.6313  time: 43s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3944  avg_val_loss: 0.6313  time: 43s
Epoch 4 - Score: 0.6962
INFO:__main__:Epoch 4 - Score: 0.6962


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2377(0.6313) 
f1 score : 0.4258555133079847
recall score : 0.3684210526315789
precision score : 0.5045045045045045


Score: 0.7082
INFO:__main__:Score: 0.7082
ACC BEST Score: 0.7123
INFO:__main__:ACC BEST Score: 0.7123
Score: 0.7105
INFO:__main__:Score: 0.7105
ACC BEST Score: 0.7117
INFO:__main__:ACC BEST Score: 0.7117


f1 score : 0.39330543933054396
recall score : 0.3092105263157895
precision score : 0.5402298850574713
f1 score : 0.35135135135135137
recall score : 0.2560735390676297
precision score : 0.5595408895265424


In [None]:
from google.colab import runtime
runtime.unassign()