In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m90.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m100.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.4
Looking in indexes: https://pypi.org/simple, htt

In [None]:
!nvidia-smi

Tue Apr  4 17:25:17 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0    44W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:

import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re
import html

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD, AdamW, RAdam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score, recall_score, accuracy_score, precision_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
DIR = "/content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")
CUSTOM_MODEL_DIR = os.path.join(OUTPUT_DIR,'clrp_deberta_v3_base_epoch20')
OUTPUT_EXP_DIR = DIR + '/output/EXP018/'
if not os.path.exists(OUTPUT_EXP_DIR):
    os.makedirs(OUTPUT_EXP_DIR)

In [None]:

# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model_name="microsoft/deberta-v3-base"
    # model='microsoft/deberta-base'
    # model='roberta-base'
    # model='roberta-large'
    # model='roberta-large-mnli'
    # model='xlnet-large-cased'
    # model='albert-xxlarge-v2'
    # model="microsoft/deberta-large"
    # model="microsoft/deberta-v3-large"
    # model='microsoft/deberta-v2-xlarge'
    # model='funnel-transformer/large'
    # model='funnel-transformer/medium'
    # model='albert-base-v2'
    # model='albert-large-v2'
    # model='google/electra-large-discriminator'
    # model='google/electra-base-discriminator'
    # model="facebook/bart-large-mnli"
    # model="facebook/bart-large"
    # model="facebook/bart-base"
    model = CUSTOM_MODEL_DIR
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=6
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=16
    fc_dropout=0.2
    target_size=1
    max_len=256
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=10
    trn_fold=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    train=True
    nth_awp_start_epoch=1
    gradient_checkpointing = True
    freezing = True

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1]

In [None]:
def get_score(labels, outputs):
    thresh = 0.5
    y_pred = outputs
    y_true = labels
    f_score = f1_score(y_true, (y_pred>thresh).astype(int))
    r_score = recall_score(y_true, (y_pred>thresh).astype(int))
    p_score = precision_score(y_true, (y_pred>thresh).astype(int))
    print(f"f1 score : {f_score}")
    print(f"recall score : {r_score}")
    print(f"precision score : {p_score}")
    return accuracy_score(y_true, (y_pred>thresh).astype(int))

def get_acc_score(labels, outputs):
    y_pred = outputs
    y_true = labels
    best_score = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.80, 0.01):
        thresh = np.round(thresh, 2)
        score = accuracy_score(y_true, (y_pred>thresh).astype(int))
        #print("Accuracy score at threshold {0} is {1}".format(thresh, score))
        if score > best_score:
          best_score = score
          best_thresh = thresh
    return accuracy_score(y_true, (y_pred>best_thresh).astype(int))


def get_logger(filename=OUTPUT_EXP_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=CFG.seed)

In [None]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [None]:
import pandas as pd
import numpy as np

train = pd.read_csv(os.path.join(INPUT_DIR,"train_data.csv"))
test = pd.read_csv(os.path.join(INPUT_DIR,"test_data.csv"))
sample_sub = pd.read_csv(os.path.join(INPUT_DIR,"submission.csv"))

print(train.shape)
display(train.head(3))

print(test.shape)
display(test.head(3))

print(sample_sub.shape)
display(sample_sub.head(3))

(4974, 6)


Unnamed: 0,id,title,year,abstract,keywords,y
0,1,Hierarchical Adversarially Learned Inference,2018,We propose a novel hierarchical generative mod...,"generative, hierarchical, unsupervised, semisu...",0
1,2,Learning to Compute Word Embeddings On the Fly,2018,Words in natural language follow a Zipfian dis...,"NLU, word embeddings, representation learning",0
2,3,Graph2Seq: Scalable Learning Dynamics for Graphs,2018,Neural networks are increasingly used as a gen...,,0


(6393, 5)


Unnamed: 0,id,title,year,abstract,keywords
0,1,StyleAlign: Analysis and Applications of Align...,2022,"In this paper, we perform an in-depth study of...","StyleGAN, transfer learning, fine tuning, mode..."
1,2,Embedding a random graph via GNN: mean-field i...,2021,We develop a theory for embedding a random gra...,"Graph neural network, graph embedding, multi-r..."
2,3,BBRefinement: an universal scheme to improve p...,2021,We present a conceptually simple yet powerful ...,"object detection, deep neural networks, refine..."


(6393, 2)


Unnamed: 0,id,y
0,1,0
1,2,0
2,3,0


In [None]:
train["texts"] = train["title"] + "[SEP]" + train["abstract"] 

In [None]:
skf = StratifiedKFold(n_splits=CFG.n_fold,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train.y)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)

if CFG.debug:
    display(train.groupby('kfold').size())
    train = train.sample(n=500, random_state=0).reset_index(drop=True)
    display(train.groupby('kfold').size())

In [None]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_EXP_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

In [None]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['texts'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 3 # cls + sep + sep
LOGGER.info(f"max_len: {CFG.max_len}")

100%|██████████| 4974/4974 [00:04<00:00, 1215.82it/s]
max_len: 522
INFO:__main__:max_len: 522


In [None]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False,
                           truncation=True)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df['y'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.half)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

class ValidDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df['y'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

#collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())

        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self._init_weights(self.fc)
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
        self.sig = nn.Sigmoid()
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        feature = self.layer_norm1(feature)
        output = self.fc(feature)
        #output = self.sig(output)
        return output

In [None]:

# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
        #print(y_preds.sigmoid().squeeze().view(1, -1))
        loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [None]:

# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['kfold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['kfold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds['y'].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = ValidDataset(CFG, valid_folds)


    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size*2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_EXP_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCELoss()
    
    best_score = -1.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')

        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_EXP_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_EXP_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [None]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['y'].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        acc_score = get_acc_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
        LOGGER.info(f'ACC BEST Score: {acc_score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
            #break
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_EXP_DIR+'oof_df.pkl')

DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base_epoch20",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "torch_dt

Epoch: [1][0/279] Elapsed 0m 3s (remain 18m 5s) Loss: 0.9507(0.9507) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 17s (remain 0m 31s) Loss: 0.6133(0.6404) Grad: 0.6974  LR: 0.00001982  
Epoch: [1][200/279] Elapsed 0m 31s (remain 0m 12s) Loss: 0.5283(0.6267) Grad: 4.2012  LR: 0.00001930  
Epoch: [1][278/279] Elapsed 0m 42s (remain 0m 0s) Loss: 0.3582(0.6167) Grad: 3.4611  LR: 0.00001867  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2867(0.2867) 


Epoch 1 - avg_train_loss: 0.6167  avg_val_loss: 0.5757  time: 46s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6167  avg_val_loss: 0.5757  time: 46s
Epoch 1 - Score: 0.6948
INFO:__main__:Epoch 1 - Score: 0.6948
Epoch 1 - Save Best Score: 0.6948 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6948 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2039(0.5757) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 37s) Loss: 0.3621(0.3621) Grad: nan  LR: 0.00001866  
Epoch: [2][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.5654(0.5614) Grad: 2.3573  LR: 0.00001757  
Epoch: [2][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.4316(0.5702) Grad: 1.5273  LR: 0.00001623  
Epoch: [2][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.6108(0.5709) Grad: 1.3568  LR: 0.00001502  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.3704(0.3704) 


Epoch 2 - avg_train_loss: 0.5709  avg_val_loss: 0.5596  time: 43s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5709  avg_val_loss: 0.5596  time: 43s
Epoch 2 - Score: 0.7068
INFO:__main__:Epoch 2 - Score: 0.7068
Epoch 2 - Save Best Score: 0.7068 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7068 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8930(0.5596) 
f1 score : 0.4251968503937008
recall score : 0.35526315789473684
precision score : 0.5294117647058824
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 36s) Loss: 0.4480(0.4480) Grad: nan  LR: 0.00001501  
Epoch: [3][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.9341(0.5126) Grad: 12.8159  LR: 0.00001331  
Epoch: [3][200/279] Elapsed 0m 28s (remain 0m 11s) Loss: 0.3779(0.4967) Grad: 3.9355  LR: 0.00001149  
Epoch: [3][278/279] Elapsed 0m 39s (remain 0m 0s) Loss: 0.7231(0.4926) Grad: 9.4514  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.4519(0.4519) 


Epoch 3 - avg_train_loss: 0.4926  avg_val_loss: 0.5892  time: 43s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4926  avg_val_loss: 0.5892  time: 43s
Epoch 3 - Score: 0.6365
INFO:__main__:Epoch 3 - Score: 0.6365


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7468(0.5892) 
f1 score : 0.4723032069970846
recall score : 0.5328947368421053
precision score : 0.42408376963350786
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 32s) Loss: 0.4749(0.4749) Grad: nan  LR: 0.00001002  
Epoch: [4][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.1315(0.3287) Grad: 5.3988  LR: 0.00000816  
Epoch: [4][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.1685(0.2969) Grad: 4.6207  LR: 0.00000636  
Epoch: [4][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.2028(0.2819) Grad: 14.9205  LR: 0.00000504  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.1507(0.1507) 


Epoch 4 - avg_train_loss: 0.2819  avg_val_loss: 0.7889  time: 42s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2819  avg_val_loss: 0.7889  time: 42s
Epoch 4 - Score: 0.6867
INFO:__main__:Epoch 4 - Score: 0.6867


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.1250(0.7889) 
f1 score : 0.36585365853658536
recall score : 0.29605263157894735
precision score : 0.4787234042553192
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 31s) Loss: 0.0469(0.0469) Grad: nan  LR: 0.00000503  
Epoch: [5][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.0210(0.0963) Grad: 0.9223  LR: 0.00000350  
Epoch: [5][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0567(0.0887) Grad: 5.7559  LR: 0.00000220  
Epoch: [5][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.0424(0.0846) Grad: 3.6667  LR: 0.00000137  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.1269(0.1269) 


Epoch 5 - avg_train_loss: 0.0846  avg_val_loss: 0.9844  time: 42s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0846  avg_val_loss: 0.9844  time: 42s
Epoch 5 - Score: 0.6807
INFO:__main__:Epoch 5 - Score: 0.6807


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.7160(0.9844) 
f1 score : 0.3908045977011494
recall score : 0.3355263157894737
precision score : 0.46788990825688076
Epoch: [6][0/279] Elapsed 0m 0s (remain 1m 39s) Loss: 0.0120(0.0120) Grad: nan  LR: 0.00000136  
Epoch: [6][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.0159(0.0478) Grad: 0.8157  LR: 0.00000057  
Epoch: [6][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.0175(0.0414) Grad: 1.5811  LR: 0.00000012  
Epoch: [6][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.2893(0.0380) Grad: 3.9612  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.2238(0.2238) 


Epoch 6 - avg_train_loss: 0.0380  avg_val_loss: 1.0080  time: 43s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0380  avg_val_loss: 1.0080  time: 43s
Epoch 6 - Score: 0.6667
INFO:__main__:Epoch 6 - Score: 0.6667


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.4127(1.0080) 
f1 score : 0.4028776978417266
recall score : 0.3684210526315789
precision score : 0.4444444444444444


Score: 0.7068
INFO:__main__:Score: 0.7068
ACC BEST Score: 0.7329
INFO:__main__:ACC BEST Score: 0.7329
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base_epoch20",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input"

f1 score : 0.4251968503937008
recall score : 0.35526315789473684
precision score : 0.5294117647058824


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base_epoch20 were not used when initializing DebertaV2Model: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 56s) Loss: 0.7271(0.7271) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.6699(0.6109) Grad: 3.2351  LR: 0.00001982  
Epoch: [1][200/279] Elapsed 0m 28s (remain 0m 11s) Loss: 0.7285(0.6081) Grad: 4.5172  LR: 0.00001930  
Epoch: [1][278/279] Elapsed 0m 39s (remain 0m 0s) Loss: 0.7979(0.6094) Grad: 7.5025  LR: 0.00001867  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2774(0.2774) 


Epoch 1 - avg_train_loss: 0.6094  avg_val_loss: 0.6003  time: 43s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6094  avg_val_loss: 0.6003  time: 43s
Epoch 1 - Score: 0.6928
INFO:__main__:Epoch 1 - Score: 0.6928
Epoch 1 - Save Best Score: 0.6928 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6928 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.3387(0.6003) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 46s) Loss: 0.6533(0.6533) Grad: nan  LR: 0.00001866  
Epoch: [2][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.6230(0.5696) Grad: 6.3221  LR: 0.00001757  
Epoch: [2][200/279] Elapsed 0m 28s (remain 0m 11s) Loss: 0.2930(0.5637) Grad: 1.0381  LR: 0.00001623  
Epoch: [2][278/279] Elapsed 0m 39s (remain 0m 0s) Loss: 0.6172(0.5640) Grad: 2.0008  LR: 0.00001502  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5672(0.5672) 


Epoch 2 - avg_train_loss: 0.5640  avg_val_loss: 0.6075  time: 43s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5640  avg_val_loss: 0.6075  time: 43s
Epoch 2 - Score: 0.6827
INFO:__main__:Epoch 2 - Score: 0.6827


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7167(0.6075) 
f1 score : 0.5240963855421685
recall score : 0.5686274509803921
precision score : 0.4860335195530726
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 40s) Loss: 0.5430(0.5430) Grad: nan  LR: 0.00001501  
Epoch: [3][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.4978(0.4950) Grad: 4.9232  LR: 0.00001331  
Epoch: [3][200/279] Elapsed 0m 28s (remain 0m 11s) Loss: 0.6777(0.4875) Grad: 9.6276  LR: 0.00001149  
Epoch: [3][278/279] Elapsed 0m 39s (remain 0m 0s) Loss: 0.4883(0.4817) Grad: 3.2884  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4596(0.4596) 


Epoch 3 - avg_train_loss: 0.4817  avg_val_loss: 0.5770  time: 43s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4817  avg_val_loss: 0.5770  time: 43s
Epoch 3 - Score: 0.6948
INFO:__main__:Epoch 3 - Score: 0.6948
Epoch 3 - Save Best Score: 0.6948 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.6948 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0409(0.5770) 
f1 score : 0.4153846153846154
recall score : 0.35294117647058826
precision score : 0.5046728971962616
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 54s) Loss: 0.3474(0.3474) Grad: nan  LR: 0.00001002  
Epoch: [4][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.5488(0.2787) Grad: 4.9344  LR: 0.00000816  
Epoch: [4][200/279] Elapsed 0m 28s (remain 0m 11s) Loss: 0.0950(0.2572) Grad: 4.4832  LR: 0.00000636  
Epoch: [4][278/279] Elapsed 0m 39s (remain 0m 0s) Loss: 0.0948(0.2483) Grad: 5.1807  LR: 0.00000504  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6191(0.6191) 


Epoch 4 - avg_train_loss: 0.2483  avg_val_loss: 0.7770  time: 43s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2483  avg_val_loss: 0.7770  time: 43s
Epoch 4 - Score: 0.6908
INFO:__main__:Epoch 4 - Score: 0.6908


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.6358(0.7770) 
f1 score : 0.44999999999999996
recall score : 0.4117647058823529
precision score : 0.49606299212598426
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 48s) Loss: 0.0269(0.0269) Grad: nan  LR: 0.00000503  
Epoch: [5][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.0352(0.0585) Grad: 2.6820  LR: 0.00000350  
Epoch: [5][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.3965(0.0554) Grad: nan  LR: 0.00000220  
Epoch: [5][278/279] Elapsed 0m 39s (remain 0m 0s) Loss: 0.0083(0.0549) Grad: 0.4451  LR: 0.00000137  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6194(0.6194) 


Epoch 5 - avg_train_loss: 0.0549  avg_val_loss: 1.1016  time: 43s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0549  avg_val_loss: 1.1016  time: 43s
Epoch 5 - Score: 0.6787
INFO:__main__:Epoch 5 - Score: 0.6787


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 3.0129(1.1016) 
f1 score : 0.36000000000000004
recall score : 0.29411764705882354
precision score : 0.4639175257731959
Epoch: [6][0/279] Elapsed 0m 0s (remain 1m 43s) Loss: 0.0083(0.0083) Grad: nan  LR: 0.00000136  
Epoch: [6][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.0410(0.0215) Grad: 5.7669  LR: 0.00000057  
Epoch: [6][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.0157(0.0235) Grad: 0.8835  LR: 0.00000012  
Epoch: [6][278/279] Elapsed 0m 39s (remain 0m 0s) Loss: 0.0151(0.0231) Grad: 2.0400  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.7758(0.7758) 


Epoch 6 - avg_train_loss: 0.0231  avg_val_loss: 1.0992  time: 43s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0231  avg_val_loss: 1.0992  time: 43s
Epoch 6 - Score: 0.6707
INFO:__main__:Epoch 6 - Score: 0.6707


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.7170(1.0992) 
f1 score : 0.3970588235294118
recall score : 0.35294117647058826
precision score : 0.453781512605042


Score: 0.6948
INFO:__main__:Score: 0.6948
ACC BEST Score: 0.7048
INFO:__main__:ACC BEST Score: 0.7048
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base_epoch20",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input"

f1 score : 0.4153846153846154
recall score : 0.35294117647058826
precision score : 0.5046728971962616


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base_epoch20 were not used when initializing DebertaV2Model: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 44s) Loss: 0.5688(0.5688) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.4968(0.6361) Grad: 2.4744  LR: 0.00001982  
Epoch: [1][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.6709(0.6208) Grad: 2.6923  LR: 0.00001930  
Epoch: [1][278/279] Elapsed 0m 39s (remain 0m 0s) Loss: 0.5645(0.6114) Grad: 2.7623  LR: 0.00001867  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4227(0.4227) 


Epoch 1 - avg_train_loss: 0.6114  avg_val_loss: 0.5912  time: 43s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6114  avg_val_loss: 0.5912  time: 43s
Epoch 1 - Score: 0.7048
INFO:__main__:Epoch 1 - Score: 0.7048
Epoch 1 - Save Best Score: 0.7048 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7048 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9550(0.5912) 
f1 score : 0.08695652173913043
recall score : 0.0457516339869281
precision score : 0.875
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 49s) Loss: 0.5801(0.5801) Grad: nan  LR: 0.00001866  
Epoch: [2][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.7471(0.5637) Grad: 7.4730  LR: 0.00001757  
Epoch: [2][200/279] Elapsed 0m 28s (remain 0m 11s) Loss: 0.5850(0.5613) Grad: 1.8942  LR: 0.00001623  
Epoch: [2][278/279] Elapsed 0m 39s (remain 0m 0s) Loss: 0.6543(0.5589) Grad: 6.8306  LR: 0.00001502  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2787(0.2787) 


Epoch 2 - avg_train_loss: 0.5589  avg_val_loss: 0.5605  time: 43s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5589  avg_val_loss: 0.5605  time: 43s
Epoch 2 - Score: 0.7088
INFO:__main__:Epoch 2 - Score: 0.7088
Epoch 2 - Save Best Score: 0.7088 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7088 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1763(0.5605) 
f1 score : 0.16184971098265896
recall score : 0.0915032679738562
precision score : 0.7
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 42s) Loss: 0.5103(0.5103) Grad: nan  LR: 0.00001501  
Epoch: [3][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.4260(0.4671) Grad: 2.5400  LR: 0.00001331  
Epoch: [3][200/279] Elapsed 0m 28s (remain 0m 11s) Loss: 0.4636(0.4681) Grad: 4.1098  LR: 0.00001149  
Epoch: [3][278/279] Elapsed 0m 39s (remain 0m 0s) Loss: 0.3679(0.4675) Grad: 4.6803  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3356(0.3356) 


Epoch 3 - avg_train_loss: 0.4675  avg_val_loss: 0.5666  time: 43s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4675  avg_val_loss: 0.5666  time: 43s
Epoch 3 - Score: 0.6948
INFO:__main__:Epoch 3 - Score: 0.6948


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0440(0.5666) 
f1 score : 0.4015748031496063
recall score : 0.3333333333333333
precision score : 0.504950495049505
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 46s) Loss: 0.2539(0.2539) Grad: nan  LR: 0.00001002  
Epoch: [4][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.3052(0.2843) Grad: 10.1639  LR: 0.00000816  
Epoch: [4][200/279] Elapsed 0m 28s (remain 0m 11s) Loss: 0.4514(0.2563) Grad: 21.5971  LR: 0.00000636  
Epoch: [4][278/279] Elapsed 0m 39s (remain 0m 0s) Loss: 0.1488(0.2463) Grad: 6.3186  LR: 0.00000504  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4476(0.4476) 


Epoch 4 - avg_train_loss: 0.2463  avg_val_loss: 0.7464  time: 43s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2463  avg_val_loss: 0.7464  time: 43s
Epoch 4 - Score: 0.6767
INFO:__main__:Epoch 4 - Score: 0.6767


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.6749(0.7464) 
f1 score : 0.4579124579124579
recall score : 0.4444444444444444
precision score : 0.4722222222222222
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 44s) Loss: 0.0748(0.0748) Grad: nan  LR: 0.00000503  
Epoch: [5][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.0452(0.0726) Grad: 3.3896  LR: 0.00000350  
Epoch: [5][200/279] Elapsed 0m 28s (remain 0m 11s) Loss: 0.0190(0.0648) Grad: 0.8088  LR: 0.00000220  
Epoch: [5][278/279] Elapsed 0m 39s (remain 0m 0s) Loss: 0.0116(0.0614) Grad: 0.6937  LR: 0.00000137  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4310(0.4310) 


Epoch 5 - avg_train_loss: 0.0614  avg_val_loss: 0.9894  time: 43s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0614  avg_val_loss: 0.9894  time: 43s
Epoch 5 - Score: 0.6908
INFO:__main__:Epoch 5 - Score: 0.6908


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.4111(0.9894) 
f1 score : 0.42105263157894735
recall score : 0.3660130718954248
precision score : 0.49557522123893805
Epoch: [6][0/279] Elapsed 0m 0s (remain 1m 44s) Loss: 0.0205(0.0205) Grad: nan  LR: 0.00000136  
Epoch: [6][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.0242(0.0266) Grad: 1.8896  LR: 0.00000057  
Epoch: [6][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.0074(0.0300) Grad: 0.2554  LR: 0.00000012  
Epoch: [6][278/279] Elapsed 0m 39s (remain 0m 0s) Loss: 0.0118(0.0276) Grad: 0.9607  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4692(0.4692) 


Epoch 6 - avg_train_loss: 0.0276  avg_val_loss: 1.0271  time: 43s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0276  avg_val_loss: 1.0271  time: 43s
Epoch 6 - Score: 0.6847
INFO:__main__:Epoch 6 - Score: 0.6847


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.4907(1.0271) 
f1 score : 0.4163568773234201
recall score : 0.3660130718954248
precision score : 0.4827586206896552


Score: 0.7088
INFO:__main__:Score: 0.7088
ACC BEST Score: 0.7149
INFO:__main__:ACC BEST Score: 0.7149
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base_epoch20",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input"

f1 score : 0.16184971098265896
recall score : 0.0915032679738562
precision score : 0.7


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base_epoch20 were not used when initializing DebertaV2Model: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 47s) Loss: 0.7373(0.7373) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.6021(0.6346) Grad: 0.7577  LR: 0.00001982  
Epoch: [1][200/279] Elapsed 0m 28s (remain 0m 11s) Loss: 0.3813(0.6151) Grad: 6.8206  LR: 0.00001930  
Epoch: [1][278/279] Elapsed 0m 39s (remain 0m 0s) Loss: 0.5977(0.6130) Grad: 0.6957  LR: 0.00001867  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3188(0.3188) 


Epoch 1 - avg_train_loss: 0.6130  avg_val_loss: 0.5940  time: 43s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6130  avg_val_loss: 0.5940  time: 43s
Epoch 1 - Score: 0.6928
INFO:__main__:Epoch 1 - Score: 0.6928
Epoch 1 - Save Best Score: 0.6928 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6928 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1650(0.5940) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 49s) Loss: 0.7734(0.7734) Grad: nan  LR: 0.00001866  
Epoch: [2][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.5630(0.5774) Grad: 1.9742  LR: 0.00001757  
Epoch: [2][200/279] Elapsed 0m 28s (remain 0m 11s) Loss: 0.7759(0.5623) Grad: 8.5457  LR: 0.00001623  
Epoch: [2][278/279] Elapsed 0m 39s (remain 0m 0s) Loss: 0.7261(0.5659) Grad: 3.2206  LR: 0.00001502  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3215(0.3215) 


Epoch 2 - avg_train_loss: 0.5659  avg_val_loss: 0.5739  time: 43s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5659  avg_val_loss: 0.5739  time: 43s
Epoch 2 - Score: 0.7028
INFO:__main__:Epoch 2 - Score: 0.7028
Epoch 2 - Save Best Score: 0.7028 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7028 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9954(0.5739) 
f1 score : 0.1590909090909091
recall score : 0.0915032679738562
precision score : 0.6086956521739131
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 49s) Loss: 0.4536(0.4536) Grad: nan  LR: 0.00001501  
Epoch: [3][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.2189(0.5132) Grad: 1.9038  LR: 0.00001331  
Epoch: [3][200/279] Elapsed 0m 28s (remain 0m 11s) Loss: 0.3672(0.4940) Grad: 2.2101  LR: 0.00001149  
Epoch: [3][278/279] Elapsed 0m 39s (remain 0m 0s) Loss: 0.5454(0.4857) Grad: 3.0798  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2205(0.2205) 


Epoch 3 - avg_train_loss: 0.4857  avg_val_loss: 0.5927  time: 43s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4857  avg_val_loss: 0.5927  time: 43s
Epoch 3 - Score: 0.6988
INFO:__main__:Epoch 3 - Score: 0.6988


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1232(0.5927) 
f1 score : 0.3055555555555556
recall score : 0.21568627450980393
precision score : 0.5238095238095238
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 45s) Loss: 0.5371(0.5371) Grad: nan  LR: 0.00001002  
Epoch: [4][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.3169(0.3142) Grad: 6.6952  LR: 0.00000816  
Epoch: [4][200/279] Elapsed 0m 28s (remain 0m 11s) Loss: 0.1562(0.2717) Grad: 6.0951  LR: 0.00000636  
Epoch: [4][278/279] Elapsed 0m 39s (remain 0m 0s) Loss: 0.0715(0.2646) Grad: 2.0900  LR: 0.00000504  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.1584(0.1584) 


Epoch 4 - avg_train_loss: 0.2646  avg_val_loss: 0.7975  time: 43s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2646  avg_val_loss: 0.7975  time: 43s
Epoch 4 - Score: 0.6847
INFO:__main__:Epoch 4 - Score: 0.6847


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.7612(0.7975) 
f1 score : 0.3745019920318725
recall score : 0.30718954248366015
precision score : 0.47959183673469385
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 44s) Loss: 0.0450(0.0450) Grad: nan  LR: 0.00000503  
Epoch: [5][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.0298(0.0724) Grad: 1.2241  LR: 0.00000350  
Epoch: [5][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.0631(0.0664) Grad: 8.0399  LR: 0.00000220  
Epoch: [5][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.0877(0.0656) Grad: 9.0967  LR: 0.00000137  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2280(0.2280) 


Epoch 5 - avg_train_loss: 0.0656  avg_val_loss: 0.9586  time: 42s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0656  avg_val_loss: 0.9586  time: 42s
Epoch 5 - Score: 0.6888
INFO:__main__:Epoch 5 - Score: 0.6888


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.8185(0.9586) 
f1 score : 0.43223443223443225
recall score : 0.38562091503267976
precision score : 0.49166666666666664
Epoch: [6][0/279] Elapsed 0m 0s (remain 1m 39s) Loss: 0.0112(0.0112) Grad: nan  LR: 0.00000136  
Epoch: [6][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.0516(0.0308) Grad: 4.7011  LR: 0.00000057  
Epoch: [6][200/279] Elapsed 0m 28s (remain 0m 11s) Loss: 0.0298(0.0280) Grad: 3.1180  LR: 0.00000012  
Epoch: [6][278/279] Elapsed 0m 39s (remain 0m 0s) Loss: 0.0212(0.0297) Grad: 1.1131  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.1454(0.1454) 


Epoch 6 - avg_train_loss: 0.0297  avg_val_loss: 1.0257  time: 43s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0297  avg_val_loss: 1.0257  time: 43s
Epoch 6 - Score: 0.6807
INFO:__main__:Epoch 6 - Score: 0.6807


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.2642(1.0257) 
f1 score : 0.3764705882352941
recall score : 0.3137254901960784
precision score : 0.47058823529411764


Score: 0.7028
INFO:__main__:Score: 0.7028
ACC BEST Score: 0.7149
INFO:__main__:ACC BEST Score: 0.7149
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base_epoch20",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input"

f1 score : 0.1590909090909091
recall score : 0.0915032679738562
precision score : 0.6086956521739131


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base_epoch20 were not used when initializing DebertaV2Model: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 37s) Loss: 0.9282(0.9282) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.6382(0.6366) Grad: 4.2988  LR: 0.00001982  
Epoch: [1][200/279] Elapsed 0m 28s (remain 0m 11s) Loss: 0.6733(0.6187) Grad: 5.0426  LR: 0.00001930  
Epoch: [1][278/279] Elapsed 0m 39s (remain 0m 0s) Loss: 0.5312(0.6094) Grad: 1.1133  LR: 0.00001867  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2949(0.2949) 


Epoch 1 - avg_train_loss: 0.6094  avg_val_loss: 0.5738  time: 43s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6094  avg_val_loss: 0.5738  time: 43s
Epoch 1 - Score: 0.7103
INFO:__main__:Epoch 1 - Score: 0.7103
Epoch 1 - Save Best Score: 0.7103 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7103 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1209(0.5738) 
f1 score : 0.14285714285714285
recall score : 0.07894736842105263
precision score : 0.75
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 39s) Loss: 0.5981(0.5981) Grad: nan  LR: 0.00001866  
Epoch: [2][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.5986(0.5798) Grad: 4.4261  LR: 0.00001757  
Epoch: [2][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.6606(0.5734) Grad: 4.0562  LR: 0.00001623  
Epoch: [2][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.5020(0.5687) Grad: 2.7521  LR: 0.00001502  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2152(0.2152) 


Epoch 2 - avg_train_loss: 0.5687  avg_val_loss: 0.5643  time: 43s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5687  avg_val_loss: 0.5643  time: 43s
Epoch 2 - Score: 0.7223
INFO:__main__:Epoch 2 - Score: 0.7223
Epoch 2 - Save Best Score: 0.7223 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7223 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1932(0.5643) 
f1 score : 0.26595744680851063
recall score : 0.16447368421052633
precision score : 0.6944444444444444
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 50s) Loss: 0.3948(0.3948) Grad: nan  LR: 0.00001501  
Epoch: [3][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.4341(0.4670) Grad: 3.1924  LR: 0.00001331  
Epoch: [3][200/279] Elapsed 0m 28s (remain 0m 11s) Loss: 0.6772(0.4814) Grad: 3.1380  LR: 0.00001149  
Epoch: [3][278/279] Elapsed 0m 39s (remain 0m 0s) Loss: 0.3708(0.4782) Grad: 2.1286  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2543(0.2543) 


Epoch 3 - avg_train_loss: 0.4782  avg_val_loss: 0.5889  time: 43s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4782  avg_val_loss: 0.5889  time: 43s
Epoch 3 - Score: 0.7163
INFO:__main__:Epoch 3 - Score: 0.7163


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9928(0.5889) 
f1 score : 0.36199095022624433
recall score : 0.2631578947368421
precision score : 0.5797101449275363
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 48s) Loss: 0.5112(0.5112) Grad: nan  LR: 0.00001002  
Epoch: [4][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.5581(0.2884) Grad: 18.8010  LR: 0.00000816  
Epoch: [4][200/279] Elapsed 0m 28s (remain 0m 11s) Loss: 0.2903(0.2725) Grad: 9.4345  LR: 0.00000636  
Epoch: [4][278/279] Elapsed 0m 39s (remain 0m 0s) Loss: 0.5039(0.2666) Grad: 14.2799  LR: 0.00000504  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.1909(0.1909) 


Epoch 4 - avg_train_loss: 0.2666  avg_val_loss: 0.8432  time: 43s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2666  avg_val_loss: 0.8432  time: 43s
Epoch 4 - Score: 0.7163
INFO:__main__:Epoch 4 - Score: 0.7163


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.1647(0.8432) 
f1 score : 0.3561643835616438
recall score : 0.2565789473684211
precision score : 0.582089552238806
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 44s) Loss: 0.0980(0.0980) Grad: nan  LR: 0.00000503  
Epoch: [5][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.0323(0.0811) Grad: 3.2922  LR: 0.00000350  
Epoch: [5][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0740(0.0786) Grad: 9.7730  LR: 0.00000220  
Epoch: [5][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.0452(0.0773) Grad: 5.5614  LR: 0.00000137  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2941(0.2941) 


Epoch 5 - avg_train_loss: 0.0773  avg_val_loss: 1.0311  time: 43s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0773  avg_val_loss: 1.0311  time: 43s
Epoch 5 - Score: 0.7002
INFO:__main__:Epoch 5 - Score: 0.7002


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.5266(1.0311) 
f1 score : 0.39676113360323884
recall score : 0.3223684210526316
precision score : 0.5157894736842106
Epoch: [6][0/279] Elapsed 0m 0s (remain 1m 38s) Loss: 0.0818(0.0818) Grad: nan  LR: 0.00000136  
Epoch: [6][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.0119(0.0331) Grad: 0.4299  LR: 0.00000057  
Epoch: [6][200/279] Elapsed 0m 28s (remain 0m 11s) Loss: 0.0196(0.0302) Grad: 2.9189  LR: 0.00000012  
Epoch: [6][278/279] Elapsed 0m 39s (remain 0m 0s) Loss: 0.0156(0.0306) Grad: 1.6056  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3610(0.3610) 


Epoch 6 - avg_train_loss: 0.0306  avg_val_loss: 1.0610  time: 43s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0306  avg_val_loss: 1.0610  time: 43s
Epoch 6 - Score: 0.6821
INFO:__main__:Epoch 6 - Score: 0.6821


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.4486(1.0610) 
f1 score : 0.39230769230769225
recall score : 0.3355263157894737
precision score : 0.4722222222222222


Score: 0.7223
INFO:__main__:Score: 0.7223
ACC BEST Score: 0.7264
INFO:__main__:ACC BEST Score: 0.7264
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base_epoch20",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input"

f1 score : 0.26595744680851063
recall score : 0.16447368421052633
precision score : 0.6944444444444444


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base_epoch20 were not used when initializing DebertaV2Model: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 44s) Loss: 0.7451(0.7451) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.6636(0.6317) Grad: 2.1775  LR: 0.00001982  
Epoch: [1][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.7354(0.6159) Grad: 6.6628  LR: 0.00001930  
Epoch: [1][278/279] Elapsed 0m 39s (remain 0m 0s) Loss: 0.7944(0.6066) Grad: nan  LR: 0.00001867  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4875(0.4875) 


Epoch 1 - avg_train_loss: 0.6066  avg_val_loss: 0.6149  time: 43s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6066  avg_val_loss: 0.6149  time: 43s
Epoch 1 - Score: 0.6197
INFO:__main__:Epoch 1 - Score: 0.6197
Epoch 1 - Save Best Score: 0.6197 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6197 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8098(0.6149) 
f1 score : 0.33215547703180215
recall score : 0.3092105263157895
precision score : 0.35877862595419846
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 41s) Loss: 0.5015(0.5015) Grad: nan  LR: 0.00001866  
Epoch: [2][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.4194(0.5915) Grad: 1.1123  LR: 0.00001757  
Epoch: [2][200/279] Elapsed 0m 28s (remain 0m 11s) Loss: 0.5229(0.5695) Grad: 1.0665  LR: 0.00001623  
Epoch: [2][278/279] Elapsed 0m 39s (remain 0m 0s) Loss: 0.3765(0.5667) Grad: 7.4742  LR: 0.00001502  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4700(0.4700) 


Epoch 2 - avg_train_loss: 0.5667  avg_val_loss: 0.6003  time: 43s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5667  avg_val_loss: 0.6003  time: 43s
Epoch 2 - Score: 0.6479
INFO:__main__:Epoch 2 - Score: 0.6479
Epoch 2 - Save Best Score: 0.6479 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.6479 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8610(0.6003) 
f1 score : 0.32950191570881227
recall score : 0.28289473684210525
precision score : 0.3944954128440367
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 46s) Loss: 0.5718(0.5718) Grad: nan  LR: 0.00001501  
Epoch: [3][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.4502(0.5036) Grad: 5.8757  LR: 0.00001331  
Epoch: [3][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.2681(0.4846) Grad: 1.8129  LR: 0.00001149  
Epoch: [3][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.3660(0.4825) Grad: 4.4719  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5168(0.5168) 


Epoch 3 - avg_train_loss: 0.4825  avg_val_loss: 0.6263  time: 42s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4825  avg_val_loss: 0.6263  time: 42s
Epoch 3 - Score: 0.6559
INFO:__main__:Epoch 3 - Score: 0.6559
Epoch 3 - Save Best Score: 0.6559 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.6559 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8045(0.6263) 
f1 score : 0.46394984326018807
recall score : 0.4868421052631579
precision score : 0.4431137724550898
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 44s) Loss: 0.5508(0.5508) Grad: nan  LR: 0.00001002  
Epoch: [4][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.3865(0.3472) Grad: 11.0321  LR: 0.00000816  
Epoch: [4][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.4648(0.2931) Grad: 15.5371  LR: 0.00000636  
Epoch: [4][278/279] Elapsed 0m 39s (remain 0m 0s) Loss: 0.1619(0.2746) Grad: 7.2874  LR: 0.00000504  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3284(0.3284) 


Epoch 4 - avg_train_loss: 0.2746  avg_val_loss: 0.8810  time: 42s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2746  avg_val_loss: 0.8810  time: 42s
Epoch 4 - Score: 0.6821
INFO:__main__:Epoch 4 - Score: 0.6821
Epoch 4 - Save Best Score: 0.6821 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.6821 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.6324(0.8810) 
f1 score : 0.368
recall score : 0.3026315789473684
precision score : 0.46938775510204084
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 47s) Loss: 0.0839(0.0839) Grad: nan  LR: 0.00000503  
Epoch: [5][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.0754(0.0799) Grad: 9.2969  LR: 0.00000350  
Epoch: [5][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.0550(0.0747) Grad: 9.0922  LR: 0.00000220  
Epoch: [5][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.0331(0.0675) Grad: 5.7286  LR: 0.00000137  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6454(0.6454) 


Epoch 5 - avg_train_loss: 0.0675  avg_val_loss: 1.0637  time: 42s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0675  avg_val_loss: 1.0637  time: 42s
Epoch 5 - Score: 0.6640
INFO:__main__:Epoch 5 - Score: 0.6640


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.4985(1.0637) 
f1 score : 0.4098939929328622
recall score : 0.3815789473684211
precision score : 0.44274809160305345
Epoch: [6][0/279] Elapsed 0m 0s (remain 1m 45s) Loss: 0.0111(0.0111) Grad: nan  LR: 0.00000136  
Epoch: [6][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.0111(0.0312) Grad: 0.6136  LR: 0.00000057  
Epoch: [6][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.0187(0.0237) Grad: 2.2407  LR: 0.00000012  
Epoch: [6][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.0227(0.0248) Grad: 1.6426  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6028(0.6028) 


Epoch 6 - avg_train_loss: 0.0248  avg_val_loss: 1.1035  time: 42s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0248  avg_val_loss: 1.1035  time: 42s
Epoch 6 - Score: 0.6700
INFO:__main__:Epoch 6 - Score: 0.6700


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.7234(1.1035) 
f1 score : 0.39259259259259255
recall score : 0.34868421052631576
precision score : 0.4491525423728814


Score: 0.6821
INFO:__main__:Score: 0.6821
ACC BEST Score: 0.7022
INFO:__main__:ACC BEST Score: 0.7022
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base_epoch20",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input"

f1 score : 0.368
recall score : 0.3026315789473684
precision score : 0.46938775510204084


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base_epoch20 were not used when initializing DebertaV2Model: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 49s) Loss: 0.6040(0.6040) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.6313(0.6174) Grad: 4.9920  LR: 0.00001982  
Epoch: [1][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.5493(0.6125) Grad: 2.2630  LR: 0.00001930  
Epoch: [1][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.4929(0.6072) Grad: 1.3485  LR: 0.00001867  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2099(0.2099) 


Epoch 1 - avg_train_loss: 0.6072  avg_val_loss: 0.6087  time: 42s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6072  avg_val_loss: 0.6087  time: 42s
Epoch 1 - Score: 0.6942
INFO:__main__:Epoch 1 - Score: 0.6942
Epoch 1 - Save Best Score: 0.6942 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6942 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.4095(0.6087) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 46s) Loss: 0.4858(0.4858) Grad: nan  LR: 0.00001866  
Epoch: [2][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.4929(0.5654) Grad: 2.0488  LR: 0.00001757  
Epoch: [2][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.5356(0.5643) Grad: 4.6094  LR: 0.00001623  
Epoch: [2][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.6274(0.5621) Grad: 3.2179  LR: 0.00001502  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3795(0.3795) 


Epoch 2 - avg_train_loss: 0.5621  avg_val_loss: 0.5630  time: 42s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5621  avg_val_loss: 0.5630  time: 42s
Epoch 2 - Score: 0.7042
INFO:__main__:Epoch 2 - Score: 0.7042
Epoch 2 - Save Best Score: 0.7042 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7042 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9451(0.5630) 
f1 score : 0.31627906976744186
recall score : 0.2236842105263158
precision score : 0.5396825396825397
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 53s) Loss: 0.5942(0.5942) Grad: nan  LR: 0.00001501  
Epoch: [3][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.4443(0.5001) Grad: 3.9316  LR: 0.00001331  
Epoch: [3][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.4619(0.5014) Grad: 3.3984  LR: 0.00001149  
Epoch: [3][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.4448(0.4980) Grad: 4.0902  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2488(0.2488) 


Epoch 3 - avg_train_loss: 0.4980  avg_val_loss: 0.5715  time: 42s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4980  avg_val_loss: 0.5715  time: 42s
Epoch 3 - Score: 0.7203
INFO:__main__:Epoch 3 - Score: 0.7203
Epoch 3 - Save Best Score: 0.7203 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7203 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.3006(0.5715) 
f1 score : 0.3474178403755869
recall score : 0.24342105263157895
precision score : 0.6065573770491803
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 46s) Loss: 0.5454(0.5454) Grad: nan  LR: 0.00001002  
Epoch: [4][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.2462(0.3393) Grad: 3.6739  LR: 0.00000816  
Epoch: [4][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.2030(0.3145) Grad: 4.1516  LR: 0.00000636  
Epoch: [4][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.2717(0.2978) Grad: 9.4424  LR: 0.00000504  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2859(0.2859) 


Epoch 4 - avg_train_loss: 0.2978  avg_val_loss: 0.6562  time: 43s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2978  avg_val_loss: 0.6562  time: 43s
Epoch 4 - Score: 0.7284
INFO:__main__:Epoch 4 - Score: 0.7284
Epoch 4 - Save Best Score: 0.7284 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7284 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.7468(0.6562) 
f1 score : 0.49814126394052044
recall score : 0.4407894736842105
precision score : 0.5726495726495726
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 45s) Loss: 0.0828(0.0828) Grad: nan  LR: 0.00000503  
Epoch: [5][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.0689(0.0859) Grad: 5.4611  LR: 0.00000350  
Epoch: [5][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.1646(0.0779) Grad: 9.9583  LR: 0.00000220  
Epoch: [5][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.0304(0.0747) Grad: 2.3912  LR: 0.00000137  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3573(0.3573) 


Epoch 5 - avg_train_loss: 0.0747  avg_val_loss: 0.8737  time: 42s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0747  avg_val_loss: 0.8737  time: 42s
Epoch 5 - Score: 0.7022
INFO:__main__:Epoch 5 - Score: 0.7022


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.2629(0.8737) 
f1 score : 0.4788732394366197
recall score : 0.4473684210526316
precision score : 0.5151515151515151
Epoch: [6][0/279] Elapsed 0m 0s (remain 1m 43s) Loss: 0.0183(0.0183) Grad: nan  LR: 0.00000136  
Epoch: [6][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.0231(0.0242) Grad: 2.9015  LR: 0.00000057  
Epoch: [6][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.0198(0.0261) Grad: 2.8442  LR: 0.00000012  
Epoch: [6][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.0287(0.0280) Grad: 3.2371  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2988(0.2988) 


Epoch 6 - avg_train_loss: 0.0280  avg_val_loss: 0.9104  time: 42s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0280  avg_val_loss: 0.9104  time: 42s
Epoch 6 - Score: 0.7062
INFO:__main__:Epoch 6 - Score: 0.7062


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.5055(0.9104) 
f1 score : 0.4671532846715329
recall score : 0.42105263157894735
precision score : 0.5245901639344263


Score: 0.7284
INFO:__main__:Score: 0.7284
ACC BEST Score: 0.7425
INFO:__main__:ACC BEST Score: 0.7425
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base_epoch20",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input"

f1 score : 0.49814126394052044
recall score : 0.4407894736842105
precision score : 0.5726495726495726


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base_epoch20 were not used when initializing DebertaV2Model: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 58s) Loss: 0.8184(0.8184) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.6147(0.6193) Grad: 1.5299  LR: 0.00001982  
Epoch: [1][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.5415(0.6164) Grad: 1.2649  LR: 0.00001930  
Epoch: [1][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.6045(0.6089) Grad: 8.1198  LR: 0.00001867  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4628(0.4628) 


Epoch 1 - avg_train_loss: 0.6089  avg_val_loss: 0.6057  time: 42s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6089  avg_val_loss: 0.6057  time: 42s
Epoch 1 - Score: 0.7082
INFO:__main__:Epoch 1 - Score: 0.7082
Epoch 1 - Save Best Score: 0.7082 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7082 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8376(0.6057) 
f1 score : 0.3881856540084388
recall score : 0.3026315789473684
precision score : 0.5411764705882353
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 49s) Loss: 0.5923(0.5923) Grad: nan  LR: 0.00001866  
Epoch: [2][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.3428(0.5755) Grad: 6.2424  LR: 0.00001757  
Epoch: [2][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.6182(0.5719) Grad: 1.2732  LR: 0.00001623  
Epoch: [2][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.4438(0.5653) Grad: 4.0822  LR: 0.00001502  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3367(0.3367) 


Epoch 2 - avg_train_loss: 0.5653  avg_val_loss: 0.5733  time: 42s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5653  avg_val_loss: 0.5733  time: 42s
Epoch 2 - Score: 0.7223
INFO:__main__:Epoch 2 - Score: 0.7223
Epoch 2 - Save Best Score: 0.7223 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7223 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0655(0.5733) 
f1 score : 0.28865979381443296
recall score : 0.18421052631578946
precision score : 0.6666666666666666
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 49s) Loss: 0.5186(0.5186) Grad: nan  LR: 0.00001501  
Epoch: [3][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.4714(0.4929) Grad: 7.4649  LR: 0.00001331  
Epoch: [3][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.4233(0.4976) Grad: 3.6994  LR: 0.00001149  
Epoch: [3][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.4324(0.4904) Grad: 3.8219  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3496(0.3496) 


Epoch 3 - avg_train_loss: 0.4904  avg_val_loss: 0.5736  time: 42s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4904  avg_val_loss: 0.5736  time: 42s
Epoch 3 - Score: 0.7042
INFO:__main__:Epoch 3 - Score: 0.7042


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0971(0.5736) 
f1 score : 0.31627906976744186
recall score : 0.2236842105263158
precision score : 0.5396825396825397
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 41s) Loss: 0.3933(0.3933) Grad: nan  LR: 0.00001002  
Epoch: [4][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.3037(0.3464) Grad: 7.1785  LR: 0.00000816  
Epoch: [4][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.1495(0.3247) Grad: 2.7469  LR: 0.00000636  
Epoch: [4][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.0784(0.3033) Grad: 2.4958  LR: 0.00000504  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4264(0.4264) 


Epoch 4 - avg_train_loss: 0.3033  avg_val_loss: 0.7026  time: 42s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3033  avg_val_loss: 0.7026  time: 42s
Epoch 4 - Score: 0.6519
INFO:__main__:Epoch 4 - Score: 0.6519


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0056(0.7026) 
f1 score : 0.4896755162241888
recall score : 0.5460526315789473
precision score : 0.44385026737967914
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 39s) Loss: 0.1135(0.1135) Grad: nan  LR: 0.00000503  
Epoch: [5][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.1194(0.1182) Grad: 6.8826  LR: 0.00000350  
Epoch: [5][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.0323(0.1078) Grad: 1.6602  LR: 0.00000220  
Epoch: [5][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.0275(0.0992) Grad: 1.3935  LR: 0.00000137  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2585(0.2585) 


Epoch 5 - avg_train_loss: 0.0992  avg_val_loss: 0.8885  time: 42s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0992  avg_val_loss: 0.8885  time: 42s
Epoch 5 - Score: 0.7143
INFO:__main__:Epoch 5 - Score: 0.7143


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.3909(0.8885) 
f1 score : 0.41322314049586784
recall score : 0.32894736842105265
precision score : 0.5555555555555556
Epoch: [6][0/279] Elapsed 0m 0s (remain 1m 48s) Loss: 0.0315(0.0315) Grad: nan  LR: 0.00000136  
Epoch: [6][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.1799(0.0379) Grad: 8.7021  LR: 0.00000057  
Epoch: [6][200/279] Elapsed 0m 28s (remain 0m 11s) Loss: 0.0365(0.0400) Grad: 4.9708  LR: 0.00000012  
Epoch: [6][278/279] Elapsed 0m 39s (remain 0m 0s) Loss: 0.0159(0.0397) Grad: 0.6234  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3471(0.3471) 


Epoch 6 - avg_train_loss: 0.0397  avg_val_loss: 0.9122  time: 43s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0397  avg_val_loss: 0.9122  time: 43s
Epoch 6 - Score: 0.6901
INFO:__main__:Epoch 6 - Score: 0.6901


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.1522(0.9122) 
f1 score : 0.42537313432835816
recall score : 0.375
precision score : 0.49137931034482757


Score: 0.7223
INFO:__main__:Score: 0.7223
ACC BEST Score: 0.7264
INFO:__main__:ACC BEST Score: 0.7264
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base_epoch20",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input"

f1 score : 0.28865979381443296
recall score : 0.18421052631578946
precision score : 0.6666666666666666


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base_epoch20 were not used when initializing DebertaV2Model: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 50s) Loss: 0.9233(0.9233) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.5723(0.6377) Grad: 3.4016  LR: 0.00001982  
Epoch: [1][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.6694(0.6239) Grad: 1.2605  LR: 0.00001930  
Epoch: [1][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.6055(0.6142) Grad: 1.7576  LR: 0.00001867  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4904(0.4904) 


Epoch 1 - avg_train_loss: 0.6142  avg_val_loss: 0.5748  time: 42s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6142  avg_val_loss: 0.5748  time: 42s
Epoch 1 - Score: 0.6982
INFO:__main__:Epoch 1 - Score: 0.6982
Epoch 1 - Save Best Score: 0.6982 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6982 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7556(0.5748) 
f1 score : 0.28571428571428575
recall score : 0.19736842105263158
precision score : 0.5172413793103449
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 42s) Loss: 0.6289(0.6289) Grad: nan  LR: 0.00001866  
Epoch: [2][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.6841(0.5676) Grad: 1.2266  LR: 0.00001757  
Epoch: [2][200/279] Elapsed 0m 28s (remain 0m 11s) Loss: 0.5576(0.5750) Grad: 1.8645  LR: 0.00001623  
Epoch: [2][278/279] Elapsed 0m 39s (remain 0m 0s) Loss: 0.5562(0.5651) Grad: 5.8674  LR: 0.00001502  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3305(0.3305) 


Epoch 2 - avg_train_loss: 0.5651  avg_val_loss: 0.5408  time: 43s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5651  avg_val_loss: 0.5408  time: 43s
Epoch 2 - Score: 0.7123
INFO:__main__:Epoch 2 - Score: 0.7123
Epoch 2 - Save Best Score: 0.7123 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7123 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9523(0.5408) 
f1 score : 0.25906735751295334
recall score : 0.16447368421052633
precision score : 0.6097560975609756
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 44s) Loss: 0.2457(0.2457) Grad: nan  LR: 0.00001501  
Epoch: [3][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.4309(0.4632) Grad: 4.0990  LR: 0.00001331  
Epoch: [3][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.3315(0.4669) Grad: 2.0093  LR: 0.00001149  
Epoch: [3][278/279] Elapsed 0m 39s (remain 0m 0s) Loss: 0.2957(0.4648) Grad: 2.7101  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6919(0.6919) 


Epoch 3 - avg_train_loss: 0.4648  avg_val_loss: 0.6605  time: 43s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4648  avg_val_loss: 0.6605  time: 43s
Epoch 3 - Score: 0.5956
INFO:__main__:Epoch 3 - Score: 0.5956


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.4483(0.6605) 
f1 score : 0.524822695035461
recall score : 0.7302631578947368
precision score : 0.4095940959409594
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 43s) Loss: 0.3752(0.3752) Grad: nan  LR: 0.00001002  
Epoch: [4][100/279] Elapsed 0m 13s (remain 0m 24s) Loss: 0.1322(0.2670) Grad: 3.8875  LR: 0.00000816  
Epoch: [4][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.2346(0.2471) Grad: 6.7924  LR: 0.00000636  
Epoch: [4][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.1608(0.2390) Grad: 7.1123  LR: 0.00000504  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2195(0.2195) 


Epoch 4 - avg_train_loss: 0.2390  avg_val_loss: 0.8219  time: 42s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2390  avg_val_loss: 0.8219  time: 42s
Epoch 4 - Score: 0.6942
INFO:__main__:Epoch 4 - Score: 0.6942


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.1822(0.8219) 
f1 score : 0.32142857142857145
recall score : 0.23684210526315788
precision score : 0.5
Epoch: [5][0/279] Elapsed 0m 0s (remain 1m 42s) Loss: 0.1575(0.1575) Grad: nan  LR: 0.00000503  
Epoch: [5][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.0590(0.0826) Grad: 5.9005  LR: 0.00000350  
Epoch: [5][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0550(0.0738) Grad: 4.6540  LR: 0.00000220  
Epoch: [5][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.0126(0.0686) Grad: 0.7643  LR: 0.00000137  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4082(0.4082) 


Epoch 5 - avg_train_loss: 0.0686  avg_val_loss: 0.9847  time: 42s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0686  avg_val_loss: 0.9847  time: 42s
Epoch 5 - Score: 0.7062
INFO:__main__:Epoch 5 - Score: 0.7062


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.1659(0.9847) 
f1 score : 0.434108527131783
recall score : 0.3684210526315789
precision score : 0.5283018867924528
Epoch: [6][0/279] Elapsed 0m 0s (remain 1m 38s) Loss: 0.0453(0.0453) Grad: nan  LR: 0.00000136  
Epoch: [6][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.0098(0.0306) Grad: 0.5793  LR: 0.00000057  
Epoch: [6][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.0066(0.0286) Grad: 0.2719  LR: 0.00000012  
Epoch: [6][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.0090(0.0292) Grad: 0.3376  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4752(0.4752) 


Epoch 6 - avg_train_loss: 0.0292  avg_val_loss: 1.0114  time: 42s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0292  avg_val_loss: 1.0114  time: 42s
Epoch 6 - Score: 0.6982
INFO:__main__:Epoch 6 - Score: 0.6982


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.1470(1.0114) 
f1 score : 0.4444444444444444
recall score : 0.39473684210526316
precision score : 0.5084745762711864


Score: 0.7123
INFO:__main__:Score: 0.7123
ACC BEST Score: 0.7243
INFO:__main__:ACC BEST Score: 0.7243
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base_epoch20",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input"

f1 score : 0.25906735751295334
recall score : 0.16447368421052633
precision score : 0.6097560975609756


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base_epoch20 were not used when initializing DebertaV2Model: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 51s) Loss: 0.5898(0.5898) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.5801(0.6231) Grad: 5.5287  LR: 0.00001982  
Epoch: [1][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.3567(0.6166) Grad: 5.7607  LR: 0.00001930  
Epoch: [1][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.5898(0.6074) Grad: 1.8118  LR: 0.00001867  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3017(0.3017) 


Epoch 1 - avg_train_loss: 0.6074  avg_val_loss: 0.5865  time: 42s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6074  avg_val_loss: 0.5865  time: 42s
Epoch 1 - Score: 0.6942
INFO:__main__:Epoch 1 - Score: 0.6942
Epoch 1 - Save Best Score: 0.6942 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6942 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1658(0.5865) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 39s) Loss: 0.6094(0.6094) Grad: nan  LR: 0.00001866  
Epoch: [2][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.7207(0.5716) Grad: 5.0919  LR: 0.00001757  
Epoch: [2][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.5156(0.5763) Grad: 0.7735  LR: 0.00001623  
Epoch: [2][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.5566(0.5736) Grad: 1.5911  LR: 0.00001502  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3134(0.3134) 


Epoch 2 - avg_train_loss: 0.5736  avg_val_loss: 0.5837  time: 42s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5736  avg_val_loss: 0.5837  time: 42s
Epoch 2 - Score: 0.6942
INFO:__main__:Epoch 2 - Score: 0.6942


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0829(0.5837) 
f1 score : 0.16483516483516483
recall score : 0.09868421052631579
precision score : 0.5
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 39s) Loss: 0.3979(0.3979) Grad: nan  LR: 0.00001501  
Epoch: [3][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.3665(0.5252) Grad: 1.4486  LR: 0.00001331  
Epoch: [3][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.4517(0.5217) Grad: 3.8373  LR: 0.00001149  
Epoch: [3][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.5200(0.5186) Grad: 1.6950  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2828(0.2828) 


Epoch 3 - avg_train_loss: 0.5186  avg_val_loss: 0.5787  time: 42s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5186  avg_val_loss: 0.5787  time: 42s
Epoch 3 - Score: 0.7042
INFO:__main__:Epoch 3 - Score: 0.7042
Epoch 3 - Save Best Score: 0.7042 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7042 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1927(0.5787) 
f1 score : 0.30985915492957744
recall score : 0.21710526315789475
precision score : 0.5409836065573771
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 46s) Loss: 0.2632(0.2632) Grad: nan  LR: 0.00001002  
Epoch: [4][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.2261(0.4259) Grad: 1.8085  LR: 0.00000816  
Epoch: [4][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.1554(0.3944) Grad: 5.5266  LR: 0.00000636  
Epoch: [4][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.3481(0.3786) Grad: 4.3997  LR: 0.00000504  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.3307(0.3307) 


Epoch 4 - avg_train_loss: 0.3786  avg_val_loss: 0.6422  time: 42s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3786  avg_val_loss: 0.6422  time: 42s
Epoch 4 - Score: 0.6700
INFO:__main__:Epoch 4 - Score: 0.6700


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.3375(0.6422) 
f1 score : 0.37404580152671757
recall score : 0.3223684210526316
precision score : 0.44545454545454544
Epoch: [5][0/279] Elapsed 0m 0s (remain 2m 12s) Loss: 0.3032(0.3032) Grad: nan  LR: 0.00000503  
Epoch: [5][100/279] Elapsed 0m 14s (remain 0m 24s) Loss: 0.2803(0.1725) Grad: 12.9147  LR: 0.00000350  
Epoch: [5][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.0363(0.1655) Grad: 3.3385  LR: 0.00000220  
Epoch: [5][278/279] Elapsed 0m 38s (remain 0m 0s) Loss: 0.1888(0.1602) Grad: 8.2762  LR: 0.00000137  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3598(0.3598) 


Epoch 5 - avg_train_loss: 0.1602  avg_val_loss: 0.8454  time: 42s
INFO:__main__:Epoch 5 - avg_train_loss: 0.1602  avg_val_loss: 0.8454  time: 42s
Epoch 5 - Score: 0.6761
INFO:__main__:Epoch 5 - Score: 0.6761


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.0485(0.8454) 
f1 score : 0.3585657370517928
recall score : 0.29605263157894735
precision score : 0.45454545454545453
Epoch: [6][0/279] Elapsed 0m 0s (remain 1m 39s) Loss: 0.2104(0.2104) Grad: nan  LR: 0.00000136  
Epoch: [6][100/279] Elapsed 0m 14s (remain 0m 25s) Loss: 0.0556(0.0701) Grad: 4.6256  LR: 0.00000057  
Epoch: [6][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.0271(0.0704) Grad: 2.2136  LR: 0.00000012  
Epoch: [6][278/279] Elapsed 0m 39s (remain 0m 0s) Loss: 0.0287(0.0686) Grad: 1.4438  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4397(0.4397) 


Epoch 6 - avg_train_loss: 0.0686  avg_val_loss: 0.8925  time: 43s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0686  avg_val_loss: 0.8925  time: 43s
Epoch 6 - Score: 0.6801
INFO:__main__:Epoch 6 - Score: 0.6801


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 2.0624(0.8925) 
f1 score : 0.39999999999999997
recall score : 0.34868421052631576
precision score : 0.4690265486725664


Score: 0.7042
INFO:__main__:Score: 0.7042
ACC BEST Score: 0.7103
INFO:__main__:ACC BEST Score: 0.7103
Score: 0.7085
INFO:__main__:Score: 0.7085
ACC BEST Score: 0.7107
INFO:__main__:ACC BEST Score: 0.7107


In [None]:
from google.colab import runtime
runtime.unassign()