In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m84.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m34.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m114.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, https:/

In [3]:
!nvidia-smi

Sat Apr 29 16:21:55 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    46W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:

import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re
import html

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD, AdamW, RAdam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score, recall_score, accuracy_score, precision_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
DIR = "/content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")
CUSTOM_MODEL_DIR = os.path.join(OUTPUT_DIR,'clrp_deberta_v3_base')
OUTPUT_EXP_DIR = DIR + '/output/EXP032/'
if not os.path.exists(OUTPUT_EXP_DIR):
    os.makedirs(OUTPUT_EXP_DIR)

In [6]:

# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model_name="microsoft/deberta-v3-base"
    # model='microsoft/deberta-base'
    # model='roberta-base'
    # model='roberta-large'
    # model='roberta-large-mnli'
    # model='xlnet-large-cased'
    # model='albert-xxlarge-v2'
    # model="microsoft/deberta-large"
    # model="microsoft/deberta-v3-large"
    # model='microsoft/deberta-v2-xlarge'
    # model='funnel-transformer/large'
    # model='funnel-transformer/medium'
    # model='albert-base-v2'
    # model='albert-large-v2'
    # model='google/electra-large-discriminator'
    # model='google/electra-base-discriminator'
    # model="facebook/bart-large-mnli"
    # model="facebook/bart-large"
    # model="facebook/bart-base"
    model = CUSTOM_MODEL_DIR
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=16
    fc_dropout=0.2
    target_size=1
    max_len=256
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=10
    trn_fold=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    train=True
    nth_awp_start_epoch=1
    gradient_checkpointing = False
    freezing = False
    num_reinit_layers = 1
    is_reinit_layer = False
    fgm = True

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1]

In [7]:
def get_score(labels, outputs):
    thresh = 0.5
    y_pred = outputs
    y_true = labels
    f_score = f1_score(y_true, (y_pred>thresh).astype(int))
    r_score = recall_score(y_true, (y_pred>thresh).astype(int))
    p_score = precision_score(y_true, (y_pred>thresh).astype(int))
    print(f"f1 score : {f_score}")
    print(f"recall score : {r_score}")
    print(f"precision score : {p_score}")
    return accuracy_score(y_true, (y_pred>thresh).astype(int))

def get_acc_score(labels, outputs):
    y_pred = outputs
    y_true = labels
    best_score = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.80, 0.01):
        thresh = np.round(thresh, 2)
        score = accuracy_score(y_true, (y_pred>thresh).astype(int))
        #print("Accuracy score at threshold {0} is {1}".format(thresh, score))
        if score > best_score:
          best_score = score
          best_thresh = thresh
    print(f"thresh : {best_thresh}")
    return accuracy_score(y_true, (y_pred>best_thresh).astype(int))


def get_logger(filename=OUTPUT_EXP_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=CFG.seed)

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
import pandas as pd
import numpy as np

train = pd.read_csv(os.path.join(INPUT_DIR,"train_data.csv"))
test = pd.read_csv(os.path.join(INPUT_DIR,"test_data.csv"))
sample_sub = pd.read_csv(os.path.join(INPUT_DIR,"submission.csv"))

print(train.shape)
display(train.head(3))

print(test.shape)
display(test.head(3))

print(sample_sub.shape)
display(sample_sub.head(3))

(4974, 6)


Unnamed: 0,id,title,year,abstract,keywords,y
0,1,Hierarchical Adversarially Learned Inference,2018,We propose a novel hierarchical generative mod...,"generative, hierarchical, unsupervised, semisu...",0
1,2,Learning to Compute Word Embeddings On the Fly,2018,Words in natural language follow a Zipfian dis...,"NLU, word embeddings, representation learning",0
2,3,Graph2Seq: Scalable Learning Dynamics for Graphs,2018,Neural networks are increasingly used as a gen...,,0


(6393, 5)


Unnamed: 0,id,title,year,abstract,keywords
0,1,StyleAlign: Analysis and Applications of Align...,2022,"In this paper, we perform an in-depth study of...","StyleGAN, transfer learning, fine tuning, mode..."
1,2,Embedding a random graph via GNN: mean-field i...,2021,We develop a theory for embedding a random gra...,"Graph neural network, graph embedding, multi-r..."
2,3,BBRefinement: an universal scheme to improve p...,2021,We present a conceptually simple yet powerful ...,"object detection, deep neural networks, refine..."


(6393, 2)


Unnamed: 0,id,y
0,1,0
1,2,0
2,3,0


In [10]:
train["texts"] = train["title"] + "[SEP]" + train["abstract"] 

In [11]:
skf = StratifiedKFold(n_splits=CFG.n_fold,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train.y)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)

if CFG.debug:
    display(train.groupby('kfold').size())
    train = train.sample(n=500, random_state=0).reset_index(drop=True)
    display(train.groupby('kfold').size())

In [12]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_EXP_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

In [13]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['texts'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 3 # cls + sep + sep
LOGGER.info(f"max_len: {CFG.max_len}")

100%|██████████| 4974/4974 [00:03<00:00, 1340.65it/s]
max_len: 522
INFO:__main__:max_len: 522


In [14]:
class FGM():
    def __init__(self, model, eps=1.):
        self.model = model
        self.eps = eps
        self.backup = {}

    def attack(self, emb_name='word_embeddings'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0:
                    r_at = self.eps * param.grad / norm
                    param.data.add_(r_at)

    def restore(self, emb_name='word_embeddings'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                assert name in self.backup
                param.data = self.backup[name]
            self.backup = {}

In [15]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False,
                           truncation=True)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df['y'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

#collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
def reinit_layers(model):

    #for layer in model.model.encoder.layer[-CFG.num_reinit_layers:]:
    for layer in model.encoder.layer[-CFG.num_reinit_layers:]:    #Custome model内(backbone)

            for module in layer.modules():

                if isinstance(module,nn.Linear):
                    module.weight.data.normal_(mean=0.0,std=model.config.initializer_range)
                    if module.bias is not None:
                            module.bias.data.zero_()
                elif isinstance(module, nn.Embedding):
                        module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                        if module.padding_idx is not None:
                            module.weight.data[module.padding_idx].zero_()
                elif isinstance(module, nn.LayerNorm):
                        module.bias.data.zero_()
                        module.weight.data.fill_(1.0)
                        
    return model

In [17]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if CFG.is_reinit_layer:
            self.model = reinit_layers(self.model)
            print(f'Reinitializing Last {CFG.num_reinit_layers} Layers.')
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())

        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self._init_weights(self.fc)
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
        self._init_weights(self.fc)
        self.sig = nn.Sigmoid()
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        feature = self.layer_norm1(feature)
        output = self.fc(feature)
        #output = self.sig(output)
        return output

In [18]:
def calculate_loss(inputs, labels, model, criterion, is_valid=True, device="cpu"):    
    y_preds = model(inputs)
    loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
    return (loss, y_preds) if is_valid else loss

In [19]:

# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device, fgm):
    model.zero_grad()
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            loss = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=False, device=device)
        #print(y_preds.sigmoid().squeeze().view(1, -1))
        #loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        if scaler is not None:
            scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if CFG.fgm:
          fgm.attack() 
          adversarial_loss = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=False, device=device)
          scaler.scale(adversarial_loss).backward()
          fgm.restore()
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  #'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          #grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            loss, y_preds = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=True, device=device)
        #loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [20]:

# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['kfold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['kfold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds['y'].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)


    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size*2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_EXP_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr=5e-6, decoder_lr=1e-4, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", 
                    "LayerNorm.weight"]
        group1=['layer.0.','layer.1.','layer.2.','layer.3.']
        group2=['layer.4.','layer.5.','layer.6.','layer.7.']    
        group3=['layer.8.','layer.9.','layer.10.','layer.11.']
        group_all=['layer.0.','layer.1.','layer.2.','layer.3.','layer.4.','layer.5.','layer.6.','layer.7.','layer.8.','layer.9.','layer.10.','layer.11.']
        optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': weight_decay, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': weight_decay, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': weight_decay, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': 0.0},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': 0.0, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': 0.0, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': 0.0, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.named_parameters() if "model" not in n], 'lr':decoder_lr, "momentum" : 0.99},
    ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss()

    print('Enable FGM')
    fgm = FGM(model=model, eps=0.1)
    
    best_score = -1.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device, fgm)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score_05 = get_score(valid_labels, predictions)
        score = get_acc_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')

        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_EXP_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_EXP_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [21]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['y'].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        acc_score = get_acc_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
        LOGGER.info(f'ACC BEST Score: {acc_score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
            #break
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_EXP_DIR+'oof_df.pkl')

DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "torch_dtype": "f

Enable FGM
Epoch: [1][0/279] Elapsed 0m 3s (remain 16m 24s) Loss: 0.4995(0.4995) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 43s (remain 1m 16s) Loss: 0.6084(0.6392) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 1m 24s (remain 0m 32s) Loss: 0.8123(0.6249) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 1m 55s (remain 0m 0s) Loss: 0.6103(0.6188) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5328(0.5328) 


Epoch 1 - avg_train_loss: 0.6188  avg_val_loss: 0.6136  time: 119s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6188  avg_val_loss: 0.6136  time: 119s
Epoch 1 - Score: 0.7149
INFO:__main__:Epoch 1 - Score: 0.7149
Epoch 1 - Save Best Score: 0.7149 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7149 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8265(0.6136) 
f1 score : 0.13253012048192772
recall score : 0.07236842105263158
precision score : 0.7857142857142857
thresh : 0.47
Epoch: [2][0/279] Elapsed 0m 0s (remain 2m 55s) Loss: 0.5324(0.5324) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 39s (remain 1m 10s) Loss: 0.6771(0.6133) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 1m 20s (remain 0m 31s) Loss: 0.4340(0.6102) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 1m 52s (remain 0m 0s) Loss: 0.6729(0.5995) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.3409(0.3409) 


Epoch 2 - avg_train_loss: 0.5995  avg_val_loss: 0.5653  time: 116s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5995  avg_val_loss: 0.5653  time: 116s
Epoch 2 - Score: 0.7149
INFO:__main__:Epoch 2 - Score: 0.7149


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0830(0.5653) 
f1 score : 0.1437125748502994
recall score : 0.07894736842105263
precision score : 0.8
thresh : 0.49
Epoch: [3][0/279] Elapsed 0m 0s (remain 2m 38s) Loss: 0.4797(0.4797) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 41s (remain 1m 12s) Loss: 0.5345(0.5487) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 0.6281(0.5353) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 1m 52s (remain 0m 0s) Loss: 0.5999(0.5280) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2493(0.2493) 


Epoch 3 - avg_train_loss: 0.5280  avg_val_loss: 0.5729  time: 116s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5280  avg_val_loss: 0.5729  time: 116s
Epoch 3 - Score: 0.7169
INFO:__main__:Epoch 3 - Score: 0.7169
Epoch 3 - Save Best Score: 0.7169 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7169 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2630(0.5729) 
f1 score : 0.27411167512690354
recall score : 0.17763157894736842
precision score : 0.6
thresh : 0.64
Epoch: [4][0/279] Elapsed 0m 0s (remain 2m 49s) Loss: 0.4049(0.4049) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 40s (remain 1m 12s) Loss: 0.4393(0.4177) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 1m 20s (remain 0m 31s) Loss: 0.4636(0.4009) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 1m 52s (remain 0m 0s) Loss: 0.2893(0.3991) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3290(0.3290) 


Epoch 4 - avg_train_loss: 0.3991  avg_val_loss: 0.6087  time: 116s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3991  avg_val_loss: 0.6087  time: 116s
Epoch 4 - Score: 0.7129
INFO:__main__:Epoch 4 - Score: 0.7129


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1418(0.6087) 
f1 score : 0.46048109965635736
recall score : 0.4407894736842105
precision score : 0.48201438848920863
thresh : 0.75


Score: 0.7129
INFO:__main__:Score: 0.7129
ACC BEST Score: 0.7169
INFO:__main__:ACC BEST Score: 0.7169
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.27411167512690354
recall score : 0.17763157894736842
precision score : 0.6
thresh : 0.64


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable FGM
Epoch: [1][0/279] Elapsed 0m 0s (remain 2m 56s) Loss: 0.6563(0.6563) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 39s (remain 1m 10s) Loss: 0.4604(0.6255) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 1m 20s (remain 0m 31s) Loss: 0.7907(0.6173) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 1m 52s (remain 0m 0s) Loss: 0.7595(0.6152) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4781(0.4781) 


Epoch 1 - avg_train_loss: 0.6152  avg_val_loss: 0.6188  time: 116s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6152  avg_val_loss: 0.6188  time: 116s
Epoch 1 - Score: 0.6968
INFO:__main__:Epoch 1 - Score: 0.6968
Epoch 1 - Save Best Score: 0.6968 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6968 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9260(0.6188) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
thresh : 0.41
Epoch: [2][0/279] Elapsed 0m 0s (remain 3m 31s) Loss: 0.5159(0.5159) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 40s (remain 1m 11s) Loss: 0.4723(0.5981) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 1m 20s (remain 0m 31s) Loss: 0.6099(0.5919) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 1m 52s (remain 0m 0s) Loss: 0.6111(0.5863) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5067(0.5067) 


Epoch 2 - avg_train_loss: 0.5863  avg_val_loss: 0.5902  time: 116s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5863  avg_val_loss: 0.5902  time: 116s
Epoch 2 - Score: 0.7249
INFO:__main__:Epoch 2 - Score: 0.7249
Epoch 2 - Save Best Score: 0.7249 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7249 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7665(0.5902) 
f1 score : 0.4710144927536232
recall score : 0.42483660130718953
precision score : 0.5284552845528455
thresh : 0.53
Epoch: [3][0/279] Elapsed 0m 0s (remain 2m 52s) Loss: 0.5289(0.5289) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 40s (remain 1m 11s) Loss: 0.5206(0.5380) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 0.6449(0.5147) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 1m 52s (remain 0m 0s) Loss: 0.3825(0.5005) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2807(0.2807) 


Epoch 3 - avg_train_loss: 0.5005  avg_val_loss: 0.5823  time: 116s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5005  avg_val_loss: 0.5823  time: 116s
Epoch 3 - Score: 0.7269
INFO:__main__:Epoch 3 - Score: 0.7269
Epoch 3 - Save Best Score: 0.7269 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7269 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.3771(0.5823) 
f1 score : 0.3474178403755869
recall score : 0.24183006535947713
precision score : 0.6166666666666667
thresh : 0.45
Epoch: [4][0/279] Elapsed 0m 0s (remain 2m 46s) Loss: 0.2850(0.2850) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 40s (remain 1m 11s) Loss: 0.2938(0.3185) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 1m 20s (remain 0m 31s) Loss: 0.3420(0.3121) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 1m 52s (remain 0m 0s) Loss: 0.1834(0.3123) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4536(0.4536) 


Epoch 4 - avg_train_loss: 0.3123  avg_val_loss: 0.6431  time: 116s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3123  avg_val_loss: 0.6431  time: 116s
Epoch 4 - Score: 0.7249
INFO:__main__:Epoch 4 - Score: 0.7249


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2794(0.6431) 
f1 score : 0.4701754385964912
recall score : 0.43790849673202614
precision score : 0.5075757575757576
thresh : 0.75


Score: 0.7209
INFO:__main__:Score: 0.7209
ACC BEST Score: 0.7269
INFO:__main__:ACC BEST Score: 0.7269
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.3474178403755869
recall score : 0.24183006535947713
precision score : 0.6166666666666667
thresh : 0.45


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable FGM
Epoch: [1][0/279] Elapsed 0m 0s (remain 3m 13s) Loss: 0.8851(0.8851) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 40s (remain 1m 11s) Loss: 0.5763(0.6318) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 0.6303(0.6260) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 1m 52s (remain 0m 0s) Loss: 0.5685(0.6207) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4277(0.4277) 


Epoch 1 - avg_train_loss: 0.6207  avg_val_loss: 0.6022  time: 117s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6207  avg_val_loss: 0.6022  time: 117s
Epoch 1 - Score: 0.7129
INFO:__main__:Epoch 1 - Score: 0.7129
Epoch 1 - Save Best Score: 0.7129 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7129 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9523(0.6022) 
f1 score : 0.11976047904191617
recall score : 0.06535947712418301
precision score : 0.7142857142857143
thresh : 0.4
Epoch: [2][0/279] Elapsed 0m 0s (remain 3m 3s) Loss: 0.6369(0.6369) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 40s (remain 1m 11s) Loss: 0.4458(0.6227) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 1m 20s (remain 0m 31s) Loss: 0.6052(0.6031) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 1m 51s (remain 0m 0s) Loss: 0.6288(0.5988) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3119(0.3119) 


Epoch 2 - avg_train_loss: 0.5988  avg_val_loss: 0.5684  time: 115s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5988  avg_val_loss: 0.5684  time: 115s
Epoch 2 - Score: 0.7269
INFO:__main__:Epoch 2 - Score: 0.7269
Epoch 2 - Save Best Score: 0.7269 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7269 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2360(0.5684) 
f1 score : 0.1301775147928994
recall score : 0.0718954248366013
precision score : 0.6875
thresh : 0.39
Epoch: [3][0/279] Elapsed 0m 0s (remain 2m 56s) Loss: 0.4100(0.4100) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 40s (remain 1m 12s) Loss: 0.4887(0.5464) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 1m 20s (remain 0m 31s) Loss: 0.5474(0.5343) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 1m 52s (remain 0m 0s) Loss: 0.5907(0.5248) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5879(0.5879) 


Epoch 3 - avg_train_loss: 0.5248  avg_val_loss: 0.5930  time: 116s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5248  avg_val_loss: 0.5930  time: 116s
Epoch 3 - Score: 0.7129
INFO:__main__:Epoch 3 - Score: 0.7129


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7856(0.5930) 
f1 score : 0.5476190476190476
recall score : 0.6013071895424836
precision score : 0.5027322404371585
thresh : 0.64
Epoch: [4][0/279] Elapsed 0m 0s (remain 2m 45s) Loss: 0.6129(0.6129) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 40s (remain 1m 12s) Loss: 0.6319(0.4330) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 1m 20s (remain 0m 31s) Loss: 0.4280(0.4093) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 1m 51s (remain 0m 0s) Loss: 0.4396(0.4066) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4328(0.4328) 


Epoch 4 - avg_train_loss: 0.4066  avg_val_loss: 0.5933  time: 116s
INFO:__main__:Epoch 4 - avg_train_loss: 0.4066  avg_val_loss: 0.5933  time: 116s
Epoch 4 - Score: 0.7189
INFO:__main__:Epoch 4 - Score: 0.7189


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2524(0.5933) 
f1 score : 0.4765342960288809
recall score : 0.43137254901960786
precision score : 0.532258064516129
thresh : 0.71


Score: 0.7048
INFO:__main__:Score: 0.7048
ACC BEST Score: 0.7269
INFO:__main__:ACC BEST Score: 0.7269
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.1301775147928994
recall score : 0.0718954248366013
precision score : 0.6875
thresh : 0.39


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable FGM
Epoch: [1][0/279] Elapsed 0m 0s (remain 2m 57s) Loss: 0.5847(0.5847) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 41s (remain 1m 12s) Loss: 0.5180(0.6330) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 1m 20s (remain 0m 31s) Loss: 0.6813(0.6163) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 1m 52s (remain 0m 0s) Loss: 0.5497(0.6204) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3805(0.3805) 


Epoch 1 - avg_train_loss: 0.6204  avg_val_loss: 0.6093  time: 116s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6204  avg_val_loss: 0.6093  time: 116s
Epoch 1 - Score: 0.7008
INFO:__main__:Epoch 1 - Score: 0.7008
Epoch 1 - Save Best Score: 0.7008 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7008 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0936(0.6093) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
thresh : 0.39
Epoch: [2][0/279] Elapsed 0m 0s (remain 2m 50s) Loss: 0.6032(0.6032) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 40s (remain 1m 11s) Loss: 0.8408(0.6021) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 1m 20s (remain 0m 31s) Loss: 0.2202(0.5927) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 1m 51s (remain 0m 0s) Loss: 0.5087(0.5916) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4660(0.4660) 


Epoch 2 - avg_train_loss: 0.5916  avg_val_loss: 0.6057  time: 115s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5916  avg_val_loss: 0.6057  time: 115s
Epoch 2 - Score: 0.7088
INFO:__main__:Epoch 2 - Score: 0.7088
Epoch 2 - Save Best Score: 0.7088 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7088 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8245(0.6057) 
f1 score : 0.15028901734104047
recall score : 0.08496732026143791
precision score : 0.65
thresh : 0.47
Epoch: [3][0/279] Elapsed 0m 0s (remain 2m 58s) Loss: 0.6438(0.6438) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 41s (remain 1m 12s) Loss: 0.8038(0.5527) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 0.5609(0.5400) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 1m 52s (remain 0m 0s) Loss: 0.4409(0.5308) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2504(0.2504) 


Epoch 3 - avg_train_loss: 0.5308  avg_val_loss: 0.5825  time: 116s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5308  avg_val_loss: 0.5825  time: 116s
Epoch 3 - Score: 0.7129
INFO:__main__:Epoch 3 - Score: 0.7129
Epoch 3 - Save Best Score: 0.7129 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7129 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1623(0.5825) 
f1 score : 0.31627906976744186
recall score : 0.2222222222222222
precision score : 0.5483870967741935
thresh : 0.47
Epoch: [4][0/279] Elapsed 0m 0s (remain 2m 39s) Loss: 0.4570(0.4570) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 40s (remain 1m 11s) Loss: 0.4367(0.4212) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 1m 20s (remain 0m 31s) Loss: 0.2388(0.4151) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 1m 52s (remain 0m 0s) Loss: 0.4225(0.4147) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2682(0.2682) 


Epoch 4 - avg_train_loss: 0.4147  avg_val_loss: 0.6380  time: 116s
INFO:__main__:Epoch 4 - avg_train_loss: 0.4147  avg_val_loss: 0.6380  time: 116s
Epoch 4 - Score: 0.7149
INFO:__main__:Epoch 4 - Score: 0.7149
Epoch 4 - Save Best Score: 0.7149 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7149 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1988(0.6380) 
f1 score : 0.449438202247191
recall score : 0.39215686274509803
precision score : 0.5263157894736842
thresh : 0.62


Score: 0.7048
INFO:__main__:Score: 0.7048
ACC BEST Score: 0.7149
INFO:__main__:ACC BEST Score: 0.7149
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.449438202247191
recall score : 0.39215686274509803
precision score : 0.5263157894736842
thresh : 0.62


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable FGM
Epoch: [1][0/279] Elapsed 0m 0s (remain 3m 8s) Loss: 0.7157(0.7157) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 40s (remain 1m 11s) Loss: 0.6234(0.6402) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 0.6676(0.6301) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 1m 52s (remain 0m 0s) Loss: 0.6083(0.6244) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3370(0.3370) 


Epoch 1 - avg_train_loss: 0.6244  avg_val_loss: 0.5989  time: 117s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6244  avg_val_loss: 0.5989  time: 117s
Epoch 1 - Score: 0.7243
INFO:__main__:Epoch 1 - Score: 0.7243
Epoch 1 - Save Best Score: 0.7243 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7243 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1776(0.5989) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
thresh : 0.32
Epoch: [2][0/279] Elapsed 0m 0s (remain 2m 50s) Loss: 0.7070(0.7070) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 41s (remain 1m 12s) Loss: 0.7083(0.6004) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 0.4185(0.5900) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 1m 52s (remain 0m 0s) Loss: 0.5936(0.5867) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2748(0.2748) 


Epoch 2 - avg_train_loss: 0.5867  avg_val_loss: 0.5691  time: 116s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5867  avg_val_loss: 0.5691  time: 116s
Epoch 2 - Score: 0.7324
INFO:__main__:Epoch 2 - Score: 0.7324
Epoch 2 - Save Best Score: 0.7324 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7324 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1666(0.5691) 
f1 score : 0.16568047337278105
recall score : 0.09210526315789473
precision score : 0.8235294117647058
thresh : 0.35
Epoch: [3][0/279] Elapsed 0m 0s (remain 2m 59s) Loss: 0.4605(0.4605) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 40s (remain 1m 12s) Loss: 0.5138(0.5372) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 0.4969(0.5136) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 1m 52s (remain 0m 0s) Loss: 0.5758(0.5047) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3445(0.3445) 


Epoch 3 - avg_train_loss: 0.5047  avg_val_loss: 0.5837  time: 116s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5047  avg_val_loss: 0.5837  time: 116s
Epoch 3 - Score: 0.7243
INFO:__main__:Epoch 3 - Score: 0.7243


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8372(0.5837) 
f1 score : 0.45
recall score : 0.4144736842105263
precision score : 0.4921875
thresh : 0.68
Epoch: [4][0/279] Elapsed 0m 0s (remain 2m 51s) Loss: 0.4009(0.4009) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 40s (remain 1m 11s) Loss: 0.1870(0.3919) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 0.3662(0.3902) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 1m 52s (remain 0m 0s) Loss: 0.4175(0.3915) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3437(0.3437) 


Epoch 4 - avg_train_loss: 0.3915  avg_val_loss: 0.6403  time: 116s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3915  avg_val_loss: 0.6403  time: 116s
Epoch 4 - Score: 0.7223
INFO:__main__:Epoch 4 - Score: 0.7223


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0726(0.6403) 
f1 score : 0.42490842490842495
recall score : 0.3815789473684211
precision score : 0.4793388429752066
thresh : 0.78


Score: 0.7163
INFO:__main__:Score: 0.7163
ACC BEST Score: 0.7324
INFO:__main__:ACC BEST Score: 0.7324
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.16568047337278105
recall score : 0.09210526315789473
precision score : 0.8235294117647058
thresh : 0.35


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable FGM
Epoch: [1][0/279] Elapsed 0m 0s (remain 3m 15s) Loss: 0.6589(0.6589) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 40s (remain 1m 12s) Loss: 0.5503(0.6415) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 0.6878(0.6325) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 1m 52s (remain 0m 0s) Loss: 0.5795(0.6266) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4323(0.4323) 


Epoch 1 - avg_train_loss: 0.6266  avg_val_loss: 0.6141  time: 116s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6266  avg_val_loss: 0.6141  time: 116s
Epoch 1 - Score: 0.6982
INFO:__main__:Epoch 1 - Score: 0.6982
Epoch 1 - Save Best Score: 0.6982 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6982 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0504(0.6141) 
f1 score : 0.07317073170731707
recall score : 0.039473684210526314
precision score : 0.5
thresh : 0.62
Epoch: [2][0/279] Elapsed 0m 0s (remain 2m 56s) Loss: 0.6101(0.6101) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 40s (remain 1m 11s) Loss: 0.7717(0.6045) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 0.5362(0.5917) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 1m 52s (remain 0m 0s) Loss: 0.5716(0.5890) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4032(0.4032) 


Epoch 2 - avg_train_loss: 0.5890  avg_val_loss: 0.5976  time: 116s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5890  avg_val_loss: 0.5976  time: 116s
Epoch 2 - Score: 0.6982
INFO:__main__:Epoch 2 - Score: 0.6982


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1183(0.5976) 
f1 score : 0.07361963190184048
recall score : 0.039473684210526314
precision score : 0.5454545454545454
thresh : 0.51
Epoch: [3][0/279] Elapsed 0m 0s (remain 2m 57s) Loss: 0.4831(0.4831) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 40s (remain 1m 11s) Loss: 0.5582(0.5573) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 0.4764(0.5433) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 1m 52s (remain 0m 0s) Loss: 0.5673(0.5360) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2795(0.2795) 


Epoch 3 - avg_train_loss: 0.5360  avg_val_loss: 0.6286  time: 116s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5360  avg_val_loss: 0.6286  time: 116s
Epoch 3 - Score: 0.7042
INFO:__main__:Epoch 3 - Score: 0.7042
Epoch 3 - Save Best Score: 0.7042 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7042 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.5275(0.6286) 
f1 score : 0.05
recall score : 0.02631578947368421
precision score : 0.5
thresh : 0.43
Epoch: [4][0/279] Elapsed 0m 0s (remain 3m 6s) Loss: 0.3867(0.3867) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 41s (remain 1m 12s) Loss: 0.3741(0.4341) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 1m 22s (remain 0m 31s) Loss: 0.3725(0.4191) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 1m 53s (remain 0m 0s) Loss: 0.4542(0.4123) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4740(0.4740) 


Epoch 4 - avg_train_loss: 0.4123  avg_val_loss: 0.6771  time: 117s
INFO:__main__:Epoch 4 - avg_train_loss: 0.4123  avg_val_loss: 0.6771  time: 117s
Epoch 4 - Score: 0.6982
INFO:__main__:Epoch 4 - Score: 0.6982


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2937(0.6771) 
f1 score : 0.3346007604562738
recall score : 0.2894736842105263
precision score : 0.3963963963963964
thresh : 0.78


Score: 0.6942
INFO:__main__:Score: 0.6942
ACC BEST Score: 0.7042
INFO:__main__:ACC BEST Score: 0.7042
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.05
recall score : 0.02631578947368421
precision score : 0.5
thresh : 0.43


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable FGM
Epoch: [1][0/279] Elapsed 0m 0s (remain 3m 10s) Loss: 0.6215(0.6215) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 41s (remain 1m 12s) Loss: 0.6074(0.6257) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 0.8475(0.6310) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 1m 52s (remain 0m 0s) Loss: 0.4100(0.6204) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2722(0.2722) 


Epoch 1 - avg_train_loss: 0.6204  avg_val_loss: 0.6103  time: 116s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6204  avg_val_loss: 0.6103  time: 116s
Epoch 1 - Score: 0.7082
INFO:__main__:Epoch 1 - Score: 0.7082
Epoch 1 - Save Best Score: 0.7082 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7082 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.3442(0.6103) 
f1 score : 0.08536585365853658
recall score : 0.046052631578947366
precision score : 0.5833333333333334
thresh : 0.26
Epoch: [2][0/279] Elapsed 0m 0s (remain 2m 59s) Loss: 0.5262(0.5262) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 40s (remain 1m 11s) Loss: 0.6199(0.6071) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 1m 20s (remain 0m 31s) Loss: 0.5418(0.5995) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 1m 52s (remain 0m 0s) Loss: 0.5396(0.5917) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4201(0.4201) 


Epoch 2 - avg_train_loss: 0.5917  avg_val_loss: 0.5830  time: 116s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5917  avg_val_loss: 0.5830  time: 116s
Epoch 2 - Score: 0.7082
INFO:__main__:Epoch 2 - Score: 0.7082


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8991(0.5830) 
f1 score : 0.27860696517412936
recall score : 0.18421052631578946
precision score : 0.5714285714285714
thresh : 0.5
Epoch: [3][0/279] Elapsed 0m 0s (remain 2m 39s) Loss: 0.6622(0.6622) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 40s (remain 1m 11s) Loss: 0.6553(0.5601) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 1m 20s (remain 0m 31s) Loss: 0.5735(0.5547) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 1m 51s (remain 0m 0s) Loss: 0.4567(0.5448) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2666(0.2666) 


Epoch 3 - avg_train_loss: 0.5448  avg_val_loss: 0.5749  time: 115s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5448  avg_val_loss: 0.5749  time: 115s
Epoch 3 - Score: 0.7143
INFO:__main__:Epoch 3 - Score: 0.7143
Epoch 3 - Save Best Score: 0.7143 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7143 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2677(0.5749) 
f1 score : 0.2185792349726776
recall score : 0.13157894736842105
precision score : 0.6451612903225806
thresh : 0.48
Epoch: [4][0/279] Elapsed 0m 0s (remain 3m 2s) Loss: 0.5008(0.5008) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 40s (remain 1m 12s) Loss: 0.3691(0.4259) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 0.6069(0.4242) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 1m 52s (remain 0m 0s) Loss: 0.2566(0.4251) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3030(0.3030) 


Epoch 4 - avg_train_loss: 0.4251  avg_val_loss: 0.6048  time: 116s
INFO:__main__:Epoch 4 - avg_train_loss: 0.4251  avg_val_loss: 0.6048  time: 116s
Epoch 4 - Score: 0.7163
INFO:__main__:Epoch 4 - Score: 0.7163
Epoch 4 - Save Best Score: 0.7163 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7163 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2690(0.6048) 
f1 score : 0.40784313725490196
recall score : 0.34210526315789475
precision score : 0.5048543689320388
thresh : 0.73


Score: 0.6962
INFO:__main__:Score: 0.6962
ACC BEST Score: 0.7163
INFO:__main__:ACC BEST Score: 0.7163
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.40784313725490196
recall score : 0.34210526315789475
precision score : 0.5048543689320388
thresh : 0.73


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable FGM
Epoch: [1][0/279] Elapsed 0m 0s (remain 3m 14s) Loss: 0.6674(0.6674) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 41s (remain 1m 13s) Loss: 0.7697(0.6285) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 0.6576(0.6212) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 1m 52s (remain 0m 0s) Loss: 0.5865(0.6202) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4775(0.4775) 


Epoch 1 - avg_train_loss: 0.6202  avg_val_loss: 0.6188  time: 116s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6202  avg_val_loss: 0.6188  time: 116s
Epoch 1 - Score: 0.7203
INFO:__main__:Epoch 1 - Score: 0.7203
Epoch 1 - Save Best Score: 0.7203 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7203 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9321(0.6188) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
thresh : 0.41
Epoch: [2][0/279] Elapsed 0m 0s (remain 2m 56s) Loss: 0.5624(0.5624) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 41s (remain 1m 12s) Loss: 0.6064(0.6065) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 1.0602(0.5973) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 1m 52s (remain 0m 0s) Loss: 0.5642(0.5959) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4908(0.4908) 


Epoch 2 - avg_train_loss: 0.5959  avg_val_loss: 0.6027  time: 116s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5959  avg_val_loss: 0.6027  time: 116s
Epoch 2 - Score: 0.7243
INFO:__main__:Epoch 2 - Score: 0.7243
Epoch 2 - Save Best Score: 0.7243 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7243 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8343(0.6027) 
f1 score : 0.20540540540540542
recall score : 0.125
precision score : 0.5757575757575758
thresh : 0.48
Epoch: [3][0/279] Elapsed 0m 0s (remain 3m 8s) Loss: 0.7110(0.7110) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 40s (remain 1m 10s) Loss: 0.4584(0.5630) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 0.4268(0.5434) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 1m 52s (remain 0m 0s) Loss: 0.4282(0.5437) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2580(0.2580) 


Epoch 3 - avg_train_loss: 0.5437  avg_val_loss: 0.5823  time: 116s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5437  avg_val_loss: 0.5823  time: 116s
Epoch 3 - Score: 0.7243
INFO:__main__:Epoch 3 - Score: 0.7243


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2458(0.5823) 
f1 score : 0.13333333333333336
recall score : 0.07236842105263158
precision score : 0.8461538461538461
thresh : 0.4
Epoch: [4][0/279] Elapsed 0m 0s (remain 2m 59s) Loss: 0.5517(0.5517) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 41s (remain 1m 12s) Loss: 0.3725(0.4609) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 1m 20s (remain 0m 31s) Loss: 0.3199(0.4316) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 1m 53s (remain 0m 0s) Loss: 0.2015(0.4268) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3948(0.3948) 


Epoch 4 - avg_train_loss: 0.4268  avg_val_loss: 0.6165  time: 117s
INFO:__main__:Epoch 4 - avg_train_loss: 0.4268  avg_val_loss: 0.6165  time: 117s
Epoch 4 - Score: 0.7223
INFO:__main__:Epoch 4 - Score: 0.7223


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9017(0.6165) 
f1 score : 0.42446043165467623
recall score : 0.3881578947368421
precision score : 0.46825396825396826
thresh : 0.59


Score: 0.7042
INFO:__main__:Score: 0.7042
ACC BEST Score: 0.7243
INFO:__main__:ACC BEST Score: 0.7243
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.20540540540540542
recall score : 0.125
precision score : 0.5757575757575758
thresh : 0.48


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable FGM
Epoch: [1][0/279] Elapsed 0m 0s (remain 3m 10s) Loss: 0.7012(0.7012) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 41s (remain 1m 12s) Loss: 0.7236(0.6371) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 0.7450(0.6275) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 1m 52s (remain 0m 0s) Loss: 0.5614(0.6201) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4212(0.4212) 


Epoch 1 - avg_train_loss: 0.6201  avg_val_loss: 0.6066  time: 116s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6201  avg_val_loss: 0.6066  time: 116s
Epoch 1 - Score: 0.7062
INFO:__main__:Epoch 1 - Score: 0.7062
Epoch 1 - Save Best Score: 0.7062 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7062 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9341(0.6066) 
f1 score : 0.07272727272727271
recall score : 0.039473684210526314
precision score : 0.46153846153846156
thresh : 0.36
Epoch: [2][0/279] Elapsed 0m 0s (remain 2m 50s) Loss: 0.4941(0.4941) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 41s (remain 1m 13s) Loss: 0.5745(0.5937) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 1m 22s (remain 0m 31s) Loss: 0.6191(0.6006) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 1m 53s (remain 0m 0s) Loss: 0.7732(0.5990) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3680(0.3680) 


Epoch 2 - avg_train_loss: 0.5990  avg_val_loss: 0.5736  time: 117s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5990  avg_val_loss: 0.5736  time: 117s
Epoch 2 - Score: 0.7103
INFO:__main__:Epoch 2 - Score: 0.7103
Epoch 2 - Save Best Score: 0.7103 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7103 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9766(0.5736) 
f1 score : 0.06289308176100628
recall score : 0.03289473684210526
precision score : 0.7142857142857143
thresh : 0.44
Epoch: [3][0/279] Elapsed 0m 0s (remain 3m 7s) Loss: 0.6651(0.6651) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 40s (remain 1m 12s) Loss: 0.6045(0.5396) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 1m 20s (remain 0m 31s) Loss: 0.6498(0.5337) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 1m 52s (remain 0m 0s) Loss: 0.4943(0.5234) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4476(0.4476) 


Epoch 3 - avg_train_loss: 0.5234  avg_val_loss: 0.5654  time: 116s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5234  avg_val_loss: 0.5654  time: 116s
Epoch 3 - Score: 0.7243
INFO:__main__:Epoch 3 - Score: 0.7243
Epoch 3 - Save Best Score: 0.7243 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7243 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7063(0.5654) 
f1 score : 0.4615384615384615
recall score : 0.4342105263157895
precision score : 0.4925373134328358
thresh : 0.59
Epoch: [4][0/279] Elapsed 0m 0s (remain 2m 47s) Loss: 0.5786(0.5786) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 41s (remain 1m 12s) Loss: 0.6228(0.4065) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 1m 22s (remain 0m 31s) Loss: 0.6417(0.3960) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 1m 53s (remain 0m 0s) Loss: 0.5134(0.3856) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3661(0.3661) 


Epoch 4 - avg_train_loss: 0.3856  avg_val_loss: 0.5932  time: 117s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3856  avg_val_loss: 0.5932  time: 117s
Epoch 4 - Score: 0.7203
INFO:__main__:Epoch 4 - Score: 0.7203


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8905(0.5932) 
f1 score : 0.4592592592592593
recall score : 0.40789473684210525
precision score : 0.5254237288135594
thresh : 0.71


Score: 0.6901
INFO:__main__:Score: 0.6901
ACC BEST Score: 0.7243
INFO:__main__:ACC BEST Score: 0.7243
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.4615384615384615
recall score : 0.4342105263157895
precision score : 0.4925373134328358
thresh : 0.59


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable FGM
Epoch: [1][0/279] Elapsed 0m 0s (remain 3m 2s) Loss: 0.7214(0.7214) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 40s (remain 1m 11s) Loss: 0.4855(0.6055) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 1m 22s (remain 0m 31s) Loss: 0.6589(0.6130) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 1m 53s (remain 0m 0s) Loss: 0.5705(0.6138) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3487(0.3487) 


Epoch 1 - avg_train_loss: 0.6138  avg_val_loss: 0.5865  time: 117s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6138  avg_val_loss: 0.5865  time: 117s
Epoch 1 - Score: 0.7062
INFO:__main__:Epoch 1 - Score: 0.7062
Epoch 1 - Save Best Score: 0.7062 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7062 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0929(0.5865) 
f1 score : 0.09815950920245399
recall score : 0.05263157894736842
precision score : 0.7272727272727273
thresh : 0.45
Epoch: [2][0/279] Elapsed 0m 0s (remain 3m 8s) Loss: 0.4863(0.4863) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 40s (remain 1m 10s) Loss: 0.6516(0.5940) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 1m 20s (remain 0m 31s) Loss: 0.5143(0.5830) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 1m 52s (remain 0m 0s) Loss: 0.5623(0.5742) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4283(0.4283) 


Epoch 2 - avg_train_loss: 0.5742  avg_val_loss: 0.5917  time: 116s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5742  avg_val_loss: 0.5917  time: 116s
Epoch 2 - Score: 0.7082
INFO:__main__:Epoch 2 - Score: 0.7082
Epoch 2 - Save Best Score: 0.7082 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7082 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8294(0.5917) 
f1 score : 0.43661971830985913
recall score : 0.40789473684210525
precision score : 0.4696969696969697
thresh : 0.59
Epoch: [3][0/279] Elapsed 0m 0s (remain 2m 50s) Loss: 0.4296(0.4296) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 41s (remain 1m 12s) Loss: 0.5572(0.5222) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 0.3089(0.5028) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 1m 53s (remain 0m 0s) Loss: 0.2948(0.4918) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2375(0.2375) 


Epoch 3 - avg_train_loss: 0.4918  avg_val_loss: 0.6072  time: 117s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4918  avg_val_loss: 0.6072  time: 117s
Epoch 3 - Score: 0.7143
INFO:__main__:Epoch 3 - Score: 0.7143
Epoch 3 - Save Best Score: 0.7143 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7143 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2083(0.6072) 
f1 score : 0.34632034632034636
recall score : 0.2631578947368421
precision score : 0.5063291139240507
thresh : 0.67
Epoch: [4][0/279] Elapsed 0m 0s (remain 3m 17s) Loss: 0.3210(0.3210) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 40s (remain 1m 11s) Loss: 0.1627(0.3227) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 1m 20s (remain 0m 31s) Loss: 0.6335(0.3076) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 1m 52s (remain 0m 0s) Loss: 0.3501(0.3048) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2586(0.2586) 


Epoch 4 - avg_train_loss: 0.3048  avg_val_loss: 0.7271  time: 116s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3048  avg_val_loss: 0.7271  time: 116s
Epoch 4 - Score: 0.7103
INFO:__main__:Epoch 4 - Score: 0.7103


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.5915(0.7271) 
f1 score : 0.3777777777777778
recall score : 0.3355263157894737
precision score : 0.4322033898305085
thresh : 0.78


Score: 0.6962
INFO:__main__:Score: 0.6962
ACC BEST Score: 0.7143
INFO:__main__:ACC BEST Score: 0.7143
Score: 0.7041
INFO:__main__:Score: 0.7041
ACC BEST Score: 0.7091
INFO:__main__:ACC BEST Score: 0.7091


f1 score : 0.34632034632034636
recall score : 0.2631578947368421
precision score : 0.5063291139240507
thresh : 0.67
f1 score : 0.3095684803001876
recall score : 0.21667760998030203
precision score : 0.541871921182266
thresh : 0.61


In [None]:
from google.colab import runtime
runtime.unassign()