In [1]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!nvidia-smi

Sat Apr 29 13:22:33 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P0    44W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:

import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re
import html

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD, AdamW, RAdam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score, recall_score, accuracy_score, precision_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
DIR = "/content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")
CUSTOM_MODEL_DIR = os.path.join(OUTPUT_DIR,'clrp_deberta_v3_base')
OUTPUT_EXP_DIR = DIR + '/output/EXP031/'
if not os.path.exists(OUTPUT_EXP_DIR):
    os.makedirs(OUTPUT_EXP_DIR)

In [6]:

# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model_name="microsoft/deberta-v3-base"
    # model='microsoft/deberta-base'
    # model='roberta-base'
    # model='roberta-large'
    # model='roberta-large-mnli'
    # model='xlnet-large-cased'
    # model='albert-xxlarge-v2'
    # model="microsoft/deberta-large"
    # model="microsoft/deberta-v3-large"
    # model='microsoft/deberta-v2-xlarge'
    # model='funnel-transformer/large'
    # model='funnel-transformer/medium'
    # model='albert-base-v2'
    # model='albert-large-v2'
    # model='google/electra-large-discriminator'
    # model='google/electra-base-discriminator'
    # model="facebook/bart-large-mnli"
    # model="facebook/bart-large"
    # model="facebook/bart-base"
    model = CUSTOM_MODEL_DIR
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=16
    fc_dropout=0.2
    target_size=1
    max_len=256
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=10
    trn_fold=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    train=True
    nth_awp_start_epoch=1
    gradient_checkpointing = False
    freezing = False
    num_reinit_layers = 1
    is_reinit_layer = False
    fgm = True

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1]

In [7]:
def get_score(labels, outputs):
    thresh = 0.5
    y_pred = outputs
    y_true = labels
    f_score = f1_score(y_true, (y_pred>thresh).astype(int))
    r_score = recall_score(y_true, (y_pred>thresh).astype(int))
    p_score = precision_score(y_true, (y_pred>thresh).astype(int))
    print(f"f1 score : {f_score}")
    print(f"recall score : {r_score}")
    print(f"precision score : {p_score}")
    return accuracy_score(y_true, (y_pred>thresh).astype(int))

def get_acc_score(labels, outputs):
    y_pred = outputs
    y_true = labels
    best_score = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.80, 0.01):
        thresh = np.round(thresh, 2)
        score = accuracy_score(y_true, (y_pred>thresh).astype(int))
        #print("Accuracy score at threshold {0} is {1}".format(thresh, score))
        if score > best_score:
          best_score = score
          best_thresh = thresh
    print(f"thresh : {best_thresh}")
    return accuracy_score(y_true, (y_pred>best_thresh).astype(int))


def get_logger(filename=OUTPUT_EXP_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=CFG.seed)

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
import pandas as pd
import numpy as np

train = pd.read_csv(os.path.join(INPUT_DIR,"train_data.csv"))
test = pd.read_csv(os.path.join(INPUT_DIR,"test_data.csv"))
sample_sub = pd.read_csv(os.path.join(INPUT_DIR,"submission.csv"))

print(train.shape)
display(train.head(3))

print(test.shape)
display(test.head(3))

print(sample_sub.shape)
display(sample_sub.head(3))

(4974, 6)


Unnamed: 0,id,title,year,abstract,keywords,y
0,1,Hierarchical Adversarially Learned Inference,2018,We propose a novel hierarchical generative mod...,"generative, hierarchical, unsupervised, semisu...",0
1,2,Learning to Compute Word Embeddings On the Fly,2018,Words in natural language follow a Zipfian dis...,"NLU, word embeddings, representation learning",0
2,3,Graph2Seq: Scalable Learning Dynamics for Graphs,2018,Neural networks are increasingly used as a gen...,,0


(6393, 5)


Unnamed: 0,id,title,year,abstract,keywords
0,1,StyleAlign: Analysis and Applications of Align...,2022,"In this paper, we perform an in-depth study of...","StyleGAN, transfer learning, fine tuning, mode..."
1,2,Embedding a random graph via GNN: mean-field i...,2021,We develop a theory for embedding a random gra...,"Graph neural network, graph embedding, multi-r..."
2,3,BBRefinement: an universal scheme to improve p...,2021,We present a conceptually simple yet powerful ...,"object detection, deep neural networks, refine..."


(6393, 2)


Unnamed: 0,id,y
0,1,0
1,2,0
2,3,0


In [10]:
train["texts"] = train["title"] + "[SEP]" + train["abstract"] 

In [11]:
skf = StratifiedKFold(n_splits=CFG.n_fold,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train.y)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)

if CFG.debug:
    display(train.groupby('kfold').size())
    train = train.sample(n=500, random_state=0).reset_index(drop=True)
    display(train.groupby('kfold').size())

In [12]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_EXP_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

In [13]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['texts'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 3 # cls + sep + sep
LOGGER.info(f"max_len: {CFG.max_len}")

100%|██████████| 4974/4974 [00:03<00:00, 1285.26it/s]
max_len: 522
INFO:__main__:max_len: 522


In [14]:
class FGM():
    def __init__(self, model, eps=1.):
        self.model = model
        self.eps = eps
        self.backup = {}

    def attack(self, emb_name='word_embeddings'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0:
                    r_at = self.eps * param.grad / norm
                    param.data.add_(r_at)

    def restore(self, emb_name='word_embeddings'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                assert name in self.backup
                param.data = self.backup[name]
            self.backup = {}

In [15]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False,
                           truncation=True)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df['y'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

#collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
def reinit_layers(model):

    #for layer in model.model.encoder.layer[-CFG.num_reinit_layers:]:
    for layer in model.encoder.layer[-CFG.num_reinit_layers:]:    #Custome model内(backbone)

            for module in layer.modules():

                if isinstance(module,nn.Linear):
                    module.weight.data.normal_(mean=0.0,std=model.config.initializer_range)
                    if module.bias is not None:
                            module.bias.data.zero_()
                elif isinstance(module, nn.Embedding):
                        module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                        if module.padding_idx is not None:
                            module.weight.data[module.padding_idx].zero_()
                elif isinstance(module, nn.LayerNorm):
                        module.bias.data.zero_()
                        module.weight.data.fill_(1.0)
                        
    return model

In [17]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if CFG.is_reinit_layer:
            self.model = reinit_layers(self.model)
            print(f'Reinitializing Last {CFG.num_reinit_layers} Layers.')
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())

        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self._init_weights(self.fc)
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
        self._init_weights(self.fc)
        self.sig = nn.Sigmoid()
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        feature = self.layer_norm1(feature)
        output = self.fc(feature)
        #output = self.sig(output)
        return output

In [18]:
def calculate_loss(inputs, labels, model, criterion, is_valid=True, device="cpu"):    
    y_preds = model(inputs)
    loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
    return (loss, y_preds) if is_valid else loss

In [19]:

# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device, fgm):
    model.zero_grad()
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            loss = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=False, device=device)
        #print(y_preds.sigmoid().squeeze().view(1, -1))
        #loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        if scaler is not None:
            scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if CFG.fgm:
          fgm.attack() 
          adversarial_loss = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=False, device=device)
          scaler.scale(adversarial_loss).backward()
          fgm.restore()
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  #'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          #grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            loss, y_preds = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=True, device=device)
        #loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [20]:

# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['kfold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['kfold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds['y'].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)


    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size*2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_EXP_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr=5e-6, decoder_lr=1e-4, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", 
                    "LayerNorm.weight"]
        group1=['layer.0.','layer.1.','layer.2.','layer.3.']
        group2=['layer.4.','layer.5.','layer.6.','layer.7.']    
        group3=['layer.8.','layer.9.','layer.10.','layer.11.']
        group_all=['layer.0.','layer.1.','layer.2.','layer.3.','layer.4.','layer.5.','layer.6.','layer.7.','layer.8.','layer.9.','layer.10.','layer.11.']
        optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': weight_decay, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': weight_decay, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': weight_decay, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': 0.0},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': 0.0, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': 0.0, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': 0.0, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.named_parameters() if "model" not in n], 'lr':decoder_lr, "momentum" : 0.99},
    ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.MSELoss()

    print('Enable FGM')
    fgm = FGM(model=model, eps=0.1)
    
    best_score = -1.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device, fgm)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score_05 = get_score(valid_labels, predictions)
        score = get_acc_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')

        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_EXP_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_EXP_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [None]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['y'].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        acc_score = get_acc_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
        LOGGER.info(f'ACC BEST Score: {acc_score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
            #break
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_EXP_DIR+'oof_df.pkl')

DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "torch_dtype": "f

Enable FGM
Epoch: [1][0/279] Elapsed 0m 1s (remain 9m 14s) Loss: 1.5115(1.5115) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 42s (remain 1m 14s) Loss: 0.2039(0.3985) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 1m 22s (remain 0m 32s) Loss: 0.2741(0.3059) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 1m 54s (remain 0m 0s) Loss: 0.2079(0.2784) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.1571(0.1571) 


Epoch 1 - avg_train_loss: 0.2784  avg_val_loss: 0.2092  time: 118s
INFO:__main__:Epoch 1 - avg_train_loss: 0.2784  avg_val_loss: 0.2092  time: 118s
Epoch 1 - Score: 0.7129
INFO:__main__:Epoch 1 - Score: 0.7129
Epoch 1 - Save Best Score: 0.7129 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7129 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.3225(0.2092) 
f1 score : 0.46769230769230774
recall score : 1.0
precision score : 0.30522088353413657
thresh : 0.63
Epoch: [2][0/279] Elapsed 0m 0s (remain 3m 15s) Loss: 0.1868(0.1868) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 40s (remain 1m 10s) Loss: 0.1747(0.2109) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 0.1714(0.2103) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 1m 53s (remain 0m 0s) Loss: 0.2263(0.2060) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0685(0.0685) 


Epoch 2 - avg_train_loss: 0.2060  avg_val_loss: 0.1912  time: 117s
INFO:__main__:Epoch 2 - avg_train_loss: 0.2060  avg_val_loss: 0.1912  time: 117s
Epoch 2 - Score: 0.7149
INFO:__main__:Epoch 2 - Score: 0.7149
Epoch 2 - Save Best Score: 0.7149 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7149 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.4592(0.1912) 
f1 score : 0.4713178294573644
recall score : 1.0
precision score : 0.30831643002028397
thresh : 0.62
Epoch: [3][0/279] Elapsed 0m 0s (remain 3m 0s) Loss: 0.1313(0.1313) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 41s (remain 1m 13s) Loss: 0.1831(0.1864) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 0.2005(0.1781) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 1m 52s (remain 0m 0s) Loss: 0.1870(0.1747) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0993(0.0993) 


Epoch 3 - avg_train_loss: 0.1747  avg_val_loss: 0.1900  time: 116s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1747  avg_val_loss: 0.1900  time: 116s
Epoch 3 - Score: 0.7229
INFO:__main__:Epoch 3 - Score: 0.7229
Epoch 3 - Save Best Score: 0.7229 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7229 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.3960(0.1900) 
f1 score : 0.4802527646129542
recall score : 1.0
precision score : 0.316008316008316
thresh : 0.65
Epoch: [4][0/279] Elapsed 0m 0s (remain 3m 4s) Loss: 0.0941(0.0941) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 41s (remain 1m 12s) Loss: 0.1314(0.1356) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 0.1341(0.1283) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 1m 53s (remain 0m 0s) Loss: 0.0786(0.1255) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.1231(0.1231) 


Epoch 4 - avg_train_loss: 0.1255  avg_val_loss: 0.2029  time: 117s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1255  avg_val_loss: 0.2029  time: 117s
Epoch 4 - Score: 0.7189
INFO:__main__:Epoch 4 - Score: 0.7189


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.3691(0.2029) 
f1 score : 0.4959216965742252
recall score : 1.0
precision score : 0.3297180043383948
thresh : 0.68


Score: 0.3394
INFO:__main__:Score: 0.3394
ACC BEST Score: 0.7229
INFO:__main__:ACC BEST Score: 0.7229
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.4802527646129542
recall score : 1.0
precision score : 0.316008316008316
thresh : 0.65


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable FGM
Epoch: [1][0/279] Elapsed 0m 0s (remain 3m 4s) Loss: 0.3090(0.3090) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 40s (remain 1m 10s) Loss: 0.1382(0.3285) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 1m 20s (remain 0m 31s) Loss: 0.2688(0.2713) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 1m 53s (remain 0m 0s) Loss: 0.2700(0.2530) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.2027(0.2027) 


Epoch 1 - avg_train_loss: 0.2530  avg_val_loss: 0.2223  time: 117s
INFO:__main__:Epoch 1 - avg_train_loss: 0.2530  avg_val_loss: 0.2223  time: 117s
Epoch 1 - Score: 0.7068
INFO:__main__:Epoch 1 - Score: 0.7068
Epoch 1 - Save Best Score: 0.7068 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7068 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.2475(0.2223) 
f1 score : 0.4700460829493087
recall score : 1.0
precision score : 0.3072289156626506
thresh : 0.64
Epoch: [2][0/279] Elapsed 0m 0s (remain 3m 36s) Loss: 0.1913(0.1913) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 40s (remain 1m 12s) Loss: 0.1544(0.2159) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 0.2229(0.2101) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 1m 53s (remain 0m 0s) Loss: 0.2117(0.2069) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.1607(0.1607) 


Epoch 2 - avg_train_loss: 0.2069  avg_val_loss: 0.2032  time: 117s
INFO:__main__:Epoch 2 - avg_train_loss: 0.2069  avg_val_loss: 0.2032  time: 117s
Epoch 2 - Score: 0.7329
INFO:__main__:Epoch 2 - Score: 0.7329
Epoch 2 - Save Best Score: 0.7329 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7329 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.2938(0.2032) 
f1 score : 0.4700460829493087
recall score : 1.0
precision score : 0.3072289156626506
thresh : 0.62
Epoch: [3][0/279] Elapsed 0m 0s (remain 3m 14s) Loss: 0.1593(0.1593) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 40s (remain 1m 12s) Loss: 0.1826(0.1887) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 0.2010(0.1805) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 1m 53s (remain 0m 0s) Loss: 0.1342(0.1753) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0887(0.0887) 


Epoch 3 - avg_train_loss: 0.1753  avg_val_loss: 0.1891  time: 117s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1753  avg_val_loss: 0.1891  time: 117s
Epoch 3 - Score: 0.7229
INFO:__main__:Epoch 3 - Score: 0.7229


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.4310(0.1891) 
f1 score : 0.48407643312101906
recall score : 0.9934640522875817
precision score : 0.32
thresh : 0.6
Epoch: [4][0/279] Elapsed 0m 0s (remain 2m 52s) Loss: 0.0919(0.0919) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 41s (remain 1m 12s) Loss: 0.0769(0.1173) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 1m 20s (remain 0m 31s) Loss: 0.1073(0.1117) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 1m 53s (remain 0m 0s) Loss: 0.0675(0.1108) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.1415(0.1415) 


Epoch 4 - avg_train_loss: 0.1108  avg_val_loss: 0.2001  time: 117s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1108  avg_val_loss: 0.2001  time: 117s
Epoch 4 - Score: 0.7169
INFO:__main__:Epoch 4 - Score: 0.7169


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.3839(0.2001) 
f1 score : 0.49261083743842365
recall score : 0.9803921568627451
precision score : 0.32894736842105265
thresh : 0.63


Score: 0.3072
INFO:__main__:Score: 0.3072
ACC BEST Score: 0.7329
INFO:__main__:ACC BEST Score: 0.7329
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.4700460829493087
recall score : 1.0
precision score : 0.3072289156626506
thresh : 0.62


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable FGM
Epoch: [1][0/279] Elapsed 0m 0s (remain 3m 21s) Loss: 0.3267(0.3267) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 40s (remain 1m 12s) Loss: 0.1957(0.2680) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 1m 22s (remain 0m 31s) Loss: 0.2108(0.2433) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 1m 53s (remain 0m 0s) Loss: 0.2219(0.2347) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.1847(0.1847) 


Epoch 1 - avg_train_loss: 0.2347  avg_val_loss: 0.2173  time: 117s
INFO:__main__:Epoch 1 - avg_train_loss: 0.2347  avg_val_loss: 0.2173  time: 117s
Epoch 1 - Score: 0.7108
INFO:__main__:Epoch 1 - Score: 0.7108
Epoch 1 - Save Best Score: 0.7108 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7108 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.2938(0.2173) 
f1 score : 0.4700460829493087
recall score : 1.0
precision score : 0.3072289156626506
thresh : 0.63
Epoch: [2][0/279] Elapsed 0m 0s (remain 3m 1s) Loss: 0.2108(0.2108) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 41s (remain 1m 12s) Loss: 0.1364(0.2127) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 1m 20s (remain 0m 31s) Loss: 0.2225(0.2046) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 1m 52s (remain 0m 0s) Loss: 0.2350(0.2049) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.1739(0.1739) 


Epoch 2 - avg_train_loss: 0.2049  avg_val_loss: 0.2095  time: 117s
INFO:__main__:Epoch 2 - avg_train_loss: 0.2049  avg_val_loss: 0.2095  time: 117s
Epoch 2 - Score: 0.7129
INFO:__main__:Epoch 2 - Score: 0.7129
Epoch 2 - Save Best Score: 0.7129 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7129 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.2991(0.2095) 
f1 score : 0.4700460829493087
recall score : 1.0
precision score : 0.3072289156626506
thresh : 0.69
Epoch: [3][0/279] Elapsed 0m 0s (remain 2m 59s) Loss: 0.1646(0.1646) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 41s (remain 1m 12s) Loss: 0.1559(0.1808) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 0.1692(0.1754) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 1m 53s (remain 0m 0s) Loss: 0.2142(0.1711) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.1838(0.1838) 


Epoch 3 - avg_train_loss: 0.1711  avg_val_loss: 0.2007  time: 117s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1711  avg_val_loss: 0.2007  time: 117s
Epoch 3 - Score: 0.7108
INFO:__main__:Epoch 3 - Score: 0.7108


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.3083(0.2007) 
f1 score : 0.4722222222222222
recall score : 1.0
precision score : 0.3090909090909091
thresh : 0.64
Epoch: [4][0/279] Elapsed 0m 0s (remain 2m 57s) Loss: 0.1734(0.1734) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 41s (remain 1m 12s) Loss: 0.1417(0.1146) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 0.0941(0.1075) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 1m 52s (remain 0m 0s) Loss: 0.1322(0.1060) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.1464(0.1464) 


Epoch 4 - avg_train_loss: 0.1060  avg_val_loss: 0.2011  time: 116s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1060  avg_val_loss: 0.2011  time: 116s
Epoch 4 - Score: 0.7028
INFO:__main__:Epoch 4 - Score: 0.7028


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.4020(0.2011) 
f1 score : 0.4967532467532467
recall score : 1.0
precision score : 0.3304535637149028
thresh : 0.7


Score: 0.3072
INFO:__main__:Score: 0.3072
ACC BEST Score: 0.7129
INFO:__main__:ACC BEST Score: 0.7129
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.4700460829493087
recall score : 1.0
precision score : 0.3072289156626506
thresh : 0.69


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable FGM
Epoch: [1][0/279] Elapsed 0m 0s (remain 3m 12s) Loss: 0.4406(0.4406) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 41s (remain 1m 12s) Loss: 0.1379(0.3604) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 0.2969(0.2853) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 1m 53s (remain 0m 0s) Loss: 0.1710(0.2696) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.1023(0.1023) 


Epoch 1 - avg_train_loss: 0.2696  avg_val_loss: 0.2045  time: 117s
INFO:__main__:Epoch 1 - avg_train_loss: 0.2696  avg_val_loss: 0.2045  time: 117s
Epoch 1 - Score: 0.7008
INFO:__main__:Epoch 1 - Score: 0.7008
Epoch 1 - Save Best Score: 0.7008 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7008 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.4093(0.2045) 
f1 score : 0.4700460829493087
recall score : 1.0
precision score : 0.3072289156626506
thresh : 0.62
Epoch: [2][0/279] Elapsed 0m 0s (remain 3m 9s) Loss: 0.1910(0.1910) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 40s (remain 1m 11s) Loss: 0.3227(0.2244) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 0.0671(0.2105) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 1m 52s (remain 0m 0s) Loss: 0.1643(0.2068) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.1272(0.1272) 


Epoch 2 - avg_train_loss: 0.2068  avg_val_loss: 0.2041  time: 116s
INFO:__main__:Epoch 2 - avg_train_loss: 0.2068  avg_val_loss: 0.2041  time: 116s
Epoch 2 - Score: 0.7048
INFO:__main__:Epoch 2 - Score: 0.7048
Epoch 2 - Save Best Score: 0.7048 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7048 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.3480(0.2041) 
f1 score : 0.4700460829493087
recall score : 1.0
precision score : 0.3072289156626506
thresh : 0.63
Epoch: [3][0/279] Elapsed 0m 0s (remain 3m 9s) Loss: 0.2250(0.2250) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 41s (remain 1m 12s) Loss: 0.2681(0.1972) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 1m 22s (remain 0m 31s) Loss: 0.2013(0.1862) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 1m 53s (remain 0m 0s) Loss: 0.1656(0.1826) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0735(0.0735) 


Epoch 3 - avg_train_loss: 0.1826  avg_val_loss: 0.1948  time: 117s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1826  avg_val_loss: 0.1948  time: 117s
Epoch 3 - Score: 0.7068
INFO:__main__:Epoch 3 - Score: 0.7068
Epoch 3 - Save Best Score: 0.7068 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7068 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.4324(0.1948) 
f1 score : 0.47874015748031495
recall score : 0.9934640522875817
precision score : 0.3153526970954357
thresh : 0.62
Epoch: [4][0/279] Elapsed 0m 0s (remain 2m 54s) Loss: 0.1606(0.1606) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 40s (remain 1m 11s) Loss: 0.1437(0.1476) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 0.0995(0.1510) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 1m 53s (remain 0m 0s) Loss: 0.1967(0.1508) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0853(0.0853) 


Epoch 4 - avg_train_loss: 0.1508  avg_val_loss: 0.1956  time: 117s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1508  avg_val_loss: 0.1956  time: 117s
Epoch 4 - Score: 0.7088
INFO:__main__:Epoch 4 - Score: 0.7088
Epoch 4 - Save Best Score: 0.7088 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7088 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.4051(0.1956) 
f1 score : 0.48154093097913325
recall score : 0.9803921568627451
precision score : 0.3191489361702128
thresh : 0.62


Score: 0.3514
INFO:__main__:Score: 0.3514
ACC BEST Score: 0.7088
INFO:__main__:ACC BEST Score: 0.7088
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.48154093097913325
recall score : 0.9803921568627451
precision score : 0.3191489361702128
thresh : 0.62


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable FGM
Epoch: [1][0/279] Elapsed 0m 0s (remain 3m 29s) Loss: 0.2109(0.2109) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 40s (remain 1m 12s) Loss: 0.2169(0.3063) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 1m 22s (remain 0m 31s) Loss: 0.2380(0.2618) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 1m 53s (remain 0m 0s) Loss: 0.1971(0.2465) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0999(0.0999) 


Epoch 1 - avg_train_loss: 0.2465  avg_val_loss: 0.1949  time: 118s
INFO:__main__:Epoch 1 - avg_train_loss: 0.2465  avg_val_loss: 0.1949  time: 118s
Epoch 1 - Score: 0.7284
INFO:__main__:Epoch 1 - Score: 0.7284
Epoch 1 - Save Best Score: 0.7284 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7284 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.4087(0.1949) 
f1 score : 0.46841294298921415
recall score : 1.0
precision score : 0.3058350100603622
thresh : 0.61
Epoch: [2][0/279] Elapsed 0m 0s (remain 2m 59s) Loss: 0.2181(0.2181) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 41s (remain 1m 13s) Loss: 0.2573(0.2069) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 1m 22s (remain 0m 31s) Loss: 0.1441(0.2023) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 1m 52s (remain 0m 0s) Loss: 0.1972(0.1998) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0833(0.0833) 


Epoch 2 - avg_train_loss: 0.1998  avg_val_loss: 0.1912  time: 117s
INFO:__main__:Epoch 2 - avg_train_loss: 0.1998  avg_val_loss: 0.1912  time: 117s
Epoch 2 - Score: 0.7264
INFO:__main__:Epoch 2 - Score: 0.7264


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.3672(0.1912) 
f1 score : 0.4713178294573644
recall score : 1.0
precision score : 0.30831643002028397
thresh : 0.61
Epoch: [3][0/279] Elapsed 0m 0s (remain 2m 52s) Loss: 0.1459(0.1459) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 41s (remain 1m 12s) Loss: 0.1535(0.1733) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 0.1614(0.1664) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 1m 53s (remain 0m 0s) Loss: 0.1512(0.1648) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.1054(0.1054) 


Epoch 3 - avg_train_loss: 0.1648  avg_val_loss: 0.1949  time: 117s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1648  avg_val_loss: 0.1949  time: 117s
Epoch 3 - Score: 0.7243
INFO:__main__:Epoch 3 - Score: 0.7243


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.2838(0.1949) 
f1 score : 0.47874015748031495
recall score : 1.0
precision score : 0.3146997929606625
thresh : 0.65
Epoch: [4][0/279] Elapsed 0m 0s (remain 3m 7s) Loss: 0.1242(0.1242) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 40s (remain 1m 12s) Loss: 0.0590(0.1106) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 0.1046(0.1075) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 1m 53s (remain 0m 0s) Loss: 0.1462(0.1074) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.1254(0.1254) 


Epoch 4 - avg_train_loss: 0.1074  avg_val_loss: 0.2133  time: 117s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1074  avg_val_loss: 0.2133  time: 117s
Epoch 4 - Score: 0.7203
INFO:__main__:Epoch 4 - Score: 0.7203


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.2947(0.2133) 
f1 score : 0.4836601307189542
recall score : 0.9736842105263158
precision score : 0.3217391304347826
thresh : 0.66


Score: 0.3058
INFO:__main__:Score: 0.3058
ACC BEST Score: 0.7284
INFO:__main__:ACC BEST Score: 0.7284
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.46841294298921415
recall score : 1.0
precision score : 0.3058350100603622
thresh : 0.61


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable FGM
Epoch: [1][0/279] Elapsed 0m 0s (remain 3m 21s) Loss: 0.7897(0.7897) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 41s (remain 1m 12s) Loss: 0.1589(0.2820) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 0.2774(0.2488) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 1m 53s (remain 0m 0s) Loss: 0.2152(0.2392) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.1770(0.1770) 


Epoch 1 - avg_train_loss: 0.2392  avg_val_loss: 0.2198  time: 117s
INFO:__main__:Epoch 1 - avg_train_loss: 0.2392  avg_val_loss: 0.2198  time: 117s
Epoch 1 - Score: 0.6942
INFO:__main__:Epoch 1 - Score: 0.6942
Epoch 1 - Save Best Score: 0.6942 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6942 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.3327(0.2198) 
f1 score : 0.46841294298921415
recall score : 1.0
precision score : 0.3058350100603622
thresh : 0.66
Epoch: [2][0/279] Elapsed 0m 0s (remain 3m 10s) Loss: 0.2102(0.2102) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 41s (remain 1m 12s) Loss: 0.3071(0.2159) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 1m 22s (remain 0m 31s) Loss: 0.1670(0.2065) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 1m 53s (remain 0m 0s) Loss: 0.1994(0.2028) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.1216(0.1216) 


Epoch 2 - avg_train_loss: 0.2028  avg_val_loss: 0.2045  time: 117s
INFO:__main__:Epoch 2 - avg_train_loss: 0.2028  avg_val_loss: 0.2045  time: 117s
Epoch 2 - Score: 0.6962
INFO:__main__:Epoch 2 - Score: 0.6962
Epoch 2 - Save Best Score: 0.6962 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.6962 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.4272(0.2045) 
f1 score : 0.46913580246913583
recall score : 1.0
precision score : 0.3064516129032258
thresh : 0.64
Epoch: [3][0/279] Elapsed 0m 0s (remain 3m 20s) Loss: 0.1518(0.1518) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 41s (remain 1m 12s) Loss: 0.1721(0.1790) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 1m 22s (remain 0m 31s) Loss: 0.1361(0.1747) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 1m 53s (remain 0m 0s) Loss: 0.1688(0.1714) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0829(0.0829) 


Epoch 3 - avg_train_loss: 0.1714  avg_val_loss: 0.2063  time: 117s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1714  avg_val_loss: 0.2063  time: 117s
Epoch 3 - Score: 0.7022
INFO:__main__:Epoch 3 - Score: 0.7022
Epoch 3 - Save Best Score: 0.7022 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7022 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5459(0.2063) 
f1 score : 0.4779874213836478
recall score : 1.0
precision score : 0.3140495867768595
thresh : 0.62
Epoch: [4][0/279] Elapsed 0m 0s (remain 3m 24s) Loss: 0.1073(0.1073) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 41s (remain 1m 12s) Loss: 0.0891(0.1283) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 1m 22s (remain 0m 31s) Loss: 0.1320(0.1246) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 1m 53s (remain 0m 0s) Loss: 0.1468(0.1214) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.1289(0.1289) 


Epoch 4 - avg_train_loss: 0.1214  avg_val_loss: 0.2129  time: 117s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1214  avg_val_loss: 0.2129  time: 117s
Epoch 4 - Score: 0.6982
INFO:__main__:Epoch 4 - Score: 0.6982


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.4600(0.2129) 
f1 score : 0.4859504132231405
recall score : 0.9671052631578947
precision score : 0.32450331125827814
thresh : 0.68


Score: 0.3320
INFO:__main__:Score: 0.3320
ACC BEST Score: 0.7022
INFO:__main__:ACC BEST Score: 0.7022
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.4779874213836478
recall score : 1.0
precision score : 0.3140495867768595
thresh : 0.62


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable FGM
Epoch: [1][0/279] Elapsed 0m 0s (remain 3m 23s) Loss: 0.4585(0.4585) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 41s (remain 1m 13s) Loss: 0.2138(0.3421) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 0.3174(0.2829) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 1m 53s (remain 0m 0s) Loss: 0.1476(0.2613) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0988(0.0988) 


Epoch 1 - avg_train_loss: 0.2613  avg_val_loss: 0.2036  time: 117s
INFO:__main__:Epoch 1 - avg_train_loss: 0.2613  avg_val_loss: 0.2036  time: 117s
Epoch 1 - Score: 0.7022
INFO:__main__:Epoch 1 - Score: 0.7022
Epoch 1 - Save Best Score: 0.7022 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7022 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.4312(0.2036) 
f1 score : 0.46841294298921415
recall score : 1.0
precision score : 0.3058350100603622
thresh : 0.61
Epoch: [2][0/279] Elapsed 0m 0s (remain 3m 12s) Loss: 0.1717(0.1717) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 41s (remain 1m 12s) Loss: 0.2095(0.2157) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 0.1903(0.2089) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 1m 52s (remain 0m 0s) Loss: 0.1613(0.2051) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0910(0.0910) 


Epoch 2 - avg_train_loss: 0.2051  avg_val_loss: 0.1957  time: 117s
INFO:__main__:Epoch 2 - avg_train_loss: 0.2051  avg_val_loss: 0.1957  time: 117s
Epoch 2 - Score: 0.7123
INFO:__main__:Epoch 2 - Score: 0.7123
Epoch 2 - Save Best Score: 0.7123 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7123 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.3986(0.1957) 
f1 score : 0.46913580246913583
recall score : 1.0
precision score : 0.3064516129032258
thresh : 0.6
Epoch: [3][0/279] Elapsed 0m 0s (remain 2m 58s) Loss: 0.2258(0.2258) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 40s (remain 1m 12s) Loss: 0.2371(0.1860) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 0.3032(0.1871) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 1m 52s (remain 0m 0s) Loss: 0.1538(0.1846) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0664(0.0664) 


Epoch 3 - avg_train_loss: 0.1846  avg_val_loss: 0.1957  time: 116s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1846  avg_val_loss: 0.1957  time: 116s
Epoch 3 - Score: 0.7062
INFO:__main__:Epoch 3 - Score: 0.7062


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.4713(0.1957) 
f1 score : 0.4713178294573644
recall score : 1.0
precision score : 0.30831643002028397
thresh : 0.6
Epoch: [4][0/279] Elapsed 0m 0s (remain 3m 21s) Loss: 0.1604(0.1604) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 41s (remain 1m 12s) Loss: 0.1580(0.1494) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 0.1705(0.1465) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 1m 53s (remain 0m 0s) Loss: 0.1029(0.1466) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.1040(0.1040) 


Epoch 4 - avg_train_loss: 0.1466  avg_val_loss: 0.1974  time: 117s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1466  avg_val_loss: 0.1974  time: 117s
Epoch 4 - Score: 0.7143
INFO:__main__:Epoch 4 - Score: 0.7143
Epoch 4 - Save Best Score: 0.7143 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7143 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.4157(0.1974) 
f1 score : 0.47770700636942676
recall score : 0.9868421052631579
precision score : 0.31512605042016806
thresh : 0.67


Score: 0.3400
INFO:__main__:Score: 0.3400
ACC BEST Score: 0.7143
INFO:__main__:ACC BEST Score: 0.7143
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.47770700636942676
recall score : 0.9868421052631579
precision score : 0.31512605042016806
thresh : 0.67


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable FGM
Epoch: [1][0/279] Elapsed 0m 0s (remain 3m 17s) Loss: 0.8525(0.8525) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 41s (remain 1m 13s) Loss: 0.3202(0.3156) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 1m 22s (remain 0m 31s) Loss: 0.2386(0.2631) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 1m 53s (remain 0m 0s) Loss: 0.2852(0.2470) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.3503(0.3503) 


Epoch 1 - avg_train_loss: 0.2470  avg_val_loss: 0.2848  time: 117s
INFO:__main__:Epoch 1 - avg_train_loss: 0.2470  avg_val_loss: 0.2848  time: 117s
Epoch 1 - Score: 0.7143
INFO:__main__:Epoch 1 - Score: 0.7143
Epoch 1 - Save Best Score: 0.7143 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7143 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.1260(0.2848) 
f1 score : 0.46841294298921415
recall score : 1.0
precision score : 0.3058350100603622
thresh : 0.67
Epoch: [2][0/279] Elapsed 0m 0s (remain 3m 8s) Loss: 0.3101(0.3101) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 41s (remain 1m 13s) Loss: 0.2241(0.2206) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 1m 22s (remain 0m 31s) Loss: 0.3783(0.2096) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 1m 53s (remain 0m 0s) Loss: 0.2084(0.2077) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.1657(0.1657) 


Epoch 2 - avg_train_loss: 0.2077  avg_val_loss: 0.2075  time: 117s
INFO:__main__:Epoch 2 - avg_train_loss: 0.2077  avg_val_loss: 0.2075  time: 117s
Epoch 2 - Score: 0.7243
INFO:__main__:Epoch 2 - Score: 0.7243
Epoch 2 - Save Best Score: 0.7243 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7243 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.2960(0.2075) 
f1 score : 0.46841294298921415
recall score : 1.0
precision score : 0.3058350100603622
thresh : 0.63
Epoch: [3][0/279] Elapsed 0m 0s (remain 3m 27s) Loss: 0.2349(0.2349) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 40s (remain 1m 11s) Loss: 0.1302(0.1946) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 0.1398(0.1811) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 1m 53s (remain 0m 0s) Loss: 0.1415(0.1778) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0780(0.0780) 


Epoch 3 - avg_train_loss: 0.1778  avg_val_loss: 0.1961  time: 117s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1778  avg_val_loss: 0.1961  time: 117s
Epoch 3 - Score: 0.7203
INFO:__main__:Epoch 3 - Score: 0.7203


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.4593(0.1961) 
f1 score : 0.47756410256410253
recall score : 0.9802631578947368
precision score : 0.3156779661016949
thresh : 0.63
Epoch: [4][0/279] Elapsed 0m 0s (remain 3m 13s) Loss: 0.1811(0.1811) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 41s (remain 1m 12s) Loss: 0.0800(0.1390) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 0.0881(0.1273) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 1m 54s (remain 0m 0s) Loss: 0.0594(0.1247) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.1305(0.1305) 


Epoch 4 - avg_train_loss: 0.1247  avg_val_loss: 0.2029  time: 118s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1247  avg_val_loss: 0.2029  time: 118s
Epoch 4 - Score: 0.7123
INFO:__main__:Epoch 4 - Score: 0.7123


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.3637(0.2029) 
f1 score : 0.48604269293924457
recall score : 0.9736842105263158
precision score : 0.3238512035010941
thresh : 0.63


Score: 0.3058
INFO:__main__:Score: 0.3058
ACC BEST Score: 0.7243
INFO:__main__:ACC BEST Score: 0.7243
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.46841294298921415
recall score : 1.0
precision score : 0.3058350100603622
thresh : 0.63


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable FGM
Epoch: [1][0/279] Elapsed 0m 0s (remain 3m 32s) Loss: 0.1295(0.1295) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 41s (remain 1m 13s) Loss: 0.2389(0.3173) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 1m 22s (remain 0m 31s) Loss: 0.3154(0.2699) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 1m 53s (remain 0m 0s) Loss: 0.1681(0.2520) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0736(0.0736) 


Epoch 1 - avg_train_loss: 0.2520  avg_val_loss: 0.1996  time: 117s
INFO:__main__:Epoch 1 - avg_train_loss: 0.2520  avg_val_loss: 0.1996  time: 117s
Epoch 1 - Score: 0.7062
INFO:__main__:Epoch 1 - Score: 0.7062
Epoch 1 - Save Best Score: 0.7062 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7062 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.4485(0.1996) 
f1 score : 0.46841294298921415
recall score : 1.0
precision score : 0.3058350100603622
thresh : 0.58
Epoch: [2][0/279] Elapsed 0m 0s (remain 3m 2s) Loss: 0.1278(0.1278) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 42s (remain 1m 14s) Loss: 0.1985(0.2085) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 1m 22s (remain 0m 32s) Loss: 0.1995(0.2069) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 1m 54s (remain 0m 0s) Loss: 0.2706(0.2061) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.1128(0.1128) 


Epoch 2 - avg_train_loss: 0.2061  avg_val_loss: 0.1953  time: 118s
INFO:__main__:Epoch 2 - avg_train_loss: 0.2061  avg_val_loss: 0.1953  time: 118s
Epoch 2 - Score: 0.7103
INFO:__main__:Epoch 2 - Score: 0.7103
Epoch 2 - Save Best Score: 0.7103 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7103 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.3569(0.1953) 
f1 score : 0.46841294298921415
recall score : 1.0
precision score : 0.3058350100603622
thresh : 0.6
Epoch: [3][0/279] Elapsed 0m 0s (remain 3m 24s) Loss: 0.2115(0.2115) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 41s (remain 1m 12s) Loss: 0.2030(0.1814) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 0.1992(0.1768) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 1m 53s (remain 0m 0s) Loss: 0.1685(0.1725) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.1230(0.1230) 


Epoch 3 - avg_train_loss: 0.1725  avg_val_loss: 0.1856  time: 117s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1725  avg_val_loss: 0.1856  time: 117s
Epoch 3 - Score: 0.7203
INFO:__main__:Epoch 3 - Score: 0.7203
Epoch 3 - Save Best Score: 0.7203 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7203 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.3079(0.1856) 
f1 score : 0.4720496894409938
recall score : 1.0
precision score : 0.3089430894308943
thresh : 0.63
Epoch: [4][0/279] Elapsed 0m 0s (remain 3m 1s) Loss: 0.1651(0.1651) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 41s (remain 1m 12s) Loss: 0.2270(0.1304) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 1m 22s (remain 0m 32s) Loss: 0.1967(0.1249) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 1m 53s (remain 0m 0s) Loss: 0.1478(0.1209) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.1186(0.1186) 


Epoch 4 - avg_train_loss: 0.1209  avg_val_loss: 0.1873  time: 117s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1209  avg_val_loss: 0.1873  time: 117s
Epoch 4 - Score: 0.7183
INFO:__main__:Epoch 4 - Score: 0.7183


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.3072(0.1873) 
f1 score : 0.48852459016393446
recall score : 0.9802631578947368
precision score : 0.32532751091703055
thresh : 0.63


Score: 0.3159
INFO:__main__:Score: 0.3159
ACC BEST Score: 0.7203
INFO:__main__:ACC BEST Score: 0.7203
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.4720496894409938
recall score : 1.0
precision score : 0.3089430894308943
thresh : 0.63


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable FGM
Epoch: [1][0/279] Elapsed 0m 0s (remain 3m 14s) Loss: 0.3434(0.3434) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 40s (remain 1m 11s) Loss: 0.1582(0.3845) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 1m 22s (remain 0m 32s) Loss: 0.2244(0.3012) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 1m 53s (remain 0m 0s) Loss: 0.2102(0.2767) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.1433(0.1433) 


Epoch 1 - avg_train_loss: 0.2767  avg_val_loss: 0.2084  time: 117s
INFO:__main__:Epoch 1 - avg_train_loss: 0.2767  avg_val_loss: 0.2084  time: 117s
Epoch 1 - Score: 0.7042
INFO:__main__:Epoch 1 - Score: 0.7042
Epoch 1 - Save Best Score: 0.7042 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7042 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.3195(0.2084) 
f1 score : 0.46841294298921415
recall score : 1.0
precision score : 0.3058350100603622
thresh : 0.62
Epoch: [2][0/279] Elapsed 0m 0s (remain 3m 26s) Loss: 0.1794(0.1794) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 40s (remain 1m 11s) Loss: 0.3161(0.2145) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 1m 21s (remain 0m 31s) Loss: 0.1511(0.2087) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 1m 53s (remain 0m 0s) Loss: 0.2037(0.2039) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.1379(0.1379) 


Epoch 2 - avg_train_loss: 0.2039  avg_val_loss: 0.2052  time: 117s
INFO:__main__:Epoch 2 - avg_train_loss: 0.2039  avg_val_loss: 0.2052  time: 117s
Epoch 2 - Score: 0.7042
INFO:__main__:Epoch 2 - Score: 0.7042


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.3105(0.2052) 
f1 score : 0.46841294298921415
recall score : 1.0
precision score : 0.3058350100603622
thresh : 0.65
Epoch: [3][0/279] Elapsed 0m 0s (remain 2m 54s) Loss: 0.1417(0.1417) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 41s (remain 1m 13s) Loss: 0.1952(0.1958) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 1m 22s (remain 0m 31s) Loss: 0.1555(0.1862) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 1m 54s (remain 0m 0s) Loss: 0.1160(0.1809) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0787(0.0787) 


Epoch 3 - avg_train_loss: 0.1809  avg_val_loss: 0.1970  time: 118s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1809  avg_val_loss: 0.1970  time: 118s
Epoch 3 - Score: 0.7143
INFO:__main__:Epoch 3 - Score: 0.7143
Epoch 3 - Save Best Score: 0.7143 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7143 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.3834(0.1970) 
f1 score : 0.47634069400630913
recall score : 0.993421052631579
precision score : 0.3132780082987552
thresh : 0.63
Epoch: [4][0/279] Elapsed 0m 0s (remain 3m 35s) Loss: 0.1334(0.1334) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 40s (remain 1m 11s) Loss: 0.1003(0.1515) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 1m 20s (remain 0m 31s) Loss: 0.2070(0.1464) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 1m 52s (remain 0m 0s) Loss: 0.1600(0.1435) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0764(0.0764) 


Epoch 4 - avg_train_loss: 0.1435  avg_val_loss: 0.2032  time: 117s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1435  avg_val_loss: 0.2032  time: 117s
Epoch 4 - Score: 0.7143
INFO:__main__:Epoch 4 - Score: 0.7143


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.3926(0.2032) 
f1 score : 0.4869281045751634
recall score : 0.9802631578947368
precision score : 0.3239130434782609
thresh : 0.65




In [None]:
from google.colab import runtime
runtime.unassign()