In [1]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!nvidia-smi

Sat Apr 29 20:56:05 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    46W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:

import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re
import html

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD, AdamW, RAdam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score, recall_score, accuracy_score, precision_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
DIR = "/content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")
CUSTOM_MODEL_DIR = os.path.join(OUTPUT_DIR,'clrp_deberta_v3_base')
OUTPUT_EXP_DIR = DIR + '/output/EXP034/'
if not os.path.exists(OUTPUT_EXP_DIR):
    os.makedirs(OUTPUT_EXP_DIR)

In [6]:

# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model_name="microsoft/deberta-v3-base"
    # model='microsoft/deberta-base'
    # model='roberta-base'
    # model='roberta-large'
    # model='roberta-large-mnli'
    # model='xlnet-large-cased'
    # model='albert-xxlarge-v2'
    # model="microsoft/deberta-large"
    # model="microsoft/deberta-v3-large"
    # model='microsoft/deberta-v2-xlarge'
    # model='funnel-transformer/large'
    # model='funnel-transformer/medium'
    # model='albert-base-v2'
    # model='albert-large-v2'
    # model='google/electra-large-discriminator'
    # model='google/electra-base-discriminator'
    # model="facebook/bart-large-mnli"
    # model="facebook/bart-large"
    # model="facebook/bart-base"
    model = CUSTOM_MODEL_DIR
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=16
    fc_dropout=0.2
    target_size=1
    max_len=256
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=10
    trn_fold=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    train=True
    nth_awp_start_epoch=1
    gradient_checkpointing = False
    freezing = False
    num_reinit_layers = 1
    is_reinit_layer = False
    fgm = False
    awp_start=1

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1]

In [7]:
def get_score(labels, outputs):
    thresh = 0.5
    y_pred = outputs
    y_true = labels
    f_score = f1_score(y_true, (y_pred>thresh).astype(int))
    r_score = recall_score(y_true, (y_pred>thresh).astype(int))
    p_score = precision_score(y_true, (y_pred>thresh).astype(int))
    print(f"f1 score : {f_score}")
    print(f"recall score : {r_score}")
    print(f"precision score : {p_score}")
    return accuracy_score(y_true, (y_pred>thresh).astype(int))

def get_acc_score(labels, outputs):
    y_pred = outputs
    y_true = labels
    best_score = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.80, 0.01):
        thresh = np.round(thresh, 2)
        score = accuracy_score(y_true, (y_pred>thresh).astype(int))
        #print("Accuracy score at threshold {0} is {1}".format(thresh, score))
        if score > best_score:
          best_score = score
          best_thresh = thresh
    print(f"thresh : {best_thresh}")
    return accuracy_score(y_true, (y_pred>best_thresh).astype(int))


def get_logger(filename=OUTPUT_EXP_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=CFG.seed)

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
import pandas as pd
import numpy as np

train = pd.read_csv(os.path.join(INPUT_DIR,"train_data.csv"))
test = pd.read_csv(os.path.join(INPUT_DIR,"test_data.csv"))
sample_sub = pd.read_csv(os.path.join(INPUT_DIR,"submission.csv"))

print(train.shape)
display(train.head(3))

print(test.shape)
display(test.head(3))

print(sample_sub.shape)
display(sample_sub.head(3))

(4974, 6)


Unnamed: 0,id,title,year,abstract,keywords,y
0,1,Hierarchical Adversarially Learned Inference,2018,We propose a novel hierarchical generative mod...,"generative, hierarchical, unsupervised, semisu...",0
1,2,Learning to Compute Word Embeddings On the Fly,2018,Words in natural language follow a Zipfian dis...,"NLU, word embeddings, representation learning",0
2,3,Graph2Seq: Scalable Learning Dynamics for Graphs,2018,Neural networks are increasingly used as a gen...,,0


(6393, 5)


Unnamed: 0,id,title,year,abstract,keywords
0,1,StyleAlign: Analysis and Applications of Align...,2022,"In this paper, we perform an in-depth study of...","StyleGAN, transfer learning, fine tuning, mode..."
1,2,Embedding a random graph via GNN: mean-field i...,2021,We develop a theory for embedding a random gra...,"Graph neural network, graph embedding, multi-r..."
2,3,BBRefinement: an universal scheme to improve p...,2021,We present a conceptually simple yet powerful ...,"object detection, deep neural networks, refine..."


(6393, 2)


Unnamed: 0,id,y
0,1,0
1,2,0
2,3,0


In [10]:
train["texts"] = train["title"] + "[SEP]" + train["abstract"] 

In [11]:
skf = StratifiedKFold(n_splits=CFG.n_fold,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train.y)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)

if CFG.debug:
    display(train.groupby('kfold').size())
    train = train.sample(n=500, random_state=0).reset_index(drop=True)
    display(train.groupby('kfold').size())

In [12]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_EXP_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

In [13]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['texts'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 3 # cls + sep + sep
LOGGER.info(f"max_len: {CFG.max_len}")

100%|██████████| 4974/4974 [00:03<00:00, 1307.09it/s]
max_len: 522
INFO:__main__:max_len: 522


In [14]:
class AWP:
    def __init__(self, model, optimizer, *, adv_param='weight',
                 adv_lr=0.001, adv_eps=0.001):
        self.model = model
        self.optimizer = optimizer
        self.adv_param = adv_param
        self.adv_lr = adv_lr
        self.adv_eps = adv_eps
        self.backup = {}

    def perturb(self, inputs, y, criterion):
        """
        Perturb model parameters for AWP gradient
        Call before loss and loss.backward()
        """
        self._save()  # save model parameters
        self._attack_step()  # perturb weights

    def _attack_step(self):
        e = 1e-6
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                grad = self.optimizer.state[param]['exp_avg']
                norm_grad = torch.norm(grad)
                norm_data = torch.norm(param.detach())

                if norm_grad != 0 and not torch.isnan(norm_grad):
                    # Set lower and upper limit in change
                    limit_eps = self.adv_eps * param.detach().abs()
                    param_min = param.data - limit_eps
                    param_max = param.data + limit_eps

                    # Perturb along gradient
                    # w += (adv_lr * |w| / |grad|) * grad
                    param.data.add_(grad, alpha=(self.adv_lr * (norm_data + e) / (norm_grad + e)))

                    # Apply the limit to the change
                    param.data.clamp_(param_min, param_max)

    def _save(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                if name not in self.backup:
                    self.backup[name] = param.clone().detach()
                else:
                    self.backup[name].copy_(param.data)

    def restore(self):
        """
        Restore model parameter to correct position; AWP do not perturbe weights, it perturb gradients
        Call after loss.backward(), before optimizer.step()
        """
        for name, param in self.model.named_parameters():
            if name in self.backup:
                param.data.copy_(self.backup[name])

In [15]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False,
                           truncation=True)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df['y'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

#collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
def reinit_layers(model):

    #for layer in model.model.encoder.layer[-CFG.num_reinit_layers:]:
    for layer in model.encoder.layer[-CFG.num_reinit_layers:]:    #Custome model内(backbone)

            for module in layer.modules():

                if isinstance(module,nn.Linear):
                    module.weight.data.normal_(mean=0.0,std=model.config.initializer_range)
                    if module.bias is not None:
                            module.bias.data.zero_()
                elif isinstance(module, nn.Embedding):
                        module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                        if module.padding_idx is not None:
                            module.weight.data[module.padding_idx].zero_()
                elif isinstance(module, nn.LayerNorm):
                        module.bias.data.zero_()
                        module.weight.data.fill_(1.0)
                        
    return model

In [17]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if CFG.is_reinit_layer:
            self.model = reinit_layers(self.model)
            print(f'Reinitializing Last {CFG.num_reinit_layers} Layers.')
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())

        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self._init_weights(self.fc)
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
        self._init_weights(self.fc)
        self.sig = nn.Sigmoid()
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        feature = self.layer_norm1(feature)
        output = self.fc(feature)
        #output = self.sig(output)
        return output

In [18]:
def calculate_loss(inputs, labels, model, criterion, is_valid=True, device="cpu"):    
    y_preds = model(inputs)
    loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
    return (loss, y_preds) if is_valid else loss

In [19]:

# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device, awp):
    model.zero_grad()
    model.train()
    awp_start = CFG.awp_start
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        if epoch >= awp_start:
            awp.perturb(inputs, labels, criterion)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            loss = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=False, device=device)
        #print(y_preds.sigmoid().squeeze().view(1, -1))
        #loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        if scaler is not None:
            scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        awp.restore()
        if CFG.fgm:
          fgm.attack() 
          adversarial_loss = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=False, device=device)
          scaler.scale(adversarial_loss).backward()
          fgm.restore()
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  #'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          #grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            loss, y_preds = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=True, device=device)
        #loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [20]:

# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['kfold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['kfold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds['y'].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)


    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size*2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_EXP_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr=5e-6, decoder_lr=1e-4, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", 
                    "LayerNorm.weight"]
        group1=['layer.0.','layer.1.','layer.2.','layer.3.']
        group2=['layer.4.','layer.5.','layer.6.','layer.7.']    
        group3=['layer.8.','layer.9.','layer.10.','layer.11.']
        group_all=['layer.0.','layer.1.','layer.2.','layer.3.','layer.4.','layer.5.','layer.6.','layer.7.','layer.8.','layer.9.','layer.10.','layer.11.']
        optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': weight_decay, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': weight_decay, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': weight_decay, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': 0.0},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': 0.0, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': 0.0, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': 0.0, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.named_parameters() if "model" not in n], 'lr':decoder_lr, "momentum" : 0.99},
    ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss()

    print('Enable AWP')
    awp = AWP(model, optimizer, adv_lr=0.001, adv_eps=0.001)
    #print('Enable FGM')
    #fgm = FGM(model=model, eps=0.1)
    
    best_score = -1.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device, awp)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score_05 = get_score(valid_labels, predictions)
        score = get_acc_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')

        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_EXP_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_EXP_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [21]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['y'].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        acc_score = get_acc_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
        LOGGER.info(f'ACC BEST Score: {acc_score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
            #break
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_EXP_DIR+'oof_df.pkl')

DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "torch_dtype": "f

Enable AWP
Epoch: [1][0/279] Elapsed 0m 1s (remain 7m 23s) Loss: 0.4995(0.4995) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 13s (remain 0m 23s) Loss: 0.5988(0.6362) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 24s (remain 0m 9s) Loss: 0.8319(0.6216) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.6001(0.6164) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4611(0.4611) 


Epoch 1 - avg_train_loss: 0.6164  avg_val_loss: 0.6096  time: 37s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6164  avg_val_loss: 0.6096  time: 37s
Epoch 1 - Score: 0.7008
INFO:__main__:Epoch 1 - Score: 0.7008
Epoch 1 - Save Best Score: 0.7008 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7008 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9392(0.6096) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
thresh : 0.41
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 47s) Loss: 0.5455(0.5455) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.5062(0.5815) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.4609(0.5848) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.6279(0.5766) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2873(0.2873) 


Epoch 2 - avg_train_loss: 0.5766  avg_val_loss: 0.5626  time: 37s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5766  avg_val_loss: 0.5626  time: 37s
Epoch 2 - Score: 0.7149
INFO:__main__:Epoch 2 - Score: 0.7149
Epoch 2 - Save Best Score: 0.7149 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7149 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0796(0.5626) 
f1 score : 0.16279069767441862
recall score : 0.09210526315789473
precision score : 0.7
thresh : 0.52
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 41s) Loss: 0.4272(0.4272) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.5506(0.5232) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.5645(0.5155) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.6121(0.5077) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2837(0.2837) 


Epoch 3 - avg_train_loss: 0.5077  avg_val_loss: 0.5987  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5077  avg_val_loss: 0.5987  time: 36s
Epoch 3 - Score: 0.7249
INFO:__main__:Epoch 3 - Score: 0.7249
Epoch 3 - Save Best Score: 0.7249 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7249 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2320(0.5987) 
f1 score : 0.3004291845493562
recall score : 0.23026315789473684
precision score : 0.43209876543209874
thresh : 0.71
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 44s) Loss: 0.3022(0.3022) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.3962(0.3973) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.3660(0.3841) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.2727(0.3795) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3984(0.3984) 


Epoch 4 - avg_train_loss: 0.3795  avg_val_loss: 0.6512  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3795  avg_val_loss: 0.6512  time: 36s
Epoch 4 - Score: 0.7048
INFO:__main__:Epoch 4 - Score: 0.7048


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0642(0.6512) 
f1 score : 0.4098939929328622
recall score : 0.3815789473684211
precision score : 0.44274809160305345
thresh : 0.76


Score: 0.6727
INFO:__main__:Score: 0.6727
ACC BEST Score: 0.7249
INFO:__main__:ACC BEST Score: 0.7249
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.3004291845493562
recall score : 0.23026315789473684
precision score : 0.43209876543209874
thresh : 0.71


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 50s) Loss: 0.6563(0.6563) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.4671(0.6223) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 23s (remain 0m 8s) Loss: 0.7913(0.6173) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.7783(0.6127) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.4781(0.4781) 


Epoch 1 - avg_train_loss: 0.6127  avg_val_loss: 0.6026  time: 36s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6127  avg_val_loss: 0.6026  time: 36s
Epoch 1 - Score: 0.7108
INFO:__main__:Epoch 1 - Score: 0.7108
Epoch 1 - Save Best Score: 0.7108 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7108 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8926(0.6026) 
f1 score : 0.13953488372093023
recall score : 0.0784313725490196
precision score : 0.631578947368421
thresh : 0.53
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 50s) Loss: 0.4942(0.4942) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 21s) Loss: 0.4406(0.5765) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.5588(0.5715) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.5885(0.5695) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.4516(0.4516) 


Epoch 2 - avg_train_loss: 0.5695  avg_val_loss: 0.5746  time: 37s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5695  avg_val_loss: 0.5746  time: 37s
Epoch 2 - Score: 0.7390
INFO:__main__:Epoch 2 - Score: 0.7390
Epoch 2 - Save Best Score: 0.7390 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7390 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8743(0.5746) 
f1 score : 0.46037735849056605
recall score : 0.39869281045751637
precision score : 0.5446428571428571
thresh : 0.53
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 56s) Loss: 0.4738(0.4738) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 11s (remain 0m 21s) Loss: 0.4805(0.5161) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.6429(0.4989) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.3850(0.4850) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3067(0.3067) 


Epoch 3 - avg_train_loss: 0.4850  avg_val_loss: 0.5797  time: 37s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4850  avg_val_loss: 0.5797  time: 37s
Epoch 3 - Score: 0.7269
INFO:__main__:Epoch 3 - Score: 0.7269


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.4014(0.5797) 
f1 score : 0.4067796610169491
recall score : 0.3137254901960784
precision score : 0.5783132530120482
thresh : 0.61
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 44s) Loss: 0.2809(0.2809) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.2285(0.3051) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.3499(0.2921) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.1535(0.2951) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5082(0.5082) 


Epoch 4 - avg_train_loss: 0.2951  avg_val_loss: 0.6928  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2951  avg_val_loss: 0.6928  time: 36s
Epoch 4 - Score: 0.7289
INFO:__main__:Epoch 4 - Score: 0.7289


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.4722(0.6928) 
f1 score : 0.4778156996587031
recall score : 0.45751633986928103
precision score : 0.5
thresh : 0.72


Score: 0.7129
INFO:__main__:Score: 0.7129
ACC BEST Score: 0.7390
INFO:__main__:ACC BEST Score: 0.7390
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.46037735849056605
recall score : 0.39869281045751637
precision score : 0.5446428571428571
thresh : 0.53


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 2m 1s) Loss: 0.8851(0.8851) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.5796(0.6327) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.6310(0.6268) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.5572(0.6210) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.5024(0.5024) 


Epoch 1 - avg_train_loss: 0.6210  avg_val_loss: 0.6145  time: 36s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6210  avg_val_loss: 0.6145  time: 36s
Epoch 1 - Score: 0.7108
INFO:__main__:Epoch 1 - Score: 0.7108
Epoch 1 - Save Best Score: 0.7108 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7108 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8088(0.6145) 
f1 score : 0.23529411764705882
recall score : 0.1568627450980392
precision score : 0.47058823529411764
thresh : 0.76
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 46s) Loss: 0.6134(0.6134) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.3932(0.5981) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.6004(0.5862) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.6681(0.5858) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.4674(0.4674) 


Epoch 2 - avg_train_loss: 0.5858  avg_val_loss: 0.5880  time: 37s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5858  avg_val_loss: 0.5880  time: 37s
Epoch 2 - Score: 0.7129
INFO:__main__:Epoch 2 - Score: 0.7129
Epoch 2 - Save Best Score: 0.7129 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7129 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9008(0.5880) 
f1 score : 0.38247011952191234
recall score : 0.3137254901960784
precision score : 0.4897959183673469
thresh : 0.58
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 43s) Loss: 0.4870(0.4870) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.5066(0.5458) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.5257(0.5294) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.5723(0.5188) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5903(0.5903) 


Epoch 3 - avg_train_loss: 0.5188  avg_val_loss: 0.6141  time: 37s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5188  avg_val_loss: 0.6141  time: 37s
Epoch 3 - Score: 0.7169
INFO:__main__:Epoch 3 - Score: 0.7169
Epoch 3 - Save Best Score: 0.7169 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7169 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8401(0.6141) 
f1 score : 0.5303867403314917
recall score : 0.6274509803921569
precision score : 0.45933014354066987
thresh : 0.69
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 44s) Loss: 0.6172(0.6172) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.7651(0.4322) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.3776(0.4088) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4703(0.4073) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.4098(0.4098) 


Epoch 4 - avg_train_loss: 0.4073  avg_val_loss: 0.6168  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.4073  avg_val_loss: 0.6168  time: 36s
Epoch 4 - Score: 0.7229
INFO:__main__:Epoch 4 - Score: 0.7229
Epoch 4 - Save Best Score: 0.7229 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7229 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.4131(0.6168) 
f1 score : 0.4468864468864469
recall score : 0.39869281045751637
precision score : 0.5083333333333333
thresh : 0.71


Score: 0.6968
INFO:__main__:Score: 0.6968
ACC BEST Score: 0.7229
INFO:__main__:ACC BEST Score: 0.7229
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.4468864468864469
recall score : 0.39869281045751637
precision score : 0.5083333333333333
thresh : 0.71


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 53s) Loss: 0.5847(0.5847) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.5108(0.6275) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.6840(0.6131) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.5048(0.6168) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3422(0.3422) 


Epoch 1 - avg_train_loss: 0.6168  avg_val_loss: 0.5996  time: 36s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6168  avg_val_loss: 0.5996  time: 36s
Epoch 1 - Score: 0.7028
INFO:__main__:Epoch 1 - Score: 0.7028
Epoch 1 - Save Best Score: 0.7028 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7028 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1183(0.5996) 
f1 score : 0.08588957055214724
recall score : 0.0457516339869281
precision score : 0.7
thresh : 0.74
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 52s) Loss: 0.5816(0.5816) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 21s) Loss: 0.8023(0.5937) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.2214(0.5820) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4928(0.5808) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4411(0.4411) 


Epoch 2 - avg_train_loss: 0.5808  avg_val_loss: 0.6000  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5808  avg_val_loss: 0.6000  time: 36s
Epoch 2 - Score: 0.7068
INFO:__main__:Epoch 2 - Score: 0.7068
Epoch 2 - Save Best Score: 0.7068 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7068 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8406(0.6000) 
f1 score : 0.20320855614973263
recall score : 0.12418300653594772
precision score : 0.5588235294117647
thresh : 0.52
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 52s) Loss: 0.6452(0.6452) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.8948(0.5422) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.5554(0.5323) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4434(0.5286) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2594(0.2594) 


Epoch 3 - avg_train_loss: 0.5286  avg_val_loss: 0.5815  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5286  avg_val_loss: 0.5815  time: 36s
Epoch 3 - Score: 0.7149
INFO:__main__:Epoch 3 - Score: 0.7149
Epoch 3 - Save Best Score: 0.7149 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7149 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1951(0.5815) 
f1 score : 0.25252525252525254
recall score : 0.16339869281045752
precision score : 0.5555555555555556
thresh : 0.48
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 49s) Loss: 0.4862(0.4862) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.4840(0.4471) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.3389(0.4501) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.5391(0.4524) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2869(0.2869) 


Epoch 4 - avg_train_loss: 0.4524  avg_val_loss: 0.6066  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.4524  avg_val_loss: 0.6066  time: 36s
Epoch 4 - Score: 0.7149
INFO:__main__:Epoch 4 - Score: 0.7149


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1904(0.6066) 
f1 score : 0.44274809160305345
recall score : 0.3790849673202614
precision score : 0.5321100917431193
thresh : 0.58


Score: 0.7028
INFO:__main__:Score: 0.7028
ACC BEST Score: 0.7149
INFO:__main__:ACC BEST Score: 0.7149
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.25252525252525254
recall score : 0.16339869281045752
precision score : 0.5555555555555556
thresh : 0.48


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 54s) Loss: 0.7157(0.7157) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.6231(0.6395) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 23s (remain 0m 8s) Loss: 0.6648(0.6289) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.6066(0.6218) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3319(0.3319) 


Epoch 1 - avg_train_loss: 0.6218  avg_val_loss: 0.5869  time: 36s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6218  avg_val_loss: 0.5869  time: 36s
Epoch 1 - Score: 0.7123
INFO:__main__:Epoch 1 - Score: 0.7123
Epoch 1 - Save Best Score: 0.7123 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7123 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1467(0.5869) 
f1 score : 0.025806451612903226
recall score : 0.013157894736842105
precision score : 0.6666666666666666
thresh : 0.35
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 46s) Loss: 0.6665(0.6665) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.6514(0.5836) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.3496(0.5770) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.5831(0.5764) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2604(0.2604) 


Epoch 2 - avg_train_loss: 0.5764  avg_val_loss: 0.5659  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5764  avg_val_loss: 0.5659  time: 36s
Epoch 2 - Score: 0.7284
INFO:__main__:Epoch 2 - Score: 0.7284
Epoch 2 - Save Best Score: 0.7284 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7284 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1494(0.5659) 
f1 score : 0.16568047337278105
recall score : 0.09210526315789473
precision score : 0.8235294117647058
thresh : 0.39
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 50s) Loss: 0.4543(0.4543) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.5214(0.5374) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.5619(0.5172) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.5327(0.5103) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3615(0.3615) 


Epoch 3 - avg_train_loss: 0.5103  avg_val_loss: 0.5963  time: 37s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5103  avg_val_loss: 0.5963  time: 37s
Epoch 3 - Score: 0.7183
INFO:__main__:Epoch 3 - Score: 0.7183


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8189(0.5963) 
f1 score : 0.4565217391304348
recall score : 0.4144736842105263
precision score : 0.5080645161290323
thresh : 0.78
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 48s) Loss: 0.3427(0.3427) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.1752(0.3728) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.3714(0.3651) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4786(0.3678) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3692(0.3692) 


Epoch 4 - avg_train_loss: 0.3678  avg_val_loss: 0.6908  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3678  avg_val_loss: 0.6908  time: 36s
Epoch 4 - Score: 0.7203
INFO:__main__:Epoch 4 - Score: 0.7203


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0130(0.6908) 
f1 score : 0.4357142857142857
recall score : 0.40131578947368424
precision score : 0.4765625
thresh : 0.71


Score: 0.7163
INFO:__main__:Score: 0.7163
ACC BEST Score: 0.7284
INFO:__main__:ACC BEST Score: 0.7284
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.16568047337278105
recall score : 0.09210526315789473
precision score : 0.8235294117647058
thresh : 0.39


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 59s) Loss: 0.6589(0.6589) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.5493(0.6390) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 23s (remain 0m 8s) Loss: 0.6823(0.6320) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.5734(0.6271) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4175(0.4175) 


Epoch 1 - avg_train_loss: 0.6271  avg_val_loss: 0.6120  time: 36s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6271  avg_val_loss: 0.6120  time: 36s
Epoch 1 - Score: 0.6982
INFO:__main__:Epoch 1 - Score: 0.6982
Epoch 1 - Save Best Score: 0.6982 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6982 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0855(0.6120) 
f1 score : 0.07317073170731707
recall score : 0.039473684210526314
precision score : 0.5
thresh : 0.61
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 51s) Loss: 0.6089(0.6089) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 21s) Loss: 0.7433(0.5956) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.5091(0.5879) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.5797(0.5852) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3831(0.3831) 


Epoch 2 - avg_train_loss: 0.5852  avg_val_loss: 0.5973  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5852  avg_val_loss: 0.5973  time: 36s
Epoch 2 - Score: 0.7002
INFO:__main__:Epoch 2 - Score: 0.7002
Epoch 2 - Save Best Score: 0.7002 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7002 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1115(0.5973) 
f1 score : 0.08484848484848485
recall score : 0.046052631578947366
precision score : 0.5384615384615384
thresh : 0.46
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 55s) Loss: 0.4277(0.4277) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.5215(0.5414) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 24s (remain 0m 9s) Loss: 0.4691(0.5302) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.5497(0.5239) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2693(0.2693) 


Epoch 3 - avg_train_loss: 0.5239  avg_val_loss: 0.6183  time: 37s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5239  avg_val_loss: 0.6183  time: 37s
Epoch 3 - Score: 0.7022
INFO:__main__:Epoch 3 - Score: 0.7022
Epoch 3 - Save Best Score: 0.7022 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7022 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.4743(0.6183) 
f1 score : 0.08588957055214723
recall score : 0.046052631578947366
precision score : 0.6363636363636364
thresh : 0.43
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 50s) Loss: 0.3943(0.3943) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.4135(0.4495) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.4066(0.4412) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.4167(0.4343) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.4006(0.4006) 


Epoch 4 - avg_train_loss: 0.4343  avg_val_loss: 0.6501  time: 37s
INFO:__main__:Epoch 4 - avg_train_loss: 0.4343  avg_val_loss: 0.6501  time: 37s
Epoch 4 - Score: 0.7062
INFO:__main__:Epoch 4 - Score: 0.7062
Epoch 4 - Save Best Score: 0.7062 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7062 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2994(0.6501) 
f1 score : 0.3657587548638132
recall score : 0.3092105263157895
precision score : 0.44761904761904764
thresh : 0.67


Score: 0.6720
INFO:__main__:Score: 0.6720
ACC BEST Score: 0.7062
INFO:__main__:ACC BEST Score: 0.7062
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.3657587548638132
recall score : 0.3092105263157895
precision score : 0.44761904761904764
thresh : 0.67


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 51s) Loss: 0.6215(0.6215) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.5682(0.6234) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.8310(0.6280) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4188(0.6176) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.3077(0.3077) 


Epoch 1 - avg_train_loss: 0.6176  avg_val_loss: 0.5960  time: 37s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6176  avg_val_loss: 0.5960  time: 37s
Epoch 1 - Score: 0.7103
INFO:__main__:Epoch 1 - Score: 0.7103
Epoch 1 - Save Best Score: 0.7103 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7103 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2152(0.5960) 
f1 score : 0.08536585365853658
recall score : 0.046052631578947366
precision score : 0.5833333333333334
thresh : 0.32
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 51s) Loss: 0.5015(0.5015) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 21s) Loss: 0.6241(0.5860) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.5596(0.5834) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.5576(0.5770) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.4219(0.4219) 


Epoch 2 - avg_train_loss: 0.5770  avg_val_loss: 0.5787  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5770  avg_val_loss: 0.5787  time: 36s
Epoch 2 - Score: 0.7082
INFO:__main__:Epoch 2 - Score: 0.7082


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9042(0.5787) 
f1 score : 0.23280423280423282
recall score : 0.14473684210526316
precision score : 0.5945945945945946
thresh : 0.5
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 43s) Loss: 0.5819(0.5819) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 11s (remain 0m 21s) Loss: 0.6874(0.5252) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.7266(0.5235) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4287(0.5149) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.3665(0.3665) 


Epoch 3 - avg_train_loss: 0.5149  avg_val_loss: 0.5631  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5149  avg_val_loss: 0.5631  time: 36s
Epoch 3 - Score: 0.7243
INFO:__main__:Epoch 3 - Score: 0.7243
Epoch 3 - Save Best Score: 0.7243 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7243 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9954(0.5631) 
f1 score : 0.40663900414937754
recall score : 0.3223684210526316
precision score : 0.550561797752809
thresh : 0.68
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 50s) Loss: 0.4087(0.4087) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.3140(0.3859) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.5272(0.3765) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.2706(0.3739) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.3253(0.3253) 


Epoch 4 - avg_train_loss: 0.3739  avg_val_loss: 0.6177  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3739  avg_val_loss: 0.6177  time: 36s
Epoch 4 - Score: 0.7123
INFO:__main__:Epoch 4 - Score: 0.7123


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1999(0.6177) 
f1 score : 0.4649446494464945
recall score : 0.4144736842105263
precision score : 0.5294117647058824
thresh : 0.78


Score: 0.7123
INFO:__main__:Score: 0.7123
ACC BEST Score: 0.7243
INFO:__main__:ACC BEST Score: 0.7243
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.40663900414937754
recall score : 0.3223684210526316
precision score : 0.550561797752809
thresh : 0.68


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 2m 6s) Loss: 0.6674(0.6674) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.7605(0.6265) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.6221(0.6181) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.5945(0.6138) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5100(0.5100) 


Epoch 1 - avg_train_loss: 0.6138  avg_val_loss: 0.6229  time: 36s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6138  avg_val_loss: 0.6229  time: 36s
Epoch 1 - Score: 0.7143
INFO:__main__:Epoch 1 - Score: 0.7143
Epoch 1 - Save Best Score: 0.7143 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7143 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8288(0.6229) 
f1 score : 0.025806451612903226
recall score : 0.013157894736842105
precision score : 0.6666666666666666
thresh : 0.47
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 48s) Loss: 0.5868(0.5868) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.5440(0.5955) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 1.0378(0.5819) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.5461(0.5840) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.4802(0.4802) 


Epoch 2 - avg_train_loss: 0.5840  avg_val_loss: 0.5988  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5840  avg_val_loss: 0.5988  time: 36s
Epoch 2 - Score: 0.7203
INFO:__main__:Epoch 2 - Score: 0.7203
Epoch 2 - Save Best Score: 0.7203 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7203 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7822(0.5988) 
f1 score : 0.4230769230769231
recall score : 0.3618421052631579
precision score : 0.5092592592592593
thresh : 0.54
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 52s) Loss: 0.6333(0.6333) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.3991(0.5160) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.4450(0.5042) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4196(0.5033) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2930(0.2930) 


Epoch 3 - avg_train_loss: 0.5033  avg_val_loss: 0.5818  time: 37s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5033  avg_val_loss: 0.5818  time: 37s
Epoch 3 - Score: 0.7203
INFO:__main__:Epoch 3 - Score: 0.7203


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0939(0.5818) 
f1 score : 0.34545454545454546
recall score : 0.25
precision score : 0.5588235294117647
thresh : 0.52
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 51s) Loss: 0.4875(0.4875) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.2611(0.3917) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.2955(0.3593) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.1570(0.3521) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3962(0.3962) 


Epoch 4 - avg_train_loss: 0.3521  avg_val_loss: 0.6802  time: 37s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3521  avg_val_loss: 0.6802  time: 37s
Epoch 4 - Score: 0.7103
INFO:__main__:Epoch 4 - Score: 0.7103


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9406(0.6802) 
f1 score : 0.4784053156146179
recall score : 0.47368421052631576
precision score : 0.48322147651006714
thresh : 0.61


Score: 0.6982
INFO:__main__:Score: 0.6982
ACC BEST Score: 0.7203
INFO:__main__:ACC BEST Score: 0.7203
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.4230769230769231
recall score : 0.3618421052631579
precision score : 0.5092592592592593
thresh : 0.54


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 59s) Loss: 0.7012(0.7012) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 21s) Loss: 0.7393(0.6333) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.7502(0.6259) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.5604(0.6186) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.4442(0.4442) 


Epoch 1 - avg_train_loss: 0.6186  avg_val_loss: 0.6090  time: 36s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6186  avg_val_loss: 0.6090  time: 36s
Epoch 1 - Score: 0.7163
INFO:__main__:Epoch 1 - Score: 0.7163
Epoch 1 - Save Best Score: 0.7163 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7163 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8815(0.6090) 
f1 score : 0.07272727272727271
recall score : 0.039473684210526314
precision score : 0.46153846153846156
thresh : 0.38
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 44s) Loss: 0.5075(0.5075) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.5724(0.5815) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 24s (remain 0m 9s) Loss: 0.5764(0.5867) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.7717(0.5835) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.3515(0.3515) 


Epoch 2 - avg_train_loss: 0.5835  avg_val_loss: 0.5519  time: 37s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5835  avg_val_loss: 0.5519  time: 37s
Epoch 2 - Score: 0.7163
INFO:__main__:Epoch 2 - Score: 0.7163


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9205(0.5519) 
f1 score : 0.18888888888888888
recall score : 0.1118421052631579
precision score : 0.6071428571428571
thresh : 0.44
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 44s) Loss: 0.6201(0.6201) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.6397(0.5221) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.6684(0.5250) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.5119(0.5175) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.4404(0.4404) 


Epoch 3 - avg_train_loss: 0.5175  avg_val_loss: 0.5643  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5175  avg_val_loss: 0.5643  time: 36s
Epoch 3 - Score: 0.7203
INFO:__main__:Epoch 3 - Score: 0.7203
Epoch 3 - Save Best Score: 0.7203 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7203 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7104(0.5643) 
f1 score : 0.4768211920529801
recall score : 0.47368421052631576
precision score : 0.48
thresh : 0.62
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 49s) Loss: 0.5395(0.5395) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.6780(0.4237) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 24s (remain 0m 9s) Loss: 0.5708(0.4137) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.5417(0.4048) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.3738(0.3738) 


Epoch 4 - avg_train_loss: 0.4048  avg_val_loss: 0.5876  time: 37s
INFO:__main__:Epoch 4 - avg_train_loss: 0.4048  avg_val_loss: 0.5876  time: 37s
Epoch 4 - Score: 0.7143
INFO:__main__:Epoch 4 - Score: 0.7143


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8712(0.5876) 
f1 score : 0.4377358490566038
recall score : 0.3815789473684211
precision score : 0.5132743362831859
thresh : 0.44


Score: 0.6821
INFO:__main__:Score: 0.6821
ACC BEST Score: 0.7203
INFO:__main__:ACC BEST Score: 0.7203
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.4768211920529801
recall score : 0.47368421052631576
precision score : 0.48
thresh : 0.62


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 56s) Loss: 0.7214(0.7214) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.4816(0.6086) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.6686(0.6135) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.5661(0.6144) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2945(0.2945) 


Epoch 1 - avg_train_loss: 0.6144  avg_val_loss: 0.5878  time: 36s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6144  avg_val_loss: 0.5878  time: 36s
Epoch 1 - Score: 0.7062
INFO:__main__:Epoch 1 - Score: 0.7062
Epoch 1 - Save Best Score: 0.7062 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7062 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2233(0.5878) 
f1 score : 0.09815950920245399
recall score : 0.05263157894736842
precision score : 0.7272727272727273
thresh : 0.62
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 52s) Loss: 0.4873(0.4873) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.6721(0.5986) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.5227(0.5905) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.5343(0.5793) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5422(0.5422) 


Epoch 2 - avg_train_loss: 0.5793  avg_val_loss: 0.6341  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5793  avg_val_loss: 0.6341  time: 36s
Epoch 2 - Score: 0.7183
INFO:__main__:Epoch 2 - Score: 0.7183
Epoch 2 - Save Best Score: 0.7183 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7183 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6640(0.6341) 
f1 score : 0.5053191489361702
recall score : 0.625
precision score : 0.42410714285714285
thresh : 0.73
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 44s) Loss: 0.4693(0.4693) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.6329(0.5167) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.3615(0.5043) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.3496(0.4935) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2433(0.2433) 


Epoch 3 - avg_train_loss: 0.4935  avg_val_loss: 0.5967  time: 37s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4935  avg_val_loss: 0.5967  time: 37s
Epoch 3 - Score: 0.7103
INFO:__main__:Epoch 3 - Score: 0.7103


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1652(0.5967) 
f1 score : 0.3514644351464436
recall score : 0.27631578947368424
precision score : 0.4827586206896552
thresh : 0.57
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 53s) Loss: 0.3067(0.3067) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.0861(0.3378) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.7036(0.3210) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.2382(0.3143) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2237(0.2237) 


Epoch 4 - avg_train_loss: 0.3143  avg_val_loss: 0.7047  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3143  avg_val_loss: 0.7047  time: 36s
Epoch 4 - Score: 0.7022
INFO:__main__:Epoch 4 - Score: 0.7022


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.4447(0.7047) 
f1 score : 0.4106463878326996
recall score : 0.35526315789473684
precision score : 0.4864864864864865
thresh : 0.74


Score: 0.6258
INFO:__main__:Score: 0.6258
ACC BEST Score: 0.7183
INFO:__main__:ACC BEST Score: 0.7183
Score: 0.6892
INFO:__main__:Score: 0.6892
ACC BEST Score: 0.7101
INFO:__main__:ACC BEST Score: 0.7101


f1 score : 0.5053191489361702
recall score : 0.625
precision score : 0.42410714285714285
thresh : 0.73
f1 score : 0.39937839937839936
recall score : 0.33749179251477346
precision score : 0.489058039961941
thresh : 0.68


In [None]:
from google.colab import runtime
runtime.unassign()