In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m92.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m97.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, https://us

In [3]:
!nvidia-smi

Sat Apr 22 17:17:39 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P0    47W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:

import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re
import html

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD, AdamW, RAdam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score, recall_score, accuracy_score, precision_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
DIR = "/content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")
CUSTOM_MODEL_DIR = os.path.join(OUTPUT_DIR,'clrp_deberta_v3_base')
OUTPUT_EXP_DIR = DIR + '/output/EXP018/'
if not os.path.exists(OUTPUT_EXP_DIR):
    os.makedirs(OUTPUT_EXP_DIR)

In [6]:

# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model_name="microsoft/deberta-v3-base"
    # model='microsoft/deberta-base'
    # model='roberta-base'
    # model='roberta-large'
    # model='roberta-large-mnli'
    # model='xlnet-large-cased'
    # model='albert-xxlarge-v2'
    # model="microsoft/deberta-large"
    # model="microsoft/deberta-v3-large"
    # model='microsoft/deberta-v2-xlarge'
    # model='funnel-transformer/large'
    # model='funnel-transformer/medium'
    # model='albert-base-v2'
    # model='albert-large-v2'
    # model='google/electra-large-discriminator'
    # model='google/electra-base-discriminator'
    # model="facebook/bart-large-mnli"
    # model="facebook/bart-large"
    # model="facebook/bart-base"
    model = CUSTOM_MODEL_DIR
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=16
    fc_dropout=0.2
    target_size=1
    max_len=256
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=10
    trn_fold=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    train=True
    nth_awp_start_epoch=1
    gradient_checkpointing = False
    freezing = False
    num_reinit_layers = 1
    is_reinit_layer = True

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1]

In [7]:
def get_score(labels, outputs):
    thresh = 0.5
    y_pred = outputs
    y_true = labels
    f_score = f1_score(y_true, (y_pred>thresh).astype(int))
    r_score = recall_score(y_true, (y_pred>thresh).astype(int))
    p_score = precision_score(y_true, (y_pred>thresh).astype(int))
    print(f"f1 score : {f_score}")
    print(f"recall score : {r_score}")
    print(f"precision score : {p_score}")
    return accuracy_score(y_true, (y_pred>thresh).astype(int))

def get_acc_score(labels, outputs):
    y_pred = outputs
    y_true = labels
    best_score = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.80, 0.01):
        thresh = np.round(thresh, 2)
        score = accuracy_score(y_true, (y_pred>thresh).astype(int))
        #print("Accuracy score at threshold {0} is {1}".format(thresh, score))
        if score > best_score:
          best_score = score
          best_thresh = thresh
    print(f"thresh : {best_thresh}")
    return accuracy_score(y_true, (y_pred>best_thresh).astype(int))


def get_logger(filename=OUTPUT_EXP_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=CFG.seed)

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
import pandas as pd
import numpy as np

train = pd.read_csv(os.path.join(INPUT_DIR,"train_data.csv"))
test = pd.read_csv(os.path.join(INPUT_DIR,"test_data.csv"))
sample_sub = pd.read_csv(os.path.join(INPUT_DIR,"submission.csv"))

print(train.shape)
display(train.head(3))

print(test.shape)
display(test.head(3))

print(sample_sub.shape)
display(sample_sub.head(3))

(4974, 6)


Unnamed: 0,id,title,year,abstract,keywords,y
0,1,Hierarchical Adversarially Learned Inference,2018,We propose a novel hierarchical generative mod...,"generative, hierarchical, unsupervised, semisu...",0
1,2,Learning to Compute Word Embeddings On the Fly,2018,Words in natural language follow a Zipfian dis...,"NLU, word embeddings, representation learning",0
2,3,Graph2Seq: Scalable Learning Dynamics for Graphs,2018,Neural networks are increasingly used as a gen...,,0


(6393, 5)


Unnamed: 0,id,title,year,abstract,keywords
0,1,StyleAlign: Analysis and Applications of Align...,2022,"In this paper, we perform an in-depth study of...","StyleGAN, transfer learning, fine tuning, mode..."
1,2,Embedding a random graph via GNN: mean-field i...,2021,We develop a theory for embedding a random gra...,"Graph neural network, graph embedding, multi-r..."
2,3,BBRefinement: an universal scheme to improve p...,2021,We present a conceptually simple yet powerful ...,"object detection, deep neural networks, refine..."


(6393, 2)


Unnamed: 0,id,y
0,1,0
1,2,0
2,3,0


In [10]:
train["texts"] = train["title"] + "[SEP]" + train["abstract"] 

In [11]:
skf = StratifiedKFold(n_splits=CFG.n_fold,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train.y)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)

if CFG.debug:
    display(train.groupby('kfold').size())
    train = train.sample(n=500, random_state=0).reset_index(drop=True)
    display(train.groupby('kfold').size())

In [12]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_EXP_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

In [13]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['texts'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 3 # cls + sep + sep
LOGGER.info(f"max_len: {CFG.max_len}")

100%|██████████| 4974/4974 [00:03<00:00, 1330.10it/s]
max_len: 522
INFO:__main__:max_len: 522


In [14]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False,
                           truncation=True)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df['y'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.half)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

class ValidDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df['y'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

#collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [15]:
def reinit_layers(model):

    #for layer in model.model.encoder.layer[-CFG.num_reinit_layers:]:
    for layer in model.encoder.layer[-CFG.num_reinit_layers:]:    #Custome model内(backbone)

            for module in layer.modules():

                if isinstance(module,nn.Linear):
                    module.weight.data.normal_(mean=0.0,std=model.config.initializer_range)
                    if module.bias is not None:
                            module.bias.data.zero_()
                elif isinstance(module, nn.Embedding):
                        module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                        if module.padding_idx is not None:
                            module.weight.data[module.padding_idx].zero_()
                elif isinstance(module, nn.LayerNorm):
                        module.bias.data.zero_()
                        module.weight.data.fill_(1.0)
                        
    return model

In [16]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if CFG.is_reinit_layer:
            self.model = reinit_layers(self.model)
            print(f'Reinitializing Last {CFG.num_reinit_layers} Layers.')
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())

        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self._init_weights(self.fc)
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
        self._init_weights(self.fc)
        self.sig = nn.Sigmoid()
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        feature = self.layer_norm1(feature)
        output = self.fc(feature)
        #output = self.sig(output)
        return output

In [17]:

# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
        #print(y_preds.sigmoid().squeeze().view(1, -1))
        loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [18]:

# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['kfold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['kfold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds['y'].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = ValidDataset(CFG, valid_folds)


    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size*2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_EXP_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr=5e-6, decoder_lr=1e-4, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", 
                    "LayerNorm.weight"]
        group1=['layer.0.','layer.1.','layer.2.','layer.3.']
        group2=['layer.4.','layer.5.','layer.6.','layer.7.']    
        group3=['layer.8.','layer.9.','layer.10.','layer.11.']
        group_all=['layer.0.','layer.1.','layer.2.','layer.3.','layer.4.','layer.5.','layer.6.','layer.7.','layer.8.','layer.9.','layer.10.','layer.11.']
        optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': weight_decay, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': weight_decay, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': weight_decay, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': 0.0},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': 0.0, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': 0.0, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': 0.0, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.named_parameters() if "model" not in n], 'lr':decoder_lr, "momentum" : 0.99},
    ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCELoss()
    
    best_score = -1.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score_05 = get_score(valid_labels, predictions)
        score = get_acc_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')

        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_EXP_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_EXP_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [19]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['y'].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        acc_score = get_acc_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
        LOGGER.info(f'ACC BEST Score: {acc_score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
            #break
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_EXP_DIR+'oof_df.pkl')

DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "torch_dtype": "f

Reinitializing Last 1 Layers.
Epoch: [1][0/279] Elapsed 0m 3s (remain 16m 14s) Loss: 0.8579(0.8579) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 15s (remain 0m 26s) Loss: 0.6050(0.6751) Grad: 1.2253  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 26s (remain 0m 10s) Loss: 0.7310(0.6477) Grad: 4.6794  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 35s (remain 0m 0s) Loss: 0.5742(0.6389) Grad: 3.8866  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4164(0.4164) 


Epoch 1 - avg_train_loss: 0.6389  avg_val_loss: 0.5863  time: 39s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6389  avg_val_loss: 0.5863  time: 39s
Epoch 1 - Score: 0.7149
INFO:__main__:Epoch 1 - Score: 0.7149
Epoch 1 - Save Best Score: 0.7149 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7149 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9430(0.5863) 
f1 score : 0.13333333333333336
recall score : 0.07236842105263158
precision score : 0.8461538461538461
thresh : 0.48
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 30s) Loss: 0.5161(0.5161) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.4434(0.6053) Grad: 2.6112  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.5986(0.5947) Grad: nan  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4629(0.5885) Grad: 4.1691  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3289(0.3289) 


Epoch 2 - avg_train_loss: 0.5885  avg_val_loss: 0.5773  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5885  avg_val_loss: 0.5773  time: 36s
Epoch 2 - Score: 0.7068
INFO:__main__:Epoch 2 - Score: 0.7068


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0259(0.5773) 
f1 score : 0.06369426751592357
recall score : 0.03289473684210526
precision score : 1.0
thresh : 0.48
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 31s) Loss: 0.5044(0.5044) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.2837(0.5319) Grad: 4.2700  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 8s) Loss: 0.6846(0.5257) Grad: 7.9812  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4092(0.5271) Grad: 2.2792  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.2812(0.2812) 


Epoch 3 - avg_train_loss: 0.5271  avg_val_loss: 0.5738  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5271  avg_val_loss: 0.5738  time: 36s
Epoch 3 - Score: 0.7209
INFO:__main__:Epoch 3 - Score: 0.7209
Epoch 3 - Save Best Score: 0.7209 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7209 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1062(0.5738) 
f1 score : 0.3090909090909091
recall score : 0.2236842105263158
precision score : 0.5
thresh : 0.54
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 35s) Loss: 0.3882(0.3882) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.3928(0.4404) Grad: 11.4376  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.4517(0.4477) Grad: 3.2030  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.2632(0.4444) Grad: 2.3157  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3231(0.3231) 


Epoch 4 - avg_train_loss: 0.4444  avg_val_loss: 0.5996  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.4444  avg_val_loss: 0.5996  time: 36s
Epoch 4 - Score: 0.7129
INFO:__main__:Epoch 4 - Score: 0.7129


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0710(0.5996) 
f1 score : 0.37354085603112835
recall score : 0.3157894736842105
precision score : 0.45714285714285713
thresh : 0.76


Score: 0.6948
INFO:__main__:Score: 0.6948
ACC BEST Score: 0.7209
INFO:__main__:ACC BEST Score: 0.7209
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.3090909090909091
recall score : 0.2236842105263158
precision score : 0.5
thresh : 0.54


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Reinitializing Last 1 Layers.
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 46s) Loss: 0.7471(0.7471) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.5459(0.6443) Grad: 1.8629  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 23s (remain 0m 8s) Loss: 0.6343(0.6304) Grad: 2.5503  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4390(0.6262) Grad: 5.1195  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3707(0.3707) 


Epoch 1 - avg_train_loss: 0.6262  avg_val_loss: 0.5934  time: 36s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6262  avg_val_loss: 0.5934  time: 36s
Epoch 1 - Score: 0.7129
INFO:__main__:Epoch 1 - Score: 0.7129
Epoch 1 - Save Best Score: 0.7129 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7129 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1091(0.5934) 
f1 score : 0.08695652173913043
recall score : 0.0457516339869281
precision score : 0.875
thresh : 0.39
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 43s) Loss: 0.6792(0.6792) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.4954(0.6029) Grad: 1.2717  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 24s (remain 0m 9s) Loss: 0.5176(0.5862) Grad: 4.5748  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.5840(0.5803) Grad: 3.1572  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3370(0.3370) 


Epoch 2 - avg_train_loss: 0.5803  avg_val_loss: 0.5690  time: 37s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5803  avg_val_loss: 0.5690  time: 37s
Epoch 2 - Score: 0.7229
INFO:__main__:Epoch 2 - Score: 0.7229
Epoch 2 - Save Best Score: 0.7229 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7229 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1372(0.5690) 
f1 score : 0.17777777777777778
recall score : 0.10457516339869281
precision score : 0.5925925925925926
thresh : 0.41
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 37s) Loss: 0.5254(0.5254) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.7495(0.5184) Grad: 13.2526  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.5083(0.5110) Grad: 2.1044  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4470(0.5125) Grad: 2.1606  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2960(0.2960) 


Epoch 3 - avg_train_loss: 0.5125  avg_val_loss: 0.5611  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5125  avg_val_loss: 0.5611  time: 36s
Epoch 3 - Score: 0.7349
INFO:__main__:Epoch 3 - Score: 0.7349
Epoch 3 - Save Best Score: 0.7349 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7349 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1968(0.5611) 
f1 score : 0.2814070351758794
recall score : 0.1830065359477124
precision score : 0.6086956521739131
thresh : 0.43
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 46s) Loss: 0.3801(0.3801) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.2211(0.4210) Grad: 2.4814  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.3440(0.4114) Grad: 2.9194  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.3557(0.4081) Grad: 3.1290  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3761(0.3761) 


Epoch 4 - avg_train_loss: 0.4081  avg_val_loss: 0.6001  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.4081  avg_val_loss: 0.6001  time: 36s
Epoch 4 - Score: 0.7349
INFO:__main__:Epoch 4 - Score: 0.7349


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1713(0.6001) 
f1 score : 0.48175182481751827
recall score : 0.43137254901960786
precision score : 0.5454545454545454
thresh : 0.64


Score: 0.7129
INFO:__main__:Score: 0.7129
ACC BEST Score: 0.7349
INFO:__main__:ACC BEST Score: 0.7349
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.2814070351758794
recall score : 0.1830065359477124
precision score : 0.6086956521739131
thresh : 0.43


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Reinitializing Last 1 Layers.
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 46s) Loss: 0.6865(0.6865) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.3130(0.6525) Grad: 3.8773  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 23s (remain 0m 8s) Loss: 0.7075(0.6370) Grad: 1.6833  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.6147(0.6324) Grad: 4.1199  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4431(0.4431) 


Epoch 1 - avg_train_loss: 0.6324  avg_val_loss: 0.5899  time: 36s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6324  avg_val_loss: 0.5899  time: 36s
Epoch 1 - Score: 0.7169
INFO:__main__:Epoch 1 - Score: 0.7169
Epoch 1 - Save Best Score: 0.7169 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7169 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9171(0.5899) 
f1 score : 0.13095238095238093
recall score : 0.0718954248366013
precision score : 0.7333333333333333
thresh : 0.46
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 42s) Loss: 0.5752(0.5752) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.5542(0.5705) Grad: 2.0768  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.4407(0.5773) Grad: 9.2542  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4883(0.5769) Grad: 3.5570  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3410(0.3410) 


Epoch 2 - avg_train_loss: 0.5769  avg_val_loss: 0.5616  time: 37s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5769  avg_val_loss: 0.5616  time: 37s
Epoch 2 - Score: 0.7129
INFO:__main__:Epoch 2 - Score: 0.7129


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1112(0.5616) 
f1 score : 0.27230046948356806
recall score : 0.1895424836601307
precision score : 0.48333333333333334
thresh : 0.58
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 34s) Loss: 0.4395(0.4395) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.6006(0.5109) Grad: 1.7736  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.3835(0.5101) Grad: 2.7832  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.5591(0.5044) Grad: 4.4556  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2589(0.2589) 


Epoch 3 - avg_train_loss: 0.5044  avg_val_loss: 0.5910  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5044  avg_val_loss: 0.5910  time: 36s
Epoch 3 - Score: 0.7129
INFO:__main__:Epoch 3 - Score: 0.7129


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.4551(0.5910) 
f1 score : 0.3222748815165877
recall score : 0.2222222222222222
precision score : 0.5862068965517241
thresh : 0.5
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 32s) Loss: 0.4512(0.4512) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.2369(0.3989) Grad: 2.9726  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.4851(0.4032) Grad: 6.2311  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.3147(0.3943) Grad: 7.1805  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3322(0.3322) 


Epoch 4 - avg_train_loss: 0.3943  avg_val_loss: 0.6235  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3943  avg_val_loss: 0.6235  time: 36s
Epoch 4 - Score: 0.7088
INFO:__main__:Epoch 4 - Score: 0.7088


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.3640(0.6235) 
f1 score : 0.4338235294117647
recall score : 0.38562091503267976
precision score : 0.4957983193277311
thresh : 0.7


Score: 0.7068
INFO:__main__:Score: 0.7068
ACC BEST Score: 0.7169
INFO:__main__:ACC BEST Score: 0.7169
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.13095238095238093
recall score : 0.0718954248366013
precision score : 0.7333333333333333
thresh : 0.46


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Reinitializing Last 1 Layers.
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 44s) Loss: 0.8359(0.8359) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.6455(0.6674) Grad: 2.3260  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.8784(0.6439) Grad: 8.3473  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.6348(0.6342) Grad: 2.7500  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5454(0.5454) 


Epoch 1 - avg_train_loss: 0.6342  avg_val_loss: 0.6367  time: 36s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6342  avg_val_loss: 0.6367  time: 36s
Epoch 1 - Score: 0.7008
INFO:__main__:Epoch 1 - Score: 0.7008
Epoch 1 - Save Best Score: 0.7008 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7008 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7731(0.6367) 
f1 score : 0.24884792626728108
recall score : 0.17647058823529413
precision score : 0.421875
thresh : 0.58
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 38s) Loss: 0.6436(0.6436) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.6616(0.5702) Grad: 5.1242  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.7290(0.5735) Grad: 5.9234  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.6274(0.5802) Grad: 2.5032  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3095(0.3095) 


Epoch 2 - avg_train_loss: 0.5802  avg_val_loss: 0.5776  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5802  avg_val_loss: 0.5776  time: 36s
Epoch 2 - Score: 0.7008
INFO:__main__:Epoch 2 - Score: 0.7008


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0824(0.5776) 
f1 score : 0.08484848484848485
recall score : 0.0457516339869281
precision score : 0.5833333333333334
thresh : 0.52
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 31s) Loss: 0.5215(0.5215) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.5557(0.5307) Grad: 2.1907  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.5518(0.5144) Grad: 3.0200  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.6865(0.5121) Grad: 2.6950  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2975(0.2975) 


Epoch 3 - avg_train_loss: 0.5121  avg_val_loss: 0.5936  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5121  avg_val_loss: 0.5936  time: 36s
Epoch 3 - Score: 0.7108
INFO:__main__:Epoch 3 - Score: 0.7108
Epoch 3 - Save Best Score: 0.7108 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7108 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1119(0.5936) 
f1 score : 0.3900414937759336
recall score : 0.30718954248366015
precision score : 0.5340909090909091
thresh : 0.49
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 45s) Loss: 0.4402(0.4402) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.3674(0.4210) Grad: 3.5980  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.4944(0.4095) Grad: 5.2928  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.7720(0.4180) Grad: 5.0051  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2893(0.2893) 


Epoch 4 - avg_train_loss: 0.4180  avg_val_loss: 0.6431  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.4180  avg_val_loss: 0.6431  time: 36s
Epoch 4 - Score: 0.7269
INFO:__main__:Epoch 4 - Score: 0.7269
Epoch 4 - Save Best Score: 0.7269 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7269 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2681(0.6431) 
f1 score : 0.4518518518518519
recall score : 0.39869281045751637
precision score : 0.5213675213675214
thresh : 0.63


Score: 0.7028
INFO:__main__:Score: 0.7028
ACC BEST Score: 0.7269
INFO:__main__:ACC BEST Score: 0.7269
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.4518518518518519
recall score : 0.39869281045751637
precision score : 0.5213675213675214
thresh : 0.63


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Reinitializing Last 1 Layers.
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 51s) Loss: 0.6162(0.6162) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.6133(0.6940) Grad: 1.9053  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.6943(0.6486) Grad: 2.4571  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.5879(0.6382) Grad: 3.6620  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3450(0.3450) 


Epoch 1 - avg_train_loss: 0.6382  avg_val_loss: 0.5871  time: 37s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6382  avg_val_loss: 0.5871  time: 37s
Epoch 1 - Score: 0.7143
INFO:__main__:Epoch 1 - Score: 0.7143
Epoch 1 - Save Best Score: 0.7143 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7143 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0376(0.5871) 
f1 score : 0.025641025641025637
recall score : 0.013157894736842105
precision score : 0.5
thresh : 0.44
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 39s) Loss: 0.4565(0.4565) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.5596(0.5893) Grad: 1.4264  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.4839(0.5837) Grad: 2.6691  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.5811(0.5792) Grad: 2.5805  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2889(0.2889) 


Epoch 2 - avg_train_loss: 0.5792  avg_val_loss: 0.5621  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5792  avg_val_loss: 0.5621  time: 36s
Epoch 2 - Score: 0.7223
INFO:__main__:Epoch 2 - Score: 0.7223
Epoch 2 - Save Best Score: 0.7223 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7223 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9122(0.5621) 
f1 score : 0.25806451612903225
recall score : 0.15789473684210525
precision score : 0.7058823529411765
thresh : 0.5
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 43s) Loss: 0.3274(0.3274) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.7012(0.5420) Grad: 9.7596  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.6138(0.5364) Grad: 6.0031  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4370(0.5237) Grad: 3.9747  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2300(0.2300) 


Epoch 3 - avg_train_loss: 0.5237  avg_val_loss: 0.6029  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5237  avg_val_loss: 0.6029  time: 36s
Epoch 3 - Score: 0.7103
INFO:__main__:Epoch 3 - Score: 0.7103


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8671(0.6029) 
f1 score : 0.4031007751937984
recall score : 0.34210526315789475
precision score : 0.49056603773584906
thresh : 0.7
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 31s) Loss: 0.2009(0.2009) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.4426(0.4491) Grad: 5.6758  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.4180(0.4437) Grad: 7.0393  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.2397(0.4335) Grad: 3.9526  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2223(0.2223) 


Epoch 4 - avg_train_loss: 0.4335  avg_val_loss: 0.6361  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.4335  avg_val_loss: 0.6361  time: 36s
Epoch 4 - Score: 0.7123
INFO:__main__:Epoch 4 - Score: 0.7123


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8923(0.6361) 
f1 score : 0.39846743295019166
recall score : 0.34210526315789475
precision score : 0.47706422018348627
thresh : 0.73


Score: 0.7223
INFO:__main__:Score: 0.7223
ACC BEST Score: 0.7223
INFO:__main__:ACC BEST Score: 0.7223
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.25806451612903225
recall score : 0.15789473684210525
precision score : 0.7058823529411765
thresh : 0.5


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Reinitializing Last 1 Layers.
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 50s) Loss: 0.9355(0.9355) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 21s) Loss: 0.6006(0.6512) Grad: 0.7931  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.6211(0.6349) Grad: 0.5946  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4153(0.6257) Grad: 3.5799  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3505(0.3505) 


Epoch 1 - avg_train_loss: 0.6257  avg_val_loss: 0.6051  time: 36s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6257  avg_val_loss: 0.6051  time: 36s
Epoch 1 - Score: 0.6982
INFO:__main__:Epoch 1 - Score: 0.6982
Epoch 1 - Save Best Score: 0.6982 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6982 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2037(0.6051) 
f1 score : 0.061728395061728385
recall score : 0.03289473684210526
precision score : 0.5
thresh : 0.4
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 40s) Loss: 0.6597(0.6597) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.7803(0.5813) Grad: 8.7795  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.3555(0.5675) Grad: 0.9919  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.6396(0.5645) Grad: 3.8232  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4704(0.4704) 


Epoch 2 - avg_train_loss: 0.5645  avg_val_loss: 0.5947  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5645  avg_val_loss: 0.5947  time: 36s
Epoch 2 - Score: 0.6982
INFO:__main__:Epoch 2 - Score: 0.6982


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9712(0.5947) 
f1 score : 0.17894736842105263
recall score : 0.1118421052631579
precision score : 0.4473684210526316
thresh : 0.54
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 39s) Loss: 0.5991(0.5991) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.3411(0.5092) Grad: 7.8123  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.3501(0.4931) Grad: 3.0225  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4041(0.4917) Grad: 2.8841  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5421(0.5421) 


Epoch 3 - avg_train_loss: 0.4917  avg_val_loss: 0.6367  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4917  avg_val_loss: 0.6367  time: 36s
Epoch 3 - Score: 0.7103
INFO:__main__:Epoch 3 - Score: 0.7103
Epoch 3 - Save Best Score: 0.7103 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7103 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9511(0.6367) 
f1 score : 0.4300341296928327
recall score : 0.4144736842105263
precision score : 0.44680851063829785
thresh : 0.76
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 35s) Loss: 0.1842(0.1842) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.2615(0.3695) Grad: 3.1879  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.3792(0.3643) Grad: 4.6052  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4683(0.3580) Grad: 6.3761  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4457(0.4457) 


Epoch 4 - avg_train_loss: 0.3580  avg_val_loss: 0.6994  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3580  avg_val_loss: 0.6994  time: 36s
Epoch 4 - Score: 0.7082
INFO:__main__:Epoch 4 - Score: 0.7082


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.3940(0.6994) 
f1 score : 0.3880597014925373
recall score : 0.34210526315789475
precision score : 0.4482758620689655
thresh : 0.77


Score: 0.6640
INFO:__main__:Score: 0.6640
ACC BEST Score: 0.7103
INFO:__main__:ACC BEST Score: 0.7103
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.4300341296928327
recall score : 0.4144736842105263
precision score : 0.44680851063829785
thresh : 0.76


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Reinitializing Last 1 Layers.
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 45s) Loss: 1.0371(1.0371) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.6802(0.6703) Grad: 3.4712  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.5137(0.6412) Grad: 1.4258  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.5601(0.6265) Grad: 2.1573  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4557(0.4557) 


Epoch 1 - avg_train_loss: 0.6265  avg_val_loss: 0.5890  time: 36s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6265  avg_val_loss: 0.5890  time: 36s
Epoch 1 - Score: 0.7042
INFO:__main__:Epoch 1 - Score: 0.7042
Epoch 1 - Save Best Score: 0.7042 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7042 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8144(0.5890) 
f1 score : 0.3090909090909091
recall score : 0.2236842105263158
precision score : 0.5
thresh : 0.46
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 47s) Loss: 0.5723(0.5723) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 21s) Loss: 0.6475(0.5892) Grad: 0.8705  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.5923(0.5805) Grad: 2.2367  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4766(0.5750) Grad: 2.3766  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4030(0.4030) 


Epoch 2 - avg_train_loss: 0.5750  avg_val_loss: 0.5666  time: 37s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5750  avg_val_loss: 0.5666  time: 37s
Epoch 2 - Score: 0.7123
INFO:__main__:Epoch 2 - Score: 0.7123
Epoch 2 - Save Best Score: 0.7123 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7123 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8949(0.5666) 
f1 score : 0.42105263157894735
recall score : 0.34210526315789475
precision score : 0.5473684210526316
thresh : 0.49
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 43s) Loss: 0.6543(0.6543) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.4875(0.4958) Grad: 9.5295  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.5342(0.4916) Grad: 5.0464  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.3796(0.4885) Grad: 2.4511  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.1860(0.1860) 


Epoch 3 - avg_train_loss: 0.4885  avg_val_loss: 0.6003  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4885  avg_val_loss: 0.6003  time: 36s
Epoch 3 - Score: 0.7143
INFO:__main__:Epoch 3 - Score: 0.7143
Epoch 3 - Save Best Score: 0.7143 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7143 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.5001(0.6003) 
f1 score : 0.20765027322404372
recall score : 0.125
precision score : 0.6129032258064516
thresh : 0.29
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 41s) Loss: 0.3994(0.3994) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 21s) Loss: 0.4556(0.3993) Grad: 7.6156  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.5708(0.3789) Grad: 4.4353  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.2422(0.3677) Grad: 6.1707  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3265(0.3265) 


Epoch 4 - avg_train_loss: 0.3677  avg_val_loss: 0.6149  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3677  avg_val_loss: 0.6149  time: 36s
Epoch 4 - Score: 0.7203
INFO:__main__:Epoch 4 - Score: 0.7203
Epoch 4 - Save Best Score: 0.7203 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7203 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.3085(0.6149) 
f1 score : 0.4501845018450184
recall score : 0.40131578947368424
precision score : 0.5126050420168067
thresh : 0.79


Score: 0.7002
INFO:__main__:Score: 0.7002
ACC BEST Score: 0.7203
INFO:__main__:ACC BEST Score: 0.7203
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.4501845018450184
recall score : 0.40131578947368424
precision score : 0.5126050420168067
thresh : 0.79


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Reinitializing Last 1 Layers.
Epoch: [1][0/279] Elapsed 0m 0s (remain 4m 34s) Loss: 1.0537(1.0537) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 12s (remain 0m 22s) Loss: 0.5649(0.6697) Grad: 2.1752  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 24s (remain 0m 9s) Loss: 0.5386(0.6483) Grad: 0.7302  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.7129(0.6368) Grad: 3.8873  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.7947(0.7947) 


Epoch 1 - avg_train_loss: 0.6368  avg_val_loss: 0.7273  time: 37s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6368  avg_val_loss: 0.7273  time: 37s
Epoch 1 - Score: 0.7203
INFO:__main__:Epoch 1 - Score: 0.7203
Epoch 1 - Save Best Score: 0.7203 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7203 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.5258(0.7273) 
f1 score : 0.4911660777385159
recall score : 0.9144736842105263
precision score : 0.3357487922705314
thresh : 0.64
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 40s) Loss: 0.6367(0.6367) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.7495(0.6008) Grad: 6.2488  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 24s (remain 0m 9s) Loss: 0.5459(0.5935) Grad: 2.4777  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.5742(0.5856) Grad: 2.6818  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3784(0.3784) 


Epoch 2 - avg_train_loss: 0.5856  avg_val_loss: 0.5835  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5856  avg_val_loss: 0.5835  time: 36s
Epoch 2 - Score: 0.7243
INFO:__main__:Epoch 2 - Score: 0.7243
Epoch 2 - Save Best Score: 0.7243 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7243 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0412(0.5835) 
f1 score : 0.2604166666666667
recall score : 0.16447368421052633
precision score : 0.625
thresh : 0.54
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 40s) Loss: 0.5029(0.5029) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.4258(0.5158) Grad: 2.1257  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.5010(0.5192) Grad: 3.6131  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4429(0.5164) Grad: 7.4250  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4813(0.4813) 


Epoch 3 - avg_train_loss: 0.5164  avg_val_loss: 0.6027  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5164  avg_val_loss: 0.6027  time: 36s
Epoch 3 - Score: 0.7284
INFO:__main__:Epoch 3 - Score: 0.7284
Epoch 3 - Save Best Score: 0.7284 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7284 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7824(0.6027) 
f1 score : 0.46779661016949153
recall score : 0.45394736842105265
precision score : 0.4825174825174825
thresh : 0.67
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 35s) Loss: 0.5298(0.5298) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.2520(0.4366) Grad: 3.8844  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.3467(0.4216) Grad: 2.5142  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4707(0.4242) Grad: 5.2692  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4079(0.4079) 


Epoch 4 - avg_train_loss: 0.4242  avg_val_loss: 0.6321  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.4242  avg_val_loss: 0.6321  time: 36s
Epoch 4 - Score: 0.7243
INFO:__main__:Epoch 4 - Score: 0.7243


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0867(0.6321) 
f1 score : 0.4521072796934865
recall score : 0.3881578947368421
precision score : 0.5412844036697247
thresh : 0.71


Score: 0.6841
INFO:__main__:Score: 0.6841
ACC BEST Score: 0.7284
INFO:__main__:ACC BEST Score: 0.7284
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.46779661016949153
recall score : 0.45394736842105265
precision score : 0.4825174825174825
thresh : 0.67


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Reinitializing Last 1 Layers.
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 57s) Loss: 0.8638(0.8638) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 21s) Loss: 0.5576(0.6883) Grad: 0.4639  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.5952(0.6544) Grad: 0.4819  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.6562(0.6408) Grad: 3.0818  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3342(0.3342) 


Epoch 1 - avg_train_loss: 0.6408  avg_val_loss: 0.5806  time: 36s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6408  avg_val_loss: 0.5806  time: 36s
Epoch 1 - Score: 0.7062
INFO:__main__:Epoch 1 - Score: 0.7062
Epoch 1 - Save Best Score: 0.7062 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7062 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0675(0.5806) 
f1 score : 0.03821656050955414
recall score : 0.019736842105263157
precision score : 0.6
thresh : 0.38
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 49s) Loss: 0.5029(0.5029) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.6660(0.5933) Grad: 5.3057  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.5249(0.5944) Grad: 0.5530  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4856(0.5867) Grad: 0.7973  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2541(0.2541) 


Epoch 2 - avg_train_loss: 0.5867  avg_val_loss: 0.5860  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5867  avg_val_loss: 0.5860  time: 36s
Epoch 2 - Score: 0.7103
INFO:__main__:Epoch 2 - Score: 0.7103
Epoch 2 - Save Best Score: 0.7103 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7103 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2750(0.5860) 
f1 score : 0.025806451612903226
recall score : 0.013157894736842105
precision score : 0.6666666666666666
thresh : 0.32
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 33s) Loss: 0.3594(0.3594) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.5117(0.5160) Grad: 1.9507  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.3723(0.5256) Grad: 7.9839  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.5234(0.5184) Grad: 3.3163  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4369(0.4369) 


Epoch 3 - avg_train_loss: 0.5184  avg_val_loss: 0.5481  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5184  avg_val_loss: 0.5481  time: 36s
Epoch 3 - Score: 0.7123
INFO:__main__:Epoch 3 - Score: 0.7123
Epoch 3 - Save Best Score: 0.7123 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7123 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7376(0.5481) 
f1 score : 0.4859154929577465
recall score : 0.45394736842105265
precision score : 0.5227272727272727
thresh : 0.56
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 48s) Loss: 0.5298(0.5298) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.3223(0.4473) Grad: 6.6504  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.2146(0.4246) Grad: 4.0560  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4536(0.4265) Grad: 7.7207  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4018(0.4018) 


Epoch 4 - avg_train_loss: 0.4265  avg_val_loss: 0.5740  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.4265  avg_val_loss: 0.5740  time: 36s
Epoch 4 - Score: 0.7163
INFO:__main__:Epoch 4 - Score: 0.7163
Epoch 4 - Save Best Score: 0.7163 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7163 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8603(0.5740) 
f1 score : 0.4565217391304348
recall score : 0.4144736842105263
precision score : 0.5080645161290323
thresh : 0.66


Score: 0.6982
INFO:__main__:Score: 0.6982
ACC BEST Score: 0.7163
INFO:__main__:ACC BEST Score: 0.7163
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.4565217391304348
recall score : 0.4144736842105263
precision score : 0.5080645161290323
thresh : 0.66


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Reinitializing Last 1 Layers.
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 45s) Loss: 0.6943(0.6943) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 21s) Loss: 0.7515(0.6703) Grad: 7.2912  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.4683(0.6444) Grad: 2.5545  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.6484(0.6308) Grad: 6.8636  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4257(0.4257) 


Epoch 1 - avg_train_loss: 0.6308  avg_val_loss: 0.5999  time: 36s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6308  avg_val_loss: 0.5999  time: 36s
Epoch 1 - Score: 0.7042
INFO:__main__:Epoch 1 - Score: 0.7042
Epoch 1 - Save Best Score: 0.7042 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7042 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8932(0.5999) 
f1 score : 0.25742574257425743
recall score : 0.17105263157894737
precision score : 0.52
thresh : 0.6
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 47s) Loss: 0.6040(0.6040) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.4900(0.5754) Grad: 1.7241  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.4648(0.5762) Grad: 6.6774  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.5098(0.5694) Grad: 1.5183  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4345(0.4345) 


Epoch 2 - avg_train_loss: 0.5694  avg_val_loss: 0.5990  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5694  avg_val_loss: 0.5990  time: 36s
Epoch 2 - Score: 0.7062
INFO:__main__:Epoch 2 - Score: 0.7062
Epoch 2 - Save Best Score: 0.7062 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7062 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8371(0.5990) 
f1 score : 0.46048109965635736
recall score : 0.4407894736842105
precision score : 0.48201438848920863
thresh : 0.66
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 34s) Loss: 0.5151(0.5151) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 11s (remain 0m 21s) Loss: 0.7441(0.5048) Grad: 3.5164  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.3748(0.5087) Grad: 2.3018  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.5220(0.5030) Grad: 5.1254  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2674(0.2674) 


Epoch 3 - avg_train_loss: 0.5030  avg_val_loss: 0.5955  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5030  avg_val_loss: 0.5955  time: 36s
Epoch 3 - Score: 0.7123
INFO:__main__:Epoch 3 - Score: 0.7123
Epoch 3 - Save Best Score: 0.7123 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7123 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2546(0.5955) 
f1 score : 0.34234234234234234
recall score : 0.25
precision score : 0.5428571428571428
thresh : 0.52
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 39s) Loss: 0.4116(0.4116) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.5347(0.3978) Grad: 9.1600  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.6260(0.3813) Grad: 18.1236  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.3411(0.3762) Grad: 4.3954  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3197(0.3197) 


Epoch 4 - avg_train_loss: 0.3762  avg_val_loss: 0.6657  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3762  avg_val_loss: 0.6657  time: 36s
Epoch 4 - Score: 0.7163
INFO:__main__:Epoch 4 - Score: 0.7163
Epoch 4 - Save Best Score: 0.7163 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7163 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2575(0.6657) 
f1 score : 0.4250871080139373
recall score : 0.40131578947368424
precision score : 0.45185185185185184
thresh : 0.74


Score: 0.6680
INFO:__main__:Score: 0.6680
ACC BEST Score: 0.7163
INFO:__main__:ACC BEST Score: 0.7163
Score: 0.6954
INFO:__main__:Score: 0.6954
ACC BEST Score: 0.7095
INFO:__main__:ACC BEST Score: 0.7095


f1 score : 0.4250871080139373
recall score : 0.40131578947368424
precision score : 0.45185185185185184
thresh : 0.74
f1 score : 0.38539553752535494
recall score : 0.3118844386080105
precision score : 0.5042462845010616
thresh : 0.68


In [None]:
from google.colab import runtime
runtime.unassign()