In [1]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!nvidia-smi

Sat Apr 29 11:59:11 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   30C    P0    44W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:

import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re
import html

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD, AdamW, RAdam, Optimizer
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score, recall_score, accuracy_score, precision_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
DIR = "/content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")
CUSTOM_MODEL_DIR = os.path.join(OUTPUT_DIR,'clrp_deberta_v3_base')
OUTPUT_EXP_DIR = DIR + '/output/EXP030/'
if not os.path.exists(OUTPUT_EXP_DIR):
    os.makedirs(OUTPUT_EXP_DIR)

In [6]:

# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model_name="microsoft/deberta-v3-base"
    # model='microsoft/deberta-base'
    # model='roberta-base'
    # model='roberta-large'
    # model='roberta-large-mnli'
    # model='xlnet-large-cased'
    # model='albert-xxlarge-v2'
    # model="microsoft/deberta-large"
    # model="microsoft/deberta-v3-large"
    # model='microsoft/deberta-v2-xlarge'
    # model='funnel-transformer/large'
    # model='funnel-transformer/medium'
    # model='albert-base-v2'
    # model='albert-large-v2'
    # model='google/electra-large-discriminator'
    # model='google/electra-base-discriminator'
    # model="facebook/bart-large-mnli"
    # model="facebook/bart-large"
    # model="facebook/bart-base"
    model = CUSTOM_MODEL_DIR
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=16
    fc_dropout=0.2
    target_size=1
    max_len=256
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=10
    trn_fold=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    train=True
    nth_awp_start_epoch=1
    gradient_checkpointing = False
    freezing = False
    num_reinit_layers = 1
    is_reinit_layer = False
    use_prior_wd = True

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1]

In [7]:
def get_score(labels, outputs):
    thresh = 0.5
    y_pred = outputs
    y_true = labels
    f_score = f1_score(y_true, (y_pred>thresh).astype(int))
    r_score = recall_score(y_true, (y_pred>thresh).astype(int))
    p_score = precision_score(y_true, (y_pred>thresh).astype(int))
    print(f"f1 score : {f_score}")
    print(f"recall score : {r_score}")
    print(f"precision score : {p_score}")
    return accuracy_score(y_true, (y_pred>thresh).astype(int))

def get_acc_score(labels, outputs):
    y_pred = outputs
    y_true = labels
    best_score = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.80, 0.01):
        thresh = np.round(thresh, 2)
        score = accuracy_score(y_true, (y_pred>thresh).astype(int))
        #print("Accuracy score at threshold {0} is {1}".format(thresh, score))
        if score > best_score:
          best_score = score
          best_thresh = thresh
    print(f"thresh : {best_thresh}")
    return accuracy_score(y_true, (y_pred>best_thresh).astype(int))


def get_logger(filename=OUTPUT_EXP_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=CFG.seed)

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
import pandas as pd
import numpy as np

train = pd.read_csv(os.path.join(INPUT_DIR,"train_data.csv"))
test = pd.read_csv(os.path.join(INPUT_DIR,"test_data.csv"))
sample_sub = pd.read_csv(os.path.join(INPUT_DIR,"submission.csv"))

print(train.shape)
display(train.head(3))

print(test.shape)
display(test.head(3))

print(sample_sub.shape)
display(sample_sub.head(3))

(4974, 6)


Unnamed: 0,id,title,year,abstract,keywords,y
0,1,Hierarchical Adversarially Learned Inference,2018,We propose a novel hierarchical generative mod...,"generative, hierarchical, unsupervised, semisu...",0
1,2,Learning to Compute Word Embeddings On the Fly,2018,Words in natural language follow a Zipfian dis...,"NLU, word embeddings, representation learning",0
2,3,Graph2Seq: Scalable Learning Dynamics for Graphs,2018,Neural networks are increasingly used as a gen...,,0


(6393, 5)


Unnamed: 0,id,title,year,abstract,keywords
0,1,StyleAlign: Analysis and Applications of Align...,2022,"In this paper, we perform an in-depth study of...","StyleGAN, transfer learning, fine tuning, mode..."
1,2,Embedding a random graph via GNN: mean-field i...,2021,We develop a theory for embedding a random gra...,"Graph neural network, graph embedding, multi-r..."
2,3,BBRefinement: an universal scheme to improve p...,2021,We present a conceptually simple yet powerful ...,"object detection, deep neural networks, refine..."


(6393, 2)


Unnamed: 0,id,y
0,1,0
1,2,0
2,3,0


In [10]:
train["texts"] = train["title"] + "[SEP]" + train["abstract"] 

In [11]:
skf = StratifiedKFold(n_splits=CFG.n_fold,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train.y)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)

if CFG.debug:
    display(train.groupby('kfold').size())
    train = train.sample(n=500, random_state=0).reset_index(drop=True)
    display(train.groupby('kfold').size())

In [12]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_EXP_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

In [13]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['texts'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 3 # cls + sep + sep
LOGGER.info(f"max_len: {CFG.max_len}")

100%|██████████| 4974/4974 [00:03<00:00, 1304.29it/s]
max_len: 522
INFO:__main__:max_len: 522


In [14]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False,
                           truncation=True)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df['y'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.half)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

class ValidDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df['y'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

#collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [15]:
def reinit_layers(model):

    #for layer in model.model.encoder.layer[-CFG.num_reinit_layers:]:
    for layer in model.encoder.layer[-CFG.num_reinit_layers:]:    #Custome model内(backbone)

            for module in layer.modules():

                if isinstance(module,nn.Linear):
                    module.weight.data.normal_(mean=0.0,std=model.config.initializer_range)
                    if module.bias is not None:
                            module.bias.data.zero_()
                elif isinstance(module, nn.Embedding):
                        module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                        if module.padding_idx is not None:
                            module.weight.data[module.padding_idx].zero_()
                elif isinstance(module, nn.LayerNorm):
                        module.bias.data.zero_()
                        module.weight.data.fill_(1.0)
                        
    return model

In [16]:
class PriorWD(Optimizer):
    def __init__(self, optim, use_prior_wd=False, exclude_last_group=True):
        super(PriorWD, self).__init__(optim.param_groups, optim.defaults)
        self.param_groups = optim.param_groups
        self.optim = optim
        self.use_prior_wd = use_prior_wd
        self.exclude_last_group = exclude_last_group
        self.weight_decay_by_group = []
        for i, group in enumerate(self.param_groups):
            self.weight_decay_by_group.append(group["weight_decay"])
            group["weight_decay"] = 0

        self.prior_params = {}
        for i, group in enumerate(self.param_groups):
            for p in group["params"]:
                self.prior_params[id(p)] = p.detach().clone()

    def step(self, closure=None):
        if self.use_prior_wd:
            for i, group in enumerate(self.param_groups):
                for p in group["params"]:
                    if self.exclude_last_group and i == len(self.param_groups):
                        p.data.add_(-group["lr"] * self.weight_decay_by_group[i], p.data)
                    else:
                        p.data.add_(
                            -group["lr"] * self.weight_decay_by_group[i], p.data - self.prior_params[id(p)],
                        )
        loss = self.optim.step(closure)

        return loss

    def compute_distance_to_prior(self, param):
        assert id(param) in self.prior_params, "parameter not in PriorWD optimizer"
        return (param.data - self.prior_params[id(param)]).pow(2).sum().sqrt()

In [17]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if CFG.is_reinit_layer:
            self.model = reinit_layers(self.model)
            print(f'Reinitializing Last {CFG.num_reinit_layers} Layers.')
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())

        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self._init_weights(self.fc)
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
        self._init_weights(self.fc)
        self.sig = nn.Sigmoid()
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        feature = self.layer_norm1(feature)
        output = self.fc(feature)
        #output = self.sig(output)
        return output

In [18]:

# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
        #print(y_preds.sigmoid().squeeze().view(1, -1))
        loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [19]:

# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['kfold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['kfold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds['y'].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = ValidDataset(CFG, valid_folds)


    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size*2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_EXP_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr=5e-6, decoder_lr=1e-4, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", 
                    "LayerNorm.weight"]
        group1=['layer.0.','layer.1.','layer.2.','layer.3.']
        group2=['layer.4.','layer.5.','layer.6.','layer.7.']    
        group3=['layer.8.','layer.9.','layer.10.','layer.11.']
        group_all=['layer.0.','layer.1.','layer.2.','layer.3.','layer.4.','layer.5.','layer.6.','layer.7.','layer.8.','layer.9.','layer.10.','layer.11.']
        optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': weight_decay, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': weight_decay, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': weight_decay, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': 0.0},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': 0.0, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': 0.0, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': 0.0, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.named_parameters() if "model" not in n], 'lr':decoder_lr, "momentum" : 0.99},
    ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    optimizer = PriorWD(optimizer, use_prior_wd=CFG.use_prior_wd)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCELoss()
    
    best_score = -1.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score_05 = get_score(valid_labels, predictions)
        score = get_acc_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')

        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_EXP_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_EXP_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [20]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['y'].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        acc_score = get_acc_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
        LOGGER.info(f'ACC BEST Score: {acc_score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
            #break
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_EXP_DIR+'oof_df.pkl')

DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "torch_dtype": "f

Epoch: [1][0/279] Elapsed 0m 4s (remain 21m 26s) Loss: 0.4993(0.4993) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 16s (remain 0m 28s) Loss: 0.6030(0.6477) Grad: 0.9954  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 28s (remain 0m 10s) Loss: 0.8062(0.6289) Grad: 6.3689  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 37s (remain 0m 0s) Loss: 0.6021(0.6217) Grad: 1.6951  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4838(0.4838) 


Epoch 1 - avg_train_loss: 0.6217  avg_val_loss: 0.6111  time: 41s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6217  avg_val_loss: 0.6111  time: 41s
Epoch 1 - Score: 0.7209
INFO:__main__:Epoch 1 - Score: 0.7209
Epoch 1 - Save Best Score: 0.7209 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7209 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8946(0.6111) 
f1 score : 0.025974025974025976
recall score : 0.013157894736842105
precision score : 1.0
thresh : 0.44
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 32s) Loss: 0.5483(0.5483) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.5547(0.5823) Grad: 2.7585  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.4192(0.5852) Grad: 7.5006  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.6494(0.5786) Grad: 2.8066  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.3085(0.3085) 


Epoch 2 - avg_train_loss: 0.5786  avg_val_loss: 0.5596  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5786  avg_val_loss: 0.5596  time: 36s
Epoch 2 - Score: 0.7189
INFO:__main__:Epoch 2 - Score: 0.7189


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0159(0.5596) 
f1 score : 0.1942857142857143
recall score : 0.1118421052631579
precision score : 0.7391304347826086
thresh : 0.51
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 25s) Loss: 0.4199(0.4199) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.5391(0.5283) Grad: 2.4914  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.5586(0.5201) Grad: 2.0430  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.5898(0.5141) Grad: 5.9118  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2717(0.2717) 


Epoch 3 - avg_train_loss: 0.5141  avg_val_loss: 0.5853  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5141  avg_val_loss: 0.5853  time: 36s
Epoch 3 - Score: 0.7209
INFO:__main__:Epoch 3 - Score: 0.7209


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2453(0.5853) 
f1 score : 0.29107981220657275
recall score : 0.20394736842105263
precision score : 0.5081967213114754
thresh : 0.64
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 32s) Loss: 0.3120(0.3120) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.4065(0.4307) Grad: 3.1774  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.3904(0.4160) Grad: 4.0457  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.3511(0.4119) Grad: 7.7270  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3708(0.3708) 


Epoch 4 - avg_train_loss: 0.4119  avg_val_loss: 0.6167  time: 37s
INFO:__main__:Epoch 4 - avg_train_loss: 0.4119  avg_val_loss: 0.6167  time: 37s
Epoch 4 - Score: 0.7169
INFO:__main__:Epoch 4 - Score: 0.7169


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0310(0.6167) 
f1 score : 0.40701754385964906
recall score : 0.3815789473684211
precision score : 0.43609022556390975
thresh : 0.79


Score: 0.6988
INFO:__main__:Score: 0.6988
ACC BEST Score: 0.7209
INFO:__main__:ACC BEST Score: 0.7209
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.025974025974025976
recall score : 0.013157894736842105
precision score : 1.0
thresh : 0.44


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 44s) Loss: 0.6562(0.6562) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.4670(0.6222) Grad: 5.3996  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.7686(0.6190) Grad: 2.7283  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.7822(0.6140) Grad: 6.0674  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4522(0.4522) 


Epoch 1 - avg_train_loss: 0.6140  avg_val_loss: 0.5950  time: 37s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6140  avg_val_loss: 0.5950  time: 37s
Epoch 1 - Score: 0.7129
INFO:__main__:Epoch 1 - Score: 0.7129
Epoch 1 - Save Best Score: 0.7129 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7129 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9270(0.5950) 
f1 score : 0.1325301204819277
recall score : 0.0718954248366013
precision score : 0.8461538461538461
thresh : 0.49
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 44s) Loss: 0.4856(0.4856) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.4480(0.5790) Grad: 1.5786  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.5557(0.5743) Grad: 4.8513  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.5801(0.5719) Grad: 1.0457  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4472(0.4472) 


Epoch 2 - avg_train_loss: 0.5719  avg_val_loss: 0.5727  time: 37s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5719  avg_val_loss: 0.5727  time: 37s
Epoch 2 - Score: 0.7289
INFO:__main__:Epoch 2 - Score: 0.7289
Epoch 2 - Save Best Score: 0.7289 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7289 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8880(0.5727) 
f1 score : 0.46875
recall score : 0.39215686274509803
precision score : 0.5825242718446602
thresh : 0.51
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 41s) Loss: 0.4709(0.4709) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.5146(0.5272) Grad: 2.4085  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 24s (remain 0m 9s) Loss: 0.6255(0.5116) Grad: 5.8085  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.3755(0.4989) Grad: 2.0633  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3001(0.3001) 


Epoch 3 - avg_train_loss: 0.4989  avg_val_loss: 0.5752  time: 37s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4989  avg_val_loss: 0.5752  time: 37s
Epoch 3 - Score: 0.7309
INFO:__main__:Epoch 3 - Score: 0.7309
Epoch 3 - Save Best Score: 0.7309 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7309 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.4070(0.5752) 
f1 score : 0.39999999999999997
recall score : 0.3006535947712418
precision score : 0.5974025974025974
thresh : 0.51
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 36s) Loss: 0.3096(0.3096) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.3093(0.3726) Grad: 10.2851  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.3579(0.3579) Grad: 7.3387  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.2092(0.3569) Grad: 3.9300  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4668(0.4668) 


Epoch 4 - avg_train_loss: 0.3569  avg_val_loss: 0.6112  time: 37s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3569  avg_val_loss: 0.6112  time: 37s
Epoch 4 - Score: 0.7309
INFO:__main__:Epoch 4 - Score: 0.7309


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2743(0.6112) 
f1 score : 0.4929577464788733
recall score : 0.45751633986928103
precision score : 0.5343511450381679
thresh : 0.73


Score: 0.7229
INFO:__main__:Score: 0.7229
ACC BEST Score: 0.7309
INFO:__main__:ACC BEST Score: 0.7309
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.39999999999999997
recall score : 0.3006535947712418
precision score : 0.5974025974025974
thresh : 0.51


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 48s) Loss: 0.8853(0.8853) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.5737(0.6381) Grad: 1.9434  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 24s (remain 0m 9s) Loss: 0.6294(0.6294) Grad: 2.2742  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.5547(0.6228) Grad: 3.2348  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4624(0.4624) 


Epoch 1 - avg_train_loss: 0.6228  avg_val_loss: 0.6033  time: 37s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6228  avg_val_loss: 0.6033  time: 37s
Epoch 1 - Score: 0.7108
INFO:__main__:Epoch 1 - Score: 0.7108
Epoch 1 - Save Best Score: 0.7108 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7108 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8609(0.6033) 
f1 score : 0.19565217391304346
recall score : 0.11764705882352941
precision score : 0.5806451612903226
thresh : 0.75
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 40s) Loss: 0.6079(0.6079) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.3967(0.5951) Grad: 1.0009  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.5845(0.5817) Grad: 1.8890  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.6445(0.5833) Grad: 0.7899  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3800(0.3800) 


Epoch 2 - avg_train_loss: 0.5833  avg_val_loss: 0.5714  time: 37s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5833  avg_val_loss: 0.5714  time: 37s
Epoch 2 - Score: 0.7189
INFO:__main__:Epoch 2 - Score: 0.7189
Epoch 2 - Save Best Score: 0.7189 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7189 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9943(0.5714) 
f1 score : 0.1411764705882353
recall score : 0.0784313725490196
precision score : 0.7058823529411765
thresh : 0.46
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 40s) Loss: 0.4639(0.4639) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.5127(0.5467) Grad: 1.5490  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 24s (remain 0m 9s) Loss: 0.5552(0.5321) Grad: 1.7997  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.5742(0.5208) Grad: 1.7538  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5672(0.5672) 


Epoch 3 - avg_train_loss: 0.5208  avg_val_loss: 0.6106  time: 37s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5208  avg_val_loss: 0.6106  time: 37s
Epoch 3 - Score: 0.7129
INFO:__main__:Epoch 3 - Score: 0.7129


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7807(0.6106) 
f1 score : 0.5217391304347826
recall score : 0.6274509803921569
precision score : 0.44651162790697674
thresh : 0.66
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 36s) Loss: 0.6323(0.6323) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.7363(0.4371) Grad: 6.5263  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.3708(0.4157) Grad: 3.6676  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.4629(0.4143) Grad: 2.9249  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3704(0.3704) 


Epoch 4 - avg_train_loss: 0.4143  avg_val_loss: 0.6091  time: 37s
INFO:__main__:Epoch 4 - avg_train_loss: 0.4143  avg_val_loss: 0.6091  time: 37s
Epoch 4 - Score: 0.7269
INFO:__main__:Epoch 4 - Score: 0.7269
Epoch 4 - Save Best Score: 0.7269 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7269 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.3545(0.6091) 
f1 score : 0.4822695035460993
recall score : 0.4444444444444444
precision score : 0.5271317829457365
thresh : 0.68


Score: 0.7068
INFO:__main__:Score: 0.7068
ACC BEST Score: 0.7269
INFO:__main__:ACC BEST Score: 0.7269
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.4822695035460993
recall score : 0.4444444444444444
precision score : 0.5271317829457365
thresh : 0.68


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 44s) Loss: 0.5845(0.5845) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 12s (remain 0m 22s) Loss: 0.5244(0.6353) Grad: 4.3152  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 24s (remain 0m 9s) Loss: 0.6782(0.6181) Grad: 2.7812  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.5029(0.6212) Grad: 2.4356  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3447(0.3447) 


Epoch 1 - avg_train_loss: 0.6212  avg_val_loss: 0.6032  time: 37s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6212  avg_val_loss: 0.6032  time: 37s
Epoch 1 - Score: 0.7028
INFO:__main__:Epoch 1 - Score: 0.7028
Epoch 1 - Save Best Score: 0.7028 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7028 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1520(0.6032) 
f1 score : 0.0625
recall score : 0.032679738562091505
precision score : 0.7142857142857143
thresh : 0.4
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 34s) Loss: 0.5986(0.5986) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 21s) Loss: 0.8950(0.5971) Grad: 6.4464  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 24s (remain 0m 9s) Loss: 0.2181(0.5826) Grad: 5.7323  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.4888(0.5815) Grad: 5.9732  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4451(0.4451) 


Epoch 2 - avg_train_loss: 0.5815  avg_val_loss: 0.5970  time: 37s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5815  avg_val_loss: 0.5970  time: 37s
Epoch 2 - Score: 0.7048
INFO:__main__:Epoch 2 - Score: 0.7048
Epoch 2 - Save Best Score: 0.7048 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7048 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8517(0.5970) 
f1 score : 0.23711340206185566
recall score : 0.1503267973856209
precision score : 0.5609756097560976
thresh : 0.78
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 47s) Loss: 0.6338(0.6338) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.8462(0.5408) Grad: 9.3101  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 24s (remain 0m 9s) Loss: 0.5352(0.5293) Grad: 4.8988  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.4473(0.5235) Grad: 2.8936  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2670(0.2670) 


Epoch 3 - avg_train_loss: 0.5235  avg_val_loss: 0.5792  time: 37s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5235  avg_val_loss: 0.5792  time: 37s
Epoch 3 - Score: 0.7169
INFO:__main__:Epoch 3 - Score: 0.7169
Epoch 3 - Save Best Score: 0.7169 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7169 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1349(0.5792) 
f1 score : 0.3486238532110092
recall score : 0.24836601307189543
precision score : 0.5846153846153846
thresh : 0.43
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 36s) Loss: 0.4507(0.4507) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.4282(0.4271) Grad: 2.7807  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.3022(0.4260) Grad: 3.5764  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.4512(0.4271) Grad: 2.5676  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2750(0.2750) 


Epoch 4 - avg_train_loss: 0.4271  avg_val_loss: 0.6186  time: 37s
INFO:__main__:Epoch 4 - avg_train_loss: 0.4271  avg_val_loss: 0.6186  time: 37s
Epoch 4 - Score: 0.7189
INFO:__main__:Epoch 4 - Score: 0.7189
Epoch 4 - Save Best Score: 0.7189 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7189 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1624(0.6186) 
f1 score : 0.4592592592592593
recall score : 0.40522875816993464
precision score : 0.5299145299145299
thresh : 0.56


Score: 0.7068
INFO:__main__:Score: 0.7068
ACC BEST Score: 0.7189
INFO:__main__:ACC BEST Score: 0.7189
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.4592592592592593
recall score : 0.40522875816993464
precision score : 0.5299145299145299
thresh : 0.56


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 52s) Loss: 0.7158(0.7158) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.6230(0.6447) Grad: 0.8736  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 24s (remain 0m 9s) Loss: 0.6650(0.6317) Grad: 1.4249  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.6060(0.6240) Grad: 0.6869  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3520(0.3520) 


Epoch 1 - avg_train_loss: 0.6240  avg_val_loss: 0.5944  time: 37s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6240  avg_val_loss: 0.5944  time: 37s
Epoch 1 - Score: 0.7082
INFO:__main__:Epoch 1 - Score: 0.7082
Epoch 1 - Save Best Score: 0.7082 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7082 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1054(0.5944) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
thresh : 0.37
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 37s) Loss: 0.6768(0.6768) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.6514(0.5814) Grad: 5.5518  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 24s (remain 0m 9s) Loss: 0.3625(0.5787) Grad: 1.0378  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.5820(0.5779) Grad: 0.9497  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2590(0.2590) 


Epoch 2 - avg_train_loss: 0.5779  avg_val_loss: 0.5662  time: 37s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5779  avg_val_loss: 0.5662  time: 37s
Epoch 2 - Score: 0.7243
INFO:__main__:Epoch 2 - Score: 0.7243
Epoch 2 - Save Best Score: 0.7243 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7243 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1870(0.5662) 
f1 score : 0.16470588235294117
recall score : 0.09210526315789473
precision score : 0.7777777777777778
thresh : 0.43
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 36s) Loss: 0.4502(0.4502) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.5229(0.5446) Grad: 3.1656  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 24s (remain 0m 9s) Loss: 0.5054(0.5246) Grad: 2.6629  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.5044(0.5178) Grad: 9.1739  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3572(0.3572) 


Epoch 3 - avg_train_loss: 0.5178  avg_val_loss: 0.5898  time: 37s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5178  avg_val_loss: 0.5898  time: 37s
Epoch 3 - Score: 0.7223
INFO:__main__:Epoch 3 - Score: 0.7223


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9003(0.5898) 
f1 score : 0.43636363636363634
recall score : 0.39473684210526316
precision score : 0.4878048780487805
thresh : 0.75
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 35s) Loss: 0.3960(0.3960) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.2244(0.3939) Grad: 3.0177  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.4092(0.3922) Grad: 5.6332  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4534(0.3960) Grad: 4.2305  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3601(0.3601) 


Epoch 4 - avg_train_loss: 0.3960  avg_val_loss: 0.6740  time: 37s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3960  avg_val_loss: 0.6740  time: 37s
Epoch 4 - Score: 0.7183
INFO:__main__:Epoch 4 - Score: 0.7183


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1138(0.6740) 
f1 score : 0.4202898550724638
recall score : 0.3815789473684211
precision score : 0.46774193548387094
thresh : 0.72


Score: 0.7143
INFO:__main__:Score: 0.7143
ACC BEST Score: 0.7243
INFO:__main__:ACC BEST Score: 0.7243
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.16470588235294117
recall score : 0.09210526315789473
precision score : 0.7777777777777778
thresh : 0.43


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 48s) Loss: 0.6592(0.6592) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.5483(0.6498) Grad: 7.5322  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.6812(0.6368) Grad: 1.9116  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.5732(0.6302) Grad: 2.0282  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4206(0.4206) 


Epoch 1 - avg_train_loss: 0.6302  avg_val_loss: 0.6116  time: 37s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6302  avg_val_loss: 0.6116  time: 37s
Epoch 1 - Score: 0.6962
INFO:__main__:Epoch 1 - Score: 0.6962
Epoch 1 - Save Best Score: 0.6962 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6962 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0807(0.6116) 
f1 score : 0.07317073170731707
recall score : 0.039473684210526314
precision score : 0.5
thresh : 0.54
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 36s) Loss: 0.6079(0.6079) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.7983(0.5967) Grad: 7.2391  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 24s (remain 0m 9s) Loss: 0.4939(0.5888) Grad: 4.1497  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.5840(0.5862) Grad: 1.3974  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3892(0.3892) 


Epoch 2 - avg_train_loss: 0.5862  avg_val_loss: 0.5966  time: 37s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5862  avg_val_loss: 0.5966  time: 37s
Epoch 2 - Score: 0.6982
INFO:__main__:Epoch 2 - Score: 0.6982
Epoch 2 - Save Best Score: 0.6982 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.6982 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0973(0.5966) 
f1 score : 0.07361963190184048
recall score : 0.039473684210526314
precision score : 0.5454545454545454
thresh : 0.47
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 41s) Loss: 0.4365(0.4365) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.5186(0.5417) Grad: 1.3569  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 24s (remain 0m 9s) Loss: 0.4902(0.5296) Grad: 5.9482  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.5547(0.5227) Grad: 1.7559  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2689(0.2689) 


Epoch 3 - avg_train_loss: 0.5227  avg_val_loss: 0.6279  time: 37s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5227  avg_val_loss: 0.6279  time: 37s
Epoch 3 - Score: 0.7062
INFO:__main__:Epoch 3 - Score: 0.7062
Epoch 3 - Save Best Score: 0.7062 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7062 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.4868(0.6279) 
f1 score : 0.09756097560975609
recall score : 0.05263157894736842
precision score : 0.6666666666666666
thresh : 0.45
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 41s) Loss: 0.3867(0.3867) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.4841(0.4471) Grad: 3.1890  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 24s (remain 0m 9s) Loss: 0.3848(0.4366) Grad: 3.4802  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.3940(0.4300) Grad: 4.4934  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3960(0.3960) 


Epoch 4 - avg_train_loss: 0.4300  avg_val_loss: 0.6663  time: 37s
INFO:__main__:Epoch 4 - avg_train_loss: 0.4300  avg_val_loss: 0.6663  time: 37s
Epoch 4 - Score: 0.7062
INFO:__main__:Epoch 4 - Score: 0.7062


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2779(0.6663) 
f1 score : 0.3412698412698413
recall score : 0.28289473684210525
precision score : 0.43
thresh : 0.65


Score: 0.7022
INFO:__main__:Score: 0.7022
ACC BEST Score: 0.7062
INFO:__main__:ACC BEST Score: 0.7062
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.09756097560975609
recall score : 0.05263157894736842
precision score : 0.6666666666666666
thresh : 0.45


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 47s) Loss: 0.6216(0.6216) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.5684(0.6234) Grad: 3.1559  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 24s (remain 0m 9s) Loss: 0.8311(0.6281) Grad: 7.3370  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.4189(0.6176) Grad: 4.0185  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3077(0.3077) 


Epoch 1 - avg_train_loss: 0.6176  avg_val_loss: 0.5960  time: 37s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6176  avg_val_loss: 0.5960  time: 37s
Epoch 1 - Score: 0.7103
INFO:__main__:Epoch 1 - Score: 0.7103
Epoch 1 - Save Best Score: 0.7103 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7103 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2155(0.5960) 
f1 score : 0.08536585365853658
recall score : 0.046052631578947366
precision score : 0.5833333333333334
thresh : 0.32
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 39s) Loss: 0.5020(0.5020) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.5820(0.5779) Grad: 2.6550  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 24s (remain 0m 9s) Loss: 0.5474(0.5808) Grad: 2.3310  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.5415(0.5748) Grad: 7.2130  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3991(0.3991) 


Epoch 2 - avg_train_loss: 0.5748  avg_val_loss: 0.5766  time: 37s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5748  avg_val_loss: 0.5766  time: 37s
Epoch 2 - Score: 0.7103
INFO:__main__:Epoch 2 - Score: 0.7103


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9236(0.5766) 
f1 score : 0.17777777777777778
recall score : 0.10526315789473684
precision score : 0.5714285714285714
thresh : 0.48
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 33s) Loss: 0.5859(0.5859) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 11s (remain 0m 21s) Loss: 0.6919(0.5304) Grad: 7.6889  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.5708(0.5218) Grad: 13.8899  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.4280(0.5110) Grad: 1.9074  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3738(0.3738) 


Epoch 3 - avg_train_loss: 0.5110  avg_val_loss: 0.5666  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5110  avg_val_loss: 0.5666  time: 36s
Epoch 3 - Score: 0.7223
INFO:__main__:Epoch 3 - Score: 0.7223
Epoch 3 - Save Best Score: 0.7223 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7223 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9925(0.5666) 
f1 score : 0.40800000000000003
recall score : 0.3355263157894737
precision score : 0.5204081632653061
thresh : 0.67
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 37s) Loss: 0.4009(0.4009) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.2881(0.3780) Grad: 7.3117  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 24s (remain 0m 9s) Loss: 0.5083(0.3707) Grad: 5.2501  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.2520(0.3668) Grad: 5.1725  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3453(0.3453) 


Epoch 4 - avg_train_loss: 0.3668  avg_val_loss: 0.6244  time: 37s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3668  avg_val_loss: 0.6244  time: 37s
Epoch 4 - Score: 0.7123
INFO:__main__:Epoch 4 - Score: 0.7123


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1859(0.6244) 
f1 score : 0.46808510638297873
recall score : 0.4342105263157895
precision score : 0.5076923076923077
thresh : 0.68


Score: 0.7022
INFO:__main__:Score: 0.7022
ACC BEST Score: 0.7223
INFO:__main__:ACC BEST Score: 0.7223
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.40800000000000003
recall score : 0.3355263157894737
precision score : 0.5204081632653061
thresh : 0.67


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 41s) Loss: 0.6675(0.6675) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.7202(0.6281) Grad: 4.2368  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.6226(0.6193) Grad: 2.1607  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.5879(0.6178) Grad: 3.8506  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4929(0.4929) 


Epoch 1 - avg_train_loss: 0.6178  avg_val_loss: 0.6207  time: 37s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6178  avg_val_loss: 0.6207  time: 37s
Epoch 1 - Score: 0.7143
INFO:__main__:Epoch 1 - Score: 0.7143
Epoch 1 - Save Best Score: 0.7143 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7143 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8880(0.6207) 
f1 score : 0.14035087719298245
recall score : 0.07894736842105263
precision score : 0.631578947368421
thresh : 0.47
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 38s) Loss: 0.5762(0.5762) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.5698(0.5934) Grad: 3.5955  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 24s (remain 0m 9s) Loss: 1.0684(0.5842) Grad: 9.3593  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.5098(0.5861) Grad: 7.9733  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4561(0.4561) 


Epoch 2 - avg_train_loss: 0.5861  avg_val_loss: 0.5929  time: 37s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5861  avg_val_loss: 0.5929  time: 37s
Epoch 2 - Score: 0.7243
INFO:__main__:Epoch 2 - Score: 0.7243
Epoch 2 - Save Best Score: 0.7243 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7243 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8380(0.5929) 
f1 score : 0.2751322751322752
recall score : 0.17105263157894737
precision score : 0.7027027027027027
thresh : 0.49
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 42s) Loss: 0.6772(0.6772) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.4702(0.5327) Grad: 4.8854  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.4602(0.5187) Grad: 1.6916  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.4219(0.5167) Grad: 1.7687  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2954(0.2954) 


Epoch 3 - avg_train_loss: 0.5167  avg_val_loss: 0.5785  time: 37s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5167  avg_val_loss: 0.5785  time: 37s
Epoch 3 - Score: 0.7304
INFO:__main__:Epoch 3 - Score: 0.7304
Epoch 3 - Save Best Score: 0.7304 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7304 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1535(0.5785) 
f1 score : 0.32692307692307687
recall score : 0.2236842105263158
precision score : 0.6071428571428571
thresh : 0.56
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 44s) Loss: 0.5156(0.5156) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 12s (remain 0m 22s) Loss: 0.2673(0.4371) Grad: 2.4977  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 24s (remain 0m 9s) Loss: 0.2539(0.4085) Grad: 3.6586  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.2128(0.4019) Grad: 3.4807  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4096(0.4096) 


Epoch 4 - avg_train_loss: 0.4019  avg_val_loss: 0.6313  time: 37s
INFO:__main__:Epoch 4 - avg_train_loss: 0.4019  avg_val_loss: 0.6313  time: 37s
Epoch 4 - Score: 0.7123
INFO:__main__:Epoch 4 - Score: 0.7123


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9656(0.6313) 
f1 score : 0.4404332129963899
recall score : 0.40131578947368424
precision score : 0.488
thresh : 0.61


Score: 0.7183
INFO:__main__:Score: 0.7183
ACC BEST Score: 0.7304
INFO:__main__:ACC BEST Score: 0.7304
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.32692307692307687
recall score : 0.2236842105263158
precision score : 0.6071428571428571
thresh : 0.56


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 48s) Loss: 0.7012(0.7012) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.7373(0.6354) Grad: 5.4722  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 24s (remain 0m 9s) Loss: 0.7427(0.6276) Grad: 4.4021  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.5498(0.6198) Grad: 3.2287  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4134(0.4134) 


Epoch 1 - avg_train_loss: 0.6198  avg_val_loss: 0.5995  time: 37s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6198  avg_val_loss: 0.5995  time: 37s
Epoch 1 - Score: 0.7082
INFO:__main__:Epoch 1 - Score: 0.7082
Epoch 1 - Save Best Score: 0.7082 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7082 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9271(0.5995) 
f1 score : 0.07272727272727271
recall score : 0.039473684210526314
precision score : 0.46153846153846156
thresh : 0.37
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 39s) Loss: 0.4878(0.4878) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 12s (remain 0m 22s) Loss: 0.5923(0.5841) Grad: 0.8068  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 24s (remain 0m 9s) Loss: 0.5977(0.5886) Grad: 1.9427  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.7632(0.5858) Grad: 5.2357  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3576(0.3576) 


Epoch 2 - avg_train_loss: 0.5858  avg_val_loss: 0.5654  time: 37s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5858  avg_val_loss: 0.5654  time: 37s
Epoch 2 - Score: 0.7103
INFO:__main__:Epoch 2 - Score: 0.7103
Epoch 2 - Save Best Score: 0.7103 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7103 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9319(0.5654) 
f1 score : 0.08536585365853658
recall score : 0.046052631578947366
precision score : 0.5833333333333334
thresh : 0.39
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 42s) Loss: 0.6079(0.6079) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 12s (remain 0m 22s) Loss: 0.6763(0.5252) Grad: 4.8568  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 24s (remain 0m 9s) Loss: 0.6836(0.5268) Grad: 5.7792  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.5029(0.5190) Grad: 5.4031  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4449(0.4449) 


Epoch 3 - avg_train_loss: 0.5190  avg_val_loss: 0.5714  time: 37s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5190  avg_val_loss: 0.5714  time: 37s
Epoch 3 - Score: 0.7163
INFO:__main__:Epoch 3 - Score: 0.7163
Epoch 3 - Save Best Score: 0.7163 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7163 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7113(0.5714) 
f1 score : 0.46735395189003437
recall score : 0.4473684210526316
precision score : 0.4892086330935252
thresh : 0.58
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 39s) Loss: 0.5742(0.5742) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.7124(0.4269) Grad: 4.1960  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 24s (remain 0m 9s) Loss: 0.5464(0.4222) Grad: 3.8627  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.5142(0.4139) Grad: 4.3165  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3924(0.3924) 


Epoch 4 - avg_train_loss: 0.4139  avg_val_loss: 0.5920  time: 37s
INFO:__main__:Epoch 4 - avg_train_loss: 0.4139  avg_val_loss: 0.5920  time: 37s
Epoch 4 - Score: 0.7143
INFO:__main__:Epoch 4 - Score: 0.7143


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8168(0.5920) 
f1 score : 0.4428044280442805
recall score : 0.39473684210526316
precision score : 0.5042016806722689
thresh : 0.6


Score: 0.6881
INFO:__main__:Score: 0.6881
ACC BEST Score: 0.7163
INFO:__main__:ACC BEST Score: 0.7163
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.46735395189003437
recall score : 0.4473684210526316
precision score : 0.4892086330935252
thresh : 0.58


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 45s) Loss: 0.7212(0.7212) Grad: nan  LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 21s) Loss: 0.5088(0.6193) Grad: 2.7311  LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 24s (remain 0m 9s) Loss: 0.6660(0.6218) Grad: 2.0529  LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.5840(0.6212) Grad: 0.7604  LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3183(0.3183) 


Epoch 1 - avg_train_loss: 0.6212  avg_val_loss: 0.5923  time: 37s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6212  avg_val_loss: 0.5923  time: 37s
Epoch 1 - Score: 0.7042
INFO:__main__:Epoch 1 - Score: 0.7042
Epoch 1 - Save Best Score: 0.7042 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7042 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1552(0.5923) 
f1 score : 0.09815950920245399
recall score : 0.05263157894736842
precision score : 0.7272727272727273
thresh : 0.43
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 42s) Loss: 0.5049(0.5049) Grad: nan  LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.6680(0.6029) Grad: 3.3290  LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 24s (remain 0m 9s) Loss: 0.5171(0.5974) Grad: 3.4072  LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.5127(0.5875) Grad: 2.5610  LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4342(0.4342) 


Epoch 2 - avg_train_loss: 0.5875  avg_val_loss: 0.6007  time: 37s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5875  avg_val_loss: 0.6007  time: 37s
Epoch 2 - Score: 0.7103
INFO:__main__:Epoch 2 - Score: 0.7103
Epoch 2 - Save Best Score: 0.7103 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7103 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8244(0.6007) 
f1 score : 0.4242424242424242
recall score : 0.3684210526315789
precision score : 0.5
thresh : 0.55
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 38s) Loss: 0.4993(0.4993) Grad: nan  LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.5591(0.5435) Grad: 2.0360  LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 24s (remain 0m 9s) Loss: 0.4441(0.5384) Grad: 2.2597  LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.4067(0.5334) Grad: 6.6451  LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2997(0.2997) 


Epoch 3 - avg_train_loss: 0.5334  avg_val_loss: 0.5782  time: 37s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5334  avg_val_loss: 0.5782  time: 37s
Epoch 3 - Score: 0.7163
INFO:__main__:Epoch 3 - Score: 0.7163
Epoch 3 - Save Best Score: 0.7163 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7163 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0571(0.5782) 
f1 score : 0.3559322033898305
recall score : 0.27631578947368424
precision score : 0.5
thresh : 0.54
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 44s) Loss: 0.4050(0.4050) Grad: nan  LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.1837(0.4638) Grad: 1.9708  LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.7251(0.4546) Grad: 5.1335  LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 33s (remain 0m 0s) Loss: 0.4607(0.4482) Grad: 5.4269  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2697(0.2697) 


Epoch 4 - avg_train_loss: 0.4482  avg_val_loss: 0.6077  time: 37s
INFO:__main__:Epoch 4 - avg_train_loss: 0.4482  avg_val_loss: 0.6077  time: 37s
Epoch 4 - Score: 0.7203
INFO:__main__:Epoch 4 - Score: 0.7203
Epoch 4 - Save Best Score: 0.7203 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7203 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2497(0.6077) 
f1 score : 0.38866396761133604
recall score : 0.3157894736842105
precision score : 0.5052631578947369
thresh : 0.6


Score: 0.6962
INFO:__main__:Score: 0.6962
ACC BEST Score: 0.7203
INFO:__main__:ACC BEST Score: 0.7203
Score: 0.7057
INFO:__main__:Score: 0.7057
ACC BEST Score: 0.7115
INFO:__main__:ACC BEST Score: 0.7115


f1 score : 0.38866396761133604
recall score : 0.3157894736842105
precision score : 0.5052631578947369
thresh : 0.6
f1 score : 0.353927625772286
recall score : 0.26329612606697306
precision score : 0.5397039030955586
thresh : 0.59


In [21]:
from google.colab import runtime
runtime.unassign()