In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m99.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m114.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, https:/

In [None]:
!nvidia-smi

Sat Apr 29 22:02:04 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    49W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:

import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re
import html

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD, AdamW, RAdam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score, recall_score, accuracy_score, precision_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
DIR = "/content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")
CUSTOM_MODEL_DIR = os.path.join(OUTPUT_DIR,'clrp_deberta_v3_base')
OUTPUT_MODEL_DIR = DIR + '/output/EXP035/'
OUTPUT_EXP_DIR = OUTPUT_MODEL_DIR + '/ABSTRACT/'
if not os.path.exists(OUTPUT_MODEL_DIR):
    os.makedirs(OUTPUT_MODEL_DIR)
if not os.path.exists(OUTPUT_EXP_DIR):
    os.makedirs(OUTPUT_EXP_DIR)

In [None]:

# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model_name="microsoft/deberta-v3-base"
    # model='microsoft/deberta-base'
    # model='roberta-base'
    # model='roberta-large'
    # model='roberta-large-mnli'
    # model='xlnet-large-cased'
    # model='albert-xxlarge-v2'
    # model="microsoft/deberta-large"
    # model="microsoft/deberta-v3-large"
    # model='microsoft/deberta-v2-xlarge'
    # model='funnel-transformer/large'
    # model='funnel-transformer/medium'
    # model='albert-base-v2'
    # model='albert-large-v2'
    # model='google/electra-large-discriminator'
    # model='google/electra-base-discriminator'
    # model="facebook/bart-large-mnli"
    # model="facebook/bart-large"
    # model="facebook/bart-base"
    model = CUSTOM_MODEL_DIR
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=16
    fc_dropout=0.2
    target_size=1
    max_len=256
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=10
    trn_fold=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    train=True
    nth_awp_start_epoch=1
    gradient_checkpointing = False
    freezing = False
    num_reinit_layers = 1
    is_reinit_layer = False
    fgm = False
    awp_start=1

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1]

In [None]:
def get_score(labels, outputs):
    thresh = 0.5
    y_pred = outputs
    y_true = labels
    f_score = f1_score(y_true, (y_pred>thresh).astype(int))
    r_score = recall_score(y_true, (y_pred>thresh).astype(int))
    p_score = precision_score(y_true, (y_pred>thresh).astype(int))
    print(f"f1 score : {f_score}")
    print(f"recall score : {r_score}")
    print(f"precision score : {p_score}")
    return accuracy_score(y_true, (y_pred>thresh).astype(int))

def get_acc_score(labels, outputs):
    y_pred = outputs
    y_true = labels
    best_score = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.80, 0.01):
        thresh = np.round(thresh, 2)
        score = accuracy_score(y_true, (y_pred>thresh).astype(int))
        #print("Accuracy score at threshold {0} is {1}".format(thresh, score))
        if score > best_score:
          best_score = score
          best_thresh = thresh
    print(f"thresh : {best_thresh}")
    return accuracy_score(y_true, (y_pred>best_thresh).astype(int))


def get_logger(filename=OUTPUT_EXP_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=CFG.seed)

In [None]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [None]:
import pandas as pd
import numpy as np

train = pd.read_csv(os.path.join(INPUT_DIR,"train_data.csv"))
test = pd.read_csv(os.path.join(INPUT_DIR,"test_data.csv"))
sample_sub = pd.read_csv(os.path.join(INPUT_DIR,"submission.csv"))

print(train.shape)
display(train.head(3))

print(test.shape)
display(test.head(3))

print(sample_sub.shape)
display(sample_sub.head(3))

(4974, 6)


Unnamed: 0,id,title,year,abstract,keywords,y
0,1,Hierarchical Adversarially Learned Inference,2018,We propose a novel hierarchical generative mod...,"generative, hierarchical, unsupervised, semisu...",0
1,2,Learning to Compute Word Embeddings On the Fly,2018,Words in natural language follow a Zipfian dis...,"NLU, word embeddings, representation learning",0
2,3,Graph2Seq: Scalable Learning Dynamics for Graphs,2018,Neural networks are increasingly used as a gen...,,0


(6393, 5)


Unnamed: 0,id,title,year,abstract,keywords
0,1,StyleAlign: Analysis and Applications of Align...,2022,"In this paper, we perform an in-depth study of...","StyleGAN, transfer learning, fine tuning, mode..."
1,2,Embedding a random graph via GNN: mean-field i...,2021,We develop a theory for embedding a random gra...,"Graph neural network, graph embedding, multi-r..."
2,3,BBRefinement: an universal scheme to improve p...,2021,We present a conceptually simple yet powerful ...,"object detection, deep neural networks, refine..."


(6393, 2)


Unnamed: 0,id,y
0,1,0
1,2,0
2,3,0


In [None]:
train["texts"] = train["abstract"]

In [None]:
skf = StratifiedKFold(n_splits=CFG.n_fold,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train.y)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)

if CFG.debug:
    display(train.groupby('kfold').size())
    train = train.sample(n=500, random_state=0).reset_index(drop=True)
    display(train.groupby('kfold').size())

In [None]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_EXP_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

In [None]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['texts'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 3 # cls + sep + sep
LOGGER.info(f"max_len: {CFG.max_len}")

100%|██████████| 4974/4974 [00:03<00:00, 1432.39it/s]
max_len: 512
INFO:__main__:max_len: 512


In [None]:
class AWP:
    def __init__(self, model, optimizer, *, adv_param='weight',
                 adv_lr=0.001, adv_eps=0.001):
        self.model = model
        self.optimizer = optimizer
        self.adv_param = adv_param
        self.adv_lr = adv_lr
        self.adv_eps = adv_eps
        self.backup = {}

    def perturb(self, inputs, y, criterion):
        """
        Perturb model parameters for AWP gradient
        Call before loss and loss.backward()
        """
        self._save()  # save model parameters
        self._attack_step()  # perturb weights

    def _attack_step(self):
        e = 1e-6
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                grad = self.optimizer.state[param]['exp_avg']
                norm_grad = torch.norm(grad)
                norm_data = torch.norm(param.detach())

                if norm_grad != 0 and not torch.isnan(norm_grad):
                    # Set lower and upper limit in change
                    limit_eps = self.adv_eps * param.detach().abs()
                    param_min = param.data - limit_eps
                    param_max = param.data + limit_eps

                    # Perturb along gradient
                    # w += (adv_lr * |w| / |grad|) * grad
                    param.data.add_(grad, alpha=(self.adv_lr * (norm_data + e) / (norm_grad + e)))

                    # Apply the limit to the change
                    param.data.clamp_(param_min, param_max)

    def _save(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                if name not in self.backup:
                    self.backup[name] = param.clone().detach()
                else:
                    self.backup[name].copy_(param.data)

    def restore(self):
        """
        Restore model parameter to correct position; AWP do not perturbe weights, it perturb gradients
        Call after loss.backward(), before optimizer.step()
        """
        for name, param in self.model.named_parameters():
            if name in self.backup:
                param.data.copy_(self.backup[name])

In [None]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False,
                           truncation=True)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df['y'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

#collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
def reinit_layers(model):

    #for layer in model.model.encoder.layer[-CFG.num_reinit_layers:]:
    for layer in model.encoder.layer[-CFG.num_reinit_layers:]:    #Custome model内(backbone)

            for module in layer.modules():

                if isinstance(module,nn.Linear):
                    module.weight.data.normal_(mean=0.0,std=model.config.initializer_range)
                    if module.bias is not None:
                            module.bias.data.zero_()
                elif isinstance(module, nn.Embedding):
                        module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                        if module.padding_idx is not None:
                            module.weight.data[module.padding_idx].zero_()
                elif isinstance(module, nn.LayerNorm):
                        module.bias.data.zero_()
                        module.weight.data.fill_(1.0)
                        
    return model

In [None]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if CFG.is_reinit_layer:
            self.model = reinit_layers(self.model)
            print(f'Reinitializing Last {CFG.num_reinit_layers} Layers.')
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())

        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self._init_weights(self.fc)
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
        self._init_weights(self.fc)
        self.sig = nn.Sigmoid()
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        feature = self.layer_norm1(feature)
        output = self.fc(feature)
        #output = self.sig(output)
        return output

In [None]:
def calculate_loss(inputs, labels, model, criterion, is_valid=True, device="cpu"):    
    y_preds = model(inputs)
    loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
    return (loss, y_preds) if is_valid else loss

In [None]:

# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device, awp):
    model.zero_grad()
    model.train()
    awp_start = CFG.awp_start
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        if epoch >= awp_start:
            awp.perturb(inputs, labels, criterion)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            loss = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=False, device=device)
        #print(y_preds.sigmoid().squeeze().view(1, -1))
        #loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        if scaler is not None:
            scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        awp.restore()
        if CFG.fgm:
          fgm.attack() 
          adversarial_loss = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=False, device=device)
          scaler.scale(adversarial_loss).backward()
          fgm.restore()
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  #'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          #grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            loss, y_preds = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=True, device=device)
        #loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [None]:

# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['kfold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['kfold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds['y'].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)


    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size*2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_EXP_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr=5e-6, decoder_lr=1e-4, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", 
                    "LayerNorm.weight"]
        group1=['layer.0.','layer.1.','layer.2.','layer.3.']
        group2=['layer.4.','layer.5.','layer.6.','layer.7.']    
        group3=['layer.8.','layer.9.','layer.10.','layer.11.']
        group_all=['layer.0.','layer.1.','layer.2.','layer.3.','layer.4.','layer.5.','layer.6.','layer.7.','layer.8.','layer.9.','layer.10.','layer.11.']
        optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': weight_decay, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': weight_decay, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': weight_decay, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': 0.0},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': 0.0, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': 0.0, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': 0.0, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.named_parameters() if "model" not in n], 'lr':decoder_lr, "momentum" : 0.99},
    ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss()

    print('Enable AWP')
    awp = AWP(model, optimizer, adv_lr=0.001, adv_eps=0.001)
    #print('Enable FGM')
    #fgm = FGM(model=model, eps=0.1)
    
    best_score = -1.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device, awp)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score_05 = get_score(valid_labels, predictions)
        score = get_acc_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')

        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_EXP_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_EXP_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [None]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['y'].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        acc_score = get_acc_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
        LOGGER.info(f'ACC BEST Score: {acc_score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
            #break
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_EXP_DIR+'oof_abstract_df.pkl')

DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "torch_dtype": "f

Enable AWP
Epoch: [1][0/279] Elapsed 0m 5s (remain 25m 42s) Loss: 0.4983(0.4983) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 16s (remain 0m 29s) Loss: 0.6029(0.6403) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 27s (remain 0m 10s) Loss: 0.8349(0.6235) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 36s (remain 0m 0s) Loss: 0.6061(0.6179) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.4980(0.4980) 


Epoch 1 - avg_train_loss: 0.6179  avg_val_loss: 0.6189  time: 40s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6179  avg_val_loss: 0.6189  time: 40s
Epoch 1 - Score: 0.7028
INFO:__main__:Epoch 1 - Score: 0.7028
Epoch 1 - Save Best Score: 0.7028 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7028 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8852(0.6189) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
thresh : 0.44
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 28s) Loss: 0.5688(0.5688) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 19s) Loss: 0.5211(0.5890) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.4352(0.5876) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.6160(0.5796) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.2885(0.2885) 


Epoch 2 - avg_train_loss: 0.5796  avg_val_loss: 0.5631  time: 35s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5796  avg_val_loss: 0.5631  time: 35s
Epoch 2 - Score: 0.7169
INFO:__main__:Epoch 2 - Score: 0.7169
Epoch 2 - Save Best Score: 0.7169 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7169 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0984(0.5631) 
f1 score : 0.15476190476190477
recall score : 0.08552631578947369
precision score : 0.8125
thresh : 0.46
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 33s) Loss: 0.4318(0.4318) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.5321(0.5286) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.5790(0.5177) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.6014(0.5108) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.2701(0.2701) 


Epoch 3 - avg_train_loss: 0.5108  avg_val_loss: 0.5970  time: 35s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5108  avg_val_loss: 0.5970  time: 35s
Epoch 3 - Score: 0.7209
INFO:__main__:Epoch 3 - Score: 0.7209
Epoch 3 - Save Best Score: 0.7209 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7209 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2541(0.5970) 
f1 score : 0.3231441048034935
recall score : 0.24342105263157895
precision score : 0.4805194805194805
thresh : 0.67
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 34s) Loss: 0.2654(0.2654) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.3602(0.4034) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.3723(0.3933) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.3010(0.3895) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.3891(0.3891) 


Epoch 4 - avg_train_loss: 0.3895  avg_val_loss: 0.6520  time: 35s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3895  avg_val_loss: 0.6520  time: 35s
Epoch 4 - Score: 0.7048
INFO:__main__:Epoch 4 - Score: 0.7048


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0778(0.6520) 
f1 score : 0.3958333333333333
recall score : 0.375
precision score : 0.41911764705882354
thresh : 0.78


Score: 0.6888
INFO:__main__:Score: 0.6888
ACC BEST Score: 0.7209
INFO:__main__:ACC BEST Score: 0.7209
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.3231441048034935
recall score : 0.24342105263157895
precision score : 0.4805194805194805
thresh : 0.67


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 43s) Loss: 0.6340(0.6340) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 19s) Loss: 0.4689(0.6237) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.7991(0.6186) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 30s (remain 0m 0s) Loss: 0.7619(0.6139) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.4950(0.4950) 


Epoch 1 - avg_train_loss: 0.6139  avg_val_loss: 0.6058  time: 35s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6139  avg_val_loss: 0.6058  time: 35s
Epoch 1 - Score: 0.7108
INFO:__main__:Epoch 1 - Score: 0.7108
Epoch 1 - Save Best Score: 0.7108 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7108 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8498(0.6058) 
f1 score : 0.1411764705882353
recall score : 0.0784313725490196
precision score : 0.7058823529411765
thresh : 0.49
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 43s) Loss: 0.5065(0.5065) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.4498(0.5754) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 8s) Loss: 0.5897(0.5730) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.6070(0.5721) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4479(0.4479) 


Epoch 2 - avg_train_loss: 0.5721  avg_val_loss: 0.5812  time: 35s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5721  avg_val_loss: 0.5812  time: 35s
Epoch 2 - Score: 0.7289
INFO:__main__:Epoch 2 - Score: 0.7289
Epoch 2 - Save Best Score: 0.7289 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7289 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8868(0.5812) 
f1 score : 0.46212121212121215
recall score : 0.39869281045751637
precision score : 0.5495495495495496
thresh : 0.54
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 43s) Loss: 0.4466(0.4466) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.4842(0.5303) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.6654(0.5076) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.3747(0.4932) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.2578(0.2578) 


Epoch 3 - avg_train_loss: 0.4932  avg_val_loss: 0.5914  time: 35s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4932  avg_val_loss: 0.5914  time: 35s
Epoch 3 - Score: 0.7209
INFO:__main__:Epoch 3 - Score: 0.7209


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.4641(0.5914) 
f1 score : 0.376068376068376
recall score : 0.2875816993464052
precision score : 0.5432098765432098
thresh : 0.59
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 34s) Loss: 0.2877(0.2877) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.2042(0.3114) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.2246(0.2976) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.1291(0.3020) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3970(0.3970) 


Epoch 4 - avg_train_loss: 0.3020  avg_val_loss: 0.7042  time: 35s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3020  avg_val_loss: 0.7042  time: 35s
Epoch 4 - Score: 0.7149
INFO:__main__:Epoch 4 - Score: 0.7149


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.6084(0.7042) 
f1 score : 0.445993031358885
recall score : 0.41830065359477125
precision score : 0.47761194029850745
thresh : 0.79


Score: 0.7149
INFO:__main__:Score: 0.7149
ACC BEST Score: 0.7289
INFO:__main__:ACC BEST Score: 0.7289
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.46212121212121215
recall score : 0.39869281045751637
precision score : 0.5495495495495496
thresh : 0.54


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 46s) Loss: 0.8620(0.8620) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.5849(0.6313) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.6301(0.6261) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.6045(0.6209) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5013(0.5013) 


Epoch 1 - avg_train_loss: 0.6209  avg_val_loss: 0.6152  time: 35s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6209  avg_val_loss: 0.6152  time: 35s
Epoch 1 - Score: 0.7149
INFO:__main__:Epoch 1 - Score: 0.7149
Epoch 1 - Save Best Score: 0.7149 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7149 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8606(0.6152) 
f1 score : 0.15384615384615385
recall score : 0.08496732026143791
precision score : 0.8125
thresh : 0.51
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 37s) Loss: 0.6129(0.6129) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.3994(0.5993) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.5979(0.5858) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.6624(0.5838) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4878(0.4878) 


Epoch 2 - avg_train_loss: 0.5838  avg_val_loss: 0.5943  time: 35s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5838  avg_val_loss: 0.5943  time: 35s
Epoch 2 - Score: 0.7189
INFO:__main__:Epoch 2 - Score: 0.7189
Epoch 2 - Save Best Score: 0.7189 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7189 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8606(0.5943) 
f1 score : 0.4536082474226804
recall score : 0.43137254901960786
precision score : 0.4782608695652174
thresh : 0.59
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 41s) Loss: 0.5237(0.5237) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.4958(0.5421) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 9s) Loss: 0.5010(0.5254) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 32s (remain 0m 0s) Loss: 0.5294(0.5119) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.6010(0.6010) 


Epoch 3 - avg_train_loss: 0.5119  avg_val_loss: 0.6356  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5119  avg_val_loss: 0.6356  time: 36s
Epoch 3 - Score: 0.7169
INFO:__main__:Epoch 3 - Score: 0.7169


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8402(0.6356) 
f1 score : 0.5226666666666666
recall score : 0.6405228758169934
precision score : 0.44144144144144143
thresh : 0.73
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 29s) Loss: 0.6018(0.6018) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.7915(0.3911) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.3182(0.3610) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.4946(0.3567) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4949(0.4949) 


Epoch 4 - avg_train_loss: 0.3567  avg_val_loss: 0.6918  time: 35s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3567  avg_val_loss: 0.6918  time: 35s
Epoch 4 - Score: 0.7149
INFO:__main__:Epoch 4 - Score: 0.7149


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.5138(0.6918) 
f1 score : 0.43205574912891986
recall score : 0.40522875816993464
precision score : 0.4626865671641791
thresh : 0.79


Score: 0.6807
INFO:__main__:Score: 0.6807
ACC BEST Score: 0.7189
INFO:__main__:ACC BEST Score: 0.7189
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.4536082474226804
recall score : 0.43137254901960786
precision score : 0.4782608695652174
thresh : 0.59


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 42s) Loss: 0.5698(0.5698) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.4989(0.6261) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.6829(0.6124) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.4742(0.6166) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3393(0.3393) 


Epoch 1 - avg_train_loss: 0.6166  avg_val_loss: 0.6014  time: 34s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6166  avg_val_loss: 0.6014  time: 34s
Epoch 1 - Score: 0.7048
INFO:__main__:Epoch 1 - Score: 0.7048
Epoch 1 - Save Best Score: 0.7048 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7048 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1444(0.6014) 
f1 score : 0.08641975308641976
recall score : 0.0457516339869281
precision score : 0.7777777777777778
thresh : 0.62
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 34s) Loss: 0.5989(0.5989) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 19s) Loss: 0.7835(0.5972) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.2200(0.5818) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.4873(0.5805) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4334(0.4334) 


Epoch 2 - avg_train_loss: 0.5805  avg_val_loss: 0.5989  time: 35s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5805  avg_val_loss: 0.5989  time: 35s
Epoch 2 - Score: 0.7048
INFO:__main__:Epoch 2 - Score: 0.7048


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8711(0.5989) 
f1 score : 0.19895287958115185
recall score : 0.12418300653594772
precision score : 0.5
thresh : 0.52
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 38s) Loss: 0.6430(0.6430) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.9136(0.5439) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.5363(0.5322) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.4440(0.5276) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2722(0.2722) 


Epoch 3 - avg_train_loss: 0.5276  avg_val_loss: 0.5924  time: 35s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5276  avg_val_loss: 0.5924  time: 35s
Epoch 3 - Score: 0.7108
INFO:__main__:Epoch 3 - Score: 0.7108
Epoch 3 - Save Best Score: 0.7108 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7108 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2099(0.5924) 
f1 score : 0.25961538461538464
recall score : 0.17647058823529413
precision score : 0.4909090909090909
thresh : 0.45
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 36s) Loss: 0.4524(0.4524) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.4205(0.4379) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.3204(0.4415) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.5159(0.4433) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3067(0.3067) 


Epoch 4 - avg_train_loss: 0.4433  avg_val_loss: 0.6249  time: 35s
INFO:__main__:Epoch 4 - avg_train_loss: 0.4433  avg_val_loss: 0.6249  time: 35s
Epoch 4 - Score: 0.7149
INFO:__main__:Epoch 4 - Score: 0.7149
Epoch 4 - Save Best Score: 0.7149 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7149 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2171(0.6249) 
f1 score : 0.4226415094339622
recall score : 0.3660130718954248
precision score : 0.5
thresh : 0.59


Score: 0.6928
INFO:__main__:Score: 0.6928
ACC BEST Score: 0.7149
INFO:__main__:ACC BEST Score: 0.7149
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.4226415094339622
recall score : 0.3660130718954248
precision score : 0.5
thresh : 0.59


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 48s) Loss: 0.7290(0.7290) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.6228(0.6406) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.6650(0.6293) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.6126(0.6226) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3261(0.3261) 


Epoch 1 - avg_train_loss: 0.6226  avg_val_loss: 0.5915  time: 35s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6226  avg_val_loss: 0.5915  time: 35s
Epoch 1 - Score: 0.7103
INFO:__main__:Epoch 1 - Score: 0.7103
Epoch 1 - Save Best Score: 0.7103 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7103 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1702(0.5915) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
thresh : 0.36
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 31s) Loss: 0.6763(0.6763) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.6292(0.5820) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.3704(0.5774) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.5751(0.5760) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2584(0.2584) 


Epoch 2 - avg_train_loss: 0.5760  avg_val_loss: 0.5655  time: 35s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5760  avg_val_loss: 0.5655  time: 35s
Epoch 2 - Score: 0.7304
INFO:__main__:Epoch 2 - Score: 0.7304
Epoch 2 - Save Best Score: 0.7304 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7304 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1474(0.5655) 
f1 score : 0.2736842105263158
recall score : 0.17105263157894737
precision score : 0.6842105263157895
thresh : 0.45
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 33s) Loss: 0.4622(0.4622) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.5140(0.5353) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 23s (remain 0m 8s) Loss: 0.5952(0.5113) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.6154(0.5027) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4255(0.4255) 


Epoch 3 - avg_train_loss: 0.5027  avg_val_loss: 0.6079  time: 35s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5027  avg_val_loss: 0.6079  time: 35s
Epoch 3 - Score: 0.7324
INFO:__main__:Epoch 3 - Score: 0.7324
Epoch 3 - Save Best Score: 0.7324 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7324 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7849(0.6079) 
f1 score : 0.47457627118644063
recall score : 0.4605263157894737
precision score : 0.48951048951048953
thresh : 0.79
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 42s) Loss: 0.3084(0.3084) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.1006(0.3328) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.4108(0.3278) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.3399(0.3309) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4553(0.4553) 


Epoch 4 - avg_train_loss: 0.3309  avg_val_loss: 0.7216  time: 35s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3309  avg_val_loss: 0.7216  time: 35s
Epoch 4 - Score: 0.7143
INFO:__main__:Epoch 4 - Score: 0.7143


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0402(0.7216) 
f1 score : 0.42068965517241386
recall score : 0.40131578947368424
precision score : 0.4420289855072464
thresh : 0.7


Score: 0.6881
INFO:__main__:Score: 0.6881
ACC BEST Score: 0.7324
INFO:__main__:ACC BEST Score: 0.7324
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.47457627118644063
recall score : 0.4605263157894737
precision score : 0.48951048951048953
thresh : 0.79


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 48s) Loss: 0.6556(0.6556) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.5537(0.6390) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.6914(0.6323) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.5832(0.6261) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4652(0.4652) 


Epoch 1 - avg_train_loss: 0.6261  avg_val_loss: 0.6160  time: 35s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6261  avg_val_loss: 0.6160  time: 35s
Epoch 1 - Score: 0.6982
INFO:__main__:Epoch 1 - Score: 0.6982
Epoch 1 - Save Best Score: 0.6982 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6982 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9750(0.6160) 
f1 score : 0.07317073170731707
recall score : 0.039473684210526314
precision score : 0.5
thresh : 0.43
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 37s) Loss: 0.6028(0.6028) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.8028(0.5969) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 8s) Loss: 0.4861(0.5907) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.6025(0.5884) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3792(0.3792) 


Epoch 2 - avg_train_loss: 0.5884  avg_val_loss: 0.6006  time: 35s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5884  avg_val_loss: 0.6006  time: 35s
Epoch 2 - Score: 0.6942
INFO:__main__:Epoch 2 - Score: 0.6942


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0953(0.6006) 
f1 score : 0.061349693251533735
recall score : 0.03289473684210526
precision score : 0.45454545454545453
thresh : 0.51
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 36s) Loss: 0.4447(0.4447) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 11s (remain 0m 19s) Loss: 0.5308(0.5468) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.4964(0.5357) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.5650(0.5298) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2890(0.2890) 


Epoch 3 - avg_train_loss: 0.5298  avg_val_loss: 0.6236  time: 34s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5298  avg_val_loss: 0.6236  time: 34s
Epoch 3 - Score: 0.7042
INFO:__main__:Epoch 3 - Score: 0.7042
Epoch 3 - Save Best Score: 0.7042 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7042 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.3931(0.6236) 
f1 score : 0.14035087719298245
recall score : 0.07894736842105263
precision score : 0.631578947368421
thresh : 0.49
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 43s) Loss: 0.4108(0.4108) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.4588(0.4643) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.4455(0.4558) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.4198(0.4472) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3784(0.3784) 


Epoch 4 - avg_train_loss: 0.4472  avg_val_loss: 0.6637  time: 35s
INFO:__main__:Epoch 4 - avg_train_loss: 0.4472  avg_val_loss: 0.6637  time: 35s
Epoch 4 - Score: 0.7002
INFO:__main__:Epoch 4 - Score: 0.7002


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2176(0.6637) 
f1 score : 0.32786885245901637
recall score : 0.2631578947368421
precision score : 0.43478260869565216
thresh : 0.72


Score: 0.7042
INFO:__main__:Score: 0.7042
ACC BEST Score: 0.7042
INFO:__main__:ACC BEST Score: 0.7042
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.14035087719298245
recall score : 0.07894736842105263
precision score : 0.631578947368421
thresh : 0.49


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 43s) Loss: 0.6271(0.6271) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.5653(0.6239) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.8219(0.6271) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.4116(0.6163) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3003(0.3003) 


Epoch 1 - avg_train_loss: 0.6163  avg_val_loss: 0.5936  time: 35s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6163  avg_val_loss: 0.5936  time: 35s
Epoch 1 - Score: 0.7123
INFO:__main__:Epoch 1 - Score: 0.7123
Epoch 1 - Save Best Score: 0.7123 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7123 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2087(0.5936) 
f1 score : 0.08536585365853658
recall score : 0.046052631578947366
precision score : 0.5833333333333334
thresh : 0.33
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 36s) Loss: 0.4943(0.4943) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.6315(0.5910) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 8s) Loss: 0.5385(0.5867) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.5455(0.5786) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4373(0.4373) 


Epoch 2 - avg_train_loss: 0.5786  avg_val_loss: 0.5820  time: 35s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5786  avg_val_loss: 0.5820  time: 35s
Epoch 2 - Score: 0.7062
INFO:__main__:Epoch 2 - Score: 0.7062


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8773(0.5820) 
f1 score : 0.2222222222222222
recall score : 0.13815789473684212
precision score : 0.5675675675675675
thresh : 0.79
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 28s) Loss: 0.5816(0.5816) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 11s (remain 0m 19s) Loss: 0.6976(0.5280) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.6198(0.5288) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 30s (remain 0m 0s) Loss: 0.4270(0.5227) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3348(0.3348) 


Epoch 3 - avg_train_loss: 0.5227  avg_val_loss: 0.5609  time: 34s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5227  avg_val_loss: 0.5609  time: 34s
Epoch 3 - Score: 0.7183
INFO:__main__:Epoch 3 - Score: 0.7183
Epoch 3 - Save Best Score: 0.7183 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7183 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0466(0.5609) 
f1 score : 0.32075471698113206
recall score : 0.2236842105263158
precision score : 0.5666666666666667
thresh : 0.47
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 47s) Loss: 0.3570(0.3570) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.3163(0.4044) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.5321(0.3956) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.3176(0.3957) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3906(0.3906) 


Epoch 4 - avg_train_loss: 0.3957  avg_val_loss: 0.6024  time: 35s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3957  avg_val_loss: 0.6024  time: 35s
Epoch 4 - Score: 0.7223
INFO:__main__:Epoch 4 - Score: 0.7223
Epoch 4 - Save Best Score: 0.7223 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7223 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0979(0.6024) 
f1 score : 0.4274809160305343
recall score : 0.3684210526315789
precision score : 0.509090909090909
thresh : 0.76


Score: 0.6982
INFO:__main__:Score: 0.6982
ACC BEST Score: 0.7223
INFO:__main__:ACC BEST Score: 0.7223
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.4274809160305343
recall score : 0.3684210526315789
precision score : 0.509090909090909
thresh : 0.76


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 50s) Loss: 0.6660(0.6660) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.7759(0.6261) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 23s (remain 0m 8s) Loss: 0.6281(0.6171) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.5920(0.6124) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.5047(0.5047) 


Epoch 1 - avg_train_loss: 0.6124  avg_val_loss: 0.6185  time: 35s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6124  avg_val_loss: 0.6185  time: 35s
Epoch 1 - Score: 0.7163
INFO:__main__:Epoch 1 - Score: 0.7163
Epoch 1 - Save Best Score: 0.7163 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7163 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8182(0.6185) 
f1 score : 0.21390374331550804
recall score : 0.13157894736842105
precision score : 0.5714285714285714
thresh : 0.52
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 31s) Loss: 0.5791(0.5791) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.5396(0.5934) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.9510(0.5815) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.5200(0.5833) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4591(0.4591) 


Epoch 2 - avg_train_loss: 0.5833  avg_val_loss: 0.6005  time: 35s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5833  avg_val_loss: 0.6005  time: 35s
Epoch 2 - Score: 0.7264
INFO:__main__:Epoch 2 - Score: 0.7264
Epoch 2 - Save Best Score: 0.7264 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7264 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7888(0.6005) 
f1 score : 0.42857142857142855
recall score : 0.375
precision score : 0.5
thresh : 0.55
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 40s) Loss: 0.6885(0.6885) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.4357(0.5238) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.4984(0.5118) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.3986(0.5101) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2963(0.2963) 


Epoch 3 - avg_train_loss: 0.5101  avg_val_loss: 0.5817  time: 35s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5101  avg_val_loss: 0.5817  time: 35s
Epoch 3 - Score: 0.7223
INFO:__main__:Epoch 3 - Score: 0.7223


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1210(0.5817) 
f1 score : 0.3317972350230415
recall score : 0.23684210526315788
precision score : 0.5538461538461539
thresh : 0.6
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 38s) Loss: 0.4792(0.4792) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.3295(0.4120) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.2948(0.3817) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.1577(0.3734) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4008(0.4008) 


Epoch 4 - avg_train_loss: 0.3734  avg_val_loss: 0.6653  time: 35s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3734  avg_val_loss: 0.6653  time: 35s
Epoch 4 - Score: 0.7103
INFO:__main__:Epoch 4 - Score: 0.7103


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.0887(0.6653) 
f1 score : 0.4507042253521127
recall score : 0.42105263157894735
precision score : 0.48484848484848486
thresh : 0.56


Score: 0.6942
INFO:__main__:Score: 0.6942
ACC BEST Score: 0.7264
INFO:__main__:ACC BEST Score: 0.7264
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.42857142857142855
recall score : 0.375
precision score : 0.5
thresh : 0.55


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 44s) Loss: 0.7150(0.7150) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.7427(0.6340) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.7523(0.6260) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 30s (remain 0m 0s) Loss: 0.5729(0.6191) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4512(0.4512) 


Epoch 1 - avg_train_loss: 0.6191  avg_val_loss: 0.6107  time: 34s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6191  avg_val_loss: 0.6107  time: 34s
Epoch 1 - Score: 0.7042
INFO:__main__:Epoch 1 - Score: 0.7042
Epoch 1 - Save Best Score: 0.7042 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7042 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8883(0.6107) 
f1 score : 0.07272727272727271
recall score : 0.039473684210526314
precision score : 0.46153846153846156
thresh : 0.39
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 37s) Loss: 0.5148(0.5148) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.5818(0.5845) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 23s (remain 0m 8s) Loss: 0.5921(0.5877) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.7991(0.5844) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3440(0.3440) 


Epoch 2 - avg_train_loss: 0.5844  avg_val_loss: 0.5538  time: 35s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5844  avg_val_loss: 0.5538  time: 35s
Epoch 2 - Score: 0.7163
INFO:__main__:Epoch 2 - Score: 0.7163
Epoch 2 - Save Best Score: 0.7163 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7163 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.9469(0.5538) 
f1 score : 0.15116279069767444
recall score : 0.08552631578947369
precision score : 0.65
thresh : 0.43
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 44s) Loss: 0.5882(0.5882) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.6459(0.5263) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.6970(0.5287) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.5236(0.5224) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.4379(0.4379) 


Epoch 3 - avg_train_loss: 0.5224  avg_val_loss: 0.5624  time: 35s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5224  avg_val_loss: 0.5624  time: 35s
Epoch 3 - Score: 0.7143
INFO:__main__:Epoch 3 - Score: 0.7143


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7055(0.5624) 
f1 score : 0.4761904761904762
recall score : 0.4605263157894737
precision score : 0.49295774647887325
thresh : 0.61
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 28s) Loss: 0.5315(0.5315) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.6910(0.4375) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.5941(0.4216) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.5437(0.4103) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.3597(0.3597) 


Epoch 4 - avg_train_loss: 0.4103  avg_val_loss: 0.5790  time: 35s
INFO:__main__:Epoch 4 - avg_train_loss: 0.4103  avg_val_loss: 0.5790  time: 35s
Epoch 4 - Score: 0.7183
INFO:__main__:Epoch 4 - Score: 0.7183
Epoch 4 - Save Best Score: 0.7183 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7183 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.8763(0.5790) 
f1 score : 0.46840148698884754
recall score : 0.4144736842105263
precision score : 0.5384615384615384
thresh : 0.46


Score: 0.7123
INFO:__main__:Score: 0.7123
ACC BEST Score: 0.7183
INFO:__main__:ACC BEST Score: 0.7183
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.46840148698884754
recall score : 0.4144736842105263
precision score : 0.5384615384615384
thresh : 0.46


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 43s) Loss: 0.7213(0.7213) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.4822(0.6089) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.6450(0.6136) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.5935(0.6145) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.3029(0.3029) 


Epoch 1 - avg_train_loss: 0.6145  avg_val_loss: 0.5930  time: 35s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6145  avg_val_loss: 0.5930  time: 35s
Epoch 1 - Score: 0.7062
INFO:__main__:Epoch 1 - Score: 0.7062
Epoch 1 - Save Best Score: 0.7062 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7062 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.2204(0.5930) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
thresh : 0.35
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 40s) Loss: 0.5218(0.5218) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.6754(0.5979) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.5228(0.5923) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.5193(0.5818) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.4845(0.4845) 


Epoch 2 - avg_train_loss: 0.5818  avg_val_loss: 0.6153  time: 35s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5818  avg_val_loss: 0.6153  time: 35s
Epoch 2 - Score: 0.7082
INFO:__main__:Epoch 2 - Score: 0.7082
Epoch 2 - Save Best Score: 0.7082 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7082 Model


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7364(0.6153) 
f1 score : 0.486646884272997
recall score : 0.5394736842105263
precision score : 0.44324324324324327
thresh : 0.67
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 31s) Loss: 0.4719(0.4719) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.6784(0.5277) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.3408(0.5159) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.3315(0.5074) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 5s) Loss: 0.2761(0.2761) 


Epoch 3 - avg_train_loss: 0.5074  avg_val_loss: 0.5922  time: 35s
INFO:__main__:Epoch 3 - avg_train_loss: 0.5074  avg_val_loss: 0.5922  time: 35s
Epoch 3 - Score: 0.7062
INFO:__main__:Epoch 3 - Score: 0.7062


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.1507(0.5922) 
f1 score : 0.3852459016393443
recall score : 0.3092105263157895
precision score : 0.5108695652173914
thresh : 0.68
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 39s) Loss: 0.3034(0.3034) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 11s (remain 0m 20s) Loss: 0.1224(0.3678) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 22s (remain 0m 8s) Loss: 0.6848(0.3506) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 31s (remain 0m 0s) Loss: 0.3504(0.3447) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 6s) Loss: 0.2586(0.2586) 


Epoch 4 - avg_train_loss: 0.3447  avg_val_loss: 0.6953  time: 35s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3447  avg_val_loss: 0.6953  time: 35s
Epoch 4 - Score: 0.7022
INFO:__main__:Epoch 4 - Score: 0.7022


EVAL: [15/16] Elapsed 0m 3s (remain 0m 0s) Loss: 1.4520(0.6953) 
f1 score : 0.3764705882352941
recall score : 0.3157894736842105
precision score : 0.46601941747572817
thresh : 0.76




In [None]:
from google.colab import runtime
runtime.unassign()