In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m99.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m110.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, h

In [3]:
!nvidia-smi

Sat Apr 29 21:56:00 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    44W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:

import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re
import html

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD, AdamW, RAdam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score, recall_score, accuracy_score, precision_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
DIR = "/content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")
CUSTOM_MODEL_DIR = os.path.join(OUTPUT_DIR,'clrp_deberta_v3_base')
OUTPUT_EXP_DIR = DIR + '/output/EXP035/'
if not os.path.exists(OUTPUT_EXP_DIR):
    os.makedirs(OUTPUT_EXP_DIR)

In [6]:

# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model_name="microsoft/deberta-v3-base"
    # model='microsoft/deberta-base'
    # model='roberta-base'
    # model='roberta-large'
    # model='roberta-large-mnli'
    # model='xlnet-large-cased'
    # model='albert-xxlarge-v2'
    # model="microsoft/deberta-large"
    # model="microsoft/deberta-v3-large"
    # model='microsoft/deberta-v2-xlarge'
    # model='funnel-transformer/large'
    # model='funnel-transformer/medium'
    # model='albert-base-v2'
    # model='albert-large-v2'
    # model='google/electra-large-discriminator'
    # model='google/electra-base-discriminator'
    # model="facebook/bart-large-mnli"
    # model="facebook/bart-large"
    # model="facebook/bart-base"
    model = CUSTOM_MODEL_DIR
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=16
    fc_dropout=0.2
    target_size=1
    max_len=256
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=10
    trn_fold=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    train=True
    nth_awp_start_epoch=1
    gradient_checkpointing = False
    freezing = False
    num_reinit_layers = 1
    is_reinit_layer = False
    fgm = False
    awp_start=1

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1]

In [7]:
def get_score(labels, outputs):
    thresh = 0.5
    y_pred = outputs
    y_true = labels
    f_score = f1_score(y_true, (y_pred>thresh).astype(int))
    r_score = recall_score(y_true, (y_pred>thresh).astype(int))
    p_score = precision_score(y_true, (y_pred>thresh).astype(int))
    print(f"f1 score : {f_score}")
    print(f"recall score : {r_score}")
    print(f"precision score : {p_score}")
    return accuracy_score(y_true, (y_pred>thresh).astype(int))

def get_acc_score(labels, outputs):
    y_pred = outputs
    y_true = labels
    best_score = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.80, 0.01):
        thresh = np.round(thresh, 2)
        score = accuracy_score(y_true, (y_pred>thresh).astype(int))
        #print("Accuracy score at threshold {0} is {1}".format(thresh, score))
        if score > best_score:
          best_score = score
          best_thresh = thresh
    print(f"thresh : {best_thresh}")
    return accuracy_score(y_true, (y_pred>best_thresh).astype(int))


def get_logger(filename=OUTPUT_EXP_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=CFG.seed)

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
import pandas as pd
import numpy as np

train = pd.read_csv(os.path.join(INPUT_DIR,"train_data.csv"))
test = pd.read_csv(os.path.join(INPUT_DIR,"test_data.csv"))
sample_sub = pd.read_csv(os.path.join(INPUT_DIR,"submission.csv"))

print(train.shape)
display(train.head(3))

print(test.shape)
display(test.head(3))

print(sample_sub.shape)
display(sample_sub.head(3))

(4974, 6)


Unnamed: 0,id,title,year,abstract,keywords,y
0,1,Hierarchical Adversarially Learned Inference,2018,We propose a novel hierarchical generative mod...,"generative, hierarchical, unsupervised, semisu...",0
1,2,Learning to Compute Word Embeddings On the Fly,2018,Words in natural language follow a Zipfian dis...,"NLU, word embeddings, representation learning",0
2,3,Graph2Seq: Scalable Learning Dynamics for Graphs,2018,Neural networks are increasingly used as a gen...,,0


(6393, 5)


Unnamed: 0,id,title,year,abstract,keywords
0,1,StyleAlign: Analysis and Applications of Align...,2022,"In this paper, we perform an in-depth study of...","StyleGAN, transfer learning, fine tuning, mode..."
1,2,Embedding a random graph via GNN: mean-field i...,2021,We develop a theory for embedding a random gra...,"Graph neural network, graph embedding, multi-r..."
2,3,BBRefinement: an universal scheme to improve p...,2021,We present a conceptually simple yet powerful ...,"object detection, deep neural networks, refine..."


(6393, 2)


Unnamed: 0,id,y
0,1,0
1,2,0
2,3,0


In [10]:
train["texts"] = train["title"]

In [11]:
skf = StratifiedKFold(n_splits=CFG.n_fold,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train.y)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)

if CFG.debug:
    display(train.groupby('kfold').size())
    train = train.sample(n=500, random_state=0).reset_index(drop=True)
    display(train.groupby('kfold').size())

In [12]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_EXP_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

In [13]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['texts'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 3 # cls + sep + sep
LOGGER.info(f"max_len: {CFG.max_len}")

100%|██████████| 4974/4974 [00:00<00:00, 10023.93it/s]
max_len: 43
INFO:__main__:max_len: 43


In [14]:
class AWP:
    def __init__(self, model, optimizer, *, adv_param='weight',
                 adv_lr=0.001, adv_eps=0.001):
        self.model = model
        self.optimizer = optimizer
        self.adv_param = adv_param
        self.adv_lr = adv_lr
        self.adv_eps = adv_eps
        self.backup = {}

    def perturb(self, inputs, y, criterion):
        """
        Perturb model parameters for AWP gradient
        Call before loss and loss.backward()
        """
        self._save()  # save model parameters
        self._attack_step()  # perturb weights

    def _attack_step(self):
        e = 1e-6
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                grad = self.optimizer.state[param]['exp_avg']
                norm_grad = torch.norm(grad)
                norm_data = torch.norm(param.detach())

                if norm_grad != 0 and not torch.isnan(norm_grad):
                    # Set lower and upper limit in change
                    limit_eps = self.adv_eps * param.detach().abs()
                    param_min = param.data - limit_eps
                    param_max = param.data + limit_eps

                    # Perturb along gradient
                    # w += (adv_lr * |w| / |grad|) * grad
                    param.data.add_(grad, alpha=(self.adv_lr * (norm_data + e) / (norm_grad + e)))

                    # Apply the limit to the change
                    param.data.clamp_(param_min, param_max)

    def _save(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                if name not in self.backup:
                    self.backup[name] = param.clone().detach()
                else:
                    self.backup[name].copy_(param.data)

    def restore(self):
        """
        Restore model parameter to correct position; AWP do not perturbe weights, it perturb gradients
        Call after loss.backward(), before optimizer.step()
        """
        for name, param in self.model.named_parameters():
            if name in self.backup:
                param.data.copy_(self.backup[name])

In [15]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False,
                           truncation=True)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df['y'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

#collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
def reinit_layers(model):

    #for layer in model.model.encoder.layer[-CFG.num_reinit_layers:]:
    for layer in model.encoder.layer[-CFG.num_reinit_layers:]:    #Custome model内(backbone)

            for module in layer.modules():

                if isinstance(module,nn.Linear):
                    module.weight.data.normal_(mean=0.0,std=model.config.initializer_range)
                    if module.bias is not None:
                            module.bias.data.zero_()
                elif isinstance(module, nn.Embedding):
                        module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                        if module.padding_idx is not None:
                            module.weight.data[module.padding_idx].zero_()
                elif isinstance(module, nn.LayerNorm):
                        module.bias.data.zero_()
                        module.weight.data.fill_(1.0)
                        
    return model

In [17]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if CFG.is_reinit_layer:
            self.model = reinit_layers(self.model)
            print(f'Reinitializing Last {CFG.num_reinit_layers} Layers.')
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())

        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self._init_weights(self.fc)
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
        self._init_weights(self.fc)
        self.sig = nn.Sigmoid()
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        feature = self.layer_norm1(feature)
        output = self.fc(feature)
        #output = self.sig(output)
        return output

In [18]:
def calculate_loss(inputs, labels, model, criterion, is_valid=True, device="cpu"):    
    y_preds = model(inputs)
    loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
    return (loss, y_preds) if is_valid else loss

In [19]:

# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device, awp):
    model.zero_grad()
    model.train()
    awp_start = CFG.awp_start
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        if epoch >= awp_start:
            awp.perturb(inputs, labels, criterion)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            loss = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=False, device=device)
        #print(y_preds.sigmoid().squeeze().view(1, -1))
        #loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        if scaler is not None:
            scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        awp.restore()
        if CFG.fgm:
          fgm.attack() 
          adversarial_loss = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=False, device=device)
          scaler.scale(adversarial_loss).backward()
          fgm.restore()
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  #'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          #grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            loss, y_preds = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=True, device=device)
        #loss = criterion(y_preds.sigmoid().squeeze(), labels.squeeze())
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [20]:

# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['kfold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['kfold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds['y'].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)


    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size*2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_EXP_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr=5e-6, decoder_lr=1e-4, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", 
                    "LayerNorm.weight"]
        group1=['layer.0.','layer.1.','layer.2.','layer.3.']
        group2=['layer.4.','layer.5.','layer.6.','layer.7.']    
        group3=['layer.8.','layer.9.','layer.10.','layer.11.']
        group_all=['layer.0.','layer.1.','layer.2.','layer.3.','layer.4.','layer.5.','layer.6.','layer.7.','layer.8.','layer.9.','layer.10.','layer.11.']
        optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': weight_decay, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': weight_decay, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': weight_decay, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': 0.0},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': 0.0, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': 0.0, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': 0.0, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.named_parameters() if "model" not in n], 'lr':decoder_lr, "momentum" : 0.99},
    ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss()

    print('Enable AWP')
    awp = AWP(model, optimizer, adv_lr=0.001, adv_eps=0.001)
    #print('Enable FGM')
    #fgm = FGM(model=model, eps=0.1)
    
    best_score = -1.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device, awp)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score_05 = get_score(valid_labels, predictions)
        score = get_acc_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')

        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_EXP_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_EXP_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [21]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['y'].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        acc_score = get_acc_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
        LOGGER.info(f'ACC BEST Score: {acc_score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
            #break
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_EXP_DIR+'oof_title_df.pkl')

DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "torch_dtype": "f

Enable AWP
Epoch: [1][0/279] Elapsed 0m 3s (remain 16m 36s) Loss: 0.5390(0.5390) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 12s (remain 0m 21s) Loss: 0.5989(0.6462) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 20s (remain 0m 7s) Loss: 0.8178(0.6320) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 26s (remain 0m 0s) Loss: 0.5961(0.6258) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 3s) Loss: 0.4377(0.4377) 


Epoch 1 - avg_train_loss: 0.6258  avg_val_loss: 0.6138  time: 28s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6258  avg_val_loss: 0.6138  time: 28s
Epoch 1 - Score: 0.6948
INFO:__main__:Epoch 1 - Score: 0.6948
Epoch 1 - Save Best Score: 0.6948 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6948 Model


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 1.0256(0.6138) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
thresh : 0.43
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 14s) Loss: 0.5314(0.5314) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.5635(0.5930) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.5598(0.5950) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 24s (remain 0m 0s) Loss: 0.5896(0.5916) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 3s) Loss: 0.3210(0.3210) 


Epoch 2 - avg_train_loss: 0.5916  avg_val_loss: 0.6155  time: 25s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5916  avg_val_loss: 0.6155  time: 25s
Epoch 2 - Score: 0.7008
INFO:__main__:Epoch 2 - Score: 0.7008
Epoch 2 - Save Best Score: 0.7008 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7008 Model


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 1.3669(0.6155) 
f1 score : 0.012820512820512818
recall score : 0.006578947368421052
precision score : 0.25
thresh : 0.4
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 22s) Loss: 0.4507(0.4507) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.4217(0.4947) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.4734(0.4690) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 24s (remain 0m 0s) Loss: 0.5111(0.4578) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 3s) Loss: 0.3434(0.3434) 


Epoch 3 - avg_train_loss: 0.4578  avg_val_loss: 0.6896  time: 25s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4578  avg_val_loss: 0.6896  time: 25s
Epoch 3 - Score: 0.7008
INFO:__main__:Epoch 3 - Score: 0.7008


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 1.6893(0.6896) 
f1 score : 0.20304568527918782
recall score : 0.13157894736842105
precision score : 0.4444444444444444
thresh : 0.61
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 20s) Loss: 0.2467(0.2467) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.0645(0.2273) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.3674(0.2175) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0646(0.2117) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 3s) Loss: 0.6063(0.6063) 


Epoch 4 - avg_train_loss: 0.2117  avg_val_loss: 0.7961  time: 24s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2117  avg_val_loss: 0.7961  time: 24s
Epoch 4 - Score: 0.6888
INFO:__main__:Epoch 4 - Score: 0.6888


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 1.6723(0.7961) 
f1 score : 0.3929824561403508
recall score : 0.3684210526315789
precision score : 0.42105263157894735
thresh : 0.77


Score: 0.6908
INFO:__main__:Score: 0.6908
ACC BEST Score: 0.7008
INFO:__main__:ACC BEST Score: 0.7008
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.012820512820512818
recall score : 0.006578947368421052
precision score : 0.25
thresh : 0.4


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 26s) Loss: 0.7657(0.7657) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.4636(0.6209) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.7762(0.6204) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.7866(0.6185) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 3s) Loss: 0.4909(0.4909) 


Epoch 1 - avg_train_loss: 0.6185  avg_val_loss: 0.6195  time: 24s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6185  avg_val_loss: 0.6195  time: 24s
Epoch 1 - Score: 0.6948
INFO:__main__:Epoch 1 - Score: 0.6948
Epoch 1 - Save Best Score: 0.6948 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6948 Model


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 0.9501(0.6195) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
thresh : 0.48
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 20s) Loss: 0.4902(0.4902) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.4912(0.5779) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.5626(0.5789) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 24s (remain 0m 0s) Loss: 0.5221(0.5771) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 3s) Loss: 0.4395(0.4395) 


Epoch 2 - avg_train_loss: 0.5771  avg_val_loss: 0.6297  time: 25s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5771  avg_val_loss: 0.6297  time: 25s
Epoch 2 - Score: 0.6968
INFO:__main__:Epoch 2 - Score: 0.6968
Epoch 2 - Save Best Score: 0.6968 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.6968 Model


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 1.0139(0.6297) 
f1 score : 0.20202020202020204
recall score : 0.13071895424836602
precision score : 0.4444444444444444
thresh : 0.56
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 29s) Loss: 0.5242(0.5242) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.3965(0.4459) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.5834(0.4137) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 24s (remain 0m 0s) Loss: 0.3083(0.3945) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 3s) Loss: 0.2845(0.2845) 


Epoch 3 - avg_train_loss: 0.3945  avg_val_loss: 0.7701  time: 25s
INFO:__main__:Epoch 3 - avg_train_loss: 0.3945  avg_val_loss: 0.7701  time: 25s
Epoch 3 - Score: 0.6888
INFO:__main__:Epoch 3 - Score: 0.6888


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 1.8918(0.7701) 
f1 score : 0.19138755980861244
recall score : 0.13071895424836602
precision score : 0.35714285714285715
thresh : 0.77
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 26s) Loss: 0.1520(0.1520) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.0583(0.1459) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.0385(0.1361) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.1583(0.1313) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 3s) Loss: 0.4089(0.4089) 


Epoch 4 - avg_train_loss: 0.1313  avg_val_loss: 0.9075  time: 24s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1313  avg_val_loss: 0.9075  time: 24s
Epoch 4 - Score: 0.6707
INFO:__main__:Epoch 4 - Score: 0.6707


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 2.0687(0.9075) 
f1 score : 0.31297709923664124
recall score : 0.2679738562091503
precision score : 0.3761467889908257
thresh : 0.71


Score: 0.6827
INFO:__main__:Score: 0.6827
ACC BEST Score: 0.6968
INFO:__main__:ACC BEST Score: 0.6968
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.20202020202020204
recall score : 0.13071895424836602
precision score : 0.4444444444444444
thresh : 0.56


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 25s) Loss: 0.8847(0.8847) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.5687(0.6284) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.6556(0.6251) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.6671(0.6217) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 3s) Loss: 0.4555(0.4555) 


Epoch 1 - avg_train_loss: 0.6217  avg_val_loss: 0.6047  time: 25s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6217  avg_val_loss: 0.6047  time: 25s
Epoch 1 - Score: 0.6988
INFO:__main__:Epoch 1 - Score: 0.6988
Epoch 1 - Save Best Score: 0.6988 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6988 Model


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 1.0103(0.6047) 
f1 score : 0.02564102564102564
recall score : 0.013071895424836602
precision score : 0.6666666666666666
thresh : 0.46
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 25s) Loss: 0.6293(0.6293) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.4325(0.6017) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.5910(0.5941) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 24s (remain 0m 0s) Loss: 0.5864(0.5985) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 3s) Loss: 0.3318(0.3318) 


Epoch 2 - avg_train_loss: 0.5985  avg_val_loss: 0.5971  time: 25s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5985  avg_val_loss: 0.5971  time: 25s
Epoch 2 - Score: 0.7028
INFO:__main__:Epoch 2 - Score: 0.7028
Epoch 2 - Save Best Score: 0.7028 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7028 Model


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 1.2871(0.5971) 
f1 score : 0.0625
recall score : 0.032679738562091505
precision score : 0.7142857142857143
thresh : 0.45
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 26s) Loss: 0.4037(0.4037) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.3834(0.5006) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.5067(0.4937) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 24s (remain 0m 0s) Loss: 0.3726(0.4746) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.7039(0.7039) 


Epoch 3 - avg_train_loss: 0.4746  avg_val_loss: 0.7452  time: 25s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4746  avg_val_loss: 0.7452  time: 25s
Epoch 3 - Score: 0.6586
INFO:__main__:Epoch 3 - Score: 0.6586


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 0.9547(0.7452) 
f1 score : 0.4450261780104712
recall score : 0.5555555555555556
precision score : 0.37117903930131
thresh : 0.77
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 23s) Loss: 0.3900(0.3900) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.2780(0.2382) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.2785(0.2240) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.2365(0.2165) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 3s) Loss: 0.4412(0.4412) 


Epoch 4 - avg_train_loss: 0.2165  avg_val_loss: 0.8449  time: 24s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2165  avg_val_loss: 0.8449  time: 24s
Epoch 4 - Score: 0.6647
INFO:__main__:Epoch 4 - Score: 0.6647


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 1.9816(0.8449) 
f1 score : 0.3188405797101449
recall score : 0.2875816993464052
precision score : 0.35772357723577236
thresh : 0.75


Score: 0.6988
INFO:__main__:Score: 0.6988
ACC BEST Score: 0.7028
INFO:__main__:ACC BEST Score: 0.7028
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.0625
recall score : 0.032679738562091505
precision score : 0.7142857142857143
thresh : 0.45


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 26s) Loss: 0.8511(0.8511) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.5403(0.6421) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.7070(0.6226) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.5771(0.6256) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 3s) Loss: 0.3941(0.3941) 


Epoch 1 - avg_train_loss: 0.6256  avg_val_loss: 0.6126  time: 24s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6256  avg_val_loss: 0.6126  time: 24s
Epoch 1 - Score: 0.6928
INFO:__main__:Epoch 1 - Score: 0.6928
Epoch 1 - Save Best Score: 0.6928 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6928 Model


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 1.0969(0.6126) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
thresh : 0.42
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 25s) Loss: 0.6062(0.6062) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.7053(0.5959) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.2130(0.5893) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 24s (remain 0m 0s) Loss: 0.5393(0.5900) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 3s) Loss: 0.5090(0.5090) 


Epoch 2 - avg_train_loss: 0.5900  avg_val_loss: 0.6338  time: 25s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5900  avg_val_loss: 0.6338  time: 25s
Epoch 2 - Score: 0.6968
INFO:__main__:Epoch 2 - Score: 0.6968
Epoch 2 - Save Best Score: 0.6968 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.6968 Model


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 0.9040(0.6338) 
f1 score : 0.27385892116182575
recall score : 0.21568627450980393
precision score : 0.375
thresh : 0.73
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 25s) Loss: 0.5669(0.5669) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.6018(0.4956) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.4495(0.4736) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.3619(0.4557) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 3s) Loss: 0.3072(0.3072) 


Epoch 3 - avg_train_loss: 0.4557  avg_val_loss: 0.7015  time: 25s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4557  avg_val_loss: 0.7015  time: 25s
Epoch 3 - Score: 0.6948
INFO:__main__:Epoch 3 - Score: 0.6948


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 1.5995(0.7015) 
f1 score : 0.27230046948356806
recall score : 0.1895424836601307
precision score : 0.48333333333333334
thresh : 0.67
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 26s) Loss: 0.2704(0.2704) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.4184(0.2097) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.1145(0.1907) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.2027(0.1905) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 3s) Loss: 0.3803(0.3803) 


Epoch 4 - avg_train_loss: 0.1905  avg_val_loss: 0.8521  time: 24s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1905  avg_val_loss: 0.8521  time: 24s
Epoch 4 - Score: 0.7008
INFO:__main__:Epoch 4 - Score: 0.7008
Epoch 4 - Save Best Score: 0.7008 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7008 Model


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 2.0462(0.8521) 
f1 score : 0.28806584362139914
recall score : 0.22875816993464052
precision score : 0.3888888888888889
thresh : 0.77


Score: 0.6526
INFO:__main__:Score: 0.6526
ACC BEST Score: 0.7008
INFO:__main__:ACC BEST Score: 0.7008
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.28806584362139914
recall score : 0.22875816993464052
precision score : 0.3888888888888889
thresh : 0.77


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 27s) Loss: 0.8637(0.8637) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.6290(0.6402) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.6278(0.6281) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 24s (remain 0m 0s) Loss: 0.6225(0.6235) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 3s) Loss: 0.3893(0.3893) 


Epoch 1 - avg_train_loss: 0.6235  avg_val_loss: 0.6136  time: 25s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6235  avg_val_loss: 0.6136  time: 25s
Epoch 1 - Score: 0.6942
INFO:__main__:Epoch 1 - Score: 0.6942
Epoch 1 - Save Best Score: 0.6942 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6942 Model


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 1.0801(0.6136) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
thresh : 0.46
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 24s) Loss: 0.6803(0.6803) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.7204(0.5958) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.4054(0.5943) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 24s (remain 0m 0s) Loss: 0.6079(0.5954) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 3s) Loss: 0.3480(0.3480) 


Epoch 2 - avg_train_loss: 0.5954  avg_val_loss: 0.6138  time: 25s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5954  avg_val_loss: 0.6138  time: 25s
Epoch 2 - Score: 0.6962
INFO:__main__:Epoch 2 - Score: 0.6962
Epoch 2 - Save Best Score: 0.6962 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.6962 Model


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 1.1383(0.6138) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
thresh : 0.41
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 29s) Loss: 0.4579(0.4579) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.6301(0.5137) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.2471(0.4965) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 24s (remain 0m 0s) Loss: 0.2406(0.4873) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 4s) Loss: 0.2515(0.2515) 


Epoch 3 - avg_train_loss: 0.4873  avg_val_loss: 0.7238  time: 25s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4873  avg_val_loss: 0.7238  time: 25s
Epoch 3 - Score: 0.6801
INFO:__main__:Epoch 3 - Score: 0.6801


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 1.2781(0.7238) 
f1 score : 0.2868217054263566
recall score : 0.24342105263157895
precision score : 0.3490566037735849
thresh : 0.79
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 32s) Loss: 0.2828(0.2828) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.1818(0.2411) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.2444(0.2332) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.2122(0.2281) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 3s) Loss: 0.1789(0.1789) 


Epoch 4 - avg_train_loss: 0.2281  avg_val_loss: 0.8793  time: 24s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2281  avg_val_loss: 0.8793  time: 24s
Epoch 4 - Score: 0.6740
INFO:__main__:Epoch 4 - Score: 0.6740


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 1.8667(0.8793) 
f1 score : 0.32061068702290074
recall score : 0.27631578947368424
precision score : 0.38181818181818183
thresh : 0.77


Score: 0.6942
INFO:__main__:Score: 0.6942
ACC BEST Score: 0.6962
INFO:__main__:ACC BEST Score: 0.6962
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.0
recall score : 0.0
precision score : 0.0
thresh : 0.41


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 27s) Loss: 0.7717(0.7717) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.5469(0.6207) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.6589(0.6248) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.6106(0.6217) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 3s) Loss: 0.4907(0.4907) 


Epoch 1 - avg_train_loss: 0.6217  avg_val_loss: 0.6227  time: 24s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6217  avg_val_loss: 0.6227  time: 24s
Epoch 1 - Score: 0.6982
INFO:__main__:Epoch 1 - Score: 0.6982
Epoch 1 - Save Best Score: 0.6982 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6982 Model


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 0.9733(0.6227) 
f1 score : 0.025806451612903226
recall score : 0.013157894736842105
precision score : 0.6666666666666666
thresh : 0.51
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 28s) Loss: 0.6494(0.6494) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.7974(0.6053) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.4452(0.5961) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 24s (remain 0m 0s) Loss: 0.5716(0.5889) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 3s) Loss: 0.4895(0.4895) 


Epoch 2 - avg_train_loss: 0.5889  avg_val_loss: 0.6215  time: 25s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5889  avg_val_loss: 0.6215  time: 25s
Epoch 2 - Score: 0.7022
INFO:__main__:Epoch 2 - Score: 0.7022
Epoch 2 - Save Best Score: 0.7022 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7022 Model


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 1.1192(0.6215) 
f1 score : 0.2149532710280374
recall score : 0.1513157894736842
precision score : 0.3709677419354839
thresh : 0.65
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 29s) Loss: 0.4363(0.4363) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.4885(0.4708) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.4031(0.4495) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 24s (remain 0m 0s) Loss: 0.3902(0.4292) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 3s) Loss: 0.4565(0.4565) 


Epoch 3 - avg_train_loss: 0.4292  avg_val_loss: 0.7083  time: 25s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4292  avg_val_loss: 0.7083  time: 25s
Epoch 3 - Score: 0.7002
INFO:__main__:Epoch 3 - Score: 0.7002


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 1.5458(0.7083) 
f1 score : 0.26724137931034486
recall score : 0.20394736842105263
precision score : 0.3875
thresh : 0.76
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 30s) Loss: 0.1569(0.1569) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.2420(0.1658) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.0785(0.1539) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.1407(0.1479) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 3s) Loss: 0.5277(0.5277) 


Epoch 4 - avg_train_loss: 0.1479  avg_val_loss: 0.9116  time: 25s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1479  avg_val_loss: 0.9116  time: 25s
Epoch 4 - Score: 0.6801
INFO:__main__:Epoch 4 - Score: 0.6801


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 2.0059(0.9116) 
f1 score : 0.32
recall score : 0.2631578947368421
precision score : 0.40816326530612246
thresh : 0.77


Score: 0.6620
INFO:__main__:Score: 0.6620
ACC BEST Score: 0.7022
INFO:__main__:ACC BEST Score: 0.7022
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.2149532710280374
recall score : 0.1513157894736842
precision score : 0.3709677419354839
thresh : 0.65


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 26s) Loss: 0.6537(0.6537) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.6723(0.6385) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.8344(0.6371) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.4333(0.6263) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 3s) Loss: 0.2793(0.2793) 


Epoch 1 - avg_train_loss: 0.6263  avg_val_loss: 0.6215  time: 24s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6263  avg_val_loss: 0.6215  time: 24s
Epoch 1 - Score: 0.6962
INFO:__main__:Epoch 1 - Score: 0.6962
Epoch 1 - Save Best Score: 0.6962 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6962 Model


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 1.4890(0.6215) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
thresh : 0.31
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 26s) Loss: 0.5260(0.5260) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.6548(0.6010) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.6152(0.5977) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 24s (remain 0m 0s) Loss: 0.5828(0.5949) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 3s) Loss: 0.4346(0.4346) 


Epoch 2 - avg_train_loss: 0.5949  avg_val_loss: 0.6086  time: 25s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5949  avg_val_loss: 0.6086  time: 25s
Epoch 2 - Score: 0.6962
INFO:__main__:Epoch 2 - Score: 0.6962


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 0.9991(0.6086) 
f1 score : 0.16753926701570682
recall score : 0.10526315789473684
precision score : 0.41025641025641024
thresh : 0.55
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 25s) Loss: 0.5957(0.5957) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.6379(0.5094) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.4908(0.4937) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.5626(0.4736) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 3s) Loss: 0.3304(0.3304) 


Epoch 3 - avg_train_loss: 0.4736  avg_val_loss: 0.6773  time: 24s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4736  avg_val_loss: 0.6773  time: 24s
Epoch 3 - Score: 0.6901
INFO:__main__:Epoch 3 - Score: 0.6901


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 1.4879(0.6773) 
f1 score : 0.2773109243697479
recall score : 0.21710526315789475
precision score : 0.38372093023255816
thresh : 0.65
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 27s) Loss: 0.5513(0.5513) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.2473(0.2417) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.2745(0.2325) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.1663(0.2250) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 3s) Loss: 0.3313(0.3313) 


Epoch 4 - avg_train_loss: 0.2250  avg_val_loss: 0.7783  time: 24s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2250  avg_val_loss: 0.7783  time: 24s
Epoch 4 - Score: 0.6861
INFO:__main__:Epoch 4 - Score: 0.6861


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 1.7680(0.7783) 
f1 score : 0.2992125984251969
recall score : 0.25
precision score : 0.37254901960784315
thresh : 0.79


Score: 0.6942
INFO:__main__:Score: 0.6942
ACC BEST Score: 0.6962
INFO:__main__:ACC BEST Score: 0.6962
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.0
recall score : 0.0
precision score : 0.0
thresh : 0.31


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 30s) Loss: 0.6577(0.6577) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.7305(0.6293) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.6977(0.6219) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.5637(0.6212) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 3s) Loss: 0.3947(0.3947) 


Epoch 1 - avg_train_loss: 0.6212  avg_val_loss: 0.6095  time: 24s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6212  avg_val_loss: 0.6095  time: 24s
Epoch 1 - Score: 0.6962
INFO:__main__:Epoch 1 - Score: 0.6962
Epoch 1 - Save Best Score: 0.6962 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6962 Model


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 1.1255(0.6095) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
thresh : 0.41
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 20s) Loss: 0.5013(0.5013) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.6277(0.5958) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.8316(0.5922) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 24s (remain 0m 0s) Loss: 0.5441(0.5956) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 3s) Loss: 0.4545(0.4545) 


Epoch 2 - avg_train_loss: 0.5956  avg_val_loss: 0.6148  time: 25s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5956  avg_val_loss: 0.6148  time: 25s
Epoch 2 - Score: 0.7002
INFO:__main__:Epoch 2 - Score: 0.7002
Epoch 2 - Save Best Score: 0.7002 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7002 Model


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 1.0099(0.6148) 
f1 score : 0.12571428571428572
recall score : 0.07236842105263158
precision score : 0.4782608695652174
thresh : 0.55
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 25s) Loss: 0.6095(0.6095) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 9s (remain 0m 15s) Loss: 0.4033(0.5283) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 18s (remain 0m 7s) Loss: 0.5181(0.5015) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 24s (remain 0m 0s) Loss: 0.4196(0.4919) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 3s) Loss: 0.3197(0.3197) 


Epoch 3 - avg_train_loss: 0.4919  avg_val_loss: 0.6868  time: 26s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4919  avg_val_loss: 0.6868  time: 26s
Epoch 3 - Score: 0.7022
INFO:__main__:Epoch 3 - Score: 0.7022
Epoch 3 - Save Best Score: 0.7022 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7022 Model


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 1.6356(0.6868) 
f1 score : 0.2648401826484018
recall score : 0.19078947368421054
precision score : 0.43283582089552236
thresh : 0.68
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 29s) Loss: 0.2869(0.2869) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.1289(0.2692) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.0959(0.2495) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 24s (remain 0m 0s) Loss: 0.1966(0.2453) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 3s) Loss: 0.4457(0.4457) 


Epoch 4 - avg_train_loss: 0.2453  avg_val_loss: 0.8837  time: 25s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2453  avg_val_loss: 0.8837  time: 25s
Epoch 4 - Score: 0.6781
INFO:__main__:Epoch 4 - Score: 0.6781


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 1.8309(0.8837) 
f1 score : 0.30935251798561153
recall score : 0.28289473684210525
precision score : 0.3412698412698413
thresh : 0.79


Score: 0.6761
INFO:__main__:Score: 0.6761
ACC BEST Score: 0.7022
INFO:__main__:ACC BEST Score: 0.7022
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.2648401826484018
recall score : 0.19078947368421054
precision score : 0.43283582089552236
thresh : 0.68


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 26s) Loss: 0.5983(0.5983) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.7209(0.6399) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.7577(0.6315) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.5539(0.6246) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 3s) Loss: 0.4599(0.4599) 


Epoch 1 - avg_train_loss: 0.6246  avg_val_loss: 0.6177  time: 25s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6246  avg_val_loss: 0.6177  time: 25s
Epoch 1 - Score: 0.6982
INFO:__main__:Epoch 1 - Score: 0.6982
Epoch 1 - Save Best Score: 0.6982 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6982 Model


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 0.9443(0.6177) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
thresh : 0.4
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 18s) Loss: 0.5271(0.5271) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.5478(0.5902) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.6828(0.5957) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 24s (remain 0m 0s) Loss: 0.7204(0.5970) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 3s) Loss: 0.3834(0.3834) 


Epoch 2 - avg_train_loss: 0.5970  avg_val_loss: 0.6021  time: 25s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5970  avg_val_loss: 0.6021  time: 25s
Epoch 2 - Score: 0.7062
INFO:__main__:Epoch 2 - Score: 0.7062
Epoch 2 - Save Best Score: 0.7062 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7062 Model


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 1.0195(0.6021) 
f1 score : 0.06211180124223602
recall score : 0.03289473684210526
precision score : 0.5555555555555556
thresh : 0.46
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 19s) Loss: 0.6978(0.6978) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.6662(0.4822) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.5365(0.4859) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 24s (remain 0m 0s) Loss: 0.3886(0.4736) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 3s) Loss: 0.4144(0.4144) 


Epoch 3 - avg_train_loss: 0.4736  avg_val_loss: 0.6779  time: 25s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4736  avg_val_loss: 0.6779  time: 25s
Epoch 3 - Score: 0.6982
INFO:__main__:Epoch 3 - Score: 0.6982


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 1.0190(0.6779) 
f1 score : 0.34456928838951306
recall score : 0.3026315789473684
precision score : 0.4
thresh : 0.6
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 22s) Loss: 0.4556(0.4556) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.2271(0.2133) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.2447(0.2161) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.0783(0.2105) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 3s) Loss: 0.4884(0.4884) 


Epoch 4 - avg_train_loss: 0.2105  avg_val_loss: 0.8809  time: 24s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2105  avg_val_loss: 0.8809  time: 24s
Epoch 4 - Score: 0.6922
INFO:__main__:Epoch 4 - Score: 0.6922


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 1.3177(0.8809) 
f1 score : 0.33793103448275863
recall score : 0.3223684210526316
precision score : 0.35507246376811596
thresh : 0.76


Score: 0.6962
INFO:__main__:Score: 0.6962
ACC BEST Score: 0.7062
INFO:__main__:ACC BEST Score: 0.7062
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,

f1 score : 0.06211180124223602
recall score : 0.03289473684210526
precision score : 0.5555555555555556
thresh : 0.46


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base were not used when initializing DebertaV2Model: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enable AWP
Epoch: [1][0/279] Elapsed 0m 0s (remain 1m 27s) Loss: 0.7079(0.7079) LR: 0.00002000  
Epoch: [1][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.4938(0.6133) LR: 0.00001960  
Epoch: [1][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.6955(0.6199) LR: 0.00001845  
Epoch: [1][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.6219(0.6204) LR: 0.00001709  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 3s) Loss: 0.3582(0.3582) 


Epoch 1 - avg_train_loss: 0.6204  avg_val_loss: 0.6059  time: 24s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6204  avg_val_loss: 0.6059  time: 24s
Epoch 1 - Score: 0.6982
INFO:__main__:Epoch 1 - Score: 0.6982
Epoch 1 - Save Best Score: 0.6982 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6982 Model


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 1.0788(0.6059) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
thresh : 0.43
Epoch: [2][0/279] Elapsed 0m 0s (remain 1m 26s) Loss: 0.4471(0.4471) LR: 0.00001707  
Epoch: [2][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.7124(0.6089) LR: 0.00001483  
Epoch: [2][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.5864(0.5996) LR: 0.00001221  
Epoch: [2][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.5384(0.5919) LR: 0.00001004  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 3s) Loss: 0.3942(0.3942) 


Epoch 2 - avg_train_loss: 0.5919  avg_val_loss: 0.6134  time: 25s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5919  avg_val_loss: 0.6134  time: 25s
Epoch 2 - Score: 0.6962
INFO:__main__:Epoch 2 - Score: 0.6962


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 0.8568(0.6134) 
f1 score : 0.28169014084507044
recall score : 0.19736842105263158
precision score : 0.4918032786885246
thresh : 0.51
Epoch: [3][0/279] Elapsed 0m 0s (remain 1m 26s) Loss: 0.4322(0.4322) LR: 0.00001001  
Epoch: [3][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.3626(0.4816) LR: 0.00000724  
Epoch: [3][200/279] Elapsed 0m 16s (remain 0m 6s) Loss: 0.4860(0.4755) LR: 0.00000469  
Epoch: [3][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.3186(0.4654) LR: 0.00000297  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 3s) Loss: 0.2299(0.2299) 


Epoch 3 - avg_train_loss: 0.4654  avg_val_loss: 0.6732  time: 24s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4654  avg_val_loss: 0.6732  time: 24s
Epoch 3 - Score: 0.6942
INFO:__main__:Epoch 3 - Score: 0.6942


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 1.1844(0.6732) 
f1 score : 0.28571428571428575
recall score : 0.21710526315789475
precision score : 0.4177215189873418
thresh : 0.76
Epoch: [4][0/279] Elapsed 0m 0s (remain 1m 25s) Loss: 0.2191(0.2191) LR: 0.00000295  
Epoch: [4][100/279] Elapsed 0m 8s (remain 0m 15s) Loss: 0.0599(0.2123) LR: 0.00000126  
Epoch: [4][200/279] Elapsed 0m 17s (remain 0m 6s) Loss: 0.1922(0.2058) LR: 0.00000026  
Epoch: [4][278/279] Elapsed 0m 23s (remain 0m 0s) Loss: 0.2552(0.2018) LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 3s) Loss: 0.2418(0.2418) 


Epoch 4 - avg_train_loss: 0.2018  avg_val_loss: 0.8267  time: 25s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2018  avg_val_loss: 0.8267  time: 25s
Epoch 4 - Score: 0.6740
INFO:__main__:Epoch 4 - Score: 0.6740


EVAL: [15/16] Elapsed 0m 0s (remain 0m 0s) Loss: 1.3647(0.8267) 
f1 score : 0.38518518518518513
recall score : 0.34210526315789475
precision score : 0.4406779661016949
thresh : 0.79


Score: 0.6942
INFO:__main__:Score: 0.6942
ACC BEST Score: 0.6982
INFO:__main__:ACC BEST Score: 0.6982
Score: 0.6842
INFO:__main__:Score: 0.6842
ACC BEST Score: 0.6954
INFO:__main__:ACC BEST Score: 0.6954


f1 score : 0.0
recall score : 0.0
precision score : 0.0
thresh : 0.43
f1 score : 0.1306032097399004
recall score : 0.07747866053841103
precision score : 0.4154929577464789
thresh : 0.76


In [None]:
from google.colab import runtime
runtime.unassign()