In [1]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!nvidia-smi

Fri Apr 21 10:47:15 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P0    49W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:

import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re
import html

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD, AdamW, RAdam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score, recall_score, accuracy_score, precision_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup, get_polynomial_decay_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
DIR = "/content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")
CUSTOM_MODEL_DIR = os.path.join(OUTPUT_DIR,'clrp_deberta_v3_base_epoch20')
OUTPUT_EXP_DIR = DIR + '/output/EXP021/'
if not os.path.exists(OUTPUT_EXP_DIR):
    os.makedirs(OUTPUT_EXP_DIR)

In [6]:

# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model_name="microsoft/deberta-v3-base"
    # model="microsoft/deberta-v3-base"
    # model='microsoft/deberta-base'
    # model='roberta-base'
    # model='roberta-large'
    # model='roberta-large-mnli'
    # model='xlnet-large-cased'
    # model='albert-xxlarge-v2'
    # model="microsoft/deberta-large"
    # model="microsoft/deberta-v3-large"
    # model='microsoft/deberta-v2-xlarge'
    # model='funnel-transformer/large'
    # model='funnel-transformer/medium'
    # model='albert-base-v2'
    # model='albert-large-v2'
    # model='google/electra-large-discriminator'
    # model='google/electra-base-discriminator'
    # model="facebook/bart-large-mnli"
    # model="facebook/bart-large"
    # model="facebook/bart-base"
    model = CUSTOM_MODEL_DIR
    scheduler='polynomial' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=6
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=16
    fc_dropout=0.2
    target_size=1
    max_len=256
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=3
    trn_fold=[0, 1, 2]
    train=True
    nth_awp_start_epoch=1
    gradient_checkpointing = False
    freezing = False
    num_reinit_layers = 1
    is_reinit_layer = True

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1]

In [7]:
def get_score(labels, outputs):
    thresh = 0.5
    y_pred = outputs
    y_true = labels
    f_score = f1_score(y_true, (y_pred>thresh).astype(int))
    r_score = recall_score(y_true, (y_pred>thresh).astype(int))
    p_score = precision_score(y_true, (y_pred>thresh).astype(int))
    print(f"f1 score : {f_score}")
    print(f"recall score : {r_score}")
    print(f"precision score : {p_score}")
    return accuracy_score(y_true, (y_pred>thresh).astype(int))

def get_acc_score(labels, outputs):
    y_pred = outputs
    y_true = labels
    best_score = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.80, 0.01):
        thresh = np.round(thresh, 2)
        score = accuracy_score(y_true, (y_pred>thresh).astype(int))
        #print("Accuracy score at threshold {0} is {1}".format(thresh, score))
        if score > best_score:
          best_score = score
          best_thresh = thresh
    return accuracy_score(y_true, (y_pred>best_thresh).astype(int))


def get_logger(filename=OUTPUT_EXP_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=CFG.seed)

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
import pandas as pd
import numpy as np

train = pd.read_csv(os.path.join(INPUT_DIR,"train_data.csv"))
test = pd.read_csv(os.path.join(INPUT_DIR,"test_data.csv"))
sample_sub = pd.read_csv(os.path.join(INPUT_DIR,"submission.csv"))

print(train.shape)
display(train.head(3))

print(test.shape)
display(test.head(3))

print(sample_sub.shape)
display(sample_sub.head(3))

(4974, 6)


Unnamed: 0,id,title,year,abstract,keywords,y
0,1,Hierarchical Adversarially Learned Inference,2018,We propose a novel hierarchical generative mod...,"generative, hierarchical, unsupervised, semisu...",0
1,2,Learning to Compute Word Embeddings On the Fly,2018,Words in natural language follow a Zipfian dis...,"NLU, word embeddings, representation learning",0
2,3,Graph2Seq: Scalable Learning Dynamics for Graphs,2018,Neural networks are increasingly used as a gen...,,0


(6393, 5)


Unnamed: 0,id,title,year,abstract,keywords
0,1,StyleAlign: Analysis and Applications of Align...,2022,"In this paper, we perform an in-depth study of...","StyleGAN, transfer learning, fine tuning, mode..."
1,2,Embedding a random graph via GNN: mean-field i...,2021,We develop a theory for embedding a random gra...,"Graph neural network, graph embedding, multi-r..."
2,3,BBRefinement: an universal scheme to improve p...,2021,We present a conceptually simple yet powerful ...,"object detection, deep neural networks, refine..."


(6393, 2)


Unnamed: 0,id,y
0,1,0
1,2,0
2,3,0


In [10]:
train["texts"] = train["title"] + "[SEP]" + train["abstract"]  

In [11]:
skf = StratifiedGroupKFold(n_splits=CFG.n_fold,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train.y,groups=train["year"])):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)
display(train.groupby('kfold').size())

if CFG.debug:
    #display(train.groupby('kfold').size())
    train = train.sample(n=500, random_state=0).reset_index(drop=True)
    display(train.groupby('kfold').size())

kfold
0    2586
1    1566
2     822
dtype: int64

In [12]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_EXP_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

In [13]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['texts'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 3 # cls
LOGGER.info(f"max_len: {CFG.max_len}")

100%|██████████| 4974/4974 [00:03<00:00, 1298.10it/s]
max_len: 522
INFO:__main__:max_len: 522


In [14]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False,
                           truncation=True)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['texts'].values
        self.labels = df['y'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

#collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [15]:
def reinit_layers(model):

    #for layer in model.model.encoder.layer[-CFG.num_reinit_layers:]:
    for layer in model.encoder.layer[-CFG.num_reinit_layers:]:    #Custome model内(backbone)

            for module in layer.modules():

                if isinstance(module,nn.Linear):
                    module.weight.data.normal_(mean=0.0,std=model.config.initializer_range)
                    if module.bias is not None:
                            module.bias.data.zero_()
                elif isinstance(module, nn.Embedding):
                        module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                        if module.padding_idx is not None:
                            module.weight.data[module.padding_idx].zero_()
                elif isinstance(module, nn.LayerNorm):
                        module.bias.data.zero_()
                        module.weight.data.fill_(1.0)
                        
    return model

In [16]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if CFG.is_reinit_layer:
            self.model = reinit_layers(self.model)
            print(f'Reinitializing Last {CFG.num_reinit_layers} Layers.')
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())
        
        self.high_dropout = nn.Dropout(p=0.5)

        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self._init_weights(self.fc)
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
        self.sig = nn.Sigmoid()
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        feature = self.layer_norm1(feature)
        return feature, outputs

    def forward(self, inputs=None, labels=None):
        feature, outputs = self.feature(inputs)
        logits = torch.mean(
            torch.stack(
                [self.fc(self.high_dropout(feature)) for _ in range(5)],
                dim=0,
            ),
            dim=0,
        )
        # calculate loss
        loss = None
        if labels is not None:
            loss_fn = Focal_MultiLabel_Loss(gamma=2.0)
            loss = loss_fn(logits, labels)
        
        output = (logits,) + outputs[2:]
        return ((loss,) + output) if loss is not None else output

In [17]:
class Focal_MultiLabel_Loss(nn.Module):
    def __init__(self, gamma):
      super(Focal_MultiLabel_Loss, self).__init__()
      self.gamma = gamma
      self.bceloss = nn.BCEWithLogitsLoss()

    def forward(self, outputs, targets):
      bce = self.bceloss(outputs.view(-1, 1), targets.view(-1, 1))
      bce_exp = torch.exp(-bce)
      focal_loss = (1-bce_exp)**self.gamma * bce
      return focal_loss.mean()

In [18]:
def calculate_loss(inputs, labels, model, criterion, is_valid=True, device="cpu"):    
    outputs = model(inputs,labels)
    loss, logits = outputs[:2]
    return (loss, logits) if is_valid else loss

In [19]:

# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            loss = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=False, device=device)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        if scaler is not None:
            scaler.unscale_(optimizer)
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)

        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  #'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          #grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            loss, y_preds = calculate_loss(inputs=inputs, labels=labels, model=model, criterion=criterion, is_valid=True, device=device)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [20]:

# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['kfold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['kfold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds['y'].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)


    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size*2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_EXP_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr=5e-6, decoder_lr=1e-4, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", 
                    "LayerNorm.weight"]
        group1=['layer.0.','layer.1.','layer.2.','layer.3.']
        group2=['layer.4.','layer.5.','layer.6.','layer.7.']    
        group3=['layer.8.','layer.9.','layer.10.','layer.11.']
        group_all=['layer.0.','layer.1.','layer.2.','layer.3.','layer.4.','layer.5.','layer.6.','layer.7.','layer.8.','layer.9.','layer.10.','layer.11.']
        optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': weight_decay, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': weight_decay, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': weight_decay, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': 0.0},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': 0.0, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': 0.0, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': 0.0, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.named_parameters() if "model" not in n], 'lr':decoder_lr, "momentum" : 0.99},
    ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        elif cfg.scheduler == 'polynomial':
            warmup_steps = int(len(train_folds) / CFG.batch_size * 0.1)
            scheduler = get_polynomial_decay_schedule_with_warmup(
                optimizer, warmup_steps, num_train_steps, lr_end=7e-7, power=3.0)
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = Focal_MultiLabel_Loss(gamma=2.0)
    
    best_score = -1.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')

        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_EXP_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_EXP_DIR+f"{CFG.model_name.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [None]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['y'].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        acc_score = get_acc_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
        LOGGER.info(f'ACC BEST Score: {acc_score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
            #break
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_EXP_DIR+'oof_df.pkl')

DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base_epoch20",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "torch_dt

Reinitializing Last 1 Layers.
Epoch: [1][0/149] Elapsed 0m 1s (remain 3m 56s) Loss: 0.1424(0.1424) LR: 0.00000143  
Epoch: [1][100/149] Elapsed 0m 13s (remain 0m 6s) Loss: 0.1970(0.1510) LR: 0.00001483  
Epoch: [1][148/149] Elapsed 0m 18s (remain 0m 0s) Loss: 0.1242(0.1527) LR: 0.00001242  
EVAL: [0/81] Elapsed 0m 0s (remain 0m 35s) Loss: 0.2845(0.2845) 


Epoch 1 - avg_train_loss: 0.1527  avg_val_loss: 0.2523  time: 36s
INFO:__main__:Epoch 1 - avg_train_loss: 0.1527  avg_val_loss: 0.2523  time: 36s
Epoch 1 - Score: 0.3616
INFO:__main__:Epoch 1 - Score: 0.3616
Epoch 1 - Save Best Score: 0.3616 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.3616 Model


EVAL: [80/81] Elapsed 0m 16s (remain 0m 0s) Loss: 0.0517(0.2523) 
f1 score : 0.4420412301453194
recall score : 0.9547445255474453
precision score : 0.287598944591029
Epoch: [2][0/149] Elapsed 0m 0s (remain 0m 58s) Loss: 0.1758(0.1758) LR: 0.00001237  
Epoch: [2][100/149] Elapsed 0m 11s (remain 0m 5s) Loss: 0.1167(0.1355) LR: 0.00000827  
Epoch: [2][148/149] Elapsed 0m 17s (remain 0m 0s) Loss: 0.1741(0.1321) LR: 0.00000671  
EVAL: [0/81] Elapsed 0m 0s (remain 0m 37s) Loss: 0.0835(0.0835) 


Epoch 2 - avg_train_loss: 0.1321  avg_val_loss: 0.1340  time: 34s
INFO:__main__:Epoch 2 - avg_train_loss: 0.1321  avg_val_loss: 0.1340  time: 34s
Epoch 2 - Score: 0.6589
INFO:__main__:Epoch 2 - Score: 0.6589
Epoch 2 - Save Best Score: 0.6589 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.6589 Model


EVAL: [80/81] Elapsed 0m 16s (remain 0m 0s) Loss: 0.1408(0.1340) 
f1 score : 0.4403553299492386
recall score : 0.5065693430656935
precision score : 0.3894500561167228
Epoch: [3][0/149] Elapsed 0m 0s (remain 1m 0s) Loss: 0.0815(0.0815) LR: 0.00000668  
Epoch: [3][100/149] Elapsed 0m 11s (remain 0m 5s) Loss: 0.1015(0.1132) LR: 0.00000414  
Epoch: [3][148/149] Elapsed 0m 17s (remain 0m 0s) Loss: 0.0767(0.1059) LR: 0.00000324  
EVAL: [0/81] Elapsed 0m 0s (remain 0m 36s) Loss: 0.0926(0.0926) 


Epoch 3 - avg_train_loss: 0.1059  avg_val_loss: 0.1439  time: 34s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1059  avg_val_loss: 0.1439  time: 34s
Epoch 3 - Score: 0.6210
INFO:__main__:Epoch 3 - Score: 0.6210


EVAL: [80/81] Elapsed 0m 16s (remain 0m 0s) Loss: 0.0923(0.1439) 
f1 score : 0.4621295279912184
recall score : 0.6145985401459854
precision score : 0.3702726473175022
Epoch: [4][0/149] Elapsed 0m 0s (remain 0m 53s) Loss: 0.0746(0.0746) LR: 0.00000322  
Epoch: [4][100/149] Elapsed 0m 11s (remain 0m 5s) Loss: 0.1331(0.0851) LR: 0.00000188  
Epoch: [4][148/149] Elapsed 0m 17s (remain 0m 0s) Loss: 0.0758(0.0807) LR: 0.00000145  
EVAL: [0/81] Elapsed 0m 0s (remain 0m 34s) Loss: 0.0499(0.0499) 


Epoch 4 - avg_train_loss: 0.0807  avg_val_loss: 0.1421  time: 34s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0807  avg_val_loss: 0.1421  time: 34s
Epoch 4 - Score: 0.6729
INFO:__main__:Epoch 4 - Score: 0.6729
Epoch 4 - Save Best Score: 0.6729 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.6729 Model


EVAL: [80/81] Elapsed 0m 16s (remain 0m 0s) Loss: 0.1480(0.1421) 
f1 score : 0.4419525065963061
recall score : 0.48905109489051096
precision score : 0.40312876052948254
Epoch: [5][0/149] Elapsed 0m 0s (remain 0m 58s) Loss: 0.0393(0.0393) LR: 0.00000145  
Epoch: [5][100/149] Elapsed 0m 11s (remain 0m 5s) Loss: 0.0258(0.0649) LR: 0.00000092  
Epoch: [5][148/149] Elapsed 0m 17s (remain 0m 0s) Loss: 0.1050(0.0623) LR: 0.00000080  
EVAL: [0/81] Elapsed 0m 0s (remain 0m 37s) Loss: 0.0325(0.0325) 


Epoch 5 - avg_train_loss: 0.0623  avg_val_loss: 0.1567  time: 34s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0623  avg_val_loss: 0.1567  time: 34s
Epoch 5 - Score: 0.6922
INFO:__main__:Epoch 5 - Score: 0.6922
Epoch 5 - Save Best Score: 0.6922 Model
INFO:__main__:Epoch 5 - Save Best Score: 0.6922 Model


EVAL: [80/81] Elapsed 0m 16s (remain 0m 0s) Loss: 0.2130(0.1567) 
f1 score : 0.4265129682997118
recall score : 0.4321167883211679
precision score : 0.42105263157894735
Epoch: [6][0/149] Elapsed 0m 0s (remain 0m 55s) Loss: 0.0151(0.0151) LR: 0.00000079  
Epoch: [6][100/149] Elapsed 0m 12s (remain 0m 5s) Loss: 0.0406(0.0555) LR: 0.00000070  
Epoch: [6][148/149] Elapsed 0m 17s (remain 0m 0s) Loss: 0.0544(0.0534) LR: 0.00000070  
EVAL: [0/81] Elapsed 0m 0s (remain 0m 35s) Loss: 0.0356(0.0356) 


Epoch 6 - avg_train_loss: 0.0534  avg_val_loss: 0.1622  time: 35s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0534  avg_val_loss: 0.1622  time: 35s
Epoch 6 - Score: 0.6860
INFO:__main__:Epoch 6 - Score: 0.6860


EVAL: [80/81] Elapsed 0m 16s (remain 0m 0s) Loss: 0.2028(0.1622) 
f1 score : 0.43922651933701656
recall score : 0.4642335766423358
precision score : 0.4167758846657929


Score: 0.6922
INFO:__main__:Score: 0.6922
ACC BEST Score: 0.7370
INFO:__main__:ACC BEST Score: 0.7370
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base_epoch20",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input"

f1 score : 0.4265129682997118
recall score : 0.4321167883211679
precision score : 0.42105263157894735


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base_epoch20 were not used when initializing DebertaV2Model: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Reinitializing Last 1 Layers.
Epoch: [1][0/213] Elapsed 0m 0s (remain 1m 30s) Loss: 0.3129(0.3129) LR: 0.00000095  
Epoch: [1][100/213] Elapsed 0m 12s (remain 0m 13s) Loss: 0.2232(0.1663) LR: 0.00001654  
Epoch: [1][200/213] Elapsed 0m 23s (remain 0m 1s) Loss: 0.1170(0.1483) LR: 0.00001284  
Epoch: [1][212/213] Elapsed 0m 24s (remain 0m 0s) Loss: 0.0778(0.1471) LR: 0.00001244  
EVAL: [0/49] Elapsed 0m 0s (remain 0m 21s) Loss: 0.0440(0.0440) 


Epoch 1 - avg_train_loss: 0.1471  avg_val_loss: 0.1759  time: 35s
INFO:__main__:Epoch 1 - avg_train_loss: 0.1471  avg_val_loss: 0.1759  time: 35s
Epoch 1 - Score: 0.6794
INFO:__main__:Epoch 1 - Score: 0.6794
Epoch 1 - Save Best Score: 0.6794 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6794 Model


EVAL: [48/49] Elapsed 0m 10s (remain 0m 0s) Loss: 0.4622(0.1759) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
Epoch: [2][0/213] Elapsed 0m 0s (remain 1m 31s) Loss: 0.1692(0.1692) LR: 0.00001241  
Epoch: [2][100/213] Elapsed 0m 12s (remain 0m 13s) Loss: 0.1140(0.1197) LR: 0.00000941  
Epoch: [2][200/213] Elapsed 0m 23s (remain 0m 1s) Loss: 0.0575(0.1221) LR: 0.00000697  
Epoch: [2][212/213] Elapsed 0m 25s (remain 0m 0s) Loss: 0.0936(0.1227) LR: 0.00000671  
EVAL: [0/49] Elapsed 0m 0s (remain 0m 21s) Loss: 0.0743(0.0743) 


Epoch 2 - avg_train_loss: 0.1227  avg_val_loss: 0.1425  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.1227  avg_val_loss: 0.1425  time: 36s
Epoch 2 - Score: 0.6845
INFO:__main__:Epoch 2 - Score: 0.6845
Epoch 2 - Save Best Score: 0.6845 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.6845 Model


EVAL: [48/49] Elapsed 0m 10s (remain 0m 0s) Loss: 0.2942(0.1425) 
f1 score : 0.2962962962962963
recall score : 0.20717131474103587
precision score : 0.52
Epoch: [3][0/213] Elapsed 0m 0s (remain 1m 31s) Loss: 0.1345(0.1345) LR: 0.00000669  
Epoch: [3][100/213] Elapsed 0m 11s (remain 0m 13s) Loss: 0.0777(0.1071) LR: 0.00000482  
Epoch: [3][200/213] Elapsed 0m 23s (remain 0m 1s) Loss: 0.1456(0.1074) LR: 0.00000338  
Epoch: [3][212/213] Elapsed 0m 25s (remain 0m 0s) Loss: 0.0329(0.1069) LR: 0.00000324  
EVAL: [0/49] Elapsed 0m 0s (remain 0m 20s) Loss: 0.0338(0.0338) 


Epoch 3 - avg_train_loss: 0.1069  avg_val_loss: 0.1782  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.1069  avg_val_loss: 0.1782  time: 36s
Epoch 3 - Score: 0.6897
INFO:__main__:Epoch 3 - Score: 0.6897
Epoch 3 - Save Best Score: 0.6897 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.6897 Model


EVAL: [48/49] Elapsed 0m 10s (remain 0m 0s) Loss: 0.4451(0.1782) 
f1 score : 0.2161290322580645
recall score : 0.13346613545816732
precision score : 0.5677966101694916
Epoch: [4][0/213] Elapsed 0m 0s (remain 1m 32s) Loss: 0.0622(0.0622) LR: 0.00000322  
Epoch: [4][100/213] Elapsed 0m 11s (remain 0m 13s) Loss: 0.0980(0.0964) LR: 0.00000221  
Epoch: [4][200/213] Elapsed 0m 23s (remain 0m 1s) Loss: 0.0629(0.0891) LR: 0.00000152  
Epoch: [4][212/213] Elapsed 0m 25s (remain 0m 0s) Loss: 0.0935(0.0897) LR: 0.00000145  
EVAL: [0/49] Elapsed 0m 0s (remain 0m 20s) Loss: 0.0327(0.0327) 


Epoch 4 - avg_train_loss: 0.0897  avg_val_loss: 0.1780  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0897  avg_val_loss: 0.1780  time: 36s
Epoch 4 - Score: 0.6935
INFO:__main__:Epoch 4 - Score: 0.6935
Epoch 4 - Save Best Score: 0.6935 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.6935 Model


EVAL: [48/49] Elapsed 0m 10s (remain 0m 0s) Loss: 0.4185(0.1780) 
f1 score : 0.3388429752066116
recall score : 0.2450199203187251
precision score : 0.5491071428571429
Epoch: [5][0/213] Elapsed 0m 0s (remain 1m 28s) Loss: 0.0736(0.0736) LR: 0.00000145  
Epoch: [5][100/213] Elapsed 0m 12s (remain 0m 13s) Loss: 0.0876(0.0741) LR: 0.00000103  
Epoch: [5][200/213] Elapsed 0m 24s (remain 0m 1s) Loss: 0.0365(0.0787) LR: 0.00000081  
Epoch: [5][212/213] Elapsed 0m 25s (remain 0m 0s) Loss: 0.1045(0.0785) LR: 0.00000079  
EVAL: [0/49] Elapsed 0m 0s (remain 0m 21s) Loss: 0.0258(0.0258) 


Epoch 5 - avg_train_loss: 0.0785  avg_val_loss: 0.1870  time: 36s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0785  avg_val_loss: 0.1870  time: 36s
Epoch 5 - Score: 0.6986
INFO:__main__:Epoch 5 - Score: 0.6986
Epoch 5 - Save Best Score: 0.6986 Model
INFO:__main__:Epoch 5 - Save Best Score: 0.6986 Model


EVAL: [48/49] Elapsed 0m 10s (remain 0m 0s) Loss: 0.4400(0.1870) 
f1 score : 0.3407821229050279
recall score : 0.24302788844621515
precision score : 0.5700934579439252
Epoch: [6][0/213] Elapsed 0m 0s (remain 1m 33s) Loss: 0.0922(0.0922) LR: 0.00000079  
Epoch: [6][100/213] Elapsed 0m 12s (remain 0m 13s) Loss: 0.1754(0.0769) LR: 0.00000071  
Epoch: [6][200/213] Elapsed 0m 23s (remain 0m 1s) Loss: 0.0522(0.0730) LR: 0.00000070  
Epoch: [6][212/213] Elapsed 0m 25s (remain 0m 0s) Loss: 0.0216(0.0723) LR: 0.00000070  
EVAL: [0/49] Elapsed 0m 0s (remain 0m 20s) Loss: 0.0374(0.0374) 


Epoch 6 - avg_train_loss: 0.0723  avg_val_loss: 0.1648  time: 36s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0723  avg_val_loss: 0.1648  time: 36s
Epoch 6 - Score: 0.6928
INFO:__main__:Epoch 6 - Score: 0.6928


EVAL: [48/49] Elapsed 0m 10s (remain 0m 0s) Loss: 0.3341(0.1648) 
f1 score : 0.41696969696969693
recall score : 0.3426294820717131
precision score : 0.5325077399380805


Score: 0.6986
INFO:__main__:Score: 0.6986
ACC BEST Score: 0.7037
INFO:__main__:ACC BEST Score: 0.7037
DebertaV2Config {
  "_name_or_path": "/content/drive/MyDrive/Competitions/probspace/\u7814\u7a76\u8ad6\u6587\u306e\u56fd\u969b\u5b66\u4f1a\u63a1\u629e\u4e88\u6e2c/output/clrp_deberta_v3_base_epoch20",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input"

f1 score : 0.3407821229050279
recall score : 0.24302788844621515
precision score : 0.5700934579439252


Some weights of the model checkpoint at /content/drive/MyDrive/Competitions/probspace/研究論文の国際学会採択予測/output/clrp_deberta_v3_base_epoch20 were not used when initializing DebertaV2Model: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Reinitializing Last 1 Layers.
Epoch: [1][0/259] Elapsed 0m 0s (remain 1m 49s) Loss: 0.1852(0.1852) LR: 0.00000080  
Epoch: [1][100/259] Elapsed 0m 11s (remain 0m 18s) Loss: 0.2582(0.1493) LR: 0.00001727  
Epoch: [1][200/259] Elapsed 0m 23s (remain 0m 6s) Loss: 0.1460(0.1402) LR: 0.00001408  
Epoch: [1][258/259] Elapsed 0m 30s (remain 0m 0s) Loss: 0.1004(0.1342) LR: 0.00001244  
EVAL: [0/26] Elapsed 0m 0s (remain 0m 12s) Loss: 0.0227(0.0227) 


Epoch 1 - avg_train_loss: 0.1342  avg_val_loss: 0.2659  time: 36s
INFO:__main__:Epoch 1 - avg_train_loss: 0.1342  avg_val_loss: 0.2659  time: 36s
Epoch 1 - Score: 0.5912
INFO:__main__:Epoch 1 - Score: 0.5912
Epoch 1 - Save Best Score: 0.5912 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5912 Model


EVAL: [25/26] Elapsed 0m 5s (remain 0m 0s) Loss: 0.5740(0.2659) 
f1 score : 0.0
recall score : 0.0
precision score : 0.0
Epoch: [2][0/259] Elapsed 0m 0s (remain 1m 44s) Loss: 0.1456(0.1456) LR: 0.00001241  
Epoch: [2][100/259] Elapsed 0m 12s (remain 0m 18s) Loss: 0.1863(0.1115) LR: 0.00000991  
Epoch: [2][200/259] Elapsed 0m 23s (remain 0m 6s) Loss: 0.0826(0.1131) LR: 0.00000779  
Epoch: [2][258/259] Elapsed 0m 30s (remain 0m 0s) Loss: 0.0818(0.1132) LR: 0.00000672  
EVAL: [0/26] Elapsed 0m 0s (remain 0m 12s) Loss: 0.0136(0.0136) 


Epoch 2 - avg_train_loss: 0.1132  avg_val_loss: 0.3133  time: 36s
INFO:__main__:Epoch 2 - avg_train_loss: 0.1132  avg_val_loss: 0.3133  time: 36s
Epoch 2 - Score: 0.6022
INFO:__main__:Epoch 2 - Score: 0.6022
Epoch 2 - Save Best Score: 0.6022 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.6022 Model


EVAL: [25/26] Elapsed 0m 5s (remain 0m 0s) Loss: 0.6663(0.3133) 
f1 score : 0.06303724928366763
recall score : 0.03273809523809524
precision score : 0.8461538461538461
Epoch: [3][0/259] Elapsed 0m 0s (remain 1m 53s) Loss: 0.1927(0.1927) LR: 0.00000670  
Epoch: [3][100/259] Elapsed 0m 12s (remain 0m 19s) Loss: 0.0320(0.1013) LR: 0.00000513  
Epoch: [3][200/259] Elapsed 0m 24s (remain 0m 6s) Loss: 0.0596(0.0989) LR: 0.00000386  
Epoch: [3][258/259] Elapsed 0m 30s (remain 0m 0s) Loss: 0.1226(0.0970) LR: 0.00000325  
EVAL: [0/26] Elapsed 0m 0s (remain 0m 12s) Loss: 0.0374(0.0374) 


Epoch 3 - avg_train_loss: 0.0970  avg_val_loss: 0.2122  time: 36s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0970  avg_val_loss: 0.2122  time: 36s
Epoch 3 - Score: 0.6192
INFO:__main__:Epoch 3 - Score: 0.6192
Epoch 3 - Save Best Score: 0.6192 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.6192 Model


EVAL: [25/26] Elapsed 0m 5s (remain 0m 0s) Loss: 0.3602(0.2122) 
f1 score : 0.21945137157107233
recall score : 0.13095238095238096
precision score : 0.676923076923077
Epoch: [4][0/259] Elapsed 0m 0s (remain 1m 55s) Loss: 0.0282(0.0282) LR: 0.00000324  
Epoch: [4][100/259] Elapsed 0m 11s (remain 0m 18s) Loss: 0.0354(0.0718) LR: 0.00000238  
Epoch: [4][200/259] Elapsed 0m 23s (remain 0m 6s) Loss: 0.1131(0.0756) LR: 0.00000174  
Epoch: [4][258/259] Elapsed 0m 30s (remain 0m 0s) Loss: 0.0366(0.0755) LR: 0.00000146  
EVAL: [0/26] Elapsed 0m 0s (remain 0m 12s) Loss: 0.0463(0.0463) 


Epoch 4 - avg_train_loss: 0.0755  avg_val_loss: 0.2198  time: 36s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0755  avg_val_loss: 0.2198  time: 36s
Epoch 4 - Score: 0.6375
INFO:__main__:Epoch 4 - Score: 0.6375
Epoch 4 - Save Best Score: 0.6375 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.6375 Model


EVAL: [25/26] Elapsed 0m 5s (remain 0m 0s) Loss: 0.3120(0.2198) 
f1 score : 0.37916666666666665
recall score : 0.2708333333333333
precision score : 0.6319444444444444
Epoch: [5][0/259] Elapsed 0m 0s (remain 1m 45s) Loss: 0.0608(0.0608) LR: 0.00000145  
Epoch: [5][100/259] Elapsed 0m 11s (remain 0m 18s) Loss: 0.0505(0.0608) LR: 0.00000110  
Epoch: [5][200/259] Elapsed 0m 23s (remain 0m 6s) Loss: 0.0580(0.0578) LR: 0.00000088  
Epoch: [5][258/259] Elapsed 0m 30s (remain 0m 0s) Loss: 0.0398(0.0583) LR: 0.00000080  
EVAL: [0/26] Elapsed 0m 0s (remain 0m 12s) Loss: 0.0223(0.0223) 


Epoch 5 - avg_train_loss: 0.0583  avg_val_loss: 0.3696  time: 36s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0583  avg_val_loss: 0.3696  time: 36s
Epoch 5 - Score: 0.6107
INFO:__main__:Epoch 5 - Score: 0.6107


EVAL: [25/26] Elapsed 0m 5s (remain 0m 0s) Loss: 0.5380(0.3696) 
f1 score : 0.23444976076555024
recall score : 0.14583333333333334
precision score : 0.5975609756097561
Epoch: [6][0/259] Elapsed 0m 0s (remain 1m 36s) Loss: 0.2142(0.2142) LR: 0.00000080  
Epoch: [6][100/259] Elapsed 0m 11s (remain 0m 18s) Loss: 0.0352(0.0509) LR: 0.00000072  
Epoch: [6][200/259] Elapsed 0m 23s (remain 0m 6s) Loss: 0.0333(0.0507) LR: 0.00000070  
Epoch: [6][258/259] Elapsed 0m 30s (remain 0m 0s) Loss: 0.0137(0.0489) LR: 0.00000070  
EVAL: [0/26] Elapsed 0m 0s (remain 0m 12s) Loss: 0.0383(0.0383) 


Epoch 6 - avg_train_loss: 0.0489  avg_val_loss: 0.3371  time: 36s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0489  avg_val_loss: 0.3371  time: 36s
Epoch 6 - Score: 0.6399
INFO:__main__:Epoch 6 - Score: 0.6399
Epoch 6 - Save Best Score: 0.6399 Model
INFO:__main__:Epoch 6 - Save Best Score: 0.6399 Model


EVAL: [25/26] Elapsed 0m 5s (remain 0m 0s) Loss: 0.4014(0.3371) 
f1 score : 0.3909465020576131
recall score : 0.28273809523809523
precision score : 0.6333333333333333




In [None]:
from google.colab import runtime
runtime.unassign()