In [1]:
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import datetime
import math
import string
import pickle
import random
import joblib
import itertools
from distutils.util import strtobool
import warnings
import glob
warnings.filterwarnings('ignore')

import scipy as sp
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset
from torch.utils.checkpoint import checkpoint

import transformers
import tokenizers
print(f'transformers.__version__: {transformers.__version__}')
print(f'tokenizers.__version__: {tokenizers.__version__}')
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
os.environ['TOKENIZERS_PARALLELISM']='true'

transformers.__version__: 4.20.1
tokenizers.__version__: 0.12.1


# Config

In [2]:
class CFG:
    TO_KAGGLE = True
    debug = False
    file_name = "018"
    model = 'microsoft/deberta-v3-base'
    score_path = "gs://feedback3/output/scores/scores3.csv"
    #models_path = 'FB3-models'
    epochs = 10
    patience = 3
    competition = 'FB3'
    train = True
    save_all_models = False
    offline = False
    apex = True
    print_freq = 20
    num_workers = 4
    loss_func = 'SmoothL1' # 'SmoothL1', 'RMSE'
    gradient_checkpointing = True
    scheduler = 'cosine'
    batch_scheduler = True
    num_cycles = 0.5
    num_warmup_steps = 0
    encoder_lr = 2e-5
    decoder_lr = 2e-5
    min_lr = 1e-6
    #Layer-Wise Learning Rate Decay
    llrd = True
    layerwise_lr = 5e-5
    layerwise_lr_decay = 0.9
    layerwise_weight_decay = 0.01
    layerwise_adam_epsilon = 1e-6
    layerwise_use_bertadam = False
    #pooling
    pooling = 'attention' # mean, max, min, attention, weightedlayer
    layer_start = 4
    #init_weight
    init_weight = 'normal' # normal, xavier_uniform, xavier_normal, kaiming_uniform, kaiming_normal, orthogonal
    #re-init
    reinit = True
    reinit_n = 1
    #adversarial
    fgm = True
#     awp = False
    adv_lr = 1
    adv_eps = 0.2
    unscale = False
    eps = 1e-6
    betas = (0.9, 0.999)
    max_len = 512
    weight_decay = 0.01
    gradient_accumulation_steps = 1
    max_grad_norm = 1000
    target_cols = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed = 42
    cv_seed = 42
    n_fold = 10
    trn_fold = list(range(n_fold))
    batch_size = 8
    n_targets = 6
    gpu_id = 0
    device = f'cuda:{gpu_id}'
    train_file = '/home/jupyter/feedback-prize-english-language-learning/train.csv'
    test_file = '/home/jupyter/feedback-prize-english-language-learning/test.csv'
    submission_file = '/home/jupyter/feedback-prize-english-language-learning/sample_submission.csv'

In [3]:
# #Unique model name
# if len(CFG.model.split("/")) == 2:
#     CFG.identifier = f'{CFG.str_now}-{CFG.model.split("/")[1]}'
# else:
#     CFG.identifier = f'{CFG.str_now}-{CFG.model}'
    
# print(CFG.identifier)

# Read train and split with MultilabelStratifiedKFold

In [4]:
import os
import datetime
import pickle

# ====================================================
# datetime
# ====================================================
t_delta = datetime.timedelta(hours=9)
JST = datetime.timezone(t_delta, 'JST')
now = datetime.datetime.now(JST)
date = now.strftime('%Y%m%d')
date2 = now.strftime('%Y%m%d%H%M')


# ====================================================
# file_path
# ====================================================
if "/" in CFG.model:
    model_name = CFG.model.split("/")[1]
else:
    model_name = CFG.model

path ="/home/jupyter/feedback-prize-english-language-learning/"
if CFG.debug:
    OUTPUT_DIR = f'/home/jupyter/output/ex/DEBUG/{model_name}/{CFG.file_name}/{date2}/'
else:
    OUTPUT_DIR = f'/home/jupyter/output/ex/{model_name}/{CFG.file_name}/{date2}/'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [5]:
if CFG.train:
    CFG.df_train = pd.read_csv(CFG.train_file)
    CFG.OUTPUT_DIR = OUTPUT_DIR
    CFG.log_filename = CFG.OUTPUT_DIR + 'train'
    if CFG.offline:
        #TO DO
        pass
    else:
        os.system('pip install iterative-stratification==0.1.7')
    #CV
    from iterstrat.ml_stratifiers import MultilabelStratifiedKFold    
    oof = pd.read_pickle(f"/home/jupyter/output/oof_df/oof_df_CFG1.pkl")[["text_id","fold"]]
    CFG.df_train = pd.merge(CFG.df_train,oof,how="left",on="text_id")
    CFG.df_train['fold'] = CFG.df_train['fold'].astype(int)
else:
    #TO DO
    pass

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]
    if CFG.train:
        CFG.df_train = CFG.df_train.sample(n = 100, random_state = CFG.seed).reset_index(drop=True)
        
os.makedirs(CFG.OUTPUT_DIR, exist_ok=True)    
print(CFG.OUTPUT_DIR)





/home/jupyter/output/ex/deberta-v3-base/018/202211261205/


# Helper function

In [6]:
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:, i]
        y_pred = y_preds[:, i]
        score = mean_squared_error(y_true, y_pred, squared = False)
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores

def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores

def get_logger(filename = CFG.log_filename):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter('%(message)s'))
    handler2 = FileHandler(filename = f'{filename}.log')
    handler2.setFormatter(Formatter('%(message)s'))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text,
        return_tensors = None,
        add_special_tokens = True,
        max_length = cfg.max_len,
        pad_to_max_length = True,
        truncation = True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype = torch.long)
    return inputs    

def collate(inputs):
    mask_len = int(inputs['attention_mask'].sum(axis = 1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:, :mask_len]
    return inputs

class AverageMeter(object):
    def __init__(self):
        self.reset()
        
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
        
    def update(self, val, n = 1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return f'{int(m)}m {int(s)}s'

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return f'{str(asMinutes(s))} (remain {str(asMinutes(rs))})'

def seed_everything(seed = 42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
class RMSELoss(nn.Module):
    def __init__(self, reduction = 'mean', eps = 1e-9):
        super().__init__()
        self.mse = nn.MSELoss(reduction = 'none')
        self.reduction = reduction
        self.eps = eps
        
    def forward(self, y_pred, y_true):
        loss = torch.sqrt(self.mse(y_pred, y_true) + self.eps)
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss
    
seed_everything(CFG.seed)

# Pooling

* Attention pooling (https://www.kaggle.com/competitions/feedback-prize-english-language-learning/discussion/361678)
* WeightedLayerPooling (https://www.kaggle.com/code/rhtsingh/on-stability-of-few-sample-transformer-fine-tuning?scriptVersionId=67176591&cellId=19)

In [7]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min = 1e-9)
        mean_embeddings = sum_embeddings/sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim = 1)
        return max_embeddings
    
class MinPooling(nn.Module):
    def __init__(self):
        super(MinPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = 1e-4
        min_embeddings, _ = torch.min(embeddings, dim = 1)
        return min_embeddings

#Attention pooling
class AttentionPooling(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.attention = nn.Sequential(
        nn.Linear(in_dim, in_dim),
        nn.LayerNorm(in_dim),
        nn.GELU(),
        nn.Linear(in_dim, 1),
        )

    def forward(self, last_hidden_state, attention_mask):
        w = self.attention(last_hidden_state).float()
        w[attention_mask==0]=float('-inf')
        w = torch.softmax(w,1)
        attention_embeddings = torch.sum(w * last_hidden_state, dim=1)
        return attention_embeddings

#There may be a bug in my implementation because it does not work well.
class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 4, layer_weights = None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
                torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)
            )

    def forward(self, ft_all_layers):
        all_layer_embedding = torch.stack(ft_all_layers)
        all_layer_embedding = all_layer_embedding[self.layer_start:, :, :, :]

        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()

        return weighted_average

# Fast Gradient Method (FGM)
Reference :

https://www.kaggle.com/c/tweet-sentiment-extraction/discussion/143764

In [8]:
class FGM():
    def __init__(self, model):
        self.model = model
        self.backup = {}

    def attack(self, epsilon = 1., emb_name = 'word_embeddings'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0:
                    r_at = epsilon * param.grad / norm
                    param.data.add_(r_at)

    def restore(self, emb_name = 'word_embeddings'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                assert name in self.backup
                param.data = self.backup[name]
            self.backup = {}


# Train function
* FGM
* Unscale optimizer

In [9]:
def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    losses = AverageMeter()
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled = CFG.apex)
    start = end = time.time()
    global_step = 0
    if CFG.fgm:
        fgm = FGM(model)
#     if CFG.awp:
#         awp = AWP(model,
#                   optimizer, 
#                   adv_lr = CFG.adv_lr, 
#                   adv_eps = CFG.adv_eps, 
#                   scaler = scaler)
    for step, (inputs, labels) in enumerate(train_loader):
        attention_mask = inputs['attention_mask'].to(device)
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled = CFG.apex):
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        if CFG.unscale:
            scaler.unscale_(optimizer)
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        
        #Fast Gradient Method (FGM)
        if CFG.fgm:
            fgm.attack()
            with torch.cuda.amp.autocast(enabled = CFG.apex):
                y_preds = model(inputs)
                loss_adv = criterion(y_preds, labels)
                loss_adv.backward()
            fgm.restore()
            
        #Adversarial Weight Perturbation (AWP)
#         if CFG.awp:
#             loss_awp = awp.attack_backward(inputs, labels, attention_mask, step + 1)
#             loss_awp.backward()
#             awp._restore()
        
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader) - 1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f} '
                  'LR: {lr:.8f} '
                  .format(epoch + 1, step, len(train_loader), remain = timeSince(start, float(step + 1)/len(train_loader)),
                          loss = losses,
                          grad_norm = grad_norm,
                          lr = scheduler.get_lr()[0]
                         )
                 )
    return losses.avg

# Valid function

In [10]:
def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader) - 1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss = losses,
                          remain = timeSince(start, float(step + 1) / len(valid_loader))
                         )
                 )
    return losses.avg, np.concatenate(preds)

# Logger

In [11]:
LOGGER = get_logger()
LOGGER.info(f'OUTPUT_DIR: {CFG.OUTPUT_DIR}')

OUTPUT_DIR: /home/jupyter/output/ex/deberta-v3-base/018/202211261205/


# Tokenizer

In [12]:
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.model)
#CFG.tokenizer.save_pretrained(CFG.OUTPUT_DIR + 'tokenizer')

#max_len
lengths = []
tk0 = tqdm(CFG.df_train['full_text'].fillna('').values, total = len(CFG.df_train))
for text in tk0:
    length = len(CFG.tokenizer(text, add_special_tokens = False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 2
LOGGER.info(f'max_len: {CFG.max_len}')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/3911 [00:00<?, ?it/s]

max_len: 1428


# Dataset

In [13]:
class FB3TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['full_text'].values
        self.labels = df[cfg.target_cols].values
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        label = torch.tensor(self.labels[item], dtype = torch.float)
        return inputs, label

# Model

* Initializing module (normal, xavier_uniform, xavier_normal, kaiming_uniform, kaiming_normal, orthogonal) 
* Freeze lower layer when you use large model (v2-xlarge, funnnel, etc.)

In [14]:
class FB3Model(nn.Module):
    def __init__(self, CFG, config_path = None, pretrained = False):
        super().__init__()
        self.CFG = CFG
        if config_path is None:
            self.config = AutoConfig.from_pretrained(CFG.model, ouput_hidden_states = True)
            #self.config.save_pretrained(CFG.OUTPUT_DIR + 'config')
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
        else:
            self.config = torch.load(config_path)
            
        LOGGER.info(self.config)
        
        if pretrained:
            self.model = AutoModel.from_pretrained(CFG.model, config=self.config)
            #self.model.save_pretrained(CFG.OUTPUT_DIR + 'model')
        else:
            self.model = AutoModel(self.config)
            
        if self.CFG.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
            
        if CFG.pooling == 'mean':
            self.pool = MeanPooling()
        elif CFG.pooling == 'max':
            self.pool = MaxPooling()
        elif CFG.pooling == 'min':
            self.pool = MinPooling()
        elif CFG.pooling == 'attention':
            self.pool = AttentionPooling(self.config.hidden_size)
        elif CFG.pooling == 'weightedlayer':
            self.pool = WeightedLayerPooling(self.config.num_hidden_layers, layer_start = CFG.layer_start, layer_weights = None)        
        
        self.fc = nn.Linear(self.config.hidden_size, self.CFG.n_targets)
        self._init_weights(self.fc)
        
        if 'deberta-v2-xxlarge' in CFG.model:
            self.model.embeddings.requires_grad_(False)
            self.model.encoder.layer[:24].requires_grad_(False)
        if 'deberta-v2-xlarge' in CFG.model:
            self.model.embeddings.requires_grad_(False)
            self.model.encoder.layer[:12].requires_grad_(False)
        if 'funnel-transformer-xlarge' in CFG.model:
            self.model.embeddings.requires_grad_(False)
            self.model.encoder.blocks[:1].requires_grad_(False)
        if 'funnel-transformer-large' in CFG.model:
            self.model.embeddings.requires_grad_(False)
            self.model.encoder.blocks[:1].requires_grad_(False)
        if 'deberta-large' in CFG.model:
            self.model.embeddings.requires_grad_(False)
            self.model.encoder.layer[:16].requires_grad_(False)
        if 'deberta-xlarge' in CFG.model:
            self.model.embeddings.requires_grad_(False)
            self.model.encoder.layer[:36].requires_grad_(False)
            
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            if CFG.init_weight == 'normal':
                module.weight.data.normal_(mean = 0.0, std = self.config.initializer_range)
            elif CFG.init_weight == 'xavier_uniform':
                module.weight.data = nn.init.xavier_uniform_(module.weight.data)
            elif CFG.init_weight == 'xavier_normal':
                module.weight.data = nn.init.xavier_normal_(module.weight.data)
            elif CFG.init_weight == 'kaiming_uniform':
                module.weight.data = nn.init.kaiming_uniform_(module.weight.data)
            elif CFG.init_weight == 'kaiming_normal':
                module.weight.data = nn.init.kaiming_normal_(module.weight.data)
            elif CFG.init_weight == 'orthogonal':
                module.weight.data = nn.init.orthogonal_(module.weight.data)
                
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            if CFG.init_weight == 'normal':
                module.weight.data.normal_(mean = 0.0, std = self.config.initializer_range)
            elif CFG.init_weight == 'xavier_uniform':
                module.weight.data = nn.init.xavier_uniform_(module.weight.data)
            elif CFG.init_weight == 'xavier_normal':
                module.weight.data = nn.init.xavier_normal_(module.weight.data)
            elif CFG.init_weight == 'kaiming_uniform':
                module.weight.data = nn.init.kaiming_uniform_(module.weight.data)
            elif CFG.init_weight == 'kaiming_normal':
                module.weight.data = nn.init.kaiming_normal_(module.weight.data)
            elif CFG.init_weight == 'orthogonal':
                module.weight.data = nn.init.orthogonal_(module.weight.data)
                
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    
    def feature(self, inputs):
        outputs = self.model(**inputs)
        if CFG.pooling != 'weightedlayer':
            last_hidden_states = outputs[0]
            feature = self.pool(last_hidden_states, inputs['attention_mask'])
        else:
            all_layer_embeddings = outputs[1]
            feature = self.pool(all_layer_embeddings)
            
        return feature
    
    def forward(self, inputs):
        feature = self.feature(inputs)
        outout = self.fc(feature)
        return outout

# Train
* Re-initializing upper layer (normal, xavier_uniform, xavier_normal, kaiming_uniform, kaiming_normal, orthogonal) 
* Layer-Wise Learning Rate Dacay (https://www.kaggle.com/code/rhtsingh/on-stability-of-few-sample-transformer-fine-tuning?scriptVersionId=67176591&cellId=29)
* Loss function, SmoothL1 or RMSE

In [15]:
def re_initializing_layer(model, config, layer_num):
    for module in model.model.encoder.layer[-layer_num:].modules():
        if isinstance(module, nn.Linear):

            if CFG.init_weight == 'normal':
                module.weight.data.normal_(mean=0.0, std=config.initializer_range)
            elif CFG.init_weight == 'xavier_uniform':
                module.weight.data = nn.init.xavier_uniform_(module.weight.data)
            elif CFG.init_weight == 'xavier_normal':
                module.weight.data = nn.init.xavier_normal_(module.weight.data)
            elif CFG.init_weight == 'kaiming_uniform':
                module.weight.data = nn.init.kaiming_uniform_(module.weight.data)
            elif CFG.init_weight == 'kaiming_normal':
                module.weight.data = nn.init.kaiming_normal_(module.weight.data)
            elif CFG.init_weight == 'orthogonal':
                module.weight.data = nn.init.orthogonal_(module.weight.data) 
                
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            if CFG.init_weight == 'normal':
                module.weight.data.normal_(mean=0.0, std=config.initializer_range)
            elif CFG.init_weight == 'xavier_uniform':
                module.weight.data = nn.init.xavier_uniform_(module.weight.data)
            elif CFG.init_weight == 'xavier_normal':
                module.weight.data = nn.init.xavier_normal_(module.weight.data)
            elif CFG.init_weight == 'kaiming_uniform':
                module.weight.data = nn.init.kaiming_uniform_(module.weight.data)
            elif CFG.init_weight == 'kaiming_normal':
                module.weight.data = nn.init.kaiming_normal_(module.weight.data)
            elif CFG.init_weight == 'orthogonal':
                module.weight.data = nn.init.orthogonal_(module.weight.data)
                
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    return model   

def train_loop(folds, fold):
    LOGGER.info(f"========== fold: {fold} training ==========")
    
    train_folds = folds[folds['fold'] != fold].reset_index(drop = True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop = True)
    valid_labels = valid_folds[CFG.target_cols].values
    
    train_dataset = FB3TrainDataset(CFG, train_folds)
    valid_dataset = FB3TrainDataset(CFG, valid_folds)
    
    train_loader = DataLoader(train_dataset,
                              batch_size = CFG.batch_size,
                              shuffle = True, 
                              num_workers = CFG.num_workers,
                              pin_memory = True, 
                              drop_last = True
                             )
    valid_loader = DataLoader(valid_dataset,
                              batch_size = CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers,
                              pin_memory=True, 
                              drop_last=False)

    model = FB3Model(CFG, config_path = None, pretrained = True)
    if CFG.reinit:
        model = re_initializing_layer(model, model.config, CFG.reinit_n)
        
    #os.makedirs(CFG.OUTPUT_DIR + 'config/', exist_ok = True)
    #torch.save(model.config, CFG.OUTPUT_DIR + 'config/config.pth')
    model.to(CFG.device)
    
    def get_optimizer_params(model,
                             encoder_lr,
                             decoder_lr,
                             weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr,
             'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr,
             'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr,
             'weight_decay': 0.0}
        ]
        return optimizer_parameters
    
    #llrd
    def get_optimizer_grouped_parameters(model, 
                                         layerwise_lr,
                                         layerwise_weight_decay,
                                         layerwise_lr_decay):
        
        no_decay = ["bias", "LayerNorm.weight"]
        # initialize lr for task specific layer
        optimizer_grouped_parameters = [{"params": [p for n, p in model.named_parameters() if "model" not in n],
                                         "weight_decay": 0.0,
                                         "lr": layerwise_lr,
                                        },]
        # initialize lrs for every layer
        layers = [model.model.embeddings] + list(model.model.encoder.layer)
        layers.reverse()
        lr = layerwise_lr
        for layer in layers:
            optimizer_grouped_parameters += [{"params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)],
                                              "weight_decay": layerwise_weight_decay,
                                              "lr": lr,
                                             },
                                             {"params": [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)],
                                              "weight_decay": 0.0,
                                              "lr": lr,
                                             },]
            lr *= layerwise_lr_decay
        return optimizer_grouped_parameters
    
    if CFG.llrd:
        from transformers import AdamW
        grouped_optimizer_params = get_optimizer_grouped_parameters(model, 
                                                                    CFG.layerwise_lr, 
                                                                    CFG.layerwise_weight_decay, 
                                                                    CFG.layerwise_lr_decay)
        optimizer = AdamW(grouped_optimizer_params,
                          lr = CFG.layerwise_lr,
                          eps = CFG.layerwise_adam_epsilon,
                          correct_bias = not CFG.layerwise_use_bertadam)
    else:
        from torch.optim import AdamW
        optimizer_parameters = get_optimizer_params(model,
                                                    encoder_lr=CFG.encoder_lr, 
                                                    decoder_lr=CFG.decoder_lr,
                                                    weight_decay=CFG.weight_decay)
        optimizer = AdamW(optimizer_parameters, 
                          lr=CFG.encoder_lr,
                          eps=CFG.eps,
                          betas=CFG.betas)
    
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, 
                num_warmup_steps = cfg.num_warmup_steps, 
                num_training_steps = num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, 
                num_warmup_steps = cfg.num_warmup_steps, 
                num_training_steps = num_train_steps,
                num_cycles = cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)
    
    if CFG.loss_func == 'SmoothL1':
        criterion = nn.SmoothL1Loss(reduction='mean')
    elif CFG.loss_func == 'RMSE':
        criterion = RMSELoss(reduction='mean')
    
    best_score = np.inf
    best_train_loss = np.inf
    best_val_loss = np.inf
    
    epoch_list = []
    epoch_avg_loss_list = []
    epoch_avg_val_loss_list = []
    epoch_score_list = []
    epoch_scores_list = []
    patience = CFG.patience
    for epoch in range(CFG.epochs):
        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, CFG.device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, CFG.device)
        
        # scoring
        score, scores = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time
        
        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}  Scores: {scores}')
        
        epoch_list.append(epoch+1)
        epoch_avg_loss_list.append(avg_loss)
        epoch_avg_val_loss_list.append(avg_val_loss)
        epoch_score_list.append(score)
        epoch_scores_list.append(scores)
        
        if best_score > score:
            patience = CFG.patience
            best_score = score
            best_train_loss = avg_loss
            best_val_loss = avg_val_loss
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        CFG.OUTPUT_DIR + f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")
        else:
            patience -= 1
            if patience<=0:
                break
            
        if CFG.save_all_models:
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        CFG.OUTPUT_DIR + f"{CFG.model.replace('/', '-')}_fold{fold}_epoch{epoch + 1}.pth")

    predictions = torch.load(CFG.OUTPUT_DIR + f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location = torch.device('cpu'))['predictions']
    valid_folds[[f"pred_{c}" for c in CFG.target_cols]] = predictions
    
    df_epoch = pd.DataFrame({'epoch' : epoch_list,
                             'MCRMSE' : epoch_score_list,
                             'train_loss' : epoch_avg_loss_list, 
                             'val_loss' : epoch_avg_val_loss_list})
    df_scores = pd.DataFrame(epoch_scores_list)
    df_scores.columns = CFG.target_cols
    
    
    torch.cuda.empty_cache()
    gc.collect()
    
    return best_train_loss, best_val_loss, valid_folds, pd.concat([df_epoch, df_scores], axis = 1)

# Run !

In [16]:
def get_result(oof_df, fold, best_train_loss, best_val_loss):
    labels = oof_df[CFG.target_cols].values
    preds = oof_df[[f"pred_{c}" for c in CFG.target_cols]].values
    score, scores = get_score(labels, preds)
    LOGGER.info(f'Score: {score:<.4f}  Scores: {scores}')
    _output_log = pd.DataFrame([f"{model_name}-{CFG.file_name}", CFG.model, CFG.cv_seed, CFG.seed, fold, 'best', score, best_train_loss, best_val_loss] + scores).T
    _output_log.columns = ['file', 'model', 'cv_seed', 'seed', 'fold', 'epoch', 'MCRMSE', 'train_loss', 'val_loss'] + CFG.target_cols
    return _output_log

In [None]:
if CFG.train:
    output_log = pd.DataFrame()
    oof_df = pd.DataFrame()
    train_loss_list = []
    val_loss_list = []
    for fold in range(CFG.n_fold):
        if fold in CFG.trn_fold:
            best_train_loss, best_val_loss, _oof_df, df_epoch_scores = train_loop(CFG.df_train, fold)
            train_loss_list.append(best_train_loss)
            val_loss_list.append(best_val_loss)
            oof_df = pd.concat([oof_df, _oof_df])
            oof_df.to_pickle(CFG.OUTPUT_DIR+f'oof_df_fold{fold}.pkl', protocol = 4)
            LOGGER.info(f"========== fold: {fold} result ==========")

            df_epoch_scores['file'] = f"{model_name}-{CFG.file_name}"
            df_epoch_scores['model'] = CFG.model
            df_epoch_scores['cv_seed'] = CFG.cv_seed
            df_epoch_scores['seed'] = CFG.seed
            df_epoch_scores['fold'] = fold
            df_epoch_scores = df_epoch_scores[['file', 'model', 'cv_seed', 'seed', 'fold', 'epoch', 'MCRMSE', 'train_loss', 'val_loss'] + CFG.target_cols]

            _output_log = get_result(_oof_df, fold, best_train_loss, best_val_loss)
            output_log = pd.concat([output_log, df_epoch_scores, _output_log])

    oof_df = oof_df.reset_index(drop=True)
    LOGGER.info(f"========== CV ==========")
    _output_log = get_result(oof_df, 'OOF', np.mean(train_loss_list), np.mean(val_loss_list))
    output_log = pd.concat([output_log, _output_log])
    output_log.to_csv(CFG.OUTPUT_DIR+f"{model_name}-{CFG.file_name}.csv", index=False)
    oof_df.to_pickle(CFG.OUTPUT_DIR+'oof_df.pkl', protocol = 4)

DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.20.1",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.bi

Epoch: [1][0/440] Elapsed 0m 4s (remain 34m 0s) Loss: 2.2695(2.2695) Grad: inf LR: 0.00005000 
Epoch: [1][20/440] Elapsed 1m 19s (remain 26m 30s) Loss: 0.2270(1.0698) Grad: 138387.9219 LR: 0.00005000 
Epoch: [1][40/440] Elapsed 2m 38s (remain 25m 45s) Loss: 0.2772(0.6534) Grad: 132777.1094 LR: 0.00004999 
Epoch: [1][60/440] Elapsed 3m 48s (remain 23m 40s) Loss: 0.1343(0.4993) Grad: 84122.3438 LR: 0.00004998 
Epoch: [1][80/440] Elapsed 4m 51s (remain 21m 31s) Loss: 0.1921(0.4116) Grad: 87729.5469 LR: 0.00004996 
Epoch: [1][100/440] Elapsed 5m 54s (remain 19m 48s) Loss: 0.1949(0.3570) Grad: 179243.4688 LR: 0.00004994 
Epoch: [1][120/440] Elapsed 7m 3s (remain 18m 36s) Loss: 0.0897(0.3185) Grad: 110793.8281 LR: 0.00004991 
Epoch: [1][140/440] Elapsed 8m 16s (remain 17m 33s) Loss: 0.1277(0.2924) Grad: 145824.0000 LR: 0.00004987 
Epoch: [1][160/440] Elapsed 9m 16s (remain 16m 4s) Loss: 0.1263(0.2734) Grad: 103702.2891 LR: 0.00004984 
Epoch: [1][180/440] Elapsed 10m 29s (remain 15m 1s) Loss:

Epoch 1 - avg_train_loss: 0.1792  avg_val_loss: 0.1062  time: 1569s
Epoch 1 - Score: 0.4615  Scores: [0.497933357925829, 0.4580529392347574, 0.4377707520117906, 0.43194534715514077, 0.486689103840643, 0.45686170006645876]
Epoch 1 - Save Best Score: 0.4615 Model


EVAL: [24/25] Elapsed 0m 56s (remain 0m 0s) Loss: 0.1272(0.1062) 
Epoch: [2][0/440] Elapsed 0m 2s (remain 20m 3s) Loss: 0.0749(0.0749) Grad: 91526.0547 LR: 0.00004877 
Epoch: [2][20/440] Elapsed 1m 5s (remain 21m 41s) Loss: 0.1776(0.1016) Grad: 275864.6250 LR: 0.00004866 
Epoch: [2][40/440] Elapsed 2m 13s (remain 21m 38s) Loss: 0.0973(0.0994) Grad: 141658.9219 LR: 0.00004854 
Epoch: [2][60/440] Elapsed 3m 14s (remain 20m 5s) Loss: 0.0982(0.1003) Grad: 169433.0000 LR: 0.00004842 
Epoch: [2][80/440] Elapsed 4m 29s (remain 19m 56s) Loss: 0.1439(0.1017) Grad: 264769.3125 LR: 0.00004829 
Epoch: [2][100/440] Elapsed 5m 42s (remain 19m 10s) Loss: 0.0863(0.1002) Grad: 161598.4375 LR: 0.00004816 
Epoch: [2][120/440] Elapsed 7m 6s (remain 18m 45s) Loss: 0.1533(0.1042) Grad: 380828.1875 LR: 0.00004802 
Epoch: [2][140/440] Elapsed 8m 9s (remain 17m 18s) Loss: 0.0914(0.1035) Grad: 74954.7891 LR: 0.00004788 
Epoch: [2][160/440] Elapsed 9m 32s (remain 16m 32s) Loss: 0.1432(0.1052) Grad: 271548.8438 L

Epoch 2 - avg_train_loss: 0.1104  avg_val_loss: 0.1057  time: 1588s
Epoch 2 - Score: 0.4606  Scores: [0.5020590029320753, 0.44002857564349224, 0.46289341631132136, 0.43330339210009633, 0.4776042726685252, 0.4475342264953566]
Epoch 2 - Save Best Score: 0.4606 Model


EVAL: [24/25] Elapsed 0m 56s (remain 0m 0s) Loss: 0.1265(0.1057) 
Epoch: [3][0/440] Elapsed 0m 2s (remain 15m 43s) Loss: 0.0851(0.0851) Grad: 105872.6875 LR: 0.00004521 
Epoch: [3][20/440] Elapsed 1m 13s (remain 24m 27s) Loss: 0.0936(0.1040) Grad: 290185.3438 LR: 0.00004500 
Epoch: [3][40/440] Elapsed 2m 20s (remain 22m 43s) Loss: 0.0948(0.1000) Grad: 201676.8125 LR: 0.00004479 
Epoch: [3][60/440] Elapsed 3m 28s (remain 21m 34s) Loss: 0.0842(0.0979) Grad: 142457.3906 LR: 0.00004457 
Epoch: [3][80/440] Elapsed 4m 35s (remain 20m 23s) Loss: 0.0971(0.0985) Grad: 233256.2812 LR: 0.00004434 
Epoch: [3][100/440] Elapsed 5m 36s (remain 18m 48s) Loss: 0.1229(0.1046) Grad: 129880.8594 LR: 0.00004411 
Epoch: [3][120/440] Elapsed 6m 43s (remain 17m 42s) Loss: 0.1342(0.1046) Grad: 198779.5625 LR: 0.00004388 
Epoch: [3][140/440] Elapsed 8m 2s (remain 17m 4s) Loss: 0.0859(0.1047) Grad: 142926.6250 LR: 0.00004365 
Epoch: [3][160/440] Elapsed 8m 54s (remain 15m 26s) Loss: 0.1315(0.1042) Grad: 242556.1

Epoch 3 - avg_train_loss: 0.1017  avg_val_loss: 0.1081  time: 1592s
Epoch 3 - Score: 0.4650  Scores: [0.5065377419168912, 0.45017174883345557, 0.4404302190888805, 0.42220903956865646, 0.5305050826681599, 0.4401421915393284]


EVAL: [24/25] Elapsed 0m 56s (remain 0m 0s) Loss: 0.1309(0.1081) 
Epoch: [4][0/440] Elapsed 0m 4s (remain 36m 15s) Loss: 0.0859(0.0859) Grad: 140206.9531 LR: 0.00003968 
Epoch: [4][20/440] Elapsed 1m 14s (remain 24m 51s) Loss: 0.1080(0.0934) Grad: 196579.3594 LR: 0.00003939 
Epoch: [4][40/440] Elapsed 2m 34s (remain 25m 4s) Loss: 0.1012(0.0941) Grad: 166399.8750 LR: 0.00003910 
Epoch: [4][60/440] Elapsed 3m 34s (remain 22m 12s) Loss: 0.1678(0.0963) Grad: 477854.2188 LR: 0.00003880 
Epoch: [4][80/440] Elapsed 4m 35s (remain 20m 19s) Loss: 0.0968(0.0945) Grad: 198452.7969 LR: 0.00003850 
Epoch: [4][100/440] Elapsed 5m 45s (remain 19m 20s) Loss: 0.0918(0.0957) Grad: 205710.4062 LR: 0.00003820 
Epoch: [4][120/440] Elapsed 6m 55s (remain 18m 15s) Loss: 0.1139(0.0949) Grad: 119073.1016 LR: 0.00003789 
Epoch: [4][140/440] Elapsed 7m 58s (remain 16m 54s) Loss: 0.0706(0.0946) Grad: 182033.6250 LR: 0.00003759 
Epoch: [4][160/440] Elapsed 8m 59s (remain 15m 34s) Loss: 0.0740(0.0936) Grad: 94611.6

Epoch 4 - avg_train_loss: 0.0947  avg_val_loss: 0.1045  time: 1586s
Epoch 4 - Score: 0.4579  Scores: [0.49635157382223305, 0.44389586284287236, 0.43954999322014077, 0.4368815032432967, 0.4836569392193099, 0.44682011138864586]
Epoch 4 - Save Best Score: 0.4579 Model


EVAL: [24/25] Elapsed 0m 56s (remain 0m 0s) Loss: 0.1271(0.1045) 
Epoch: [5][0/440] Elapsed 0m 5s (remain 42m 44s) Loss: 0.0620(0.0620) Grad: 103446.1250 LR: 0.00003271 
Epoch: [5][20/440] Elapsed 1m 22s (remain 27m 23s) Loss: 0.0730(0.0834) Grad: 327441.7188 LR: 0.00003237 
Epoch: [5][40/440] Elapsed 2m 26s (remain 23m 46s) Loss: 0.0683(0.0810) Grad: 97659.4531 LR: 0.00003203 
Epoch: [5][60/440] Elapsed 3m 31s (remain 21m 56s) Loss: 0.0884(0.0815) Grad: 106604.0938 LR: 0.00003168 
Epoch: [5][80/440] Elapsed 4m 39s (remain 20m 39s) Loss: 0.0753(0.0816) Grad: 154218.5312 LR: 0.00003134 
Epoch: [5][100/440] Elapsed 5m 57s (remain 20m 0s) Loss: 0.0841(0.0824) Grad: 238409.0625 LR: 0.00003099 
Epoch: [5][120/440] Elapsed 7m 4s (remain 18m 40s) Loss: 0.0836(0.0821) Grad: 177140.2500 LR: 0.00003065 
Epoch: [5][140/440] Elapsed 8m 14s (remain 17m 28s) Loss: 0.0879(0.0824) Grad: 176347.3281 LR: 0.00003030 
Epoch: [5][160/440] Elapsed 9m 24s (remain 16m 17s) Loss: 0.0462(0.0816) Grad: 126248.05

Epoch 5 - avg_train_loss: 0.0823  avg_val_loss: 0.1131  time: 1594s
Epoch 5 - Score: 0.4760  Scores: [0.5254026594333728, 0.46081549119107956, 0.4272209124245381, 0.44386539034972533, 0.5357141478016345, 0.4629112701700276]


EVAL: [24/25] Elapsed 0m 55s (remain 0m 0s) Loss: 0.1322(0.1131) 
Epoch: [6][0/440] Elapsed 0m 2s (remain 15m 56s) Loss: 0.0784(0.0784) Grad: 158865.6094 LR: 0.00002498 
Epoch: [6][20/440] Elapsed 1m 18s (remain 26m 2s) Loss: 0.0501(0.0759) Grad: 52537.1211 LR: 0.00002463 
Epoch: [6][40/440] Elapsed 2m 20s (remain 22m 48s) Loss: 0.0497(0.0719) Grad: 78847.4219 LR: 0.00002427 
Epoch: [6][60/440] Elapsed 3m 20s (remain 20m 48s) Loss: 0.0757(0.0699) Grad: 111522.1094 LR: 0.00002391 
Epoch: [6][80/440] Elapsed 4m 27s (remain 19m 46s) Loss: 0.0997(0.0713) Grad: 444331.3125 LR: 0.00002355 
Epoch: [6][100/440] Elapsed 5m 33s (remain 18m 38s) Loss: 0.0797(0.0722) Grad: 156342.5469 LR: 0.00002320 
Epoch: [6][120/440] Elapsed 6m 43s (remain 17m 43s) Loss: 0.0431(0.0711) Grad: 147014.2031 LR: 0.00002284 
Epoch: [6][140/440] Elapsed 8m 7s (remain 17m 12s) Loss: 0.0647(0.0704) Grad: 140927.5156 LR: 0.00002249 
Epoch: [6][160/440] Elapsed 9m 16s (remain 16m 4s) Loss: 0.0681(0.0703) Grad: 194212.8594

Epoch 6 - avg_train_loss: 0.0696  avg_val_loss: 0.1216  time: 1603s
Epoch 6 - Score: 0.4937  Scores: [0.5620672456574626, 0.4786949803824272, 0.426770851813223, 0.4741790272816547, 0.5310489776935474, 0.4893168826446271]


EVAL: [24/25] Elapsed 0m 56s (remain 0m 0s) Loss: 0.1409(0.1216) 
Epoch: [7][0/440] Elapsed 0m 4s (remain 32m 8s) Loss: 0.0427(0.0427) Grad: 116048.3594 LR: 0.00001726 
Epoch: [7][20/440] Elapsed 1m 11s (remain 23m 55s) Loss: 0.0503(0.0594) Grad: 90980.4688 LR: 0.00001692 
Epoch: [7][40/440] Elapsed 2m 42s (remain 26m 25s) Loss: 0.0841(0.0594) Grad: 93334.7578 LR: 0.00001658 
Epoch: [7][60/440] Elapsed 3m 38s (remain 22m 36s) Loss: 0.0625(0.0594) Grad: 155003.2969 LR: 0.00001625 
Epoch: [7][80/440] Elapsed 4m 40s (remain 20m 42s) Loss: 0.0555(0.0602) Grad: 146547.9062 LR: 0.00001591 
Epoch: [7][100/440] Elapsed 5m 57s (remain 20m 0s) Loss: 0.0544(0.0590) Grad: 89495.2656 LR: 0.00001558 
Epoch: [7][120/440] Elapsed 7m 5s (remain 18m 40s) Loss: 0.0488(0.0591) Grad: 105176.9141 LR: 0.00001525 
Epoch: [7][140/440] Elapsed 8m 4s (remain 17m 7s) Loss: 0.0799(0.0587) Grad: 125500.1562 LR: 0.00001492 
Epoch: [7][160/440] Elapsed 9m 15s (remain 16m 3s) Loss: 0.0748(0.0592) Grad: 108187.9219 LR:

Epoch 7 - avg_train_loss: 0.0588  avg_val_loss: 0.1115  time: 1595s
Epoch 7 - Score: 0.4727  Scores: [0.520489621451889, 0.4564917353293964, 0.42745501665200136, 0.4554431343182676, 0.5141957908735811, 0.462262860241757]


EVAL: [24/25] Elapsed 0m 56s (remain 0m 0s) Loss: 0.1349(0.1115) 


Score: 0.4579  Scores: [0.49635157382223305, 0.44389586284287236, 0.43954999322014077, 0.4368815032432967, 0.4836569392193099, 0.44682011138864586]
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.20.1",
  "type_vocab_size": 0,
  "vocab_size": 128100


Epoch: [1][0/440] Elapsed 0m 3s (remain 23m 25s) Loss: 2.4357(2.4357) Grad: inf LR: 0.00005000 
Epoch: [1][20/440] Elapsed 1m 8s (remain 22m 56s) Loss: 0.2112(1.3990) Grad: 107680.7656 LR: 0.00005000 
Epoch: [1][40/440] Elapsed 2m 18s (remain 22m 28s) Loss: 0.2247(0.8422) Grad: 73817.8203 LR: 0.00004999 
Epoch: [1][60/440] Elapsed 3m 36s (remain 22m 25s) Loss: 0.1771(0.6293) Grad: 96643.8281 LR: 0.00004998 
Epoch: [1][80/440] Elapsed 4m 47s (remain 21m 12s) Loss: 0.1262(0.5099) Grad: 109814.9297 LR: 0.00004996 
Epoch: [1][100/440] Elapsed 6m 7s (remain 20m 32s) Loss: 0.1151(0.4355) Grad: 61245.5508 LR: 0.00004994 
Epoch: [1][120/440] Elapsed 7m 12s (remain 18m 59s) Loss: 0.1929(0.3862) Grad: 119653.7500 LR: 0.00004991 
Epoch: [1][140/440] Elapsed 8m 15s (remain 17m 30s) Loss: 0.1310(0.3501) Grad: 114557.4844 LR: 0.00004987 
Epoch: [1][160/440] Elapsed 9m 20s (remain 16m 10s) Loss: 0.1491(0.3240) Grad: 173079.8125 LR: 0.00004984 
Epoch: [1][180/440] Elapsed 10m 35s (remain 15m 8s) Loss:

Epoch 1 - avg_train_loss: 0.1951  avg_val_loss: 0.1190  time: 1600s
Epoch 1 - Score: 0.4889  Scores: [0.5517771184942732, 0.47401060279942286, 0.4299918826940122, 0.44616610660127315, 0.5486668916329905, 0.48299980170192264]
Epoch 1 - Save Best Score: 0.4889 Model


EVAL: [24/25] Elapsed 0m 51s (remain 0m 0s) Loss: 0.0956(0.1190) 
Epoch: [2][0/440] Elapsed 0m 3s (remain 22m 7s) Loss: 0.0935(0.0935) Grad: 230293.7656 LR: 0.00004877 
Epoch: [2][20/440] Elapsed 1m 5s (remain 21m 53s) Loss: 0.0929(0.1101) Grad: 107325.9219 LR: 0.00004866 
Epoch: [2][40/440] Elapsed 2m 21s (remain 22m 55s) Loss: 0.0769(0.1053) Grad: 100848.7422 LR: 0.00004854 
Epoch: [2][60/440] Elapsed 3m 27s (remain 21m 31s) Loss: 0.1012(0.1024) Grad: 151975.7812 LR: 0.00004842 
Epoch: [2][80/440] Elapsed 4m 47s (remain 21m 15s) Loss: 0.0932(0.1043) Grad: 181441.8750 LR: 0.00004829 
Epoch: [2][100/440] Elapsed 5m 49s (remain 19m 32s) Loss: 0.0906(0.1030) Grad: 117205.2266 LR: 0.00004816 
Epoch: [2][120/440] Elapsed 6m 56s (remain 18m 18s) Loss: 0.1155(0.1033) Grad: 226675.9844 LR: 0.00004802 
Epoch: [2][140/440] Elapsed 8m 3s (remain 17m 4s) Loss: 0.1311(0.1054) Grad: 210332.2656 LR: 0.00004788 
Epoch: [2][160/440] Elapsed 9m 13s (remain 15m 59s) Loss: 0.1216(0.1048) Grad: 186858.359

Epoch 2 - avg_train_loss: 0.1072  avg_val_loss: 0.1258  time: 1602s
Epoch 2 - Score: 0.5034  Scores: [0.5147232865918544, 0.4913297159422431, 0.4580524196097426, 0.4784918609070734, 0.5725356097299256, 0.5055356041902628]


EVAL: [24/25] Elapsed 0m 51s (remain 0m 0s) Loss: 0.1029(0.1258) 
Epoch: [3][0/440] Elapsed 0m 3s (remain 29m 15s) Loss: 0.0902(0.0902) Grad: 200197.7031 LR: 0.00004521 
Epoch: [3][20/440] Elapsed 1m 26s (remain 28m 48s) Loss: 0.1287(0.1018) Grad: 198174.3281 LR: 0.00004500 
Epoch: [3][40/440] Elapsed 2m 35s (remain 25m 12s) Loss: 0.1001(0.1007) Grad: 270166.0938 LR: 0.00004479 
Epoch: [3][60/440] Elapsed 3m 42s (remain 23m 2s) Loss: 0.0589(0.0978) Grad: 121871.4531 LR: 0.00004457 
Epoch: [3][80/440] Elapsed 4m 52s (remain 21m 37s) Loss: 0.0768(0.0967) Grad: 241294.7656 LR: 0.00004434 
Epoch: [3][100/440] Elapsed 6m 8s (remain 20m 36s) Loss: 0.0579(0.0963) Grad: 151912.2812 LR: 0.00004411 
Epoch: [3][120/440] Elapsed 7m 10s (remain 18m 54s) Loss: 0.0974(0.0954) Grad: 79820.9219 LR: 0.00004388 
Epoch: [3][140/440] Elapsed 8m 30s (remain 18m 2s) Loss: 0.1350(0.0952) Grad: 143705.2812 LR: 0.00004365 
Epoch: [3][160/440] Elapsed 9m 55s (remain 17m 11s) Loss: 0.0837(0.0954) Grad: 92739.2969

Epoch 3 - avg_train_loss: 0.0970  avg_val_loss: 0.1170  time: 1609s
Epoch 3 - Score: 0.4851  Scores: [0.5004429235179227, 0.4660150503626987, 0.43343136315188274, 0.47472323922538334, 0.508561447695547, 0.5272851797019898]
Epoch 3 - Save Best Score: 0.4851 Model


EVAL: [24/25] Elapsed 0m 51s (remain 0m 0s) Loss: 0.0787(0.1170) 
Epoch: [4][0/440] Elapsed 0m 4s (remain 34m 29s) Loss: 0.0826(0.0826) Grad: 112453.2891 LR: 0.00003968 
Epoch: [4][20/440] Elapsed 1m 23s (remain 27m 53s) Loss: 0.0843(0.0884) Grad: 352638.7188 LR: 0.00003939 
Epoch: [4][40/440] Elapsed 2m 25s (remain 23m 37s) Loss: 0.0740(0.0894) Grad: 151669.7344 LR: 0.00003910 
Epoch: [4][60/440] Elapsed 3m 44s (remain 23m 17s) Loss: 0.0639(0.0867) Grad: 118498.6562 LR: 0.00003880 
Epoch: [4][80/440] Elapsed 5m 10s (remain 22m 58s) Loss: 0.0628(0.0846) Grad: 254405.7031 LR: 0.00003850 
Epoch: [4][100/440] Elapsed 6m 45s (remain 22m 40s) Loss: 0.0732(0.0839) Grad: 137016.9375 LR: 0.00003820 
Epoch: [4][120/440] Elapsed 7m 45s (remain 20m 27s) Loss: 0.0895(0.0845) Grad: 109383.9609 LR: 0.00003789 
Epoch: [4][140/440] Elapsed 9m 6s (remain 19m 19s) Loss: 0.0883(0.0859) Grad: 122874.7891 LR: 0.00003759 
Epoch: [4][160/440] Elapsed 10m 9s (remain 17m 35s) Loss: 0.0908(0.0868) Grad: 175935.

Epoch 4 - avg_train_loss: 0.0852  avg_val_loss: 0.1087  time: 1596s
Epoch 4 - Score: 0.4678  Scores: [0.5123521361764922, 0.4521896912123344, 0.4378181192704535, 0.4545032789643308, 0.47458204257259506, 0.47509480488548944]
Epoch 4 - Save Best Score: 0.4678 Model


EVAL: [24/25] Elapsed 0m 51s (remain 0m 0s) Loss: 0.0867(0.1087) 
Epoch: [5][0/440] Elapsed 0m 7s (remain 52m 28s) Loss: 0.0839(0.0839) Grad: 106244.8203 LR: 0.00003271 
Epoch: [5][20/440] Elapsed 1m 15s (remain 25m 15s) Loss: 0.0794(0.0709) Grad: 202476.9688 LR: 0.00003237 
Epoch: [5][40/440] Elapsed 2m 30s (remain 24m 20s) Loss: 0.0741(0.0728) Grad: 136727.7188 LR: 0.00003203 
Epoch: [5][60/440] Elapsed 3m 31s (remain 21m 56s) Loss: 0.0889(0.0706) Grad: 108038.3984 LR: 0.00003168 
Epoch: [5][80/440] Elapsed 4m 50s (remain 21m 26s) Loss: 0.0755(0.0700) Grad: 104546.0625 LR: 0.00003134 
Epoch: [5][100/440] Elapsed 6m 7s (remain 20m 33s) Loss: 0.1032(0.0696) Grad: 216607.2656 LR: 0.00003099 
Epoch: [5][120/440] Elapsed 7m 14s (remain 19m 6s) Loss: 0.0770(0.0705) Grad: 108935.2656 LR: 0.00003065 
Epoch: [5][140/440] Elapsed 8m 20s (remain 17m 41s) Loss: 0.0459(0.0701) Grad: 88407.4297 LR: 0.00003030 
Epoch: [5][160/440] Elapsed 9m 34s (remain 16m 34s) Loss: 0.0961(0.0708) Grad: 183286.18

Epoch 5 - avg_train_loss: 0.0714  avg_val_loss: 0.1177  time: 1602s
Epoch 5 - Score: 0.4859  Scores: [0.5421805940010731, 0.4662993066328711, 0.4443467849653943, 0.46357267361828625, 0.5040775019338716, 0.49469871017607375]


EVAL: [24/25] Elapsed 0m 51s (remain 0m 0s) Loss: 0.0883(0.1177) 
Epoch: [6][0/440] Elapsed 0m 6s (remain 47m 39s) Loss: 0.0917(0.0917) Grad: 442386.8750 LR: 0.00002498 
Epoch: [6][20/440] Elapsed 1m 23s (remain 27m 41s) Loss: 0.0657(0.0660) Grad: 111982.5234 LR: 0.00002463 
Epoch: [6][40/440] Elapsed 2m 32s (remain 24m 40s) Loss: 0.0707(0.0617) Grad: 254861.9688 LR: 0.00002427 
Epoch: [6][60/440] Elapsed 3m 43s (remain 23m 5s) Loss: 0.0443(0.0608) Grad: 49484.4570 LR: 0.00002391 
Epoch: [6][80/440] Elapsed 4m 43s (remain 20m 55s) Loss: 0.0462(0.0609) Grad: 175829.0000 LR: 0.00002355 
Epoch: [6][100/440] Elapsed 5m 54s (remain 19m 51s) Loss: 0.0906(0.0635) Grad: 198658.3438 LR: 0.00002320 
Epoch: [6][120/440] Elapsed 7m 5s (remain 18m 42s) Loss: 0.0566(0.0633) Grad: 218525.0625 LR: 0.00002284 
Epoch: [6][140/440] Elapsed 8m 8s (remain 17m 16s) Loss: 0.0522(0.0625) Grad: 111764.6094 LR: 0.00002249 
Epoch: [6][160/440] Elapsed 9m 13s (remain 15m 59s) Loss: 0.0521(0.0620) Grad: 84690.7031

Epoch 6 - avg_train_loss: 0.0594  avg_val_loss: 0.1186  time: 1606s
Epoch 6 - Score: 0.4885  Scores: [0.5421939734369357, 0.4740417187153399, 0.4369675934949415, 0.4694529235553944, 0.5070030535855489, 0.5011038841183127]


EVAL: [24/25] Elapsed 0m 51s (remain 0m 0s) Loss: 0.1166(0.1186) 
Epoch: [7][0/440] Elapsed 0m 2s (remain 21m 49s) Loss: 0.0618(0.0618) Grad: 130974.2578 LR: 0.00001726 
Epoch: [7][20/440] Elapsed 1m 16s (remain 25m 23s) Loss: 0.0400(0.0492) Grad: 79161.5312 LR: 0.00001692 
Epoch: [7][40/440] Elapsed 2m 17s (remain 22m 18s) Loss: 0.0485(0.0492) Grad: 72518.5859 LR: 0.00001658 
Epoch: [7][60/440] Elapsed 3m 39s (remain 22m 42s) Loss: 0.0532(0.0471) Grad: 112403.6172 LR: 0.00001625 
Epoch: [7][80/440] Elapsed 4m 54s (remain 21m 46s) Loss: 0.0399(0.0474) Grad: 103071.8125 LR: 0.00001591 
Epoch: [7][100/440] Elapsed 5m 57s (remain 19m 58s) Loss: 0.0544(0.0475) Grad: 306233.4375 LR: 0.00001558 
Epoch: [7][120/440] Elapsed 7m 6s (remain 18m 45s) Loss: 0.0487(0.0480) Grad: 113251.4062 LR: 0.00001525 
Epoch: [7][140/440] Elapsed 8m 10s (remain 17m 21s) Loss: 0.0428(0.0479) Grad: 124592.5859 LR: 0.00001492 
Epoch: [7][160/440] Elapsed 9m 16s (remain 16m 4s) Loss: 0.0497(0.0483) Grad: 95337.2344

Epoch 7 - avg_train_loss: 0.0479  avg_val_loss: 0.1176  time: 1602s
Epoch 7 - Score: 0.4871  Scores: [0.5226423394910589, 0.4688142976883629, 0.44874927164275896, 0.47791629297927957, 0.5158205230652946, 0.48876149642319483]


EVAL: [24/25] Elapsed 0m 51s (remain 0m 0s) Loss: 0.1091(0.1176) 


Score: 0.4678  Scores: [0.5123521361764922, 0.4521896912123344, 0.4378181192704535, 0.4545032789643308, 0.47458204257259506, 0.47509480488548944]
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.20.1",
  "type_vocab_size": 0,
  "vocab_size": 128100
}


Epoch: [1][0/440] Elapsed 0m 7s (remain 54m 40s) Loss: 2.4303(2.4303) Grad: inf LR: 0.00005000 
Epoch: [1][20/440] Elapsed 1m 21s (remain 27m 12s) Loss: 0.3809(1.1915) Grad: 229728.6562 LR: 0.00005000 
Epoch: [1][40/440] Elapsed 2m 28s (remain 24m 9s) Loss: 0.3517(0.7347) Grad: 170127.8750 LR: 0.00004999 
Epoch: [1][60/440] Elapsed 3m 51s (remain 23m 56s) Loss: 0.2488(0.5512) Grad: 159868.9219 LR: 0.00004998 
Epoch: [1][80/440] Elapsed 4m 52s (remain 21m 34s) Loss: 0.1270(0.4504) Grad: 97422.8438 LR: 0.00004996 
Epoch: [1][100/440] Elapsed 6m 19s (remain 21m 15s) Loss: 0.1498(0.3870) Grad: 116837.7891 LR: 0.00004994 
Epoch: [1][120/440] Elapsed 7m 35s (remain 20m 1s) Loss: 0.1959(0.3448) Grad: 215351.0312 LR: 0.00004991 
Epoch: [1][140/440] Elapsed 8m 49s (remain 18m 42s) Loss: 0.1490(0.3143) Grad: 143927.0938 LR: 0.00004987 
Epoch: [1][160/440] Elapsed 9m 57s (remain 17m 15s) Loss: 0.1135(0.2932) Grad: 66896.3125 LR: 0.00004984 
Epoch: [1][180/440] Elapsed 11m 4s (remain 15m 51s) Loss

In [5]:
CFG.OUTPUT_DIR = "/home/jupyter/output/ex/deberta-v3-base/018/202211261205/"

if "/" in CFG.model:
    model_name = CFG.model.split("/")[1]
else:
    model_name = CFG.model

In [6]:
if CFG.TO_KAGGLE:
    UPLOAD_DIR = CFG.OUTPUT_DIR
    EX_NO = f"{model_name}{CFG.file_name}" # 実験番号などを入れる、folderのpathにする
    USERID = 'your_id'


    def dataset_upload():
        import json
        from kaggle.api.kaggle_api_extended import KaggleApi

        id = f'{USERID}/{EX_NO}'

        dataset_metadata = {}
        dataset_metadata['id'] = id
        dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
        dataset_metadata['title'] = f'{EX_NO}'

        with open(UPLOAD_DIR +'dataset-metadata.json', 'w') as f:
            json.dump(dataset_metadata, f, indent=4)

        api = KaggleApi()
        api.authenticate()

        # データセットがない場合
        if f'{USERID}/{EX_NO}' not in [str(d) for d in api.dataset_list(user=USERID, search=f'"{EX_NO}"')]:
            api.dataset_create_new(folder=UPLOAD_DIR,
                                   convert_to_csv=False,
                                   dir_mode='skip')
            
            
            #  #フォルダーを削除
            # if f'{USERID}/{EX_NO}' not in [str(d) for d in api.dataset_list(user=USERID, search=f'"{EX_NO}"')]:
            #     remove_files = glob.glob(OUTPUT_DIR+"*")
            #     remove_files.remove(OUTPUT_DIR+"oof_df.pkl")
            #     for file in remove_files:
            #         os.remove(file)
            #     print("folder upload")
            #                 #apiコマンドを書き込む
            #     f = open(f'{model_name}_api_command.txt', 'a')
            #     api_command = f"!kaggle datasets download -d hiroki8383/{EX_NO}\n"
            #     f.write(api_command)
            #     f.close()
            # else:
            #     print("folder not upload")
            
            
        # データセットがある場合→更新されない場合がある（後で原因追及)
        else:
            print("this folder exsits")
            # api.dataset_create_version(folder=UPLOAD_DIR,
            #                            version_notes='update',
            #                            convert_to_csv=False,
            #                            delete_old_versions=False,
            #                            dir_mode='zip')
    dataset_upload()

Starting upload for file microsoft-deberta-v3-base_fold6_best.pth


100% 704M/704M [00:19<00:00, 38.0MB/s] 


Upload successful: microsoft-deberta-v3-base_fold6_best.pth (704MB)
Starting upload for file oof_df_fold3.pkl


100% 3.69M/3.69M [00:02<00:00, 1.44MB/s]


Upload successful: oof_df_fold3.pkl (4MB)
Starting upload for file oof_df_fold6.pkl


100% 6.41M/6.41M [00:02<00:00, 2.85MB/s]


Upload successful: oof_df_fold6.pkl (6MB)
Starting upload for file oof_df_fold1.pkl


100% 1.86M/1.86M [00:02<00:00, 852kB/s]


Upload successful: oof_df_fold1.pkl (2MB)
Skipping folder: .ipynb_checkpoints; use '--dir-mode' to upload folders
Starting upload for file oof_df_fold4.pkl


100% 4.59M/4.59M [00:02<00:00, 1.87MB/s]


Upload successful: oof_df_fold4.pkl (5MB)
Starting upload for file microsoft-deberta-v3-base_fold7_best.pth


100% 704M/704M [00:21<00:00, 34.9MB/s] 


Upload successful: microsoft-deberta-v3-base_fold7_best.pth (704MB)
Starting upload for file microsoft-deberta-v3-base_fold5_best.pth


100% 704M/704M [00:20<00:00, 36.1MB/s] 


Upload successful: microsoft-deberta-v3-base_fold5_best.pth (704MB)
Starting upload for file microsoft-deberta-v3-base_fold0_best.pth


100% 704M/704M [00:16<00:00, 44.0MB/s] 


Upload successful: microsoft-deberta-v3-base_fold0_best.pth (704MB)
Starting upload for file microsoft-deberta-v3-base_fold4_best.pth


100% 704M/704M [00:20<00:00, 36.2MB/s] 


Upload successful: microsoft-deberta-v3-base_fold4_best.pth (704MB)
Starting upload for file oof_df_fold0.pkl


100% 952k/952k [00:01<00:00, 522kB/s] 


Upload successful: oof_df_fold0.pkl (952KB)
Starting upload for file oof_df_fold5.pkl


100% 5.49M/5.49M [00:02<00:00, 2.03MB/s]


Upload successful: oof_df_fold5.pkl (5MB)
Starting upload for file microsoft-deberta-v3-base_fold1_best.pth


100% 704M/704M [00:20<00:00, 35.8MB/s] 


Upload successful: microsoft-deberta-v3-base_fold1_best.pth (704MB)
Starting upload for file oof_df_fold7.pkl


100% 7.27M/7.27M [00:02<00:00, 2.97MB/s]


Upload successful: oof_df_fold7.pkl (7MB)
Starting upload for file train.log


100% 22.4k/22.4k [00:02<00:00, 10.7kB/s]


Upload successful: train.log (22KB)
Starting upload for file microsoft-deberta-v3-base_fold3_best.pth


100% 704M/704M [00:21<00:00, 34.1MB/s] 


Upload successful: microsoft-deberta-v3-base_fold3_best.pth (704MB)
Starting upload for file oof_df_fold2.pkl


100% 2.76M/2.76M [00:02<00:00, 1.34MB/s]


Upload successful: oof_df_fold2.pkl (3MB)
Starting upload for file microsoft-deberta-v3-base_fold2_best.pth


100% 704M/704M [00:17<00:00, 42.4MB/s] 


Upload successful: microsoft-deberta-v3-base_fold2_best.pth (704MB)
