#Data Augmentation

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
INPUT_DIR = '/content/drive/MyDrive/Competitions/Signate/Student Cup 2022/input/'
OUTPUT_DIR = '/content/drive/MyDrive/Competitions/Signate/Student Cup 2022/output/'
OUTPUT_SUB_DIR = os.path.join(OUTPUT_DIR,'Submission')
OUTPUT_MODEL_DIR = os.path.join(OUTPUT_DIR,'Model/DeBERTa-base[ver6]/')

In [6]:
class CFG:
    wandb = False
    apex = True
    model = 'microsoft/deberta-v3-base'
    seed = 42
    n_splits = 5
    max_len = 1024
    dropout = 0.1
    target_size=4
    n_accumulate=1
    print_freq = 50
    min_lr=1e-6
    scheduler = 'cosine'
    batch_size = 8
    num_workers = 2
    lr = 3e-5
    weigth_decay = 0.01
    epochs = 10
    n_fold = 5
    trn_fold = [0, 1, 2, 3, 4]
    train = True 
    num_warmup_steps = 0
    num_cycles=0.5
    debug = False
    debug_ver2 = False
    gradient_checkpointing = True
    freezing = True

In [7]:
# Loss Func
def criterion(outputs, labels):
    return nn.CrossEntropyLoss()(outputs, labels)

def softmax(z):
    assert len(z.shape) == 2
    s = np.max(z, axis=1)
    s = s[:, np.newaxis] # necessary step to do broadcasting
    e_x = np.exp(z - s)
    div = np.sum(e_x, axis=1)
    div = div[:, np.newaxis] # dito
    return e_x / div
"""
def get_score(y_true, y_pred):
    y_pred = softmax(y_pred)
    score = log_loss(y_true, y_pred)
    return round(score, 5)
"""
def get_score(outputs, labels):
    outputs = F.softmax(torch.tensor(outputs)).numpy()
    return f1_score(np.argmax(outputs,axis=1),labels ,average='macro')

def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

def prepare_input(cfg, text, text_2=None):
    inputs = cfg.tokenizer(text, text_2,
                           padding="max_length",
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           truncation=True)

    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
        
    return inputs

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
train = pd.read_csv(os.path.join(INPUT_DIR, 'train.csv'))
test = pd.read_csv(os.path.join(INPUT_DIR, 'test.csv'))
submission_df = pd.read_csv(os.path.join(INPUT_DIR, 'submit_sample.csv'))
aug = pd.read_csv(os.path.join(OUTPUT_DIR,"train_aug_fr.csv"))

display(train.head())
print(train.shape)
display(test.head())
print(test.shape)
display(aug.head())
print(aug.shape)

aug_df = aug[["id","description_aug_fr","jobflag"]]
aug_df = aug_df.rename(columns = {"description_aug_fr": "description"})

print(train.jobflag.value_counts())

Unnamed: 0,id,description,jobflag
0,0,<li>Develop cutting-edge web applications that...,3
1,1,"<li> Designs and develops high quality, scalab...",3
2,2,<li>Functions as a point person for Network St...,4
3,3,"<li> Work on the technical design, development...",3
4,4,<li>Quantify the resources required for a task...,4


(1516, 3)


Unnamed: 0,id,description
0,1516,<li>Building decision-making models and propos...
1,1517,<li>Educate homeowners on the benefits of sola...
2,1518,"<li><span>Design, develop, document, and imple..."
3,1519,<li>Apply advanced technical expertise and ski...
4,1520,<li>Project manage and deliver against our roa...


(1517, 2)


Unnamed: 0,id,description,jobflag,description_aug_fr
0,28,Build network within the Capgemini group and ...,4,Establish a network within the Capgemini Group...
1,1213,Act as the primary contact for\xa0incoming sup...,3,Take a proactive approach and a high level of ...
2,220,Develop internal reporting/visualization tool...,4,Develop internal communication and visualizati...
3,245,Foster Google’s culture and principles within ...,2,We are committed to offering opportunities for...
4,498,"Design, development, and deployment of mission...",3,"Design, development and deployment of mission ..."


(379, 4)
4    505
1    468
3    455
2     88
Name: jobflag, dtype: int64


In [10]:
def remove_tag(x):
    p = re.compile(r"<[^>]*?>")
    return p.sub('',x)

def cleaning(texts):
    clean_texts = []
    for text in texts:
        # htmlタグを削除
        text = remove_tag(text)
        #アルファベット以外をスペースに置き換え
        #clean_punc = re.sub(r'[^a-zA-Z]', ' ', text)
        clean_texts.append(text)
    return clean_texts



from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text

train = pd.concat([train,aug_df],axis=0)
train['description'] = cleaning(train['description'])
test['description'] = cleaning(test['description'])
train['inputs'] = train['description'].apply(lambda x : resolve_encodings_and_normalize(x))
test['inputs'] = test['description'].apply(lambda x : resolve_encodings_and_normalize(x))
train.drop_duplicates(subset=['inputs'],keep='first',inplace=True)
train = train.reset_index(drop=True)
train = train.rename(columns = {"jobflag": "label"})
train["label"] = train["label"] - 1
print(train.label.value_counts())
print(train.shape)
train

3    618
2    580
0    568
1    110
Name: label, dtype: int64
(1876, 4)


Unnamed: 0,id,description,label,inputs
0,0,Develop cutting-edge web applications that per...,2,Develop cutting-edge web applications that per...
1,1,"Designs and develops high quality, scalable a...",2,"Designs and develops high quality, scalable a..."
2,2,Functions as a point person for Network Strate...,3,Functions as a point person for Network Strate...
3,3,"Work on the technical design, development, re...",2,"Work on the technical design, development, re..."
4,4,Quantify the resources required for a task/pro...,3,Quantify the resources required for a task/pro...
...,...,...,...,...
1871,820,Provide input to D&amp training and upgrade Te...,0,Provide input to D&amp training and upgrade Te...
1872,1262,Create dashboards to automate processes and re...,0,Create dashboards to automate processes and re...
1873,824,"Work on technical projects that evaluate, test...",3,"Work on technical projects that evaluate, test..."
1874,545,Develop financial models using benchmarking an...,0,Develop financial models using benchmarking an...


In [11]:
skf = StratifiedKFold(n_splits=CFG.n_splits,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train.label)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)

In [12]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_MODEL_DIR+'tokenizer/')
CFG.tokenizer = tokenizer
SEP = tokenizer.sep_token

Downloading tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
class Dataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = CFG.max_len
        self.text = df['inputs'].values
        self.tokenizer = CFG.tokenizer
        self.targets = df['label'].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        text = self.text[index]
        inputs = tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length = self.max_len
        )
        samples = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            'target': self.targets[index]
        }

        if 'token_type_ids' in inputs:
            samples['token_type_ids'] = inputs['token_type_ids']
            
        return samples

In [14]:
# Dynamic Padding (Collate)
#collate_fn = DataCollatorWithPadding(tokenizer=CFG.tokenizer)
class Collate:
    def __init__(self, tokenizer, isTrain=True):
        self.tokenizer = tokenizer
        self.isTrain = isTrain
        # self.args = args

    def __call__(self, batch):
        output = dict()
        output["input_ids"] = [sample["input_ids"] for sample in batch]
        output["attention_mask"] = [sample["attention_mask"] for sample in batch]
        if self.isTrain:
            output["target"] = [sample["target"] for sample in batch]

        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output["input_ids"]])

        # add padding
        if self.tokenizer.padding_side == "right":
            output["input_ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["input_ids"]]
            output["attention_mask"] = [s + (batch_max - len(s)) * [0] for s in output["attention_mask"]]
        else:
            output["input_ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output["input_ids"]]
            output["attention_mask"] = [(batch_max - len(s)) * [0] + s for s in output["attention_mask"]]

        # convert to tensors
        output["input_ids"] = torch.tensor(output["input_ids"], dtype=torch.long)
        output["attention_mask"] = torch.tensor(output["attention_mask"], dtype=torch.long)
        if self.isTrain:
            output["target"] = torch.tensor(output["target"], dtype=torch.long)

        return output
    
collate_fn = Collate(CFG.tokenizer, isTrain=True)

In [15]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9) #
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

In [16]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, model_name):
        super(CustomModel, self).__init__()
        # Header (fast or normal)
        self.model = AutoModel.from_pretrained(model_name)
        
        # Gradient_checkpointing
        if CFG.gradient_checkpointing:
            (self.model).gradient_checkpointing_enable()
        
        # Freezing
        if CFG.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            CFG.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())
        
        self.config = AutoConfig.from_pretrained(model_name)
        self.drop = nn.Dropout(p=CFG.dropout)
        self.pooler = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, CFG.target_size)
        
    def forward(self, ids, mask):        
        out = self.model(input_ids=ids, 
                         attention_mask=mask,
                         output_hidden_states=False)
        out = self.pooler(out.last_hidden_state, mask)
        out = self.drop(out)
        outputs = self.fc(out)
        return outputs

In [17]:
def asMinutes(s):
    m = math.floor(s/60)
    s -= m * 60
    return "%dm %ds" % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))

def get_scheduler(cfg, optimizer, num_train_steps):
    if cfg.scheduler == 'linear':
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
        )
    elif cfg.scheduler == 'cosine':
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
        )
    return scheduler

def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    model.train()

    dataset_size = 0
    running_loss = 0

    start = end = time.time()

    for step, data in enumerate(dataloader):
        ids = data['input_ids'].to(device, dtype=torch.long)
        mask = data['attention_mask'].to(device, dtype=torch.long)
        targets = data['target'].to(device, dtype=torch.long)

        batch_size = ids.size(0)
        
        outputs = model(ids, mask)
        loss = criterion(outputs, targets)

        #accumulate
        loss = loss / CFG.n_accumulate 
        loss.backward()
        if (step +1) % CFG.n_accumulate == 0:
            optimizer.step()

            optimizer.zero_grad()
            if scheduler is not None:
                scheduler.step()
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size

        epoch_loss = running_loss / dataset_size

        end = time.time()
        
        if step % CFG.print_freq == 0 or step == (len(dataloader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  .format(epoch+1, step, len(dataloader), 
                          remain=timeSince(start, float(step+1)/len(dataloader))))

    gc.collect()

    return epoch_loss


@torch.no_grad()
def valid_one_epoch(model, dataloader, device, epoch):
    model.eval()

    dataset_size = 0
    running_loss = 0

    start = end = time.time()
    pred = []

    for step, data in enumerate(dataloader):
        ids = data['input_ids'].to(device, dtype=torch.long)
        mask = data['attention_mask'].to(device, dtype=torch.long)
        targets = data['target'].to(device, dtype=torch.long)

        batch_size = ids.size(0)
        outputs = model(ids, mask)
        loss = criterion(outputs, targets)
        pred.append(outputs.to('cpu').numpy())

        running_loss += (loss.item()* batch_size)
        dataset_size += batch_size

        epoch_loss = running_loss / dataset_size

        end = time.time()

        if step % CFG.print_freq == 0 or step == (len(dataloader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  .format(step, len(dataloader),
                          remain=timeSince(start, float(step+1)/len(dataloader))))
            
    pred = np.concatenate(pred)
            
    return epoch_loss, pred

In [18]:
def train_loop(fold):
    #wandb.watch(model, log_freq=100)

    LOGGER.info(f'-------------fold:{fold} training-------------')

    train_data = train[train.kfold != fold].reset_index(drop=True)
    valid_data = train[train.kfold == fold].reset_index(drop=True)
    valid_labels = valid_data.label.values

    trainDataset = Dataset(train_data, CFG.tokenizer, CFG.max_len)
    validDataset = Dataset(valid_data, CFG.tokenizer, CFG.max_len)

    train_loader = DataLoader(trainDataset,
                              batch_size = CFG.batch_size,
                              shuffle=True,
                              collate_fn = collate_fn,
                              num_workers = CFG.num_workers,
                              pin_memory = True,
                              drop_last=True)
    
    valid_loader = DataLoader(validDataset,
                              batch_size = CFG.batch_size*2,
                              shuffle=False,
                              collate_fn = collate_fn,
                              num_workers = CFG.num_workers,
                              pin_memory = True,
                              drop_last=False)
    
    model = CustomModel(CFG.model)
    torch.save(model.config, OUTPUT_MODEL_DIR+'config.pth')
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=CFG.lr, weight_decay=CFG.weigth_decay)
    num_train_steps = int(len(train_data) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # loop
    best_score = 0

    for epoch in range(CFG.epochs):
        start_time = time.time()

        train_epoch_loss = train_one_epoch(model, optimizer, scheduler, train_loader, device, epoch)
        valid_epoch_loss, pred = valid_one_epoch(model, valid_loader, device, epoch)

        score = get_score(pred, valid_labels)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {train_epoch_loss:.4f}  avg_val_loss: {valid_epoch_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": train_epoch_loss, 
                       f"[fold{fold}] avg_val_loss": valid_epoch_loss,
                       f"[fold{fold}] score": score})
            
        if score > best_score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': pred},
                        OUTPUT_MODEL_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")
            
    predictions = torch.load(OUTPUT_MODEL_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_data['Consultant'] = predictions[:, 0]
    valid_data['Data scientist'] = predictions[:, 1]
    valid_data['Machine learning engineer'] = predictions[:, 2]
    valid_data['Software engineer'] = predictions[:, 3]
    
    
    temp = valid_data[['Consultant','Data scientist','Machine learning engineer','Software engineer']].values.tolist()
    print(get_score(temp, valid_data['label'].values))

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_data

In [19]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['label'].values
        preds = oof_df[['Consultant','Data scientist','Machine learning engineer','Software engineer']].values.tolist()
        score = get_score(preds, labels)
        LOGGER.info(f'Score: {score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_MODEL_DIR+'oof_df.pkl')
        oof_df.to_csv(OUTPUT_MODEL_DIR+f'oof_df.csv', index=False)

-------------fold:0 training-------------
INFO:__main__:-------------fold:0 training-------------


Downloading pytorch_model.bin:   0%|          | 0.00/354M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/187] Elapsed 0m 3s (remain 10m 14s) 
Epoch: [1][50/187] Elapsed 0m 18s (remain 0m 50s) 
Epoch: [1][100/187] Elapsed 0m 35s (remain 0m 29s) 
Epoch: [1][150/187] Elapsed 0m 50s (remain 0m 12s) 
Epoch: [1][186/187] Elapsed 1m 1s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 7s) 


Epoch 1 - avg_train_loss: 0.9883  avg_val_loss: 0.7954  time: 67s
INFO:__main__:Epoch 1 - avg_train_loss: 0.9883  avg_val_loss: 0.7954  time: 67s
Epoch 1 - Score: 0.5321
INFO:__main__:Epoch 1 - Score: 0.5321
Epoch 1 - Save Best Score: 0.5321 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5321 Model


EVAL: [23/24] Elapsed 0m 5s (remain 0m 0s) 
Epoch: [2][0/187] Elapsed 0m 0s (remain 1m 19s) 
Epoch: [2][50/187] Elapsed 0m 16s (remain 0m 43s) 
Epoch: [2][100/187] Elapsed 0m 32s (remain 0m 27s) 
Epoch: [2][150/187] Elapsed 0m 49s (remain 0m 11s) 
Epoch: [2][186/187] Elapsed 0m 59s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 7s) 


Epoch 2 - avg_train_loss: 0.6247  avg_val_loss: 0.5873  time: 65s
INFO:__main__:Epoch 2 - avg_train_loss: 0.6247  avg_val_loss: 0.5873  time: 65s
Epoch 2 - Score: 0.7067
INFO:__main__:Epoch 2 - Score: 0.7067
Epoch 2 - Save Best Score: 0.7067 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7067 Model


EVAL: [23/24] Elapsed 0m 5s (remain 0m 0s) 
Epoch: [3][0/187] Elapsed 0m 0s (remain 1m 4s) 
Epoch: [3][50/187] Elapsed 0m 15s (remain 0m 41s) 
Epoch: [3][100/187] Elapsed 0m 30s (remain 0m 25s) 
Epoch: [3][150/187] Elapsed 0m 48s (remain 0m 11s) 
Epoch: [3][186/187] Elapsed 0m 59s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 7s) 


Epoch 3 - avg_train_loss: 0.4602  avg_val_loss: 0.6478  time: 65s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4602  avg_val_loss: 0.6478  time: 65s
Epoch 3 - Score: 0.6570
INFO:__main__:Epoch 3 - Score: 0.6570


EVAL: [23/24] Elapsed 0m 5s (remain 0m 0s) 
Epoch: [4][0/187] Elapsed 0m 0s (remain 0m 54s) 
Epoch: [4][50/187] Elapsed 0m 16s (remain 0m 42s) 
Epoch: [4][100/187] Elapsed 0m 32s (remain 0m 27s) 
Epoch: [4][150/187] Elapsed 0m 48s (remain 0m 11s) 
Epoch: [4][186/187] Elapsed 0m 59s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 7s) 


Epoch 4 - avg_train_loss: 0.3113  avg_val_loss: 0.6781  time: 65s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3113  avg_val_loss: 0.6781  time: 65s
Epoch 4 - Score: 0.6986
INFO:__main__:Epoch 4 - Score: 0.6986


EVAL: [23/24] Elapsed 0m 5s (remain 0m 0s) 
Epoch: [5][0/187] Elapsed 0m 0s (remain 1m 5s) 
Epoch: [5][50/187] Elapsed 0m 15s (remain 0m 41s) 
Epoch: [5][100/187] Elapsed 0m 31s (remain 0m 26s) 
Epoch: [5][150/187] Elapsed 0m 46s (remain 0m 11s) 
Epoch: [5][186/187] Elapsed 0m 59s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 8s) 


Epoch 5 - avg_train_loss: 0.1849  avg_val_loss: 0.6684  time: 65s
INFO:__main__:Epoch 5 - avg_train_loss: 0.1849  avg_val_loss: 0.6684  time: 65s
Epoch 5 - Score: 0.7532
INFO:__main__:Epoch 5 - Score: 0.7532
Epoch 5 - Save Best Score: 0.7532 Model
INFO:__main__:Epoch 5 - Save Best Score: 0.7532 Model


EVAL: [23/24] Elapsed 0m 5s (remain 0m 0s) 
Epoch: [6][0/187] Elapsed 0m 0s (remain 1m 11s) 
Epoch: [6][50/187] Elapsed 0m 15s (remain 0m 42s) 
Epoch: [6][100/187] Elapsed 0m 32s (remain 0m 27s) 
Epoch: [6][150/187] Elapsed 0m 47s (remain 0m 11s) 
Epoch: [6][186/187] Elapsed 0m 59s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 8s) 


Epoch 6 - avg_train_loss: 0.0929  avg_val_loss: 0.7613  time: 65s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0929  avg_val_loss: 0.7613  time: 65s
Epoch 6 - Score: 0.7724
INFO:__main__:Epoch 6 - Score: 0.7724
Epoch 6 - Save Best Score: 0.7724 Model
INFO:__main__:Epoch 6 - Save Best Score: 0.7724 Model


EVAL: [23/24] Elapsed 0m 5s (remain 0m 0s) 
Epoch: [7][0/187] Elapsed 0m 0s (remain 1m 14s) 
Epoch: [7][50/187] Elapsed 0m 14s (remain 0m 37s) 
Epoch: [7][100/187] Elapsed 0m 31s (remain 0m 26s) 
Epoch: [7][150/187] Elapsed 0m 48s (remain 0m 11s) 
Epoch: [7][186/187] Elapsed 0m 59s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 8s) 


Epoch 7 - avg_train_loss: 0.0544  avg_val_loss: 0.7698  time: 65s
INFO:__main__:Epoch 7 - avg_train_loss: 0.0544  avg_val_loss: 0.7698  time: 65s
Epoch 7 - Score: 0.7479
INFO:__main__:Epoch 7 - Score: 0.7479


EVAL: [23/24] Elapsed 0m 5s (remain 0m 0s) 
Epoch: [8][0/187] Elapsed 0m 0s (remain 1m 3s) 
Epoch: [8][50/187] Elapsed 0m 14s (remain 0m 38s) 
Epoch: [8][100/187] Elapsed 0m 30s (remain 0m 26s) 
Epoch: [8][150/187] Elapsed 0m 46s (remain 0m 11s) 
Epoch: [8][186/187] Elapsed 0m 57s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 8s) 


Epoch 8 - avg_train_loss: 0.0325  avg_val_loss: 0.8022  time: 63s
INFO:__main__:Epoch 8 - avg_train_loss: 0.0325  avg_val_loss: 0.8022  time: 63s
Epoch 8 - Score: 0.7507
INFO:__main__:Epoch 8 - Score: 0.7507


EVAL: [23/24] Elapsed 0m 5s (remain 0m 0s) 
Epoch: [9][0/187] Elapsed 0m 0s (remain 1m 21s) 
Epoch: [9][50/187] Elapsed 0m 16s (remain 0m 44s) 
Epoch: [9][100/187] Elapsed 0m 32s (remain 0m 27s) 
Epoch: [9][150/187] Elapsed 0m 47s (remain 0m 11s) 
Epoch: [9][186/187] Elapsed 0m 59s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 8s) 


Epoch 9 - avg_train_loss: 0.0276  avg_val_loss: 0.8215  time: 65s
INFO:__main__:Epoch 9 - avg_train_loss: 0.0276  avg_val_loss: 0.8215  time: 65s
Epoch 9 - Score: 0.7426
INFO:__main__:Epoch 9 - Score: 0.7426


EVAL: [23/24] Elapsed 0m 5s (remain 0m 0s) 
Epoch: [10][0/187] Elapsed 0m 0s (remain 1m 14s) 
Epoch: [10][50/187] Elapsed 0m 16s (remain 0m 45s) 
Epoch: [10][100/187] Elapsed 0m 32s (remain 0m 28s) 
Epoch: [10][150/187] Elapsed 0m 49s (remain 0m 11s) 
Epoch: [10][186/187] Elapsed 0m 59s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 8s) 


Epoch 10 - avg_train_loss: 0.0257  avg_val_loss: 0.8236  time: 65s
INFO:__main__:Epoch 10 - avg_train_loss: 0.0257  avg_val_loss: 0.8236  time: 65s
Epoch 10 - Score: 0.7467
INFO:__main__:Epoch 10 - Score: 0.7467


EVAL: [23/24] Elapsed 0m 5s (remain 0m 0s) 


Score: 0.7724
INFO:__main__:Score: 0.7724
-------------fold:1 training-------------
INFO:__main__:-------------fold:1 training-------------


0.7724305289466322


Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/187] Elapsed 0m 0s (remain 1m 19s) 
Epoch: [1][50/187] Elapsed 0m 16s (remain 0m 44s) 
Epoch: [1][100/187] Elapsed 0m 33s (remain 0m 28s) 
Epoch: [1][150/187] Elapsed 0m 50s (remain 0m 12s) 
Epoch: [1][186/187] Elapsed 1m 2s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 7s) 


Epoch 1 - avg_train_loss: 0.9382  avg_val_loss: 0.6718  time: 67s
INFO:__main__:Epoch 1 - avg_train_loss: 0.9382  avg_val_loss: 0.6718  time: 67s
Epoch 1 - Score: 0.5866
INFO:__main__:Epoch 1 - Score: 0.5866
Epoch 1 - Save Best Score: 0.5866 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5866 Model


EVAL: [23/24] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [2][0/187] Elapsed 0m 0s (remain 1m 0s) 
Epoch: [2][50/187] Elapsed 0m 17s (remain 0m 46s) 
Epoch: [2][100/187] Elapsed 0m 32s (remain 0m 27s) 
Epoch: [2][150/187] Elapsed 0m 50s (remain 0m 11s) 
Epoch: [2][186/187] Elapsed 1m 2s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 7s) 


Epoch 2 - avg_train_loss: 0.5919  avg_val_loss: 0.5619  time: 67s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5919  avg_val_loss: 0.5619  time: 67s
Epoch 2 - Score: 0.6852
INFO:__main__:Epoch 2 - Score: 0.6852
Epoch 2 - Save Best Score: 0.6852 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.6852 Model


EVAL: [23/24] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [3][0/187] Elapsed 0m 0s (remain 1m 30s) 
Epoch: [3][50/187] Elapsed 0m 18s (remain 0m 49s) 
Epoch: [3][100/187] Elapsed 0m 34s (remain 0m 29s) 
Epoch: [3][150/187] Elapsed 0m 51s (remain 0m 12s) 
Epoch: [3][186/187] Elapsed 1m 2s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 7s) 


Epoch 3 - avg_train_loss: 0.4516  avg_val_loss: 0.5145  time: 67s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4516  avg_val_loss: 0.5145  time: 67s
Epoch 3 - Score: 0.7541
INFO:__main__:Epoch 3 - Score: 0.7541
Epoch 3 - Save Best Score: 0.7541 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7541 Model


EVAL: [23/24] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [4][0/187] Elapsed 0m 0s (remain 1m 10s) 
Epoch: [4][50/187] Elapsed 0m 18s (remain 0m 49s) 
Epoch: [4][100/187] Elapsed 0m 34s (remain 0m 28s) 
Epoch: [4][150/187] Elapsed 0m 50s (remain 0m 12s) 
Epoch: [4][186/187] Elapsed 1m 2s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 7s) 


Epoch 4 - avg_train_loss: 0.2751  avg_val_loss: 0.4746  time: 66s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2751  avg_val_loss: 0.4746  time: 66s
Epoch 4 - Score: 0.7805
INFO:__main__:Epoch 4 - Score: 0.7805
Epoch 4 - Save Best Score: 0.7805 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7805 Model


EVAL: [23/24] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [5][0/187] Elapsed 0m 0s (remain 1m 4s) 
Epoch: [5][50/187] Elapsed 0m 18s (remain 0m 50s) 
Epoch: [5][100/187] Elapsed 0m 35s (remain 0m 30s) 
Epoch: [5][150/187] Elapsed 0m 50s (remain 0m 12s) 
Epoch: [5][186/187] Elapsed 1m 3s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 7s) 


Epoch 5 - avg_train_loss: 0.1774  avg_val_loss: 0.5240  time: 67s
INFO:__main__:Epoch 5 - avg_train_loss: 0.1774  avg_val_loss: 0.5240  time: 67s
Epoch 5 - Score: 0.7732
INFO:__main__:Epoch 5 - Score: 0.7732


EVAL: [23/24] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [6][0/187] Elapsed 0m 0s (remain 1m 38s) 
Epoch: [6][50/187] Elapsed 0m 17s (remain 0m 45s) 
Epoch: [6][100/187] Elapsed 0m 34s (remain 0m 29s) 
Epoch: [6][150/187] Elapsed 0m 50s (remain 0m 12s) 
Epoch: [6][186/187] Elapsed 1m 3s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 7s) 


Epoch 6 - avg_train_loss: 0.1070  avg_val_loss: 0.5183  time: 67s
INFO:__main__:Epoch 6 - avg_train_loss: 0.1070  avg_val_loss: 0.5183  time: 67s
Epoch 6 - Score: 0.7965
INFO:__main__:Epoch 6 - Score: 0.7965
Epoch 6 - Save Best Score: 0.7965 Model
INFO:__main__:Epoch 6 - Save Best Score: 0.7965 Model


EVAL: [23/24] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [7][0/187] Elapsed 0m 0s (remain 1m 6s) 
Epoch: [7][50/187] Elapsed 0m 16s (remain 0m 44s) 
Epoch: [7][100/187] Elapsed 0m 33s (remain 0m 28s) 
Epoch: [7][150/187] Elapsed 0m 50s (remain 0m 12s) 
Epoch: [7][186/187] Elapsed 1m 3s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 7s) 


Epoch 7 - avg_train_loss: 0.0420  avg_val_loss: 0.5840  time: 67s
INFO:__main__:Epoch 7 - avg_train_loss: 0.0420  avg_val_loss: 0.5840  time: 67s
Epoch 7 - Score: 0.8308
INFO:__main__:Epoch 7 - Score: 0.8308
Epoch 7 - Save Best Score: 0.8308 Model
INFO:__main__:Epoch 7 - Save Best Score: 0.8308 Model


EVAL: [23/24] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [8][0/187] Elapsed 0m 0s (remain 1m 4s) 
Epoch: [8][50/187] Elapsed 0m 17s (remain 0m 47s) 
Epoch: [8][100/187] Elapsed 0m 36s (remain 0m 30s) 
Epoch: [8][150/187] Elapsed 0m 52s (remain 0m 12s) 
Epoch: [8][186/187] Elapsed 1m 3s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 7s) 


Epoch 8 - avg_train_loss: 0.0201  avg_val_loss: 0.5976  time: 67s
INFO:__main__:Epoch 8 - avg_train_loss: 0.0201  avg_val_loss: 0.5976  time: 67s
Epoch 8 - Score: 0.8237
INFO:__main__:Epoch 8 - Score: 0.8237


EVAL: [23/24] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [9][0/187] Elapsed 0m 0s (remain 0m 55s) 
Epoch: [9][50/187] Elapsed 0m 16s (remain 0m 43s) 
Epoch: [9][100/187] Elapsed 0m 33s (remain 0m 28s) 
Epoch: [9][150/187] Elapsed 0m 51s (remain 0m 12s) 
Epoch: [9][186/187] Elapsed 1m 3s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 7s) 


Epoch 9 - avg_train_loss: 0.0128  avg_val_loss: 0.6182  time: 67s
INFO:__main__:Epoch 9 - avg_train_loss: 0.0128  avg_val_loss: 0.6182  time: 67s
Epoch 9 - Score: 0.8236
INFO:__main__:Epoch 9 - Score: 0.8236


EVAL: [23/24] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [10][0/187] Elapsed 0m 0s (remain 1m 33s) 
Epoch: [10][50/187] Elapsed 0m 17s (remain 0m 46s) 
Epoch: [10][100/187] Elapsed 0m 34s (remain 0m 29s) 
Epoch: [10][150/187] Elapsed 0m 50s (remain 0m 12s) 
Epoch: [10][186/187] Elapsed 1m 2s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 7s) 


Epoch 10 - avg_train_loss: 0.0099  avg_val_loss: 0.6236  time: 67s
INFO:__main__:Epoch 10 - avg_train_loss: 0.0099  avg_val_loss: 0.6236  time: 67s
Epoch 10 - Score: 0.8236
INFO:__main__:Epoch 10 - Score: 0.8236


EVAL: [23/24] Elapsed 0m 3s (remain 0m 0s) 


Score: 0.8308
INFO:__main__:Score: 0.8308
-------------fold:2 training-------------
INFO:__main__:-------------fold:2 training-------------


0.8308385133773161


Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/187] Elapsed 0m 0s (remain 1m 28s) 
Epoch: [1][50/187] Elapsed 0m 18s (remain 0m 49s) 
Epoch: [1][100/187] Elapsed 0m 34s (remain 0m 29s) 
Epoch: [1][150/187] Elapsed 0m 48s (remain 0m 11s) 
Epoch: [1][186/187] Elapsed 0m 59s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 8s) 


Epoch 1 - avg_train_loss: 0.9213  avg_val_loss: 0.6662  time: 65s
INFO:__main__:Epoch 1 - avg_train_loss: 0.9213  avg_val_loss: 0.6662  time: 65s
Epoch 1 - Score: 0.5991
INFO:__main__:Epoch 1 - Score: 0.5991
Epoch 1 - Save Best Score: 0.5991 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5991 Model


EVAL: [23/24] Elapsed 0m 5s (remain 0m 0s) 
Epoch: [2][0/187] Elapsed 0m 0s (remain 1m 53s) 
Epoch: [2][50/187] Elapsed 0m 15s (remain 0m 41s) 
Epoch: [2][100/187] Elapsed 0m 32s (remain 0m 27s) 
Epoch: [2][150/187] Elapsed 0m 49s (remain 0m 11s) 
Epoch: [2][186/187] Elapsed 1m 0s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 8s) 


Epoch 2 - avg_train_loss: 0.5866  avg_val_loss: 0.5849  time: 66s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5866  avg_val_loss: 0.5849  time: 66s
Epoch 2 - Score: 0.7622
INFO:__main__:Epoch 2 - Score: 0.7622
Epoch 2 - Save Best Score: 0.7622 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7622 Model


EVAL: [23/24] Elapsed 0m 5s (remain 0m 0s) 
Epoch: [3][0/187] Elapsed 0m 0s (remain 1m 22s) 
Epoch: [3][50/187] Elapsed 0m 17s (remain 0m 47s) 
Epoch: [3][100/187] Elapsed 0m 32s (remain 0m 27s) 
Epoch: [3][150/187] Elapsed 0m 47s (remain 0m 11s) 
Epoch: [3][186/187] Elapsed 0m 59s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 8s) 


Epoch 3 - avg_train_loss: 0.3802  avg_val_loss: 0.5329  time: 65s
INFO:__main__:Epoch 3 - avg_train_loss: 0.3802  avg_val_loss: 0.5329  time: 65s
Epoch 3 - Score: 0.7554
INFO:__main__:Epoch 3 - Score: 0.7554


EVAL: [23/24] Elapsed 0m 5s (remain 0m 0s) 
Epoch: [4][0/187] Elapsed 0m 0s (remain 1m 24s) 
Epoch: [4][50/187] Elapsed 0m 16s (remain 0m 45s) 
Epoch: [4][100/187] Elapsed 0m 31s (remain 0m 27s) 
Epoch: [4][150/187] Elapsed 0m 48s (remain 0m 11s) 
Epoch: [4][186/187] Elapsed 0m 59s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 8s) 


Epoch 4 - avg_train_loss: 0.2534  avg_val_loss: 0.6247  time: 65s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2534  avg_val_loss: 0.6247  time: 65s
Epoch 4 - Score: 0.7228
INFO:__main__:Epoch 4 - Score: 0.7228


EVAL: [23/24] Elapsed 0m 5s (remain 0m 0s) 
Epoch: [5][0/187] Elapsed 0m 0s (remain 1m 0s) 
Epoch: [5][50/187] Elapsed 0m 16s (remain 0m 43s) 
Epoch: [5][100/187] Elapsed 0m 31s (remain 0m 26s) 
Epoch: [5][150/187] Elapsed 0m 46s (remain 0m 11s) 
Epoch: [5][186/187] Elapsed 1m 0s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 8s) 


Epoch 5 - avg_train_loss: 0.1270  avg_val_loss: 0.7384  time: 65s
INFO:__main__:Epoch 5 - avg_train_loss: 0.1270  avg_val_loss: 0.7384  time: 65s
Epoch 5 - Score: 0.7866
INFO:__main__:Epoch 5 - Score: 0.7866
Epoch 5 - Save Best Score: 0.7866 Model
INFO:__main__:Epoch 5 - Save Best Score: 0.7866 Model


EVAL: [23/24] Elapsed 0m 5s (remain 0m 0s) 
Epoch: [6][0/187] Elapsed 0m 0s (remain 1m 13s) 
Epoch: [6][50/187] Elapsed 0m 16s (remain 0m 43s) 
Epoch: [6][100/187] Elapsed 0m 32s (remain 0m 28s) 
Epoch: [6][150/187] Elapsed 0m 46s (remain 0m 11s) 
Epoch: [6][186/187] Elapsed 0m 59s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 8s) 


Epoch 6 - avg_train_loss: 0.0589  avg_val_loss: 0.6117  time: 65s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0589  avg_val_loss: 0.6117  time: 65s
Epoch 6 - Score: 0.7967
INFO:__main__:Epoch 6 - Score: 0.7967
Epoch 6 - Save Best Score: 0.7967 Model
INFO:__main__:Epoch 6 - Save Best Score: 0.7967 Model


EVAL: [23/24] Elapsed 0m 5s (remain 0m 0s) 
Epoch: [7][0/187] Elapsed 0m 0s (remain 1m 20s) 
Epoch: [7][50/187] Elapsed 0m 16s (remain 0m 42s) 
Epoch: [7][100/187] Elapsed 0m 31s (remain 0m 26s) 
Epoch: [7][150/187] Elapsed 0m 47s (remain 0m 11s) 
Epoch: [7][186/187] Elapsed 0m 59s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 8s) 


Epoch 7 - avg_train_loss: 0.0305  avg_val_loss: 0.6335  time: 65s
INFO:__main__:Epoch 7 - avg_train_loss: 0.0305  avg_val_loss: 0.6335  time: 65s
Epoch 7 - Score: 0.8273
INFO:__main__:Epoch 7 - Score: 0.8273
Epoch 7 - Save Best Score: 0.8273 Model
INFO:__main__:Epoch 7 - Save Best Score: 0.8273 Model


EVAL: [23/24] Elapsed 0m 5s (remain 0m 0s) 
Epoch: [8][0/187] Elapsed 0m 0s (remain 1m 12s) 
Epoch: [8][50/187] Elapsed 0m 15s (remain 0m 42s) 
Epoch: [8][100/187] Elapsed 0m 32s (remain 0m 27s) 
Epoch: [8][150/187] Elapsed 0m 49s (remain 0m 11s) 
Epoch: [8][186/187] Elapsed 0m 59s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 8s) 


Epoch 8 - avg_train_loss: 0.0140  avg_val_loss: 0.6478  time: 65s
INFO:__main__:Epoch 8 - avg_train_loss: 0.0140  avg_val_loss: 0.6478  time: 65s
Epoch 8 - Score: 0.8317
INFO:__main__:Epoch 8 - Score: 0.8317
Epoch 8 - Save Best Score: 0.8317 Model
INFO:__main__:Epoch 8 - Save Best Score: 0.8317 Model


EVAL: [23/24] Elapsed 0m 5s (remain 0m 0s) 
Epoch: [9][0/187] Elapsed 0m 0s (remain 1m 6s) 
Epoch: [9][50/187] Elapsed 0m 17s (remain 0m 47s) 
Epoch: [9][100/187] Elapsed 0m 32s (remain 0m 27s) 
Epoch: [9][150/187] Elapsed 0m 48s (remain 0m 11s) 
Epoch: [9][186/187] Elapsed 0m 58s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 8s) 


Epoch 9 - avg_train_loss: 0.0096  avg_val_loss: 0.6524  time: 64s
INFO:__main__:Epoch 9 - avg_train_loss: 0.0096  avg_val_loss: 0.6524  time: 64s
Epoch 9 - Score: 0.8353
INFO:__main__:Epoch 9 - Score: 0.8353
Epoch 9 - Save Best Score: 0.8353 Model
INFO:__main__:Epoch 9 - Save Best Score: 0.8353 Model


EVAL: [23/24] Elapsed 0m 5s (remain 0m 0s) 
Epoch: [10][0/187] Elapsed 0m 0s (remain 0m 57s) 
Epoch: [10][50/187] Elapsed 0m 16s (remain 0m 44s) 
Epoch: [10][100/187] Elapsed 0m 31s (remain 0m 26s) 
Epoch: [10][150/187] Elapsed 0m 47s (remain 0m 11s) 
Epoch: [10][186/187] Elapsed 0m 58s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 8s) 


Epoch 10 - avg_train_loss: 0.0091  avg_val_loss: 0.6548  time: 64s
INFO:__main__:Epoch 10 - avg_train_loss: 0.0091  avg_val_loss: 0.6548  time: 64s
Epoch 10 - Score: 0.8376
INFO:__main__:Epoch 10 - Score: 0.8376
Epoch 10 - Save Best Score: 0.8376 Model
INFO:__main__:Epoch 10 - Save Best Score: 0.8376 Model


EVAL: [23/24] Elapsed 0m 5s (remain 0m 0s) 


Score: 0.8376
INFO:__main__:Score: 0.8376
-------------fold:3 training-------------
INFO:__main__:-------------fold:3 training-------------


0.8375634313722826


Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/187] Elapsed 0m 0s (remain 2m 5s) 
Epoch: [1][50/187] Elapsed 0m 16s (remain 0m 43s) 
Epoch: [1][100/187] Elapsed 0m 33s (remain 0m 28s) 
Epoch: [1][150/187] Elapsed 0m 48s (remain 0m 11s) 
Epoch: [1][186/187] Elapsed 1m 0s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 10s) 


Epoch 1 - avg_train_loss: 0.9908  avg_val_loss: 0.6637  time: 66s
INFO:__main__:Epoch 1 - avg_train_loss: 0.9908  avg_val_loss: 0.6637  time: 66s
Epoch 1 - Score: 0.5907
INFO:__main__:Epoch 1 - Score: 0.5907
Epoch 1 - Save Best Score: 0.5907 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5907 Model


EVAL: [23/24] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [2][0/187] Elapsed 0m 0s (remain 1m 5s) 
Epoch: [2][50/187] Elapsed 0m 15s (remain 0m 41s) 
Epoch: [2][100/187] Elapsed 0m 31s (remain 0m 27s) 
Epoch: [2][150/187] Elapsed 0m 48s (remain 0m 11s) 
Epoch: [2][186/187] Elapsed 1m 0s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 10s) 


Epoch 2 - avg_train_loss: 0.6293  avg_val_loss: 0.5490  time: 66s
INFO:__main__:Epoch 2 - avg_train_loss: 0.6293  avg_val_loss: 0.5490  time: 66s
Epoch 2 - Score: 0.7219
INFO:__main__:Epoch 2 - Score: 0.7219
Epoch 2 - Save Best Score: 0.7219 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7219 Model


EVAL: [23/24] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [3][0/187] Elapsed 0m 0s (remain 0m 51s) 
Epoch: [3][50/187] Elapsed 0m 17s (remain 0m 45s) 
Epoch: [3][100/187] Elapsed 0m 34s (remain 0m 29s) 
Epoch: [3][150/187] Elapsed 0m 50s (remain 0m 11s) 
Epoch: [3][186/187] Elapsed 1m 1s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 10s) 


Epoch 3 - avg_train_loss: 0.4513  avg_val_loss: 0.5567  time: 66s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4513  avg_val_loss: 0.5567  time: 66s
Epoch 3 - Score: 0.7557
INFO:__main__:Epoch 3 - Score: 0.7557
Epoch 3 - Save Best Score: 0.7557 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7557 Model


EVAL: [23/24] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [4][0/187] Elapsed 0m 0s (remain 1m 4s) 
Epoch: [4][50/187] Elapsed 0m 14s (remain 0m 39s) 
Epoch: [4][100/187] Elapsed 0m 32s (remain 0m 27s) 
Epoch: [4][150/187] Elapsed 0m 48s (remain 0m 11s) 
Epoch: [4][186/187] Elapsed 1m 1s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 10s) 


Epoch 4 - avg_train_loss: 0.3306  avg_val_loss: 0.5557  time: 67s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3306  avg_val_loss: 0.5557  time: 67s
Epoch 4 - Score: 0.7422
INFO:__main__:Epoch 4 - Score: 0.7422


EVAL: [23/24] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [5][0/187] Elapsed 0m 0s (remain 1m 35s) 
Epoch: [5][50/187] Elapsed 0m 16s (remain 0m 42s) 
Epoch: [5][100/187] Elapsed 0m 30s (remain 0m 26s) 
Epoch: [5][150/187] Elapsed 0m 49s (remain 0m 11s) 
Epoch: [5][186/187] Elapsed 1m 0s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 10s) 


Epoch 5 - avg_train_loss: 0.1949  avg_val_loss: 0.5505  time: 65s
INFO:__main__:Epoch 5 - avg_train_loss: 0.1949  avg_val_loss: 0.5505  time: 65s
Epoch 5 - Score: 0.8054
INFO:__main__:Epoch 5 - Score: 0.8054
Epoch 5 - Save Best Score: 0.8054 Model
INFO:__main__:Epoch 5 - Save Best Score: 0.8054 Model


EVAL: [23/24] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [6][0/187] Elapsed 0m 0s (remain 2m 4s) 
Epoch: [6][50/187] Elapsed 0m 16s (remain 0m 44s) 
Epoch: [6][100/187] Elapsed 0m 32s (remain 0m 27s) 
Epoch: [6][150/187] Elapsed 0m 47s (remain 0m 11s) 
Epoch: [6][186/187] Elapsed 1m 0s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 10s) 


Epoch 6 - avg_train_loss: 0.1133  avg_val_loss: 0.6221  time: 65s
INFO:__main__:Epoch 6 - avg_train_loss: 0.1133  avg_val_loss: 0.6221  time: 65s
Epoch 6 - Score: 0.7926
INFO:__main__:Epoch 6 - Score: 0.7926


EVAL: [23/24] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [7][0/187] Elapsed 0m 0s (remain 1m 9s) 
Epoch: [7][50/187] Elapsed 0m 17s (remain 0m 46s) 
Epoch: [7][100/187] Elapsed 0m 34s (remain 0m 29s) 
Epoch: [7][150/187] Elapsed 0m 49s (remain 0m 11s) 
Epoch: [7][186/187] Elapsed 1m 1s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 10s) 


Epoch 7 - avg_train_loss: 0.0590  avg_val_loss: 0.6182  time: 66s
INFO:__main__:Epoch 7 - avg_train_loss: 0.0590  avg_val_loss: 0.6182  time: 66s
Epoch 7 - Score: 0.7932
INFO:__main__:Epoch 7 - Score: 0.7932


EVAL: [23/24] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [8][0/187] Elapsed 0m 0s (remain 2m 40s) 
Epoch: [8][50/187] Elapsed 0m 17s (remain 0m 45s) 
Epoch: [8][100/187] Elapsed 0m 31s (remain 0m 27s) 
Epoch: [8][150/187] Elapsed 0m 47s (remain 0m 11s) 
Epoch: [8][186/187] Elapsed 0m 58s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 10s) 


Epoch 8 - avg_train_loss: 0.0264  avg_val_loss: 0.6489  time: 64s
INFO:__main__:Epoch 8 - avg_train_loss: 0.0264  avg_val_loss: 0.6489  time: 64s
Epoch 8 - Score: 0.8177
INFO:__main__:Epoch 8 - Score: 0.8177
Epoch 8 - Save Best Score: 0.8177 Model
INFO:__main__:Epoch 8 - Save Best Score: 0.8177 Model


EVAL: [23/24] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [9][0/187] Elapsed 0m 0s (remain 1m 11s) 
Epoch: [9][50/187] Elapsed 0m 16s (remain 0m 42s) 
Epoch: [9][100/187] Elapsed 0m 30s (remain 0m 26s) 
Epoch: [9][150/187] Elapsed 0m 48s (remain 0m 11s) 
Epoch: [9][186/187] Elapsed 1m 0s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 10s) 


Epoch 9 - avg_train_loss: 0.0193  avg_val_loss: 0.6643  time: 66s
INFO:__main__:Epoch 9 - avg_train_loss: 0.0193  avg_val_loss: 0.6643  time: 66s
Epoch 9 - Score: 0.8160
INFO:__main__:Epoch 9 - Score: 0.8160


EVAL: [23/24] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [10][0/187] Elapsed 0m 0s (remain 0m 50s) 
Epoch: [10][50/187] Elapsed 0m 17s (remain 0m 46s) 
Epoch: [10][100/187] Elapsed 0m 33s (remain 0m 28s) 
Epoch: [10][150/187] Elapsed 0m 50s (remain 0m 11s) 
Epoch: [10][186/187] Elapsed 1m 1s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 10s) 


Epoch 10 - avg_train_loss: 0.0144  avg_val_loss: 0.6586  time: 67s
INFO:__main__:Epoch 10 - avg_train_loss: 0.0144  avg_val_loss: 0.6586  time: 67s
Epoch 10 - Score: 0.8182
INFO:__main__:Epoch 10 - Score: 0.8182
Epoch 10 - Save Best Score: 0.8182 Model
INFO:__main__:Epoch 10 - Save Best Score: 0.8182 Model


EVAL: [23/24] Elapsed 0m 4s (remain 0m 0s) 


Score: 0.8182
INFO:__main__:Score: 0.8182
-------------fold:4 training-------------
INFO:__main__:-------------fold:4 training-------------


0.8182024652612888


Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/187] Elapsed 0m 0s (remain 2m 7s) 
Epoch: [1][50/187] Elapsed 0m 18s (remain 0m 50s) 
Epoch: [1][100/187] Elapsed 0m 34s (remain 0m 29s) 
Epoch: [1][150/187] Elapsed 0m 50s (remain 0m 11s) 
Epoch: [1][186/187] Elapsed 1m 1s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 6s) 


Epoch 1 - avg_train_loss: 0.9375  avg_val_loss: 0.6918  time: 66s
INFO:__main__:Epoch 1 - avg_train_loss: 0.9375  avg_val_loss: 0.6918  time: 66s
Epoch 1 - Score: 0.6265
INFO:__main__:Epoch 1 - Score: 0.6265
Epoch 1 - Save Best Score: 0.6265 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6265 Model


EVAL: [23/24] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [2][0/187] Elapsed 0m 0s (remain 1m 57s) 
Epoch: [2][50/187] Elapsed 0m 17s (remain 0m 47s) 
Epoch: [2][100/187] Elapsed 0m 35s (remain 0m 30s) 
Epoch: [2][150/187] Elapsed 0m 48s (remain 0m 11s) 
Epoch: [2][186/187] Elapsed 0m 59s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 6s) 


Epoch 2 - avg_train_loss: 0.5921  avg_val_loss: 0.5922  time: 64s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5921  avg_val_loss: 0.5922  time: 64s
Epoch 2 - Score: 0.7055
INFO:__main__:Epoch 2 - Score: 0.7055
Epoch 2 - Save Best Score: 0.7055 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7055 Model


EVAL: [23/24] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [3][0/187] Elapsed 0m 0s (remain 1m 20s) 
Epoch: [3][50/187] Elapsed 0m 15s (remain 0m 40s) 
Epoch: [3][100/187] Elapsed 0m 32s (remain 0m 27s) 
Epoch: [3][150/187] Elapsed 0m 49s (remain 0m 11s) 
Epoch: [3][186/187] Elapsed 1m 0s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 7s) 


Epoch 3 - avg_train_loss: 0.4166  avg_val_loss: 0.7348  time: 66s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4166  avg_val_loss: 0.7348  time: 66s
Epoch 3 - Score: 0.6661
INFO:__main__:Epoch 3 - Score: 0.6661


EVAL: [23/24] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [4][0/187] Elapsed 0m 0s (remain 1m 11s) 
Epoch: [4][50/187] Elapsed 0m 16s (remain 0m 44s) 
Epoch: [4][100/187] Elapsed 0m 33s (remain 0m 28s) 
Epoch: [4][150/187] Elapsed 0m 48s (remain 0m 11s) 
Epoch: [4][186/187] Elapsed 1m 2s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 6s) 


Epoch 4 - avg_train_loss: 0.2558  avg_val_loss: 0.5714  time: 67s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2558  avg_val_loss: 0.5714  time: 67s
Epoch 4 - Score: 0.7185
INFO:__main__:Epoch 4 - Score: 0.7185
Epoch 4 - Save Best Score: 0.7185 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7185 Model


EVAL: [23/24] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [5][0/187] Elapsed 0m 1s (remain 3m 7s) 
Epoch: [5][50/187] Elapsed 0m 16s (remain 0m 44s) 
Epoch: [5][100/187] Elapsed 0m 32s (remain 0m 27s) 
Epoch: [5][150/187] Elapsed 0m 50s (remain 0m 12s) 
Epoch: [5][186/187] Elapsed 1m 2s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 6s) 


Epoch 5 - avg_train_loss: 0.1628  avg_val_loss: 0.6365  time: 67s
INFO:__main__:Epoch 5 - avg_train_loss: 0.1628  avg_val_loss: 0.6365  time: 67s
Epoch 5 - Score: 0.7390
INFO:__main__:Epoch 5 - Score: 0.7390
Epoch 5 - Save Best Score: 0.7390 Model
INFO:__main__:Epoch 5 - Save Best Score: 0.7390 Model


EVAL: [23/24] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [6][0/187] Elapsed 0m 0s (remain 1m 8s) 
Epoch: [6][50/187] Elapsed 0m 17s (remain 0m 46s) 
Epoch: [6][100/187] Elapsed 0m 32s (remain 0m 27s) 
Epoch: [6][150/187] Elapsed 0m 48s (remain 0m 11s) 
Epoch: [6][186/187] Elapsed 1m 0s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 6s) 


Epoch 6 - avg_train_loss: 0.0838  avg_val_loss: 0.7011  time: 65s
INFO:__main__:Epoch 6 - avg_train_loss: 0.0838  avg_val_loss: 0.7011  time: 65s
Epoch 6 - Score: 0.7599
INFO:__main__:Epoch 6 - Score: 0.7599
Epoch 6 - Save Best Score: 0.7599 Model
INFO:__main__:Epoch 6 - Save Best Score: 0.7599 Model


EVAL: [23/24] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [7][0/187] Elapsed 0m 0s (remain 1m 8s) 
Epoch: [7][50/187] Elapsed 0m 16s (remain 0m 43s) 
Epoch: [7][100/187] Elapsed 0m 33s (remain 0m 28s) 
Epoch: [7][150/187] Elapsed 0m 49s (remain 0m 11s) 
Epoch: [7][186/187] Elapsed 1m 2s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 6s) 


Epoch 7 - avg_train_loss: 0.0488  avg_val_loss: 0.6934  time: 67s
INFO:__main__:Epoch 7 - avg_train_loss: 0.0488  avg_val_loss: 0.6934  time: 67s
Epoch 7 - Score: 0.7764
INFO:__main__:Epoch 7 - Score: 0.7764
Epoch 7 - Save Best Score: 0.7764 Model
INFO:__main__:Epoch 7 - Save Best Score: 0.7764 Model


EVAL: [23/24] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [8][0/187] Elapsed 0m 0s (remain 1m 17s) 
Epoch: [8][50/187] Elapsed 0m 16s (remain 0m 43s) 
Epoch: [8][100/187] Elapsed 0m 32s (remain 0m 27s) 
Epoch: [8][150/187] Elapsed 0m 48s (remain 0m 11s) 
Epoch: [8][186/187] Elapsed 1m 1s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 6s) 


Epoch 8 - avg_train_loss: 0.0276  avg_val_loss: 0.7729  time: 66s
INFO:__main__:Epoch 8 - avg_train_loss: 0.0276  avg_val_loss: 0.7729  time: 66s
Epoch 8 - Score: 0.7689
INFO:__main__:Epoch 8 - Score: 0.7689


EVAL: [23/24] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [9][0/187] Elapsed 0m 0s (remain 1m 19s) 
Epoch: [9][50/187] Elapsed 0m 15s (remain 0m 40s) 
Epoch: [9][100/187] Elapsed 0m 32s (remain 0m 27s) 
Epoch: [9][150/187] Elapsed 0m 48s (remain 0m 11s) 
Epoch: [9][186/187] Elapsed 0m 59s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 6s) 


Epoch 9 - avg_train_loss: 0.0253  avg_val_loss: 0.7737  time: 64s
INFO:__main__:Epoch 9 - avg_train_loss: 0.0253  avg_val_loss: 0.7737  time: 64s
Epoch 9 - Score: 0.7675
INFO:__main__:Epoch 9 - Score: 0.7675


EVAL: [23/24] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [10][0/187] Elapsed 0m 0s (remain 1m 31s) 
Epoch: [10][50/187] Elapsed 0m 15s (remain 0m 42s) 
Epoch: [10][100/187] Elapsed 0m 33s (remain 0m 28s) 
Epoch: [10][150/187] Elapsed 0m 48s (remain 0m 11s) 
Epoch: [10][186/187] Elapsed 0m 59s (remain 0m 0s) 
EVAL: [0/24] Elapsed 0m 0s (remain 0m 7s) 


Epoch 10 - avg_train_loss: 0.0212  avg_val_loss: 0.7804  time: 64s
INFO:__main__:Epoch 10 - avg_train_loss: 0.0212  avg_val_loss: 0.7804  time: 64s
Epoch 10 - Score: 0.7656
INFO:__main__:Epoch 10 - Score: 0.7656


EVAL: [23/24] Elapsed 0m 4s (remain 0m 0s) 


Score: 0.7764
INFO:__main__:Score: 0.7764
Score: 0.8061
INFO:__main__:Score: 0.8061


0.7763928100884623


In [20]:
A = pd.read_csv(OUTPUT_MODEL_DIR+'oof_df.csv')
A.head()

Unnamed: 0,id,description,label,inputs,kfold,Consultant,Data scientist,Machine learning engineer,Software engineer
0,3,"Work on the technical design, development, re...",2,"Work on the technical design, development, re...",0,-3.479887,-0.398986,5.31131,-0.531013
1,9,Maintain and improve existing predictive model...,0,Maintain and improve existing predictive model...,0,6.215576,-2.32238,-2.795672,-1.636019
2,12,Project manage complex implementation engageme...,3,Project manage complex implementation engageme...,0,-3.344229,-1.206549,0.698157,3.351911
3,13,"Research, prototype, identify, and build predi...",1,"Research, prototype, identify, and build predi...",0,-0.142603,3.677754,-2.261342,-2.550574
4,20,Tie all efforts to ET&amp;O/CT&amp;O strategi...,3,Tie all efforts to ET&amp;O/CT&amp;O strategi...,0,-0.983607,-3.291892,-2.681272,4.695417
