In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
INPUT_DIR = '/content/drive/MyDrive/Competitions/Signate/Student Cup 2022/input/'
OUTPUT_DIR = '/content/drive/MyDrive/Competitions/Signate/Student Cup 2022/output/'
OUTPUT_SUB_DIR = os.path.join(OUTPUT_DIR,'Submission')
OUTPUT_MODEL_DIR = os.path.join(OUTPUT_DIR,'Model/DeBERTa-base[ver3]/')

In [6]:
class CFG:
    wandb = False
    apex = True
    model = 'microsoft/deberta-v3-base'
    seed = 42
    n_splits = 5
    max_len = 1024
    dropout = 0.2
    target_size=4
    n_accumulate=1
    print_freq = 50
    min_lr=1e-6
    scheduler = 'cosine'
    batch_size = 8
    num_workers = 2
    lr = 3e-5
    weigth_decay = 0.01
    epochs = 5
    n_fold = 5
    trn_fold = [0, 1, 2, 3, 4]
    train = True 
    num_warmup_steps = 0
    num_cycles=0.5
    debug = False
    debug_ver2 = False
    gradient_checkpointing = True
    freezing = True

In [7]:
# Loss Func
def criterion(outputs, labels):
    return nn.CrossEntropyLoss()(outputs, labels)

def softmax(z):
    assert len(z.shape) == 2
    s = np.max(z, axis=1)
    s = s[:, np.newaxis] # necessary step to do broadcasting
    e_x = np.exp(z - s)
    div = np.sum(e_x, axis=1)
    div = div[:, np.newaxis] # dito
    return e_x / div
"""
def get_score(y_true, y_pred):
    y_pred = softmax(y_pred)
    score = log_loss(y_true, y_pred)
    return round(score, 5)
"""
def get_score(outputs, labels):
    outputs = F.softmax(torch.tensor(outputs)).numpy()
    return f1_score(np.argmax(outputs,axis=1),labels ,average='macro')

def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

def prepare_input(cfg, text, text_2=None):
    inputs = cfg.tokenizer(text, text_2,
                           padding="max_length",
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           truncation=True)

    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
        
    return inputs

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
train = pd.read_csv(os.path.join(INPUT_DIR, 'train.csv'))
test = pd.read_csv(os.path.join(INPUT_DIR, 'test.csv'))
submission_df = pd.read_csv(os.path.join(INPUT_DIR, 'submit_sample.csv'))

display(train.head())
print(train.shape)
display(test.head())
print(test.shape)

Unnamed: 0,id,description,jobflag
0,0,<li>Develop cutting-edge web applications that...,3
1,1,"<li> Designs and develops high quality, scalab...",3
2,2,<li>Functions as a point person for Network St...,4
3,3,"<li> Work on the technical design, development...",3
4,4,<li>Quantify the resources required for a task...,4


(1516, 3)


Unnamed: 0,id,description
0,1516,<li>Building decision-making models and propos...
1,1517,<li>Educate homeowners on the benefits of sola...
2,1518,"<li><span>Design, develop, document, and imple..."
3,1519,<li>Apply advanced technical expertise and ski...
4,1520,<li>Project manage and deliver against our roa...


(1517, 2)


In [10]:
def remove_tag(x):
    p = re.compile(r"<[^>]*?>")
    return p.sub('',x)

def cleaning(texts):
    clean_texts = []
    for text in texts:
        # htmlタグを削除
        text = remove_tag(text)
        #アルファベット以外をスペースに置き換え
        #clean_punc = re.sub(r'[^a-zA-Z]', ' ', text)
        clean_texts.append(text)
    return clean_texts



from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text

train['description'] = cleaning(train['description'])
test['description'] = cleaning(test['description'])
train['inputs'] = train['description'].apply(lambda x : resolve_encodings_and_normalize(x))
test['inputs'] = test['description'].apply(lambda x : resolve_encodings_and_normalize(x))
train = train.rename(columns = {"jobflag": "label"})
train["label"] = train["label"] - 1
train

Unnamed: 0,id,description,label,inputs
0,0,Develop cutting-edge web applications that per...,2,Develop cutting-edge web applications that per...
1,1,"Designs and develops high quality, scalable a...",2,"Designs and develops high quality, scalable a..."
2,2,Functions as a point person for Network Strate...,3,Functions as a point person for Network Strate...
3,3,"Work on the technical design, development, re...",2,"Work on the technical design, development, re..."
4,4,Quantify the resources required for a task/pro...,3,Quantify the resources required for a task/pro...
...,...,...,...,...
1511,1511,"Support detailed reporting, statistical analys...",0,"Support detailed reporting, statistical analys..."
1512,1512,Collaborate with teams to support the ML techn...,1,Collaborate with teams to support the ML techn...
1513,1513,Work with executives and other business leade...,0,Work with executives and other business leade...
1514,1514,Leading design ideation sessions to ensure we ...,2,Leading design ideation sessions to ensure we ...


In [11]:
skf = StratifiedKFold(n_splits=CFG.n_splits,shuffle=True,random_state=2022)
for fold, ( _, val_) in enumerate(skf.split(train, train.label)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)

In [12]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_MODEL_DIR+'tokenizer/')
CFG.tokenizer = tokenizer
SEP = tokenizer.sep_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
class Dataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = CFG.max_len
        self.text = df['inputs'].values
        self.tokenizer = CFG.tokenizer
        self.targets = df['label'].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        text = self.text[index]
        inputs = tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length = self.max_len
        )
        samples = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            'target': self.targets[index]
        }

        if 'token_type_ids' in inputs:
            samples['token_type_ids'] = inputs['token_type_ids']
            
        return samples

In [14]:
# Dynamic Padding (Collate)
#collate_fn = DataCollatorWithPadding(tokenizer=CFG.tokenizer)
class Collate:
    def __init__(self, tokenizer, isTrain=True):
        self.tokenizer = tokenizer
        self.isTrain = isTrain
        # self.args = args

    def __call__(self, batch):
        output = dict()
        output["input_ids"] = [sample["input_ids"] for sample in batch]
        output["attention_mask"] = [sample["attention_mask"] for sample in batch]
        if self.isTrain:
            output["target"] = [sample["target"] for sample in batch]

        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output["input_ids"]])

        # add padding
        if self.tokenizer.padding_side == "right":
            output["input_ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["input_ids"]]
            output["attention_mask"] = [s + (batch_max - len(s)) * [0] for s in output["attention_mask"]]
        else:
            output["input_ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output["input_ids"]]
            output["attention_mask"] = [(batch_max - len(s)) * [0] + s for s in output["attention_mask"]]

        # convert to tensors
        output["input_ids"] = torch.tensor(output["input_ids"], dtype=torch.long)
        output["attention_mask"] = torch.tensor(output["attention_mask"], dtype=torch.long)
        if self.isTrain:
            output["target"] = torch.tensor(output["target"], dtype=torch.long)

        return output
    
collate_fn = Collate(CFG.tokenizer, isTrain=True)

In [15]:
class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        _, max_pooling_embeddings = torch.max(last_hidden_state, 1)
        return max_pooling_embeddings

In [16]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, model_name):
        super(CustomModel, self).__init__()
        # Header (fast or normal)
        self.model = AutoModel.from_pretrained(model_name)
        
        # Gradient_checkpointing
        if CFG.gradient_checkpointing:
            (self.model).gradient_checkpointing_enable()
        
        # Freezing
        if CFG.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            CFG.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())
        
        self.config = AutoConfig.from_pretrained(model_name)
        #self.drop = nn.Dropout(p=CFG.dropout)
        #self.pooler = MaxPooling()
        self.fc = nn.Linear(self.config.hidden_size, CFG.target_size)
        
    def forward(self, ids, mask):        
        out = self.model(input_ids=ids, 
                         attention_mask=mask,
                         output_hidden_states=False)
        outputs = out[0][:, 0, :]
        outputs = self.fc(outputs)
        return outputs

In [17]:
def asMinutes(s):
    m = math.floor(s/60)
    s -= m * 60
    return "%dm %ds" % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))

def get_scheduler(cfg, optimizer, num_train_steps):
    if cfg.scheduler == 'linear':
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
        )
    elif cfg.scheduler == 'cosine':
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
        )
    return scheduler

def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    model.train()

    dataset_size = 0
    running_loss = 0

    start = end = time.time()

    for step, data in enumerate(dataloader):
        ids = data['input_ids'].to(device, dtype=torch.long)
        mask = data['attention_mask'].to(device, dtype=torch.long)
        targets = data['target'].to(device, dtype=torch.long)

        batch_size = ids.size(0)
        
        outputs = model(ids, mask)
        loss = criterion(outputs, targets)

        #accumulate
        loss = loss / CFG.n_accumulate 
        loss.backward()
        if (step +1) % CFG.n_accumulate == 0:
            optimizer.step()

            optimizer.zero_grad()
            if scheduler is not None:
                scheduler.step()
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size

        epoch_loss = running_loss / dataset_size

        end = time.time()
        
        if step % CFG.print_freq == 0 or step == (len(dataloader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  .format(epoch+1, step, len(dataloader), 
                          remain=timeSince(start, float(step+1)/len(dataloader))))

    gc.collect()

    return epoch_loss


@torch.no_grad()
def valid_one_epoch(model, dataloader, device, epoch):
    model.eval()

    dataset_size = 0
    running_loss = 0

    start = end = time.time()
    pred = []

    for step, data in enumerate(dataloader):
        ids = data['input_ids'].to(device, dtype=torch.long)
        mask = data['attention_mask'].to(device, dtype=torch.long)
        targets = data['target'].to(device, dtype=torch.long)

        batch_size = ids.size(0)
        outputs = model(ids, mask)
        loss = criterion(outputs, targets)
        pred.append(outputs.to('cpu').numpy())

        running_loss += (loss.item()* batch_size)
        dataset_size += batch_size

        epoch_loss = running_loss / dataset_size

        end = time.time()

        if step % CFG.print_freq == 0 or step == (len(dataloader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  .format(step, len(dataloader),
                          remain=timeSince(start, float(step+1)/len(dataloader))))
            
    pred = np.concatenate(pred)
            
    return epoch_loss, pred

In [18]:
def train_loop(fold):
    #wandb.watch(model, log_freq=100)

    LOGGER.info(f'-------------fold:{fold} training-------------')

    train_data = train[train.kfold != fold].reset_index(drop=True)
    valid_data = train[train.kfold == fold].reset_index(drop=True)
    valid_labels = valid_data.label.values

    trainDataset = Dataset(train_data, CFG.tokenizer, CFG.max_len)
    validDataset = Dataset(valid_data, CFG.tokenizer, CFG.max_len)

    train_loader = DataLoader(trainDataset,
                              batch_size = CFG.batch_size,
                              shuffle=True,
                              collate_fn = collate_fn,
                              num_workers = CFG.num_workers,
                              pin_memory = True,
                              drop_last=True)
    
    valid_loader = DataLoader(validDataset,
                              batch_size = CFG.batch_size*2,
                              shuffle=False,
                              collate_fn = collate_fn,
                              num_workers = CFG.num_workers,
                              pin_memory = True,
                              drop_last=False)
    
    model = CustomModel(CFG.model)
    torch.save(model.config, OUTPUT_MODEL_DIR+'config.pth')
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=CFG.lr, weight_decay=CFG.weigth_decay)
    num_train_steps = int(len(train_data) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # loop
    best_score = 0

    for epoch in range(CFG.epochs):
        start_time = time.time()

        train_epoch_loss = train_one_epoch(model, optimizer, scheduler, train_loader, device, epoch)
        valid_epoch_loss, pred = valid_one_epoch(model, valid_loader, device, epoch)

        score = get_score(pred, valid_labels)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {train_epoch_loss:.4f}  avg_val_loss: {valid_epoch_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": train_epoch_loss, 
                       f"[fold{fold}] avg_val_loss": valid_epoch_loss,
                       f"[fold{fold}] score": score})
            
        if score > best_score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': pred},
                        OUTPUT_MODEL_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")
            
    predictions = torch.load(OUTPUT_MODEL_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_data['Data scientist'] = predictions[:, 0]
    valid_data['Machine learning engineer'] = predictions[:, 1]
    valid_data['Software engineer'] = predictions[:, 2]
    valid_data['Consultant'] = predictions[:, 3]
    
    
    temp = valid_data[['Data scientist','Machine learning engineer','Software engineer','Consultant']].values.tolist()
    print(get_score(temp, valid_data['label'].values))

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_data

In [19]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['label'].values
        preds = oof_df[['Data scientist','Machine learning engineer','Software engineer','Consultant']].values.tolist()
        score = get_score(preds, labels)
        LOGGER.info(f'Score: {score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_MODEL_DIR+'oof_df.pkl')
        oof_df.to_csv(OUTPUT_MODEL_DIR+f'oof_df.csv', index=False)

-------------fold:0 training-------------
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceCl

Epoch: [1][0/151] Elapsed 0m 1s (remain 3m 50s) 
Epoch: [1][50/151] Elapsed 0m 18s (remain 0m 36s) 
Epoch: [1][100/151] Elapsed 0m 37s (remain 0m 18s) 
Epoch: [1][150/151] Elapsed 0m 54s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) 


Epoch 1 - avg_train_loss: 1.1142  avg_val_loss: 0.9751  time: 59s
Epoch 1 - Score: 0.5358
Epoch 1 - Save Best Score: 0.5358 Model


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [2][0/151] Elapsed 0m 0s (remain 1m 39s) 
Epoch: [2][50/151] Elapsed 0m 19s (remain 0m 37s) 
Epoch: [2][100/151] Elapsed 0m 36s (remain 0m 17s) 
Epoch: [2][150/151] Elapsed 0m 52s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) 


Epoch 2 - avg_train_loss: 0.7053  avg_val_loss: 0.6704  time: 57s
Epoch 2 - Score: 0.6206
Epoch 2 - Save Best Score: 0.6206 Model


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [3][0/151] Elapsed 0m 0s (remain 0m 45s) 
Epoch: [3][50/151] Elapsed 0m 20s (remain 0m 39s) 
Epoch: [3][100/151] Elapsed 0m 35s (remain 0m 17s) 
Epoch: [3][150/151] Elapsed 0m 52s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 6s) 


Epoch 3 - avg_train_loss: 0.5051  avg_val_loss: 0.7198  time: 57s
Epoch 3 - Score: 0.6619
Epoch 3 - Save Best Score: 0.6619 Model


EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [4][0/151] Elapsed 0m 0s (remain 1m 12s) 
Epoch: [4][50/151] Elapsed 0m 17s (remain 0m 34s) 
Epoch: [4][100/151] Elapsed 0m 35s (remain 0m 17s) 
Epoch: [4][150/151] Elapsed 0m 52s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) 


Epoch 4 - avg_train_loss: 0.3468  avg_val_loss: 0.6991  time: 56s
Epoch 4 - Score: 0.7099
Epoch 4 - Save Best Score: 0.7099 Model


EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [5][0/151] Elapsed 0m 0s (remain 1m 24s) 
Epoch: [5][50/151] Elapsed 0m 16s (remain 0m 32s) 
Epoch: [5][100/151] Elapsed 0m 36s (remain 0m 18s) 
Epoch: [5][150/151] Elapsed 0m 53s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 6s) 


Epoch 5 - avg_train_loss: 0.2606  avg_val_loss: 0.7151  time: 57s
Epoch 5 - Score: 0.7119
Epoch 5 - Save Best Score: 0.7119 Model


EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) 


Score: 0.7119
-------------fold:1 training-------------


0.7118982257565999


Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/151] Elapsed 0m 0s (remain 1m 16s) 
Epoch: [1][50/151] Elapsed 0m 18s (remain 0m 36s) 
Epoch: [1][100/151] Elapsed 0m 36s (remain 0m 18s) 
Epoch: [1][150/151] Elapsed 0m 52s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 6s) 


Epoch 1 - avg_train_loss: 1.0491  avg_val_loss: 0.8081  time: 56s
Epoch 1 - Score: 0.5004
Epoch 1 - Save Best Score: 0.5004 Model


EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [2][0/151] Elapsed 0m 0s (remain 0m 41s) 
Epoch: [2][50/151] Elapsed 0m 18s (remain 0m 36s) 
Epoch: [2][100/151] Elapsed 0m 34s (remain 0m 17s) 
Epoch: [2][150/151] Elapsed 0m 52s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 6s) 


Epoch 2 - avg_train_loss: 0.7098  avg_val_loss: 0.6714  time: 56s
Epoch 2 - Score: 0.7087
Epoch 2 - Save Best Score: 0.7087 Model


EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [3][0/151] Elapsed 0m 0s (remain 0m 44s) 
Epoch: [3][50/151] Elapsed 0m 17s (remain 0m 33s) 
Epoch: [3][100/151] Elapsed 0m 35s (remain 0m 17s) 
Epoch: [3][150/151] Elapsed 0m 52s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 6s) 


Epoch 3 - avg_train_loss: 0.4979  avg_val_loss: 0.5875  time: 56s
Epoch 3 - Score: 0.7090
Epoch 3 - Save Best Score: 0.7090 Model


EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [4][0/151] Elapsed 0m 0s (remain 0m 42s) 
Epoch: [4][50/151] Elapsed 0m 17s (remain 0m 34s) 
Epoch: [4][100/151] Elapsed 0m 34s (remain 0m 16s) 
Epoch: [4][150/151] Elapsed 0m 52s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 6s) 


Epoch 4 - avg_train_loss: 0.3523  avg_val_loss: 0.5930  time: 56s
Epoch 4 - Score: 0.7143
Epoch 4 - Save Best Score: 0.7143 Model


EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) 
Epoch: [5][0/151] Elapsed 0m 0s (remain 0m 42s) 
Epoch: [5][50/151] Elapsed 0m 17s (remain 0m 33s) 
Epoch: [5][100/151] Elapsed 0m 33s (remain 0m 16s) 
Epoch: [5][150/151] Elapsed 0m 52s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 6s) 


Epoch 5 - avg_train_loss: 0.2571  avg_val_loss: 0.5967  time: 56s
Epoch 5 - Score: 0.7514
Epoch 5 - Save Best Score: 0.7514 Model


EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) 


Score: 0.7514
-------------fold:2 training-------------


0.7514232290374676


Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/151] Elapsed 0m 0s (remain 0m 55s) 
Epoch: [1][50/151] Elapsed 0m 15s (remain 0m 31s) 
Epoch: [1][100/151] Elapsed 0m 32s (remain 0m 16s) 
Epoch: [1][150/151] Elapsed 0m 49s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 4s) 


Epoch 1 - avg_train_loss: 1.0811  avg_val_loss: 0.6558  time: 54s
Epoch 1 - Score: 0.5814
Epoch 1 - Save Best Score: 0.5814 Model


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [2][0/151] Elapsed 0m 0s (remain 0m 44s) 
Epoch: [2][50/151] Elapsed 0m 16s (remain 0m 32s) 
Epoch: [2][100/151] Elapsed 0m 32s (remain 0m 16s) 
Epoch: [2][150/151] Elapsed 0m 50s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 4s) 


Epoch 2 - avg_train_loss: 0.6977  avg_val_loss: 0.6328  time: 55s
Epoch 2 - Score: 0.6497
Epoch 2 - Save Best Score: 0.6497 Model


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [3][0/151] Elapsed 0m 0s (remain 0m 44s) 
Epoch: [3][50/151] Elapsed 0m 15s (remain 0m 31s) 
Epoch: [3][100/151] Elapsed 0m 32s (remain 0m 16s) 
Epoch: [3][150/151] Elapsed 0m 50s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 4s) 


Epoch 3 - avg_train_loss: 0.5157  avg_val_loss: 0.5469  time: 55s
Epoch 3 - Score: 0.7256
Epoch 3 - Save Best Score: 0.7256 Model


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [4][0/151] Elapsed 0m 0s (remain 0m 56s) 
Epoch: [4][50/151] Elapsed 0m 18s (remain 0m 36s) 
Epoch: [4][100/151] Elapsed 0m 34s (remain 0m 16s) 
Epoch: [4][150/151] Elapsed 0m 49s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 4s) 


Epoch 4 - avg_train_loss: 0.3844  avg_val_loss: 0.5156  time: 54s
Epoch 4 - Score: 0.7632
Epoch 4 - Save Best Score: 0.7632 Model


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [5][0/151] Elapsed 0m 0s (remain 1m 12s) 
Epoch: [5][50/151] Elapsed 0m 18s (remain 0m 36s) 
Epoch: [5][100/151] Elapsed 0m 33s (remain 0m 16s) 
Epoch: [5][150/151] Elapsed 0m 50s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 4s) 


Epoch 5 - avg_train_loss: 0.3056  avg_val_loss: 0.5257  time: 55s
Epoch 5 - Score: 0.7628


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 
0.7632236262864536


Score: 0.7632
-------------fold:3 training-------------
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a Ber

Epoch: [1][0/151] Elapsed 0m 0s (remain 1m 13s) 
Epoch: [1][50/151] Elapsed 0m 16s (remain 0m 33s) 
Epoch: [1][100/151] Elapsed 0m 33s (remain 0m 16s) 
Epoch: [1][150/151] Elapsed 0m 50s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 8s) 


Epoch 1 - avg_train_loss: 1.1813  avg_val_loss: 1.0263  time: 55s
Epoch 1 - Score: 0.3566
Epoch 1 - Save Best Score: 0.3566 Model


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [2][0/151] Elapsed 0m 0s (remain 1m 18s) 
Epoch: [2][50/151] Elapsed 0m 16s (remain 0m 32s) 
Epoch: [2][100/151] Elapsed 0m 32s (remain 0m 16s) 
Epoch: [2][150/151] Elapsed 0m 49s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 8s) 


Epoch 2 - avg_train_loss: 0.8198  avg_val_loss: 0.7334  time: 54s
Epoch 2 - Score: 0.5616
Epoch 2 - Save Best Score: 0.5616 Model


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [3][0/151] Elapsed 0m 0s (remain 0m 39s) 
Epoch: [3][50/151] Elapsed 0m 15s (remain 0m 30s) 
Epoch: [3][100/151] Elapsed 0m 32s (remain 0m 16s) 
Epoch: [3][150/151] Elapsed 0m 49s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 8s) 


Epoch 3 - avg_train_loss: 0.5647  avg_val_loss: 0.7097  time: 54s
Epoch 3 - Score: 0.6588
Epoch 3 - Save Best Score: 0.6588 Model


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [4][0/151] Elapsed 0m 0s (remain 0m 53s) 
Epoch: [4][50/151] Elapsed 0m 17s (remain 0m 33s) 
Epoch: [4][100/151] Elapsed 0m 33s (remain 0m 16s) 
Epoch: [4][150/151] Elapsed 0m 50s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 8s) 


Epoch 4 - avg_train_loss: 0.4016  avg_val_loss: 0.7188  time: 55s
Epoch 4 - Score: 0.6580


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [5][0/151] Elapsed 0m 0s (remain 0m 43s) 
Epoch: [5][50/151] Elapsed 0m 19s (remain 0m 37s) 
Epoch: [5][100/151] Elapsed 0m 35s (remain 0m 17s) 
Epoch: [5][150/151] Elapsed 0m 52s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 8s) 


Epoch 5 - avg_train_loss: 0.3196  avg_val_loss: 0.7233  time: 57s
Epoch 5 - Score: 0.6631
Epoch 5 - Save Best Score: 0.6631 Model


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 


Score: 0.6631
-------------fold:4 training-------------


0.663138384473372


Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/151] Elapsed 0m 0s (remain 0m 47s) 
Epoch: [1][50/151] Elapsed 0m 16s (remain 0m 32s) 
Epoch: [1][100/151] Elapsed 0m 35s (remain 0m 17s) 
Epoch: [1][150/151] Elapsed 0m 50s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 5s) 


Epoch 1 - avg_train_loss: 1.1196  avg_val_loss: 0.8799  time: 55s
Epoch 1 - Score: 0.4967
Epoch 1 - Save Best Score: 0.4967 Model


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [2][0/151] Elapsed 0m 0s (remain 0m 58s) 
Epoch: [2][50/151] Elapsed 0m 16s (remain 0m 32s) 
Epoch: [2][100/151] Elapsed 0m 33s (remain 0m 16s) 
Epoch: [2][150/151] Elapsed 0m 51s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 5s) 


Epoch 2 - avg_train_loss: 0.7368  avg_val_loss: 0.6930  time: 56s
Epoch 2 - Score: 0.5607
Epoch 2 - Save Best Score: 0.5607 Model


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [3][0/151] Elapsed 0m 0s (remain 0m 44s) 
Epoch: [3][50/151] Elapsed 0m 15s (remain 0m 31s) 
Epoch: [3][100/151] Elapsed 0m 33s (remain 0m 16s) 
Epoch: [3][150/151] Elapsed 0m 50s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 5s) 


Epoch 3 - avg_train_loss: 0.5406  avg_val_loss: 0.6881  time: 55s
Epoch 3 - Score: 0.6406
Epoch 3 - Save Best Score: 0.6406 Model


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [4][0/151] Elapsed 0m 0s (remain 1m 17s) 
Epoch: [4][50/151] Elapsed 0m 18s (remain 0m 36s) 
Epoch: [4][100/151] Elapsed 0m 34s (remain 0m 17s) 
Epoch: [4][150/151] Elapsed 0m 51s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 5s) 


Epoch 4 - avg_train_loss: 0.4011  avg_val_loss: 0.6678  time: 55s
Epoch 4 - Score: 0.6885
Epoch 4 - Save Best Score: 0.6885 Model


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 
Epoch: [5][0/151] Elapsed 0m 0s (remain 0m 54s) 
Epoch: [5][50/151] Elapsed 0m 16s (remain 0m 32s) 
Epoch: [5][100/151] Elapsed 0m 33s (remain 0m 16s) 
Epoch: [5][150/151] Elapsed 0m 50s (remain 0m 0s) 
EVAL: [0/19] Elapsed 0m 0s (remain 0m 5s) 


Epoch 5 - avg_train_loss: 0.3102  avg_val_loss: 0.6826  time: 55s
Epoch 5 - Score: 0.6944
Epoch 5 - Save Best Score: 0.6944 Model


EVAL: [18/19] Elapsed 0m 4s (remain 0m 0s) 


Score: 0.6944
Score: 0.7172


0.694370406751439


In [20]:
A = pd.read_csv(OUTPUT_MODEL_DIR+'oof_df.csv')
A.head()

Unnamed: 0,id,description,label,inputs,kfold,Data scientist,Machine learning engineer,Software engineer,Consultant
0,1,"Designs and develops high quality, scalable a...",2,"Designs and develops high quality, scalable a...",0,-1.025049,-1.079266,3.107621,-1.23359
1,5,Participates in standard business and technica...,2,Participates in standard business and technica...,0,0.331545,-1.323937,1.036907,-0.43477
2,7,"Facilitate pre-sales initiatives, such as live...",3,"Facilitate pre-sales initiatives, such as live...",0,-1.148837,-3.670311,-1.076519,3.777849
3,9,Maintain and improve existing predictive model...,0,Maintain and improve existing predictive model...,0,4.05711,-1.416694,-2.953991,-1.147857
4,13,"Research, prototype, identify, and build predi...",1,"Research, prototype, identify, and build predi...",0,1.457274,1.631755,-0.706486,-2.514197
