In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
INPUT_DIR = '/content/drive/MyDrive/Competitions/Signate/Student Cup 2022/input/'
OUTPUT_DIR = '/content/drive/MyDrive/Competitions/Signate/Student Cup 2022/output/'
OUTPUT_SUB_DIR = os.path.join(OUTPUT_DIR,'Submission')
OUTPUT_MODEL_DIR = os.path.join(OUTPUT_DIR,'Model/DeBERTa-base/')

In [6]:
class CFG:
    wandb = False
    apex = True
    model = 'microsoft/deberta-v3-base'
    seed = 42
    n_splits = 5
    max_len = 512
    dropout = 0.2
    target_size=4
    n_accumulate=1
    print_freq = 50
    min_lr=1e-6
    scheduler = 'cosine'
    batch_size = 16
    num_workers = 2
    lr = 3e-5
    weigth_decay = 0.01
    epochs = 5
    n_fold = 5
    trn_fold = [0, 1, 2, 3, 4]
    train = True 
    num_warmup_steps = 0
    num_cycles=0.5
    debug = False
    debug_ver2 = False
    gradient_checkpointing = True
    freezing = True

In [7]:
# Loss Func
def criterion(outputs, labels):
    return nn.CrossEntropyLoss()(outputs, labels)

def softmax(z):
    assert len(z.shape) == 2
    s = np.max(z, axis=1)
    s = s[:, np.newaxis] # necessary step to do broadcasting
    e_x = np.exp(z - s)
    div = np.sum(e_x, axis=1)
    div = div[:, np.newaxis] # dito
    return e_x / div
"""
def get_score(y_true, y_pred):
    y_pred = softmax(y_pred)
    score = log_loss(y_true, y_pred)
    return round(score, 5)
"""
def get_score(outputs, labels):
    outputs = F.softmax(torch.tensor(outputs)).numpy()
    return f1_score(np.argmax(outputs,axis=1),labels ,average='macro')

def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

def prepare_input(cfg, text, text_2=None):
    inputs = cfg.tokenizer(text, text_2,
                           padding="max_length",
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           truncation=True)

    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
        
    return inputs

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
train = pd.read_csv(os.path.join(INPUT_DIR, 'train.csv'))
test = pd.read_csv(os.path.join(INPUT_DIR, 'test.csv'))
submission_df = pd.read_csv(os.path.join(INPUT_DIR, 'submit_sample.csv'))

display(train.head())
print(train.shape)
display(test.head())
print(test.shape)

Unnamed: 0,id,description,jobflag
0,0,<li>Develop cutting-edge web applications that...,3
1,1,"<li> Designs and develops high quality, scalab...",3
2,2,<li>Functions as a point person for Network St...,4
3,3,"<li> Work on the technical design, development...",3
4,4,<li>Quantify the resources required for a task...,4


(1516, 3)


Unnamed: 0,id,description
0,1516,<li>Building decision-making models and propos...
1,1517,<li>Educate homeowners on the benefits of sola...
2,1518,"<li><span>Design, develop, document, and imple..."
3,1519,<li>Apply advanced technical expertise and ski...
4,1520,<li>Project manage and deliver against our roa...


(1517, 2)


In [10]:
def remove_tag(x):
    p = re.compile(r"<[^>]*?>")
    return p.sub('',x)

def cleaning(texts):
    clean_texts = []
    for text in texts:
        # htmlタグを削除
        text = remove_tag(text)
        #アルファベット以外をスペースに置き換え
        #clean_punc = re.sub(r'[^a-zA-Z]', ' ', text)
        clean_texts.append(text)
    return clean_texts



from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text

train['description'] = cleaning(train['description'])
test['description'] = cleaning(test['description'])
train['inputs'] = train['description'].apply(lambda x : resolve_encodings_and_normalize(x))
test['inputs'] = test['description'].apply(lambda x : resolve_encodings_and_normalize(x))
train = train.rename(columns = {"jobflag": "label"})
train["label"] = train["label"].apply(lambda x : 0 if x == 4 else x)
train

Unnamed: 0,id,description,label,inputs
0,0,Develop cutting-edge web applications that per...,3,Develop cutting-edge web applications that per...
1,1,"Designs and develops high quality, scalable a...",3,"Designs and develops high quality, scalable a..."
2,2,Functions as a point person for Network Strate...,0,Functions as a point person for Network Strate...
3,3,"Work on the technical design, development, re...",3,"Work on the technical design, development, re..."
4,4,Quantify the resources required for a task/pro...,0,Quantify the resources required for a task/pro...
...,...,...,...,...
1511,1511,"Support detailed reporting, statistical analys...",1,"Support detailed reporting, statistical analys..."
1512,1512,Collaborate with teams to support the ML techn...,2,Collaborate with teams to support the ML techn...
1513,1513,Work with executives and other business leade...,1,Work with executives and other business leade...
1514,1514,Leading design ideation sessions to ensure we ...,3,Leading design ideation sessions to ensure we ...


In [11]:
skf = StratifiedKFold(n_splits=CFG.n_splits,shuffle=True,random_state=2022)
for fold, ( _, val_) in enumerate(skf.split(train, train.label)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)

In [12]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_MODEL_DIR+'tokenizer/')
CFG.tokenizer = tokenizer
SEP = tokenizer.sep_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
class Dataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = CFG.max_len
        self.text = df['inputs'].values
        self.tokenizer = CFG.tokenizer
        self.targets = df['label'].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        text = self.text[index]
        inputs = tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length = self.max_len
        )
        samples = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            'target': self.targets[index]
        }

        if 'token_type_ids' in inputs:
            samples['token_type_ids'] = inputs['token_type_ids']
            
        return samples

In [14]:
# Dynamic Padding (Collate)
#collate_fn = DataCollatorWithPadding(tokenizer=CFG.tokenizer)
class Collate:
    def __init__(self, tokenizer, isTrain=True):
        self.tokenizer = tokenizer
        self.isTrain = isTrain
        # self.args = args

    def __call__(self, batch):
        output = dict()
        output["input_ids"] = [sample["input_ids"] for sample in batch]
        output["attention_mask"] = [sample["attention_mask"] for sample in batch]
        if self.isTrain:
            output["target"] = [sample["target"] for sample in batch]

        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output["input_ids"]])

        # add padding
        if self.tokenizer.padding_side == "right":
            output["input_ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["input_ids"]]
            output["attention_mask"] = [s + (batch_max - len(s)) * [0] for s in output["attention_mask"]]
        else:
            output["input_ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output["input_ids"]]
            output["attention_mask"] = [(batch_max - len(s)) * [0] + s for s in output["attention_mask"]]

        # convert to tensors
        output["input_ids"] = torch.tensor(output["input_ids"], dtype=torch.long)
        output["attention_mask"] = torch.tensor(output["attention_mask"], dtype=torch.long)
        if self.isTrain:
            output["target"] = torch.tensor(output["target"], dtype=torch.long)

        return output
    
collate_fn = Collate(CFG.tokenizer, isTrain=True)

In [15]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9) #
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

In [16]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, model_name):
        super(CustomModel, self).__init__()
        # Header (fast or normal)
        self.model = AutoModel.from_pretrained(model_name)
        
        # Gradient_checkpointing
        if CFG.gradient_checkpointing:
            (self.model).gradient_checkpointing_enable()
        
        # Freezing
        if CFG.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            CFG.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())
        
        self.config = AutoConfig.from_pretrained(model_name)
        self.drop = nn.Dropout(p=CFG.dropout)
        self.pooler = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, CFG.target_size)
        
    def forward(self, ids, mask):        
        out = self.model(input_ids=ids, 
                         attention_mask=mask,
                         output_hidden_states=False)
        out = self.pooler(out.last_hidden_state, mask)
        out = self.drop(out)
        outputs = self.fc(out)
        return outputs

In [17]:
def asMinutes(s):
    m = math.floor(s/60)
    s -= m * 60
    return "%dm %ds" % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))

def get_scheduler(cfg, optimizer, num_train_steps):
    if cfg.scheduler == 'linear':
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
        )
    elif cfg.scheduler == 'cosine':
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
        )
    return scheduler

def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    model.train()

    dataset_size = 0
    running_loss = 0

    start = end = time.time()

    for step, data in enumerate(dataloader):
        ids = data['input_ids'].to(device, dtype=torch.long)
        mask = data['attention_mask'].to(device, dtype=torch.long)
        targets = data['target'].to(device, dtype=torch.long)

        batch_size = ids.size(0)
        
        outputs = model(ids, mask)
        loss = criterion(outputs, targets)

        #accumulate
        loss = loss / CFG.n_accumulate 
        loss.backward()
        if (step +1) % CFG.n_accumulate == 0:
            optimizer.step()

            optimizer.zero_grad()
            if scheduler is not None:
                scheduler.step()
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size

        epoch_loss = running_loss / dataset_size

        end = time.time()
        
        if step % CFG.print_freq == 0 or step == (len(dataloader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  .format(epoch+1, step, len(dataloader), 
                          remain=timeSince(start, float(step+1)/len(dataloader))))

    gc.collect()

    return epoch_loss


@torch.no_grad()
def valid_one_epoch(model, dataloader, device, epoch):
    model.eval()

    dataset_size = 0
    running_loss = 0

    start = end = time.time()
    pred = []

    for step, data in enumerate(dataloader):
        ids = data['input_ids'].to(device, dtype=torch.long)
        mask = data['attention_mask'].to(device, dtype=torch.long)
        targets = data['target'].to(device, dtype=torch.long)

        batch_size = ids.size(0)
        outputs = model(ids, mask)
        loss = criterion(outputs, targets)
        pred.append(outputs.to('cpu').numpy())

        running_loss += (loss.item()* batch_size)
        dataset_size += batch_size

        epoch_loss = running_loss / dataset_size

        end = time.time()

        if step % CFG.print_freq == 0 or step == (len(dataloader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  .format(step, len(dataloader),
                          remain=timeSince(start, float(step+1)/len(dataloader))))
            
    pred = np.concatenate(pred)
            
    return epoch_loss, pred

In [18]:
def train_loop(fold):
    #wandb.watch(model, log_freq=100)

    LOGGER.info(f'-------------fold:{fold} training-------------')

    train_data = train[train.kfold != fold].reset_index(drop=True)
    valid_data = train[train.kfold == fold].reset_index(drop=True)
    valid_labels = valid_data.label.values

    trainDataset = Dataset(train_data, CFG.tokenizer, CFG.max_len)
    validDataset = Dataset(valid_data, CFG.tokenizer, CFG.max_len)

    train_loader = DataLoader(trainDataset,
                              batch_size = CFG.batch_size,
                              shuffle=True,
                              collate_fn = collate_fn,
                              num_workers = CFG.num_workers,
                              pin_memory = True,
                              drop_last=True)
    
    valid_loader = DataLoader(validDataset,
                              batch_size = CFG.batch_size*2,
                              shuffle=False,
                              collate_fn = collate_fn,
                              num_workers = CFG.num_workers,
                              pin_memory = True,
                              drop_last=False)
    
    model = CustomModel(CFG.model)
    torch.save(model.config, OUTPUT_MODEL_DIR+'config.pth')
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=CFG.lr, weight_decay=CFG.weigth_decay)
    num_train_steps = int(len(train_data) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # loop
    best_score = 0

    for epoch in range(CFG.epochs):
        start_time = time.time()

        train_epoch_loss = train_one_epoch(model, optimizer, scheduler, train_loader, device, epoch)
        valid_epoch_loss, pred = valid_one_epoch(model, valid_loader, device, epoch)

        score = get_score(pred, valid_labels)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {train_epoch_loss:.4f}  avg_val_loss: {valid_epoch_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": train_epoch_loss, 
                       f"[fold{fold}] avg_val_loss": valid_epoch_loss,
                       f"[fold{fold}] score": score})
            
        if score > best_score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': pred},
                        OUTPUT_MODEL_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")
            
    predictions = torch.load(OUTPUT_MODEL_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_data['Consultant'] = predictions[:, 0]
    valid_data['Data scientist'] = predictions[:, 1]
    valid_data['Machine learning engineer'] = predictions[:, 2]
    valid_data['Software engineer'] = predictions[:, 3]
    
    
    temp = valid_data[['Consultant','Data scientist','Machine learning engineer','Software engineer']].values.tolist()
    print(get_score(temp, valid_data['label'].values))

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_data

In [19]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['label'].values
        preds = oof_df[['Consultant','Data scientist','Machine learning engineer','Software engineer']].values.tolist()
        score = get_score(preds, labels)
        LOGGER.info(f'Score: {score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_MODEL_DIR+'oof_df.pkl')
        oof_df.to_csv(OUTPUT_MODEL_DIR+f'oof_df.csv', index=False)

-------------fold:0 training-------------
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceCl

Epoch: [1][0/75] Elapsed 0m 2s (remain 3m 22s) 
Epoch: [1][50/75] Elapsed 1m 6s (remain 0m 31s) 
Epoch: [1][74/75] Elapsed 1m 38s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 0s (remain 0m 8s) 


Epoch 1 - avg_train_loss: 1.0340  avg_val_loss: 0.8143  time: 106s
Epoch 1 - Score: 0.5431
Epoch 1 - Save Best Score: 0.5431 Model


EVAL: [9/10] Elapsed 0m 7s (remain 0m 0s) 
Epoch: [2][0/75] Elapsed 0m 1s (remain 2m 1s) 
Epoch: [2][50/75] Elapsed 1m 8s (remain 0m 32s) 
Epoch: [2][74/75] Elapsed 1m 39s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 0s (remain 0m 8s) 


Epoch 2 - avg_train_loss: 0.6275  avg_val_loss: 0.6923  time: 107s
Epoch 2 - Score: 0.6341
Epoch 2 - Save Best Score: 0.6341 Model


EVAL: [9/10] Elapsed 0m 7s (remain 0m 0s) 
Epoch: [3][0/75] Elapsed 0m 1s (remain 2m 7s) 
Epoch: [3][50/75] Elapsed 1m 9s (remain 0m 32s) 
Epoch: [3][74/75] Elapsed 1m 41s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 0s (remain 0m 8s) 


Epoch 3 - avg_train_loss: 0.4730  avg_val_loss: 0.6427  time: 109s
Epoch 3 - Score: 0.6584
Epoch 3 - Save Best Score: 0.6584 Model


EVAL: [9/10] Elapsed 0m 7s (remain 0m 0s) 
Epoch: [4][0/75] Elapsed 0m 1s (remain 1m 49s) 
Epoch: [4][50/75] Elapsed 1m 9s (remain 0m 32s) 
Epoch: [4][74/75] Elapsed 1m 39s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 0s (remain 0m 8s) 


Epoch 4 - avg_train_loss: 0.3266  avg_val_loss: 0.6957  time: 107s
Epoch 4 - Score: 0.6870
Epoch 4 - Save Best Score: 0.6870 Model


EVAL: [9/10] Elapsed 0m 7s (remain 0m 0s) 
Epoch: [5][0/75] Elapsed 0m 1s (remain 1m 56s) 
Epoch: [5][50/75] Elapsed 1m 13s (remain 0m 34s) 
Epoch: [5][74/75] Elapsed 1m 43s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 0s (remain 0m 8s) 


Epoch 5 - avg_train_loss: 0.2499  avg_val_loss: 0.6831  time: 111s
Epoch 5 - Score: 0.6836


EVAL: [9/10] Elapsed 0m 7s (remain 0m 0s) 


Score: 0.6870
-------------fold:1 training-------------


0.6870140706824196


Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/75] Elapsed 0m 2s (remain 3m 19s) 
Epoch: [1][50/75] Elapsed 1m 10s (remain 0m 33s) 
Epoch: [1][74/75] Elapsed 1m 39s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 9s) 


Epoch 1 - avg_train_loss: 1.0234  avg_val_loss: 0.7183  time: 107s
Epoch 1 - Score: 0.5396
Epoch 1 - Save Best Score: 0.5396 Model


EVAL: [9/10] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [2][0/75] Elapsed 0m 3s (remain 3m 47s) 
Epoch: [2][50/75] Elapsed 1m 8s (remain 0m 32s) 
Epoch: [2][74/75] Elapsed 1m 42s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 9s) 


Epoch 2 - avg_train_loss: 0.6774  avg_val_loss: 0.6652  time: 110s
Epoch 2 - Score: 0.6722
Epoch 2 - Save Best Score: 0.6722 Model


EVAL: [9/10] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [3][0/75] Elapsed 0m 1s (remain 1m 24s) 
Epoch: [3][50/75] Elapsed 1m 13s (remain 0m 34s) 
Epoch: [3][74/75] Elapsed 1m 42s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 9s) 


Epoch 3 - avg_train_loss: 0.4896  avg_val_loss: 0.6096  time: 110s
Epoch 3 - Score: 0.6757
Epoch 3 - Save Best Score: 0.6757 Model


EVAL: [9/10] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [4][0/75] Elapsed 0m 1s (remain 1m 15s) 
Epoch: [4][50/75] Elapsed 1m 6s (remain 0m 31s) 
Epoch: [4][74/75] Elapsed 1m 43s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 9s) 


Epoch 4 - avg_train_loss: 0.3563  avg_val_loss: 0.5684  time: 111s
Epoch 4 - Score: 0.7439
Epoch 4 - Save Best Score: 0.7439 Model


EVAL: [9/10] Elapsed 0m 6s (remain 0m 0s) 
Epoch: [5][0/75] Elapsed 0m 0s (remain 1m 11s) 
Epoch: [5][50/75] Elapsed 1m 6s (remain 0m 31s) 
Epoch: [5][74/75] Elapsed 1m 42s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 9s) 


Epoch 5 - avg_train_loss: 0.2701  avg_val_loss: 0.5713  time: 110s
Epoch 5 - Score: 0.7574
Epoch 5 - Save Best Score: 0.7574 Model


EVAL: [9/10] Elapsed 0m 6s (remain 0m 0s) 


Score: 0.7574
-------------fold:2 training-------------


0.757379083891724


Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/75] Elapsed 0m 0s (remain 1m 7s) 
Epoch: [1][50/75] Elapsed 1m 1s (remain 0m 29s) 
Epoch: [1][74/75] Elapsed 1m 34s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 0s (remain 0m 7s) 


Epoch 1 - avg_train_loss: 1.0619  avg_val_loss: 0.7796  time: 103s
Epoch 1 - Score: 0.4981
Epoch 1 - Save Best Score: 0.4981 Model


EVAL: [9/10] Elapsed 0m 8s (remain 0m 0s) 
Epoch: [2][0/75] Elapsed 0m 0s (remain 1m 7s) 
Epoch: [2][50/75] Elapsed 1m 3s (remain 0m 29s) 
Epoch: [2][74/75] Elapsed 1m 37s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 0s (remain 0m 7s) 


Epoch 2 - avg_train_loss: 0.6943  avg_val_loss: 0.5563  time: 107s
Epoch 2 - Score: 0.6590
Epoch 2 - Save Best Score: 0.6590 Model


EVAL: [9/10] Elapsed 0m 8s (remain 0m 0s) 
Epoch: [3][0/75] Elapsed 0m 1s (remain 1m 35s) 
Epoch: [3][50/75] Elapsed 1m 5s (remain 0m 30s) 
Epoch: [3][74/75] Elapsed 1m 38s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 0s (remain 0m 7s) 


Epoch 3 - avg_train_loss: 0.5145  avg_val_loss: 0.5474  time: 107s
Epoch 3 - Score: 0.6798
Epoch 3 - Save Best Score: 0.6798 Model


EVAL: [9/10] Elapsed 0m 8s (remain 0m 0s) 
Epoch: [4][0/75] Elapsed 0m 1s (remain 1m 16s) 
Epoch: [4][50/75] Elapsed 1m 8s (remain 0m 32s) 
Epoch: [4][74/75] Elapsed 1m 37s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 0s (remain 0m 7s) 


Epoch 4 - avg_train_loss: 0.3743  avg_val_loss: 0.5177  time: 106s
Epoch 4 - Score: 0.7564
Epoch 4 - Save Best Score: 0.7564 Model


EVAL: [9/10] Elapsed 0m 8s (remain 0m 0s) 
Epoch: [5][0/75] Elapsed 0m 1s (remain 1m 43s) 
Epoch: [5][50/75] Elapsed 1m 6s (remain 0m 31s) 
Epoch: [5][74/75] Elapsed 1m 40s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 0s (remain 0m 6s) 


Epoch 5 - avg_train_loss: 0.3035  avg_val_loss: 0.5229  time: 110s
Epoch 5 - Score: 0.7436


EVAL: [9/10] Elapsed 0m 8s (remain 0m 0s) 


Score: 0.7564
-------------fold:3 training-------------


0.7563562433161481


Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/75] Elapsed 0m 1s (remain 1m 32s) 
Epoch: [1][50/75] Elapsed 1m 4s (remain 0m 30s) 
Epoch: [1][74/75] Elapsed 1m 37s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 11s) 


Epoch 1 - avg_train_loss: 1.0437  avg_val_loss: 0.7196  time: 107s
Epoch 1 - Score: 0.5516
Epoch 1 - Save Best Score: 0.5516 Model


EVAL: [9/10] Elapsed 0m 9s (remain 0m 0s) 
Epoch: [2][0/75] Elapsed 0m 1s (remain 1m 56s) 
Epoch: [2][50/75] Elapsed 1m 5s (remain 0m 30s) 
Epoch: [2][74/75] Elapsed 1m 37s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 11s) 


Epoch 2 - avg_train_loss: 0.6517  avg_val_loss: 0.6945  time: 107s
Epoch 2 - Score: 0.5760
Epoch 2 - Save Best Score: 0.5760 Model


EVAL: [9/10] Elapsed 0m 9s (remain 0m 0s) 
Epoch: [3][0/75] Elapsed 0m 1s (remain 1m 20s) 
Epoch: [3][50/75] Elapsed 1m 4s (remain 0m 30s) 
Epoch: [3][74/75] Elapsed 1m 36s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 11s) 


Epoch 3 - avg_train_loss: 0.5024  avg_val_loss: 0.6459  time: 106s
Epoch 3 - Score: 0.6529
Epoch 3 - Save Best Score: 0.6529 Model


EVAL: [9/10] Elapsed 0m 9s (remain 0m 0s) 
Epoch: [4][0/75] Elapsed 0m 0s (remain 1m 12s) 
Epoch: [4][50/75] Elapsed 1m 5s (remain 0m 30s) 
Epoch: [4][74/75] Elapsed 1m 37s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 11s) 


Epoch 4 - avg_train_loss: 0.3357  avg_val_loss: 0.6669  time: 107s
Epoch 4 - Score: 0.6785
Epoch 4 - Save Best Score: 0.6785 Model


EVAL: [9/10] Elapsed 0m 9s (remain 0m 0s) 
Epoch: [5][0/75] Elapsed 0m 1s (remain 1m 22s) 
Epoch: [5][50/75] Elapsed 1m 7s (remain 0m 31s) 
Epoch: [5][74/75] Elapsed 1m 40s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 1s (remain 0m 11s) 


Epoch 5 - avg_train_loss: 0.2513  avg_val_loss: 0.6617  time: 110s
Epoch 5 - Score: 0.6684


EVAL: [9/10] Elapsed 0m 9s (remain 0m 0s) 


Score: 0.6785
-------------fold:4 training-------------


0.678541937410931


Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/75] Elapsed 0m 1s (remain 1m 22s) 
Epoch: [1][50/75] Elapsed 1m 9s (remain 0m 32s) 
Epoch: [1][74/75] Elapsed 1m 37s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 0s (remain 0m 7s) 


Epoch 1 - avg_train_loss: 1.0890  avg_val_loss: 0.8284  time: 107s
Epoch 1 - Score: 0.4989
Epoch 1 - Save Best Score: 0.4989 Model


EVAL: [9/10] Elapsed 0m 8s (remain 0m 0s) 
Epoch: [2][0/75] Elapsed 0m 1s (remain 1m 53s) 
Epoch: [2][50/75] Elapsed 1m 7s (remain 0m 31s) 
Epoch: [2][74/75] Elapsed 1m 39s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 0s (remain 0m 6s) 


Epoch 2 - avg_train_loss: 0.6909  avg_val_loss: 0.6554  time: 109s
Epoch 2 - Score: 0.5795
Epoch 2 - Save Best Score: 0.5795 Model


EVAL: [9/10] Elapsed 0m 8s (remain 0m 0s) 
Epoch: [3][0/75] Elapsed 0m 0s (remain 1m 5s) 
Epoch: [3][50/75] Elapsed 1m 3s (remain 0m 29s) 
Epoch: [3][74/75] Elapsed 1m 34s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 0s (remain 0m 7s) 


Epoch 3 - avg_train_loss: 0.5062  avg_val_loss: 0.6021  time: 104s
Epoch 3 - Score: 0.6848
Epoch 3 - Save Best Score: 0.6848 Model


EVAL: [9/10] Elapsed 0m 8s (remain 0m 0s) 
Epoch: [4][0/75] Elapsed 0m 1s (remain 1m 48s) 
Epoch: [4][50/75] Elapsed 1m 10s (remain 0m 33s) 
Epoch: [4][74/75] Elapsed 1m 39s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 0s (remain 0m 7s) 


Epoch 4 - avg_train_loss: 0.3566  avg_val_loss: 0.6406  time: 109s
Epoch 4 - Score: 0.6854
Epoch 4 - Save Best Score: 0.6854 Model


EVAL: [9/10] Elapsed 0m 8s (remain 0m 0s) 
Epoch: [5][0/75] Elapsed 0m 0s (remain 1m 13s) 
Epoch: [5][50/75] Elapsed 1m 7s (remain 0m 31s) 
Epoch: [5][74/75] Elapsed 1m 40s (remain 0m 0s) 
EVAL: [0/10] Elapsed 0m 0s (remain 0m 7s) 


Epoch 5 - avg_train_loss: 0.2719  avg_val_loss: 0.6414  time: 109s
Epoch 5 - Score: 0.6971
Epoch 5 - Save Best Score: 0.6971 Model


EVAL: [9/10] Elapsed 0m 8s (remain 0m 0s) 


Score: 0.6971
Score: 0.7159


0.6970886123622193


In [21]:
A = pd.read_csv(OUTPUT_MODEL_DIR+'oof_df.csv')
A.head()

Unnamed: 0,id,description,label,inputs,kfold,Consultant,Data scientist,Machine learning engineer,Software engineer
0,1,"Designs and develops high quality, scalable a...",3,"Designs and develops high quality, scalable a...",0,-0.237893,-1.248257,-2.090317,2.456362
1,5,Participates in standard business and technica...,3,Participates in standard business and technica...,0,1.741166,-0.316498,-2.358713,0.427342
2,7,"Facilitate pre-sales initiatives, such as live...",0,"Facilitate pre-sales initiatives, such as live...",0,5.083238,-0.571194,-2.836229,-1.561467
3,9,Maintain and improve existing predictive model...,1,Maintain and improve existing predictive model...,0,-0.501314,3.869465,-1.112253,-2.527882
4,13,"Research, prototype, identify, and build predi...",2,"Research, prototype, identify, and build predi...",0,-1.785935,2.116324,2.617582,-1.317208
