In [None]:
!nvidia-smi

Thu Jun 16 04:18:36 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    23W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install transformers==4.19.1
!pip install sentencepiece==0.1.96

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.19.1
  Downloading transformers-4.19.1-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 13.7 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 5.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 67.6 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 55.7 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstal

In [None]:
# ----------------------------------------------
# Load Libraries
# ----------------------------------------------
import pathlib
from pathlib import Path
import sys
import os
import math
import random
import time

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn import functional as F

from transformers import AdamW
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import BertForSequenceClassification, BertConfig, BertModel
from transformers import get_cosine_schedule_with_warmup

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import train_test_split

from tqdm import tqdm

import gc
gc.enable()

# Config

In [None]:
class CFG:
    exp_id = 'exp124'
    input_path = 'input/'
    cpc_path = 'input/'
    model_path = 'microsoft/deberta-v3-large'
    out_base = 'output'
    out_path = f'{out_base}/{exp_id}'
    model_save_path = f'{out_path}/models'
    
    debug = False
    fold1_only = False
    upload_dataset = True
    debug_size = 100
    log_interval = 1822 # 未使用
    seed = 8
    max_len = 92
    learning_rate = 2e-5
    weight_decay = 0.01
    lr_decay = 0.98
    num_classes = 5
    num_fold = 4
    epochs = 5
    train_batch_size = 16
    valid_batch_size = 16
    gradient_accumulation_step = 1
    device = "cuda" if torch.cuda.is_available() else "cpu"

    if debug:
      epochs = 1
      upload_dataset = False

    if fold1_only:
      upload_dataset = False

In [None]:
!mkdir -p {CFG.out_path}
!mkdir -p {CFG.model_save_path}

# Preproc

In [None]:
# ----------------------------------------------
# Set SEED
# ----------------------------------------------
# seed
SEED = CFG.seed
def set_seed(SEED):
    random.seed(SEED)
    np.random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    
set_seed(SEED)

In [None]:
train_df = pd.read_csv(f"{CFG.input_path}train.csv")
# https://www.kaggle.com/code/gauravbrills/folds-dump-the-two-paths-fix
cpc_texts = torch.load(CFG.cpc_path+"cpc_texts_fixed.pth")
train_df['context_text'] = train_df['context'].map(cpc_texts)
train_df['context_text'] = "[context=" + train_df['context'] + ']' + train_df['context_text']
train_df['context_text'] = "[subgrp=" + train_df['context'].map(lambda x: x[:1]) + ']' + train_df['context_text']
train_df.head()

Unnamed: 0,id,anchor,target,context,score,context_text
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,[subgrp=A][context=A47]HUMAN NECESSITIES. FURN...
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,[subgrp=A][context=A47]HUMAN NECESSITIES. FURN...
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,[subgrp=A][context=A47]HUMAN NECESSITIES. FURN...
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,[subgrp=A][context=A47]HUMAN NECESSITIES. FURN...
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,[subgrp=A][context=A47]HUMAN NECESSITIES. FURN...


In [None]:
!pip install -q iterative-stratification
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

dfx = pd.get_dummies(train_df, columns=["score"]).groupby(["anchor"], as_index=False).sum()
cols = [c for c in dfx.columns if c.startswith("score_") or c == "anchor"]
dfx = dfx[cols]

mskf = MultilabelStratifiedKFold(n_splits=CFG.num_fold, shuffle=True, random_state=42)
labels = [c for c in dfx.columns if c != "anchor"]
dfx_labels = dfx[labels]
dfx["fold"] = -1

for fold, (trn_, val_) in enumerate(mskf.split(dfx, dfx_labels)):
    print(len(trn_), len(val_))
    dfx.loc[val_, "fold"] = fold

train_df = train_df.merge(dfx[["anchor", "fold"]], on="anchor", how="left")

550 183
549 184
550 183
550 183


In [None]:
train_df['fold'].value_counts().sort_index()

0    9379
1    8860
2    8612
3    9622
Name: fold, dtype: int64

In [None]:
print(train_df.shape)
train_df.head()

(36473, 7)


Unnamed: 0,id,anchor,target,context,score,context_text,fold
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,[subgrp=A][context=A47]HUMAN NECESSITIES. FURN...,0
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,[subgrp=A][context=A47]HUMAN NECESSITIES. FURN...,0
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,[subgrp=A][context=A47]HUMAN NECESSITIES. FURN...,0
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,[subgrp=A][context=A47]HUMAN NECESSITIES. FURN...,0
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,[subgrp=A][context=A47]HUMAN NECESSITIES. FURN...,0


In [None]:
train_df['input'] = train_df['anchor'] + '[SEP]' + train_df['target'] + '[SEP]' + train_df['context_text']

In [None]:
train_df = pd.concat([train_df, pd.get_dummies(train_df['score'])], axis='columns')

In [None]:
train_df[[0.0,0.25,0.5,0.75,1.0]].values

array([[0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0],
       ...,
       [0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 1, 0, 0]], dtype=uint8)

In [None]:
train_df.head()

Unnamed: 0,id,anchor,target,context,score,context_text,fold,input,0.0,0.25,0.5,0.75,1.0
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,[subgrp=A][context=A47]HUMAN NECESSITIES. FURN...,0,abatement[SEP]abatement of pollution[SEP][subg...,0,0,1,0,0
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,[subgrp=A][context=A47]HUMAN NECESSITIES. FURN...,0,abatement[SEP]act of abating[SEP][subgrp=A][co...,0,0,0,1,0
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,[subgrp=A][context=A47]HUMAN NECESSITIES. FURN...,0,abatement[SEP]active catalyst[SEP][subgrp=A][c...,0,1,0,0,0
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,[subgrp=A][context=A47]HUMAN NECESSITIES. FURN...,0,abatement[SEP]eliminating process[SEP][subgrp=...,0,0,1,0,0
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,[subgrp=A][context=A47]HUMAN NECESSITIES. FURN...,0,abatement[SEP]forest region[SEP][subgrp=A][con...,1,0,0,0,0


# Tokenizer

In [None]:
subgrp = train_df['context'].map(lambda x: x[:1]).unique()

tokenizer = AutoTokenizer.from_pretrained(CFG.model_path)
tokenizer.add_tokens([f"[context={i}]" for i in cpc_texts.keys()])
tokenizer.add_tokens([f"[subgrp={i}]" for i in subgrp])
len(tokenizer)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


128145

In [None]:
# トークン長分布の確認
"""
token_len = train_df['input'].map(lambda x: tokenizer(x)['input_ids'].__len__())
display(token_len.describe())
token_len.hist(bins=100)
"""

"\ntoken_len = train_df['input'].map(lambda x: tokenizer(x)['input_ids'].__len__())\ndisplay(token_len.describe())\ntoken_len.hist(bins=100)\n"

# Dataset

In [None]:
def prepare_input(tokenizer, text):
    inputs = tokenizer(text,
                           add_special_tokens=True,
                           max_length=CFG.max_len,
                           padding="max_length",
                           truncation=True,
                           return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, df):
        self.inputs = df['input'].values
        #self.label = df['score'].values
        self.label = df[[0.0,0.25,0.5,0.75,1.0]].values

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, item):
        inputs = self.inputs[item]
        label = self.label[item]
        outputs = prepare_input(tokenizer, inputs)
        outputs['label'] = torch.tensor(label, dtype=torch.float32)

        return outputs

# Train

In [None]:
# ----------------------------------------------
# Model
# ----------------------------------------------
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim
        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

class PPPMModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.config = AutoConfig.from_pretrained(CFG.model_path)
        self.pre_model = AutoModel.from_pretrained(CFG.model_path, config=self.config)
        self.pre_model.resize_token_embeddings(len(tokenizer))
        self.head = AttentionHead(self.config.hidden_size, self.config.hidden_size,1)
        self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)
        self.regressor = nn.Linear(self.config.hidden_size, CFG.num_classes)
    
    def forward(self, inputs):
        pre_out = self.pre_model(**inputs)
        last_hidden_states = pre_out[0]
        last_hidden_states = self.dropout(self.head(last_hidden_states))
        logits1 = self.regressor(self.dropout1(last_hidden_states))
        logits2 = self.regressor(self.dropout2(last_hidden_states))
        logits3 = self.regressor(self.dropout3(last_hidden_states))
        logits4 = self.regressor(self.dropout4(last_hidden_states))
        logits5 = self.regressor(self.dropout5(last_hidden_states))
        logits = (logits1 + logits2 + logits3 + logits4 + logits5) / 5
        return logits

In [None]:
# ----------------------------------------------
# func: valid, predict
# ----------------------------------------------
def predict(model, dataloader):
    model.eval()
    result = np.zeros((len(dataloader.dataset), CFG.num_classes))
    idx = 0
    
    with torch.no_grad():
        for batch_idx, data in enumerate(dataloader):
          inputs = {}
          for k, v in data.items():
            if k != 'label':
              inputs[k] = v.to(CFG.device)            
          output = model(inputs)
          output = nn.Softmax(dim=1)(output)
          result[idx:idx + output.shape[0], :] = output.to('cpu')
          idx += output.shape[0]
            
    return result


def valid_func(model, dataloader):
    model.eval()
    result = np.zeros((len(dataloader.dataset), CFG.num_classes))
    idx = 0
    loss_sum = 0
    bar = tqdm(dataloader, total=len(dataloader))
    
    with torch.no_grad():
        for batch_idx, data in enumerate(bar):
          inputs = {}
          for k, v in data.items():
            if k != 'label':
              inputs[k] = v.to(CFG.device)            
          label = data['label'].to(CFG.device)          
          output = model(inputs)
          output = nn.Softmax(dim=1)(output)
          result[idx:idx + output.shape[0], :] = output.to('cpu')
          idx += output.shape[0]

          loss_sum += nn.CrossEntropyLoss(reduction='sum')(output, label).item()
            
    return loss_sum/(len(dataloader.dataset)), result.reshape((len(dataloader.dataset), CFG.num_classes))

def label_to_score(label):
    return (label*[0,0.25,0.5,0.75,1.0]).sum(axis=1)

def metric_pearson(predictions, labels):
    pred_score = label_to_score(predictions)
    label_score = label_to_score(labels)
    pearson = np.corrcoef(pred_score, label_score)[0][1]       
    return pearson

In [None]:
"""preds = predict(model, valid_loader)
label_to_score(valid_loader.dataset.label)
metric_pearson(preds, valid_loader.dataset.label)"""

'preds = predict(model, valid_loader)\nlabel_to_score(valid_loader.dataset.label)\nmetric_pearson(preds, valid_loader.dataset.label)'

In [None]:
# ----------------------------------------------
# func: train
# ----------------------------------------------
def train_fn(
    model,
    save_path,
    train_loader,
    val_loader,
    optimizer,
    scheduler=None,
    num_epochs=CFG.epochs
):

    best_score = 0
    best_epoch = 0
    running_loss = 0.0
    dataset_size = 0
    log_interval = CFG.log_interval
    oof_preds = None

    start = time.time()

    for epoch in range(num_epochs):
        val_score = None
        model.train()
        bar = tqdm(train_loader, total=len(train_loader))
        for batch_idx, data in enumerate(bar):
          inputs = {}
          for k, v in data.items():
            if k != 'label':
              inputs[k] = v.to(CFG.device)            
          label = data['label'].to(CFG.device)
          batch_size = label.size(0)

          output = model(inputs)
          loss = nn.CrossEntropyLoss()(output, label)
          loss = loss / CFG.gradient_accumulation_step

          loss.backward()

          if (batch_idx + 1) % CFG.gradient_accumulation_step == 0:
            optimizer.step()
            optimizer.zero_grad()
            if scheduler:
                scheduler.step()

          if CFG.debug == True:
            if (batch_idx > 0) & (batch_idx % CFG.debug_size == 0):
                break

          running_loss += (loss.item() * batch_size)
          dataset_size += batch_size            
          total_loss = running_loss / dataset_size
          bar.set_postfix(Epoch=epoch, Loss=loss.item(), TotalLoss=total_loss, LR=optimizer.param_groups[0]['lr'])

        val_start = time.time()
        val_score, predictions = valid_func(model, val_loader)
        pearson = metric_pearson(predictions, val_loader.dataset.label)
        print(f"Epoch {epoch+1}, Step {batch_idx+1}, train_loss: {loss:0.5f}, val_loss: {val_score:0.5f}, pearson: {pearson:0.5f}")
        if pearson > best_score:
            print(f"Model Inproved: {best_score} ----> {pearson}")
            best_score = pearson
            oof_preds = predictions
            torch.save(model.state_dict(), save_path)
        print(f"validation elasped time: {time.time() - val_start: 0.3}")

    print(f"total elasped time: {time.time() - start: 0.3}")
    start = time.time()

    return best_score, oof_preds

# ----------------------------------------------
# create optimizer
# ----------------------------------------------
def create_optimizer(model):
    named_params = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optim_params = []
    for idx_, (name_, params_) in enumerate(named_params):
        weight_decay = 0 if name_ in no_decay else 0.01
        optim_params.append({'params':params_,
                            'weight_decay': weight_decay,
                            })

    return AdamW(optim_params)


# https://www.ai-shift.co.jp/techblog/2145
def create_optimizer_grouped_parameters(model):
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters()
                       if 'lstm' in n
                       or 'cnn' in n
                       or 'regressor' in n],
            "weight_decay": 0.0,
            "lr": 1e-3,
        },
    ]
    num_layers = model.config.num_hidden_layers
    layers = [getattr(model, 'pre_model').embeddings] + list(getattr(model, 'pre_model').encoder.layer)
    layers.reverse()
    lr = CFG.learning_rate
    for layer in layers:
        lr *= CFG.lr_decay
        optimizer_grouped_parameters += [
            {
                "params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": CFG.weight_decay,
                "lr": lr,
            },
            {
                "params": [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
                "lr": lr,
            },
        ]
    return AdamW(optimizer_grouped_parameters)


In [None]:
# ----------------------------------------------
# Main Loop
# ----------------------------------------------
val_scores = []
oof_df = pd.DataFrame()

for fold in range(CFG.num_fold): 
    print(f"*** FOLD {fold+1} / {CFG.num_fold}***")

    save_path = f"{CFG.model_save_path}/model_{fold+1}.pth"

    train_data = train_df[train_df['fold'] != fold]
    valid_data = train_df[train_df['fold'] == fold]
    train_set = TrainDataset(train_data)
    valid_set = TrainDataset(valid_data)

    train_loader = DataLoader(train_set,
                            batch_size=CFG.train_batch_size,
                            shuffle=True,
                            drop_last=True,
                            num_workers=2,
                            pin_memory=True)
    valid_loader = DataLoader(valid_set,
                            batch_size=CFG.valid_batch_size,
                            shuffle=False,
                            drop_last=False,
                            num_workers=2,
                            pin_memory=True)

    model = PPPMModel().to(CFG.device)
    optimizer = create_optimizer_grouped_parameters(model)
    #optimizer = AdamW(model.parameters(), lr=CFG.learning_rate)
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_training_steps=CFG.epochs*len(train_loader),
        num_warmup_steps=100
    )

    val_score, val_preds = train_fn(model, save_path, train_loader, valid_loader, optimizer, scheduler=scheduler)
    val_scores.append(val_score)
    val_preds = label_to_score(val_preds)
    valid_data['preds'] = val_preds
    oof_df = pd.concat([oof_df, valid_data])

    del model
    torch.cuda.empty_cache()

    print(val_scores)
    print("Mean:", np.array(val_scores).mean())
    if CFG.fold1_only == True:
        if fold == 0:
            break

*** FOLD 1 / 4***


Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 169

Epoch 1, Step 1693, train_loss: 0.90286, val_loss: 1.25005, pearson: 0.83281
Model Inproved: 0 ----> 0.8328123289401977
validation elasped time:  32.8


100%|██████████| 1693/1693 [05:06<00:00,  5.53it/s, Epoch=1, LR=0.000665, Loss=0.415, TotalLoss=0.712]
100%|██████████| 587/587 [00:28<00:00, 20.70it/s]


Epoch 2, Step 1693, train_loss: 0.41523, val_loss: 1.22217, pearson: 0.82621
validation elasped time:  28.4


100%|██████████| 1693/1693 [05:05<00:00,  5.54it/s, Epoch=2, LR=0.000353, Loss=0.25, TotalLoss=0.602]
100%|██████████| 587/587 [00:27<00:00, 21.05it/s]


Epoch 3, Step 1693, train_loss: 0.24956, val_loss: 1.19838, pearson: 0.83282
Model Inproved: 0.8328123289401977 ----> 0.8328216475590209
validation elasped time:  37.8


100%|██████████| 1693/1693 [05:03<00:00,  5.57it/s, Epoch=3, LR=9.77e-5, Loss=0.201, TotalLoss=0.512]
100%|██████████| 587/587 [00:28<00:00, 20.94it/s]


Epoch 4, Step 1693, train_loss: 0.20072, val_loss: 1.19336, pearson: 0.83001
validation elasped time:  28.0


100%|██████████| 1693/1693 [05:05<00:00,  5.54it/s, Epoch=4, LR=0, Loss=0.0647, TotalLoss=0.441]
100%|██████████| 587/587 [00:28<00:00, 20.94it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Epoch 5, Step 1693, train_loss: 0.06471, val_loss: 1.19216, pearson: 0.82594
validation elasped time:  28.0
total elasped time:  1.68e+03
[0.8328216475590209]
Mean: 0.8328216475590209
*** FOLD 2 / 4***


Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 172

Epoch 1, Step 1725, train_loss: 0.44879, val_loss: 1.25548, pearson: 0.79829
Model Inproved: 0 ----> 0.7982900276735344
validation elasped time:  53.2


100%|██████████| 1725/1725 [05:12<00:00,  5.51it/s, Epoch=1, LR=0.000665, Loss=0.44, TotalLoss=0.702]
100%|██████████| 554/554 [00:26<00:00, 20.76it/s]


Epoch 2, Step 1725, train_loss: 0.43951, val_loss: 1.23086, pearson: 0.82321
Model Inproved: 0.7982900276735344 ----> 0.8232072468655748
validation elasped time:  31.5


100%|██████████| 1725/1725 [05:12<00:00,  5.52it/s, Epoch=2, LR=0.000353, Loss=0.584, TotalLoss=0.588]
100%|██████████| 554/554 [00:26<00:00, 20.52it/s]


Epoch 3, Step 1725, train_loss: 0.58444, val_loss: 1.20849, pearson: 0.81789
validation elasped time:  27.0


100%|██████████| 1725/1725 [05:12<00:00,  5.52it/s, Epoch=3, LR=9.77e-5, Loss=0.269, TotalLoss=0.496]
100%|██████████| 554/554 [00:26<00:00, 20.85it/s]


Epoch 4, Step 1725, train_loss: 0.26851, val_loss: 1.19852, pearson: 0.81368
validation elasped time:  26.6


100%|██████████| 1725/1725 [05:13<00:00,  5.51it/s, Epoch=4, LR=0, Loss=0.198, TotalLoss=0.426]
100%|██████████| 554/554 [00:27<00:00, 20.47it/s]


Epoch 5, Step 1725, train_loss: 0.19784, val_loss: 1.19838, pearson: 0.81062
validation elasped time:  27.1
total elasped time:  1.73e+03
[0.8328216475590209, 0.8232072468655748]
Mean: 0.8280144472122979
*** FOLD 3 / 4***


Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 174

Epoch 1, Step 1741, train_loss: 0.52463, val_loss: 1.22175, pearson: 0.83363
Model Inproved: 0 ----> 0.8336283202210487
validation elasped time:  51.1


100%|██████████| 1741/1741 [05:17<00:00,  5.49it/s, Epoch=1, LR=0.000665, Loss=0.483, TotalLoss=0.712]
100%|██████████| 539/539 [00:26<00:00, 20.56it/s]


Epoch 2, Step 1741, train_loss: 0.48332, val_loss: 1.21119, pearson: 0.84048
Model Inproved: 0.8336283202210487 ----> 0.8404750439073577
validation elasped time:  31.1


100%|██████████| 1741/1741 [05:15<00:00,  5.52it/s, Epoch=2, LR=0.000352, Loss=0.751, TotalLoss=0.602]
100%|██████████| 539/539 [00:25<00:00, 20.86it/s]


Epoch 3, Step 1741, train_loss: 0.75134, val_loss: 1.19163, pearson: 0.83959
validation elasped time:  25.9


100%|██████████| 1741/1741 [05:15<00:00,  5.53it/s, Epoch=3, LR=9.76e-5, Loss=0.299, TotalLoss=0.513]
100%|██████████| 539/539 [00:25<00:00, 20.79it/s]


Epoch 4, Step 1741, train_loss: 0.29881, val_loss: 1.18840, pearson: 0.83491
validation elasped time:  25.9


100%|██████████| 1741/1741 [05:15<00:00,  5.52it/s, Epoch=4, LR=0, Loss=0.332, TotalLoss=0.442]
100%|██████████| 539/539 [00:25<00:00, 20.95it/s]


Epoch 5, Step 1741, train_loss: 0.33156, val_loss: 1.18976, pearson: 0.83253
validation elasped time:  25.7
total elasped time:  1.74e+03
[0.8328216475590209, 0.8232072468655748, 0.8404750439073577]
Mean: 0.8321679794439846
*** FOLD 4 / 4***


Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 167

Epoch 1, Step 1678, train_loss: 0.48508, val_loss: 1.24741, pearson: 0.80113
Model Inproved: 0 ----> 0.8011305231470817
validation elasped time:  55.2


100%|██████████| 1678/1678 [05:03<00:00,  5.52it/s, Epoch=1, LR=0.000665, Loss=0.445, TotalLoss=0.702]
100%|██████████| 602/602 [00:28<00:00, 21.06it/s]


Epoch 2, Step 1678, train_loss: 0.44506, val_loss: 1.22871, pearson: 0.80417
Model Inproved: 0.8011305231470817 ----> 0.8041684020635531
validation elasped time:  33.1


100%|██████████| 1678/1678 [05:03<00:00,  5.52it/s, Epoch=2, LR=0.000353, Loss=0.435, TotalLoss=0.591]
100%|██████████| 602/602 [00:28<00:00, 20.85it/s]


Epoch 3, Step 1678, train_loss: 0.43508, val_loss: 1.21518, pearson: 0.80492
Model Inproved: 0.8041684020635531 ----> 0.804920056408412
validation elasped time:  33.8


100%|██████████| 1678/1678 [05:03<00:00,  5.53it/s, Epoch=3, LR=9.77e-5, Loss=0.143, TotalLoss=0.5]
100%|██████████| 602/602 [00:29<00:00, 20.74it/s]


Epoch 4, Step 1678, train_loss: 0.14349, val_loss: 1.21321, pearson: 0.79969
validation elasped time:  29.0


100%|██████████| 1678/1678 [05:09<00:00,  5.41it/s, Epoch=4, LR=0, Loss=0.0451, TotalLoss=0.429]
100%|██████████| 602/602 [00:28<00:00, 20.78it/s]

Epoch 5, Step 1678, train_loss: 0.04515, val_loss: 1.21186, pearson: 0.79644
validation elasped time:  29.0
total elasped time:  1.7e+03
[0.8328216475590209, 0.8232072468655748, 0.8404750439073577, 0.804920056408412]
Mean: 0.8253559986850914





In [None]:
"""for batch_idx, data in enumerate(valid_loader):
  print(batch_idx)
"""

'for batch_idx, data in enumerate(valid_loader):\n  print(batch_idx)\n'

In [None]:
scores = {f'fold{i}':j for i,j in enumerate(val_scores)}
scores['oof'] = np.corrcoef(oof_df['preds'], oof_df['score'])[0][1]
scores = pd.Series(scores)
print(scores)

fold0    0.832822
fold1    0.823207
fold2    0.840475
fold3    0.804920
oof      0.824622
dtype: float64


In [None]:
scores.to_csv(f'{CFG.out_path}/scores.csv')
oof_df.to_csv(f'{CFG.out_path}/oof_df.csv')