In [None]:
!nvidia-smi

Fri Jun  3 20:22:10 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   57C    P0    52W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install transformers==4.19.1
!pip install sentencepiece==0.1.96

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# ----------------------------------------------
# Load Libraries
# ----------------------------------------------
import pathlib
from pathlib import Path
import sys
import os
import math
import random
import time

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import AdamW
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import BertForSequenceClassification, BertConfig, BertModel
from transformers import get_cosine_schedule_with_warmup

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import train_test_split

from tqdm import tqdm

import gc
gc.enable()

# Config

In [None]:
class CFG:
    exp_id = 'exp067'
    input_path = 'input/pppm/'
    cpc_path = 'cpc_texts/'
    model_path = 'roberta-large'
    out_base = '/content/drive/MyDrive/Colab_Files/kaggle/pppm/output'
    out_path = f'{out_base}/{exp_id}'
    scores_path = f'{out_path}/scores'
    
    debug = False
    fold1_only = False
    upload_dataset = True
    debug_size = 100
    log_interval = 1822 # 未使用
    seed = 42
    max_len = 92
    learning_rate = 2e-5
    weight_decay = 0.01
    lr_decay = 0.96
    num_classes = 1
    num_fold = 4
    epochs = 5
    train_batch_size = 16
    valid_batch_size = 16
    gradient_accumulation_step = 1
    device = "cuda" if torch.cuda.is_available() else "cpu"

    if debug:
      epochs = 1
      upload_dataset = False

    if fold1_only:
      upload_dataset = False

In [None]:
!mkdir -p {CFG.out_path}
!mkdir -p {CFG.scores_path}

# Preprocess

In [None]:
# ----------------------------------------------
# Set SEED
# ----------------------------------------------
# seed
SEED = CFG.seed
def set_seed(SEED):
    random.seed(SEED)
    np.random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    
set_seed(SEED)

In [None]:
train_df = pd.read_csv(f"{CFG.input_path}train.csv")
# https://www.kaggle.com/code/gauravbrills/folds-dump-the-two-paths-fix
cpc_texts = torch.load(CFG.cpc_path+"cpc_texts_fixed.pth")
train_df['context_text'] = train_df['context'].map(cpc_texts)
display(train_df.head())

Unnamed: 0,id,anchor,target,context,score,context_text
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...


In [None]:
!pip install -q iterative-stratification
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

dfx = pd.get_dummies(train_df, columns=["score"]).groupby(["anchor"], as_index=False).sum()
cols = [c for c in dfx.columns if c.startswith("score_") or c == "anchor"]
dfx = dfx[cols]

mskf = MultilabelStratifiedKFold(n_splits=CFG.num_fold, shuffle=True, random_state=42)
labels = [c for c in dfx.columns if c != "anchor"]
dfx_labels = dfx[labels]
dfx["fold"] = -1

for fold, (trn_, val_) in enumerate(mskf.split(dfx, dfx_labels)):
    print(len(trn_), len(val_))
    dfx.loc[val_, "fold"] = fold

train_df = train_df.merge(dfx[["anchor", "fold"]], on="anchor", how="left")

550 183
549 184
550 183
550 183


In [None]:
train_df['fold'].value_counts().sort_index()

0    9379
1    8860
2    8612
3    9622
Name: fold, dtype: int64

In [None]:
print(train_df.shape)
train_df.head()

(36473, 7)


Unnamed: 0,id,anchor,target,context,score,context_text,fold
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,0
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,0
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,0
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,0
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,0


In [None]:
"""train_df['con_grp'] = train_df['context'].map(lambda x: x[:1])
train_df.groupby('fold')['con_grp'].value_counts().sort_index()"""

"train_df['con_grp'] = train_df['context'].map(lambda x: x[:1])\ntrain_df.groupby('fold')['con_grp'].value_counts().sort_index()"

In [None]:
train_df['input'] = train_df['anchor'] + '[SEP]' + train_df['target'] + '[SEP]' + train_df['context_text']

In [None]:
train_df.head()

Unnamed: 0,id,anchor,target,context,score,context_text,fold,input
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,0,abatement[SEP]abatement of pollution[SEP]HUMAN...
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,0,abatement[SEP]act of abating[SEP]HUMAN NECESSI...
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,0,abatement[SEP]active catalyst[SEP]HUMAN NECESS...
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,0,abatement[SEP]eliminating process[SEP]HUMAN NE...
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,0,abatement[SEP]forest region[SEP]HUMAN NECESSIT...


# Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_path)

In [None]:
# トークン長分布の確認
"""
token_len = train_df['input'].map(lambda x: tokenizer(x)['input_ids'].__len__())
display(token_len.describe())
token_len.hist(bins=100)
"""

"\ntoken_len = train_df['input'].map(lambda x: tokenizer(x)['input_ids'].__len__())\ndisplay(token_len.describe())\ntoken_len.hist(bins=100)\n"

# Dataset

In [None]:
def prepare_input(tokenizer, text):
    inputs = tokenizer(text,
                           add_special_tokens=True,
                           max_length=CFG.max_len,
                           padding="max_length",
                           truncation=True,
                           return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, df):
        self.inputs = df['input'].values
        self.label = df['score'].values

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, item):
        inputs = self.inputs[item]
        label = self.label[item]
        outputs = prepare_input(tokenizer, inputs)
        outputs['label'] = torch.tensor(label, dtype=torch.float32)

        return outputs

# Train

In [None]:
# ----------------------------------------------
# Model
# ----------------------------------------------
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim
        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

class PPPMModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.num_reinit_layers = 6
        self.config = AutoConfig.from_pretrained(CFG.model_path)
        self.config.attention_probs_dropout_prob = 0.0
        self.config.hidden_dropout_prob = 0.0
        self.pre_model = AutoModel.from_pretrained(CFG.model_path, config=self.config)
        self.head = AttentionHead(self.config.hidden_size, self.config.hidden_size,1)
        #self.dropout = nn.Dropout(0.3)
        self.regressor = nn.Linear(self.config.hidden_size, CFG.num_classes)
        #self.initialize()
    
    def forward(self, inputs):
        pre_out = self.pre_model(**inputs)
        x = pre_out[0]
        x = self.head(x)
        x = self.regressor(x)
        return x
  
    def initialize(self):
      for i in range(self.num_reinit_layers):
          self.pre_model.encoder.layer[-(1 + i)].apply(self._init_weight)

    def _init_weight(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.pre_model.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()

In [None]:
# ----------------------------------------------
# func: valid, predict
# ----------------------------------------------
def predict(model, dataloader):
    model.eval()
    result = np.zeros((len(dataloader.dataset), CFG.num_classes))
    idx = 0
    
    with torch.no_grad():
        for batch_idx, data in enumerate(dataloader):
          inputs = {}
          for k, v in data.items():
            inputs[k] = v.to(CFG.device)            
          output = model(inputs)
          result[idx:idx + output.shape[0], :] = output.to('cpu')
          idx += output.shape[0]
            
    return result


def valid_mse(model, dataloader):
    model.eval()
    result = np.zeros((len(dataloader.dataset), CFG.num_classes))
    idx = 0
    mse_sum = 0
    bar = tqdm(dataloader, total=len(dataloader))
    
    with torch.no_grad():
        for batch_idx, data in enumerate(bar):
          inputs = {}
          for k, v in data.items():
            if k != 'label':
              inputs[k] = v.to(CFG.device)            
          label = data['label'].to(CFG.device)          
          output = model(inputs)
          result[idx:idx + output.shape[0], :] = output.to('cpu')
          idx += output.shape[0]

          mse_sum += nn.MSELoss(reduction='sum')(output.flatten(), label).item()
            
    return mse_sum/(len(dataloader.dataset)), result.reshape(len(result))


def metric_pearson(predictions, labels):
    pearson = np.corrcoef(predictions, labels)[0][1]       
    return pearson

In [None]:
# ----------------------------------------------
# func: train
# ----------------------------------------------
def train_fn(
    model,
    save_path,
    train_loader,
    val_loader,
    optimizer,
    scheduler=None,
    num_epochs=CFG.epochs
):

    best_score = 0
    best_epoch = 0
    running_loss = 0.0
    dataset_size = 0
    log_interval = CFG.log_interval
    oof_preds = None

    start = time.time()

    for epoch in range(num_epochs):
        val_score = None
        model.train()
        bar = tqdm(train_loader, total=len(train_loader))
        for batch_idx, data in enumerate(bar):
          inputs = {}
          for k, v in data.items():
            if k != 'label':
              inputs[k] = v.to(CFG.device)            
          label = data['label'].to(CFG.device)
          batch_size = label.size(0)

          output = model(inputs)
          loss = nn.MSELoss()(output.flatten(), label)
          loss = loss / CFG.gradient_accumulation_step

          loss.backward()

          if (batch_idx + 1) % CFG.gradient_accumulation_step == 0:
            optimizer.step()
            optimizer.zero_grad()
            if scheduler:
                scheduler.step()

          if CFG.debug == True:
            if (batch_idx > 0) & (batch_idx % CFG.debug_size == 0):
                break

          running_loss += (loss.item() * batch_size)
          dataset_size += batch_size            
          total_loss = running_loss / dataset_size
          bar.set_postfix(Epoch=epoch, Loss=loss.item(), TotalLoss=total_loss, LR=optimizer.param_groups[0]['lr'])

        val_start = time.time()
        val_score, predictions = valid_mse(model, val_loader)
        pearson = metric_pearson(predictions, val_loader.dataset.label)
        print(f"Epoch {epoch+1}, Step {batch_idx+1}, train_loss: {loss:0.5f}, val_loss: {val_score:0.5f}, pearson: {pearson:0.5f}")
        if pearson > best_score:
            print(f"Model Inproved: {best_score} ----> {pearson}")
            best_score = pearson
            oof_preds = predictions
            torch.save(model.state_dict(), save_path)
        print(f"validation elasped time: {time.time() - val_start: 0.3}")

    print(f"total elasped time: {time.time() - start: 0.3}")
    start = time.time()

    return best_score, oof_preds

# ----------------------------------------------
# create optimizer
# ----------------------------------------------
def create_optimizer(model):
    named_params = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optim_params = []
    for idx_, (name_, params_) in enumerate(named_params):
        weight_decay = 0 if name_ in no_decay else 0.01
        optim_params.append({'params':params_,
                            'weight_decay': weight_decay,
                            })

    return AdamW(optim_params)


# https://www.ai-shift.co.jp/techblog/2145
def create_optimizer_grouped_parameters(model):
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters()
                       if 'lstm' in n
                       or 'cnn' in n
                       or 'regressor' in n],
            "weight_decay": 0.0,
            "lr": 1e-3,
        },
    ]
    num_layers = model.config.num_hidden_layers
    layers = [getattr(model, 'pre_model').embeddings] + list(getattr(model, 'pre_model').encoder.layer)
    layers.reverse()
    lr = CFG.learning_rate
    for layer in layers:
        lr *= CFG.lr_decay
        optimizer_grouped_parameters += [
            {
                "params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": CFG.weight_decay,
                "lr": lr,
            },
            {
                "params": [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
                "lr": lr,
            },
        ]
    return AdamW(optimizer_grouped_parameters)


In [None]:
# ----------------------------------------------
# Main Loop
# ----------------------------------------------
val_scores = []
oof_df = pd.DataFrame()

for fold in range(CFG.num_fold): 
    print(f"*** FOLD {fold+1} / {CFG.num_fold}***")

    save_path = f"{CFG.out_path}/model_{fold+1}.pth"

    train_data = train_df[train_df['fold'] != fold]
    valid_data = train_df[train_df['fold'] == fold]
    train_set = TrainDataset(train_data)
    valid_set = TrainDataset(valid_data)

    train_loader = DataLoader(train_set,
                            batch_size=CFG.train_batch_size,
                            shuffle=True,
                            drop_last=True,
                            num_workers=2,
                            pin_memory=True)
    valid_loader = DataLoader(valid_set,
                            batch_size=CFG.valid_batch_size,
                            shuffle=False,
                            drop_last=False,
                            num_workers=2,
                            pin_memory=True)

    model = PPPMModel().to(CFG.device)
    #optimizer = create_optimizer(model)
    optimizer = create_optimizer_grouped_parameters(model)
    #optimizer = AdamW(model.parameters(), lr=CFG.learning_rate)
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_training_steps=CFG.epochs*len(train_loader),
        num_warmup_steps=100
    )

    val_score, val_preds = train_fn(model, save_path, train_loader, valid_loader, optimizer, scheduler=scheduler)

    val_scores.append(val_score)
    valid_data['preds'] = val_preds
    oof_df = pd.concat([oof_df, valid_data])

    del model
    torch.cuda.empty_cache()

    print(val_scores)
    print("Mean:", np.array(val_scores).mean())
    if CFG.fold1_only == True:
        if fold == 0:
            break

*** FOLD 1 / 4***


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 1693/1693 [13:57<00:00,  2.02it/s, Epoch=0, LR=0.000913, Loss=0.029, TotalLoss=0.0343]
100%|██████████| 587/587 [01:25<00:00,  6.87it/s]


Epoch 1, Step 1693, train_loss: 0.02899, val_loss: 0.02885, pearson: 0.79166
Model Inproved: 0 ----> 0.7916612241529974
validation elasped time:  91.2


100%|██████████| 1693/1693 [13:56<00:00,  2.03it/s, Epoch=1, LR=0.000665, Loss=0.0164, TotalLoss=0.0255]
100%|██████████| 587/587 [01:25<00:00,  6.90it/s]


Epoch 2, Step 1693, train_loss: 0.01638, val_loss: 0.02524, pearson: 0.79142
validation elasped time:  85.1


100%|██████████| 1693/1693 [13:57<00:00,  2.02it/s, Epoch=2, LR=0.000353, Loss=0.00595, TotalLoss=0.0199]
100%|██████████| 587/587 [01:25<00:00,  6.90it/s]


Epoch 3, Step 1693, train_loss: 0.00595, val_loss: 0.02483, pearson: 0.79393
Model Inproved: 0.7916612241529974 ----> 0.7939309750767866
validation elasped time:  90.9


100%|██████████| 1693/1693 [13:57<00:00,  2.02it/s, Epoch=3, LR=9.77e-5, Loss=0.003, TotalLoss=0.0159]
100%|██████████| 587/587 [01:25<00:00,  6.90it/s]


Epoch 4, Step 1693, train_loss: 0.00300, val_loss: 0.02491, pearson: 0.79812
Model Inproved: 0.7939309750767866 ----> 0.7981230807332561
validation elasped time:  91.2


100%|██████████| 1693/1693 [13:57<00:00,  2.02it/s, Epoch=4, LR=0, Loss=0.000313, TotalLoss=0.0131]
100%|██████████| 587/587 [01:24<00:00,  6.91it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Epoch 5, Step 1693, train_loss: 0.00031, val_loss: 0.02521, pearson: 0.79557
validation elasped time:  85.0
total elasped time:  4.63e+03
[0.7981230807332561]
Mean: 0.7981230807332561
*** FOLD 2 / 4***


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 1725/1725 [14:13<00:00,  2.02it/s, Epoch=0, LR=0.000913, Loss=0.0187, TotalLoss=0.0365]
100%|██████████| 554/554 [01:20<00:00,  6.91it/s]


Epoch 1, Step 1725, train_loss: 0.01875, val_loss: 0.02856, pearson: 0.78137
Model Inproved: 0 ----> 0.7813744375280561
validation elasped time:  86.5


100%|██████████| 1725/1725 [14:15<00:00,  2.02it/s, Epoch=1, LR=0.000665, Loss=0.0171, TotalLoss=0.0265]
100%|██████████| 554/554 [01:20<00:00,  6.89it/s]


Epoch 2, Step 1725, train_loss: 0.01712, val_loss: 0.02912, pearson: 0.78761
Model Inproved: 0.7813744375280561 ----> 0.7876126281981025
validation elasped time:  86.0


100%|██████████| 1725/1725 [14:14<00:00,  2.02it/s, Epoch=2, LR=0.000353, Loss=0.00456, TotalLoss=0.0207]
100%|██████████| 554/554 [01:20<00:00,  6.87it/s]


Epoch 3, Step 1725, train_loss: 0.00456, val_loss: 0.02745, pearson: 0.78750
validation elasped time:  80.7


100%|██████████| 1725/1725 [14:14<00:00,  2.02it/s, Epoch=3, LR=9.77e-5, Loss=0.00148, TotalLoss=0.0166]
100%|██████████| 554/554 [01:20<00:00,  6.89it/s]


Epoch 4, Step 1725, train_loss: 0.00148, val_loss: 0.02566, pearson: 0.78889
Model Inproved: 0.7876126281981025 ----> 0.788889529750942
validation elasped time:  86.1


100%|██████████| 1725/1725 [14:14<00:00,  2.02it/s, Epoch=4, LR=0, Loss=0.0003, TotalLoss=0.0137]
100%|██████████| 554/554 [01:20<00:00,  6.89it/s]


Epoch 5, Step 1725, train_loss: 0.00030, val_loss: 0.02630, pearson: 0.78608
validation elasped time:  80.5
total elasped time:  4.69e+03
[0.7981230807332561, 0.788889529750942]
Mean: 0.793506305242099
*** FOLD 3 / 4***


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 1741/1741 [14:23<00:00,  2.02it/s, Epoch=0, LR=0.000913, Loss=0.0238, TotalLoss=0.0354]
100%|██████████| 539/539 [01:18<00:00,  6.87it/s]


Epoch 1, Step 1741, train_loss: 0.02379, val_loss: 0.02741, pearson: 0.77882
Model Inproved: 0 ----> 0.7788227631184932
validation elasped time:  84.5


100%|██████████| 1741/1741 [14:23<00:00,  2.02it/s, Epoch=1, LR=0.000665, Loss=0.0252, TotalLoss=0.0261]
100%|██████████| 539/539 [01:18<00:00,  6.88it/s]


Epoch 2, Step 1741, train_loss: 0.02519, val_loss: 0.02878, pearson: 0.79298
Model Inproved: 0.7788227631184932 ----> 0.7929763072531734
validation elasped time:  84.7


100%|██████████| 1741/1741 [14:23<00:00,  2.02it/s, Epoch=2, LR=0.000352, Loss=0.00758, TotalLoss=0.0204]
100%|██████████| 539/539 [01:18<00:00,  6.87it/s]


Epoch 3, Step 1741, train_loss: 0.00758, val_loss: 0.02461, pearson: 0.80708
Model Inproved: 0.7929763072531734 ----> 0.8070817214234481
validation elasped time:  84.2


100%|██████████| 1741/1741 [14:23<00:00,  2.02it/s, Epoch=3, LR=9.76e-5, Loss=0.00597, TotalLoss=0.0164]
100%|██████████| 539/539 [01:18<00:00,  6.87it/s]


Epoch 4, Step 1741, train_loss: 0.00597, val_loss: 0.02454, pearson: 0.80393
validation elasped time:  78.5


100%|██████████| 1741/1741 [14:23<00:00,  2.02it/s, Epoch=4, LR=0, Loss=0.00024, TotalLoss=0.0135]
100%|██████████| 539/539 [01:18<00:00,  6.89it/s]


Epoch 5, Step 1741, train_loss: 0.00024, val_loss: 0.02476, pearson: 0.80254
validation elasped time:  78.3
total elasped time:  4.73e+03
[0.7981230807332561, 0.788889529750942, 0.8070817214234481]
Mean: 0.7980314439692154
*** FOLD 4 / 4***


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 1678/1678 [13:52<00:00,  2.02it/s, Epoch=0, LR=0.000913, Loss=0.0478, TotalLoss=0.0402]
100%|██████████| 602/602 [01:27<00:00,  6.89it/s]


Epoch 1, Step 1678, train_loss: 0.04781, val_loss: 0.02945, pearson: 0.75876
Model Inproved: 0 ----> 0.7587614238050311
validation elasped time:  93.5


100%|██████████| 1678/1678 [13:51<00:00,  2.02it/s, Epoch=1, LR=0.000665, Loss=0.0253, TotalLoss=0.0288]
100%|██████████| 602/602 [01:27<00:00,  6.89it/s]


Epoch 2, Step 1678, train_loss: 0.02530, val_loss: 0.02958, pearson: 0.76017
Model Inproved: 0.7587614238050311 ----> 0.7601720103799546
validation elasped time:  93.0


100%|██████████| 1678/1678 [13:52<00:00,  2.02it/s, Epoch=2, LR=0.000353, Loss=0.0068, TotalLoss=0.0224]
100%|██████████| 602/602 [01:27<00:00,  6.89it/s]


Epoch 3, Step 1678, train_loss: 0.00680, val_loss: 0.02886, pearson: 0.76611
Model Inproved: 0.7601720103799546 ----> 0.7661109996932539
validation elasped time:  93.0


100%|██████████| 1678/1678 [13:51<00:00,  2.02it/s, Epoch=3, LR=9.77e-5, Loss=0.00503, TotalLoss=0.0179]
100%|██████████| 602/602 [01:27<00:00,  6.89it/s]


Epoch 4, Step 1678, train_loss: 0.00503, val_loss: 0.02818, pearson: 0.76832
Model Inproved: 0.7661109996932539 ----> 0.7683177676480741
validation elasped time:  93.4


100%|██████████| 1678/1678 [13:51<00:00,  2.02it/s, Epoch=4, LR=0, Loss=0.000585, TotalLoss=0.0148]
100%|██████████| 602/602 [01:27<00:00,  6.88it/s]


Epoch 5, Step 1678, train_loss: 0.00059, val_loss: 0.02803, pearson: 0.76595
validation elasped time:  87.5
total elasped time:  4.62e+03
[0.7981230807332561, 0.788889529750942, 0.8070817214234481, 0.7683177676480741]
Mean: 0.7906030248889301


In [None]:
"""for batch_idx, data in enumerate(valid_loader):
  print(batch_idx)
"""

'for batch_idx, data in enumerate(valid_loader):\n  print(batch_idx)\n'

In [None]:
scores = {f'fold{i}':j for i,j in enumerate(val_scores)}
scores['oof'] = np.corrcoef(oof_df['preds'], oof_df['score'])[0][1]
scores = pd.Series(scores)
print(scores)

fold0    0.798123
fold1    0.788890
fold2    0.807082
fold3    0.768318
oof      0.789483
dtype: float64


In [None]:
scores.to_csv(f'{CFG.out_path}/scores.csv')
oof_df.to_csv(f'{CFG.out_path}/oof_df.csv')