In [None]:
!nvidia-smi

Fri Jun 17 03:37:57 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0    42W / 400W |      0MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install transformers==4.19.1
!pip install sentencepiece==0.1.96

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.19.1
  Downloading transformers-4.19.1-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 5.0 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 71.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 6.4 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 72.3 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uni

base: https://www.kaggle.com/code/ksork6s4/uspppm-bert-for-patents-baseline-train

In [None]:
# ----------------------------------------------
# Load Libraries
# ----------------------------------------------
import pathlib
from pathlib import Path
import sys
import os
import math
import random
import time

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn import functional as F

from transformers import AdamW
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import BertForSequenceClassification, BertConfig, BertModel
from transformers import get_cosine_schedule_with_warmup

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import train_test_split

from tqdm import tqdm

import gc
gc.enable()

# Config

In [None]:
class CFG:
    exp_id = 'exp127'
    input_path = 'input/'
    cpc_path = 'input/'
    model_path = 'anferico/bert-for-patents'
    out_base = 'output'
    out_path = f'{out_base}/{exp_id}'
    model_save_path = f'{out_path}/models'
    
    debug = False
    fold1_only = False
    upload_dataset = True
    debug_size = 100
    log_interval = 1822 # 未使用
    seed = 42
    max_len = 92
    learning_rate = 2e-5
    weight_decay = 0.01
    lr_decay = 0.98
    num_classes = 5
    num_fold = 4
    epochs = 5
    train_batch_size = 16
    valid_batch_size = 16
    gradient_accumulation_step = 1
    device = "cuda" if torch.cuda.is_available() else "cpu"

    if debug:
      epochs = 1
      upload_dataset = False

    if fold1_only:
      upload_dataset = False

In [None]:
!mkdir -p {CFG.out_path}
!mkdir -p {CFG.model_save_path}

# Preproc

In [None]:
# ----------------------------------------------
# Set SEED
# ----------------------------------------------
# seed
SEED = CFG.seed
def set_seed(SEED):
    random.seed(SEED)
    np.random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    
set_seed(SEED)

In [None]:
train_df = pd.read_csv(f"{CFG.input_path}train.csv")
# https://www.kaggle.com/code/gauravbrills/folds-dump-the-two-paths-fix
cpc_texts = torch.load(CFG.cpc_path+"cpc_texts_fixed.pth")
train_df['context_text'] = train_df['context'].map(cpc_texts)
display(train_df.head())

Unnamed: 0,id,anchor,target,context,score,context_text
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...


In [None]:
!pip install -q iterative-stratification
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

dfx = pd.get_dummies(train_df, columns=["score"]).groupby(["anchor"], as_index=False).sum()
cols = [c for c in dfx.columns if c.startswith("score_") or c == "anchor"]
dfx = dfx[cols]

mskf = MultilabelStratifiedKFold(n_splits=CFG.num_fold, shuffle=True, random_state=42)
labels = [c for c in dfx.columns if c != "anchor"]
dfx_labels = dfx[labels]
dfx["fold"] = -1

for fold, (trn_, val_) in enumerate(mskf.split(dfx, dfx_labels)):
    print(len(trn_), len(val_))
    dfx.loc[val_, "fold"] = fold

train_df = train_df.merge(dfx[["anchor", "fold"]], on="anchor", how="left")

550 183
549 184
550 183
550 183


In [None]:
train_df['fold'].value_counts().sort_index()

0    9379
1    8860
2    8612
3    9622
Name: fold, dtype: int64

In [None]:
print(train_df.shape)
train_df.head()

(36473, 7)


Unnamed: 0,id,anchor,target,context,score,context_text,fold
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,0
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,0
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,0
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,0
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,0


In [None]:
train_df['input'] = train_df['anchor'] + '[SEP]' + train_df['target'] + '[SEP]' + train_df['context_text']

In [None]:
train_df = pd.concat([train_df, pd.get_dummies(train_df['score'])], axis='columns')

In [None]:
train_df[[0.0,0.25,0.5,0.75,1.0]].values

array([[0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0],
       ...,
       [0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 1, 0, 0]], dtype=uint8)

In [None]:
train_df.head()

Unnamed: 0,id,anchor,target,context,score,context_text,fold,input,0.0,0.25,0.5,0.75,1.0
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,0,abatement[SEP]abatement of pollution[SEP]HUMAN...,0,0,1,0,0
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,0,abatement[SEP]act of abating[SEP]HUMAN NECESSI...,0,0,0,1,0
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,0,abatement[SEP]active catalyst[SEP]HUMAN NECESS...,0,1,0,0,0
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,0,abatement[SEP]eliminating process[SEP]HUMAN NE...,0,0,1,0,0
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,0,abatement[SEP]forest region[SEP]HUMAN NECESSIT...,1,0,0,0,0


# Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_path)

In [None]:
# トークン長分布の確認
"""
token_len = train_df['input'].map(lambda x: tokenizer(x)['input_ids'].__len__())
display(token_len.describe())
token_len.hist(bins=100)
"""

"\ntoken_len = train_df['input'].map(lambda x: tokenizer(x)['input_ids'].__len__())\ndisplay(token_len.describe())\ntoken_len.hist(bins=100)\n"

# Dataset

In [None]:
def prepare_input(tokenizer, text):
    inputs = tokenizer(text,
                           add_special_tokens=True,
                           max_length=CFG.max_len,
                           padding="max_length",
                           truncation=True,
                           return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, df):
        self.inputs = df['input'].values
        #self.label = df['score'].values
        self.label = df[[0.0,0.25,0.5,0.75,1.0]].values

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, item):
        inputs = self.inputs[item]
        label = self.label[item]
        outputs = prepare_input(tokenizer, inputs)
        outputs['label'] = torch.tensor(label, dtype=torch.float32)

        return outputs

# Train

In [None]:
# ----------------------------------------------
# Model
# ----------------------------------------------
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim
        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

class PPPMModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.config = AutoConfig.from_pretrained(CFG.model_path)
        self.config.update({"output_hidden_states": True})
        self.pre_model = AutoModel.from_pretrained(CFG.model_path, config=self.config)     
        self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)
        self.regressor = nn.Linear(self.config.hidden_size*4, CFG.num_classes)
    
    def forward(self, inputs):
        pre_out = self.pre_model(**inputs)
        #last_hidden_states = pre_out[0]
        head = torch.cat([pre_out["hidden_states"][-1*i][:,0] for i in range(1, 4+1)], dim=1)  # concatenate  
        last_hidden_states = self.dropout(head)
        logits1 = self.regressor(self.dropout1(last_hidden_states))
        logits2 = self.regressor(self.dropout2(last_hidden_states))
        logits3 = self.regressor(self.dropout3(last_hidden_states))
        logits4 = self.regressor(self.dropout4(last_hidden_states))
        logits5 = self.regressor(self.dropout5(last_hidden_states))
        logits = (logits1 + logits2 + logits3 + logits4 + logits5) / 5
        return logits

In [None]:
# ----------------------------------------------
# func: valid, predict
# ----------------------------------------------
def predict(model, dataloader):
    model.eval()
    result = np.zeros((len(dataloader.dataset), CFG.num_classes))
    idx = 0
    
    with torch.no_grad():
        for batch_idx, data in enumerate(dataloader):
          inputs = {}
          for k, v in data.items():
            if k != 'label':
              inputs[k] = v.to(CFG.device)            
          output = model(inputs)
          output = nn.Softmax(dim=1)(output)
          result[idx:idx + output.shape[0], :] = output.to('cpu')
          idx += output.shape[0]
            
    return result


def valid_func(model, dataloader):
    model.eval()
    result = np.zeros((len(dataloader.dataset), CFG.num_classes))
    idx = 0
    loss_sum = 0
    bar = tqdm(dataloader, total=len(dataloader))
    
    with torch.no_grad():
        for batch_idx, data in enumerate(bar):
          inputs = {}
          for k, v in data.items():
            if k != 'label':
              inputs[k] = v.to(CFG.device)            
          label = data['label'].to(CFG.device)          
          output = model(inputs)
          output = nn.Softmax(dim=1)(output)
          result[idx:idx + output.shape[0], :] = output.to('cpu')
          idx += output.shape[0]

          loss_sum += nn.CrossEntropyLoss(reduction='sum')(output, label).item()
            
    return loss_sum/(len(dataloader.dataset)), result.reshape((len(dataloader.dataset), CFG.num_classes))

def label_to_score(label):
    return (label*[0,0.25,0.5,0.75,1.0]).sum(axis=1)

def metric_pearson(predictions, labels):
    pred_score = label_to_score(predictions)
    label_score = label_to_score(labels)
    pearson = np.corrcoef(pred_score, label_score)[0][1]       
    return pearson

In [None]:
"""preds = predict(model, valid_loader)
label_to_score(valid_loader.dataset.label)
metric_pearson(preds, valid_loader.dataset.label)"""

'preds = predict(model, valid_loader)\nlabel_to_score(valid_loader.dataset.label)\nmetric_pearson(preds, valid_loader.dataset.label)'

In [None]:
# ----------------------------------------------
# func: train
# ----------------------------------------------
def train_fn(
    model,
    save_path,
    train_loader,
    val_loader,
    optimizer,
    scheduler=None,
    num_epochs=CFG.epochs
):

    best_score = 0
    best_epoch = 0
    running_loss = 0.0
    dataset_size = 0
    log_interval = CFG.log_interval
    oof_preds = None

    start = time.time()

    for epoch in range(num_epochs):
        val_score = None
        model.train()
        bar = tqdm(train_loader, total=len(train_loader))
        for batch_idx, data in enumerate(bar):
          inputs = {}
          for k, v in data.items():
            if k != 'label':
              inputs[k] = v.to(CFG.device)            
          label = data['label'].to(CFG.device)
          batch_size = label.size(0)

          output = model(inputs)
          loss = nn.CrossEntropyLoss()(output, label)
          loss = loss / CFG.gradient_accumulation_step

          loss.backward()

          if (batch_idx + 1) % CFG.gradient_accumulation_step == 0:
            optimizer.step()
            optimizer.zero_grad()
            if scheduler:
                scheduler.step()

          if CFG.debug == True:
            if (batch_idx > 0) & (batch_idx % CFG.debug_size == 0):
                break

          running_loss += (loss.item() * batch_size)
          dataset_size += batch_size            
          total_loss = running_loss / dataset_size
          bar.set_postfix(Epoch=epoch, Loss=loss.item(), TotalLoss=total_loss, LR=optimizer.param_groups[0]['lr'])

        val_start = time.time()
        val_score, predictions = valid_func(model, val_loader)
        pearson = metric_pearson(predictions, val_loader.dataset.label)
        print(f"Epoch {epoch+1}, Step {batch_idx+1}, train_loss: {loss:0.5f}, val_loss: {val_score:0.5f}, pearson: {pearson:0.5f}")
        if pearson > best_score:
            print(f"Model Inproved: {best_score} ----> {pearson}")
            best_score = pearson
            oof_preds = predictions
            torch.save(model.state_dict(), save_path)
        print(f"validation elasped time: {time.time() - val_start: 0.3}")

    print(f"total elasped time: {time.time() - start: 0.3}")
    start = time.time()

    return best_score, oof_preds

# ----------------------------------------------
# create optimizer
# ----------------------------------------------
def create_optimizer(model):
    named_params = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optim_params = []
    for idx_, (name_, params_) in enumerate(named_params):
        weight_decay = 0 if name_ in no_decay else 0.01
        optim_params.append({'params':params_,
                            'weight_decay': weight_decay,
                            })

    return AdamW(optim_params)


# https://www.ai-shift.co.jp/techblog/2145
def create_optimizer_grouped_parameters(model):
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters()
                       if 'lstm' in n
                       or 'cnn' in n
                       or 'regressor' in n],
            "weight_decay": 0.0,
            "lr": 1e-3,
        },
    ]
    num_layers = model.config.num_hidden_layers
    layers = [getattr(model, 'pre_model').embeddings] + list(getattr(model, 'pre_model').encoder.layer)
    layers.reverse()
    lr = CFG.learning_rate
    for layer in layers:
        lr *= CFG.lr_decay
        optimizer_grouped_parameters += [
            {
                "params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": CFG.weight_decay,
                "lr": lr,
            },
            {
                "params": [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
                "lr": lr,
            },
        ]
    return AdamW(optimizer_grouped_parameters)


In [None]:
# ----------------------------------------------
# Main Loop
# ----------------------------------------------
val_scores = []
oof_df = pd.DataFrame()

for fold in range(CFG.num_fold): 
    print(f"*** FOLD {fold+1} / {CFG.num_fold}***")

    save_path = f"{CFG.model_save_path}/model_{fold+1}.pth"

    train_data = train_df[train_df['fold'] != fold]
    valid_data = train_df[train_df['fold'] == fold]
    train_set = TrainDataset(train_data)
    valid_set = TrainDataset(valid_data)

    train_loader = DataLoader(train_set,
                            batch_size=CFG.train_batch_size,
                            shuffle=True,
                            drop_last=True,
                            num_workers=2,
                            pin_memory=True)
    valid_loader = DataLoader(valid_set,
                            batch_size=CFG.valid_batch_size,
                            shuffle=False,
                            drop_last=False,
                            num_workers=2,
                            pin_memory=True)

    model = PPPMModel().to(CFG.device)
    optimizer = create_optimizer_grouped_parameters(model)
    #optimizer = AdamW(model.parameters(), lr=CFG.learning_rate)
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_training_steps=CFG.epochs*len(train_loader),
        num_warmup_steps=100
    )

    val_score, val_preds = train_fn(model, save_path, train_loader, valid_loader, optimizer, scheduler=scheduler)
    val_scores.append(val_score)
    val_preds = label_to_score(val_preds)
    valid_data['preds'] = val_preds
    oof_df = pd.concat([oof_df, valid_data])

    del model
    torch.cuda.empty_cache()

    print(val_scores)
    print("Mean:", np.array(val_scores).mean())
    if CFG.fold1_only == True:
        if fold == 0:
            break

*** FOLD 1 / 4***


Some weights of the model checkpoint at anferico/bert-for-patents were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 1693/1693 [03:11<00:00,  8.86it/s, Epoch=0, LR=0.000913, Loss=0.514, TotalLoss=0.95]
100%|████████

Epoch 1, Step 1693, train_loss: 0.51354, val_loss: 1.26599, pearson: 0.81584
Model Inproved: 0 ----> 0.8158388648094539
validation elasped time:  26.8


100%|██████████| 1693/1693 [03:11<00:00,  8.85it/s, Epoch=1, LR=0.000665, Loss=0.462, TotalLoss=0.78]
100%|██████████| 587/587 [00:17<00:00, 34.10it/s]


Epoch 2, Step 1693, train_loss: 0.46205, val_loss: 1.22580, pearson: 0.82069
Model Inproved: 0.8158388648094539 ----> 0.8206906985176693
validation elasped time:  22.3


100%|██████████| 1693/1693 [03:11<00:00,  8.85it/s, Epoch=2, LR=0.000353, Loss=0.439, TotalLoss=0.649]
100%|██████████| 587/587 [00:17<00:00, 34.43it/s]


Epoch 3, Step 1693, train_loss: 0.43869, val_loss: 1.20781, pearson: 0.82705
Model Inproved: 0.8206906985176693 ----> 0.8270490402074299
validation elasped time:  22.7


100%|██████████| 1693/1693 [03:11<00:00,  8.85it/s, Epoch=3, LR=9.77e-5, Loss=0.224, TotalLoss=0.542]
100%|██████████| 587/587 [00:16<00:00, 34.71it/s]


Epoch 4, Step 1693, train_loss: 0.22402, val_loss: 1.20181, pearson: 0.82041
validation elasped time:  16.9


100%|██████████| 1693/1693 [03:11<00:00,  8.86it/s, Epoch=4, LR=0, Loss=0.0348, TotalLoss=0.459]
100%|██████████| 587/587 [00:17<00:00, 34.10it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Epoch 5, Step 1693, train_loss: 0.03481, val_loss: 1.19977, pearson: 0.81495
validation elasped time:  17.2
total elasped time:  1.06e+03
[0.8270490402074299]
Mean: 0.8270490402074299
*** FOLD 2 / 4***


Some weights of the model checkpoint at anferico/bert-for-patents were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 1725/1725 [03:15<00:00,  8.82it/s, Epoch=0, LR=0.000913, Loss=0.957, TotalLoss=0.931]
100%|███████

Epoch 1, Step 1725, train_loss: 0.95675, val_loss: 1.25519, pearson: 0.80728
Model Inproved: 0 ----> 0.807275655232252
validation elasped time:  20.3


100%|██████████| 1725/1725 [03:15<00:00,  8.83it/s, Epoch=1, LR=0.000665, Loss=0.715, TotalLoss=0.761]
100%|██████████| 554/554 [00:16<00:00, 34.17it/s]


Epoch 2, Step 1725, train_loss: 0.71482, val_loss: 1.23091, pearson: 0.81561
Model Inproved: 0.807275655232252 ----> 0.8156091596026211
validation elasped time:  20.3


100%|██████████| 1725/1725 [03:15<00:00,  8.82it/s, Epoch=2, LR=0.000353, Loss=0.437, TotalLoss=0.63]
100%|██████████| 554/554 [00:16<00:00, 34.23it/s]


Epoch 3, Step 1725, train_loss: 0.43695, val_loss: 1.22210, pearson: 0.80943
validation elasped time:  16.2


100%|██████████| 1725/1725 [03:15<00:00,  8.82it/s, Epoch=3, LR=9.77e-5, Loss=0.0385, TotalLoss=0.523]
100%|██████████| 554/554 [00:16<00:00, 33.83it/s]


Epoch 4, Step 1725, train_loss: 0.03846, val_loss: 1.21305, pearson: 0.80811
validation elasped time:  16.4


100%|██████████| 1725/1725 [03:16<00:00,  8.79it/s, Epoch=4, LR=0, Loss=0.00685, TotalLoss=0.441]
100%|██████████| 554/554 [00:16<00:00, 34.19it/s]


Epoch 5, Step 1725, train_loss: 0.00685, val_loss: 1.20863, pearson: 0.80544
validation elasped time:  16.2
total elasped time:  1.07e+03
[0.8270490402074299, 0.8156091596026211]
Mean: 0.8213290999050256
*** FOLD 3 / 4***


Some weights of the model checkpoint at anferico/bert-for-patents were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 1741/1741 [03:18<00:00,  8.78it/s, Epoch=0, LR=0.000913, Loss=1.06, TotalLoss=0.949]
100%|████████

Epoch 1, Step 1741, train_loss: 1.06450, val_loss: 1.26002, pearson: 0.81140
Model Inproved: 0 ----> 0.8114031208062441
validation elasped time:  19.7


100%|██████████| 1741/1741 [03:17<00:00,  8.81it/s, Epoch=1, LR=0.000665, Loss=0.745, TotalLoss=0.778]
100%|██████████| 539/539 [00:15<00:00, 34.47it/s]


Epoch 2, Step 1741, train_loss: 0.74500, val_loss: 1.22304, pearson: 0.82414
Model Inproved: 0.8114031208062441 ----> 0.8241441670641664
validation elasped time:  19.8


100%|██████████| 1741/1741 [03:17<00:00,  8.83it/s, Epoch=2, LR=0.000352, Loss=0.208, TotalLoss=0.646]
100%|██████████| 539/539 [00:16<00:00, 33.68it/s]


Epoch 3, Step 1741, train_loss: 0.20812, val_loss: 1.20105, pearson: 0.83364
Model Inproved: 0.8241441670641664 ----> 0.8336366645518293
validation elasped time:  20.2


100%|██████████| 1741/1741 [03:17<00:00,  8.81it/s, Epoch=3, LR=9.76e-5, Loss=0.102, TotalLoss=0.537]
100%|██████████| 539/539 [00:15<00:00, 33.82it/s]


Epoch 4, Step 1741, train_loss: 0.10192, val_loss: 1.19098, pearson: 0.82825
validation elasped time:  15.9


100%|██████████| 1741/1741 [03:17<00:00,  8.80it/s, Epoch=4, LR=0, Loss=0.0413, TotalLoss=0.454]
100%|██████████| 539/539 [00:16<00:00, 33.45it/s]


Epoch 5, Step 1741, train_loss: 0.04131, val_loss: 1.19067, pearson: 0.82529
validation elasped time:  16.1
total elasped time:  1.08e+03
[0.8270490402074299, 0.8156091596026211, 0.8336366645518293]
Mean: 0.8254316214539602
*** FOLD 4 / 4***


Some weights of the model checkpoint at anferico/bert-for-patents were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 1678/1678 [03:10<00:00,  8.79it/s, Epoch=0, LR=0.000913, Loss=1.2, TotalLoss=0.962]
100%|█████████

Epoch 1, Step 1678, train_loss: 1.19902, val_loss: 1.30165, pearson: 0.75854
Model Inproved: 0 ----> 0.7585390510313522
validation elasped time:  21.7


100%|██████████| 1678/1678 [03:10<00:00,  8.83it/s, Epoch=1, LR=0.000665, Loss=0.708, TotalLoss=0.795]
100%|██████████| 602/602 [00:17<00:00, 34.58it/s]


Epoch 2, Step 1678, train_loss: 0.70772, val_loss: 1.25291, pearson: 0.79298
Model Inproved: 0.7585390510313522 ----> 0.7929836125241606
validation elasped time:  21.5


100%|██████████| 1678/1678 [03:10<00:00,  8.83it/s, Epoch=2, LR=0.000353, Loss=0.462, TotalLoss=0.666]
100%|██████████| 602/602 [00:17<00:00, 34.58it/s]


Epoch 3, Step 1678, train_loss: 0.46177, val_loss: 1.23442, pearson: 0.79621
Model Inproved: 0.7929836125241606 ----> 0.7962142730579702
validation elasped time:  21.4


100%|██████████| 1678/1678 [03:10<00:00,  8.82it/s, Epoch=3, LR=9.77e-5, Loss=0.482, TotalLoss=0.557]
100%|██████████| 602/602 [00:17<00:00, 34.19it/s]


Epoch 4, Step 1678, train_loss: 0.48199, val_loss: 1.21801, pearson: 0.79521
validation elasped time:  17.6


100%|██████████| 1678/1678 [03:09<00:00,  8.83it/s, Epoch=4, LR=0, Loss=0.0306, TotalLoss=0.472]
100%|██████████| 602/602 [00:17<00:00, 34.62it/s]

Epoch 5, Step 1678, train_loss: 0.03063, val_loss: 1.21668, pearson: 0.79262
validation elasped time:  17.4
total elasped time:  1.05e+03
[0.8270490402074299, 0.8156091596026211, 0.8336366645518293, 0.7962142730579702]
Mean: 0.8181272843549627





In [None]:
"""for batch_idx, data in enumerate(valid_loader):
  print(batch_idx)
"""

'for batch_idx, data in enumerate(valid_loader):\n  print(batch_idx)\n'

In [None]:
scores = {f'fold{i}':j for i,j in enumerate(val_scores)}
scores['oof'] = np.corrcoef(oof_df['preds'], oof_df['score'])[0][1]
scores = pd.Series(scores)
print(scores)

fold0    0.827049
fold1    0.815609
fold2    0.833637
fold3    0.796214
oof      0.817393
dtype: float64


In [None]:
scores.to_csv(f'{CFG.out_path}/scores.csv')
oof_df.to_csv(f'{CFG.out_path}/oof_df.csv')