In [None]:
!nvidia-smi

Wed Jun 15 05:41:10 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    24W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install transformers==4.19.1
!pip install sentencepiece==0.1.96

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.19.1
  Downloading transformers-4.19.1-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 15.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 6.5 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 51.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 64.3 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstal

In [None]:
# ----------------------------------------------
# Load Libraries
# ----------------------------------------------
import pathlib
from pathlib import Path
import sys
import os
import math
import random
import time

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn import functional as F

from transformers import AdamW
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import BertForSequenceClassification, BertConfig, BertModel
from transformers import get_cosine_schedule_with_warmup

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import train_test_split

from tqdm import tqdm

import gc
gc.enable()

# Config

In [None]:
class CFG:
    exp_id = 'exp123'
    input_path = 'input/'
    cpc_path = 'input/'
    model_path = 'microsoft/cocolm-large'
    out_base = 'output'
    out_path = f'{out_base}/{exp_id}'
    model_save_path = f'{out_path}/models'
    
    debug = False
    fold1_only = False
    upload_dataset = True
    debug_size = 100
    log_interval = 1822 # 未使用
    seed = 42
    max_len = 92
    learning_rate = 2e-5
    weight_decay = 0.01
    lr_decay = 0.98
    num_classes = 5
    num_fold = 4
    epochs = 5
    train_batch_size = 16
    valid_batch_size = 16
    gradient_accumulation_step = 1
    device = "cuda" if torch.cuda.is_available() else "cpu"

    if debug:
      epochs = 1
      upload_dataset = False

    if fold1_only:
      upload_dataset = False

In [None]:
!mkdir -p {CFG.out_path}
!mkdir -p {CFG.model_save_path}

# Preprocess

In [None]:
# ----------------------------------------------
# Set SEED
# ----------------------------------------------
# seed
SEED = CFG.seed
def set_seed(SEED):
    random.seed(SEED)
    np.random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    
set_seed(SEED)

In [None]:
train_df = pd.read_csv(f"{CFG.input_path}train.csv")
# https://www.kaggle.com/code/gauravbrills/folds-dump-the-two-paths-fix
cpc_texts = torch.load(CFG.cpc_path+"cpc_texts_fixed.pth")
train_df['context_text'] = train_df['context'].map(cpc_texts)
display(train_df.head())

Unnamed: 0,id,anchor,target,context,score,context_text
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...


In [None]:
!pip install -q iterative-stratification
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

dfx = pd.get_dummies(train_df, columns=["score"]).groupby(["anchor"], as_index=False).sum()
cols = [c for c in dfx.columns if c.startswith("score_") or c == "anchor"]
dfx = dfx[cols]

mskf = MultilabelStratifiedKFold(n_splits=CFG.num_fold, shuffle=True, random_state=42)
labels = [c for c in dfx.columns if c != "anchor"]
dfx_labels = dfx[labels]
dfx["fold"] = -1

for fold, (trn_, val_) in enumerate(mskf.split(dfx, dfx_labels)):
    print(len(trn_), len(val_))
    dfx.loc[val_, "fold"] = fold

train_df = train_df.merge(dfx[["anchor", "fold"]], on="anchor", how="left")

550 183
549 184
550 183
550 183


In [None]:
train_df['fold'].value_counts().sort_index()

0    9379
1    8860
2    8612
3    9622
Name: fold, dtype: int64

In [None]:
print(train_df.shape)
train_df.head()

(36473, 7)


Unnamed: 0,id,anchor,target,context,score,context_text,fold
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,0
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,0
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,0
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,0
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,0


In [None]:
train_df['input'] = train_df['anchor'] + '[SEP]' + train_df['target'] + '[SEP]' + train_df['context_text']

In [None]:
train_df = pd.concat([train_df, pd.get_dummies(train_df['score'])], axis='columns')

In [None]:
train_df[[0.0,0.25,0.5,0.75,1.0]].values

array([[0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0],
       ...,
       [0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 1, 0, 0]], dtype=uint8)

In [None]:
train_df.head()

Unnamed: 0,id,anchor,target,context,score,context_text,fold,input,0.0,0.25,0.5,0.75,1.0
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,0,abatement[SEP]abatement of pollution[SEP]HUMAN...,0,0,1,0,0
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,0,abatement[SEP]act of abating[SEP]HUMAN NECESSI...,0,0,0,1,0
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,0,abatement[SEP]active catalyst[SEP]HUMAN NECESS...,0,1,0,0,0
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,0,abatement[SEP]eliminating process[SEP]HUMAN NE...,0,0,1,0,0
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,0,abatement[SEP]forest region[SEP]HUMAN NECESSIT...,1,0,0,0,0


# Tokenizer

In [None]:
# https://github.com/microsoft/COCO-LM/tree/main/huggingface
!git clone https://github.com/microsoft/COCO-LM.git 

sys.path.append('/content/COCO-LM/huggingface')

import torch
from cocolm.modeling_cocolm import COCOLMModel
from cocolm.configuration_cocolm import COCOLMConfig
from cocolm.tokenization_cocolm import COCOLMTokenizer

tokenizer = COCOLMTokenizer.from_pretrained(CFG.model_path)

Cloning into 'COCO-LM'...
remote: Enumerating objects: 1062, done.[K
remote: Counting objects: 100% (1062/1062), done.[K
remote: Compressing objects: 100% (872/872), done.[K
remote: Total 1062 (delta 213), reused 970 (delta 159), pack-reused 0[K
Receiving objects: 100% (1062/1062), 4.15 MiB | 11.70 MiB/s, done.
Resolving deltas: 100% (213/213), done.
Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


Downloading:   0%|          | 0.00/1.30M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/719k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/472 [00:00<?, ?B/s]

In [None]:
# トークン長分布の確認
"""
token_len = train_df['input'].map(lambda x: tokenizer(x)['input_ids'].__len__())
display(token_len.describe())
token_len.hist(bins=100)
"""

"\ntoken_len = train_df['input'].map(lambda x: tokenizer(x)['input_ids'].__len__())\ndisplay(token_len.describe())\ntoken_len.hist(bins=100)\n"

# Dataset

In [None]:
def prepare_input(tokenizer, anchor, target, title):
    # https://www.kaggle.com/competitions/us-patent-phrase-to-phrase-matching/discussion/324330
    anchor = tokenizer.encode(anchor, add_special_tokens=False)
    target = tokenizer.encode(target, add_special_tokens=False)
    title = tokenizer.encode(title, add_special_tokens=False)

    token_id = [tokenizer.cls_token_id] + anchor + [tokenizer.sep_token_id] + target + [tokenizer.sep_token_id]  + title + [tokenizer.sep_token_id]
    token_mask = [1] + [1]*len(anchor) +  [1] + [1]*len(target) + [1] + [1]*len(title) + [1]

    token_id = (token_id + [0]*512)[:CFG.max_len]
    token_mask = (token_mask + [0]*512)[:CFG.max_len]

    inputs = {}
    inputs['input_ids'] = torch.tensor(token_id, dtype=torch.long)
    inputs['attention_mask'] = torch.tensor(token_mask, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, df):
        self.anchor = df['anchor'].values
        self.target = df['target'].values
        self.title = df['context_text'].values
        self.label = df[[0.0,0.25,0.5,0.75,1.0]].values

    def __len__(self):
        return len(self.label)

    def __getitem__(self, item):
        anchor = self.anchor[item]
        target = self.target[item]
        title = self.title[item]
        label = self.label[item]
        outputs = prepare_input(tokenizer, anchor, target, title)
        outputs['label'] = torch.tensor(label, dtype=torch.float32)
        return outputs

# Train

In [None]:
# ----------------------------------------------
# Model
# ----------------------------------------------
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim
        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

class PPPMModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.config = COCOLMConfig.from_pretrained(CFG.model_path)
        self.pre_model = COCOLMModel.from_pretrained(CFG.model_path, config=self.config)
        self.head = AttentionHead(self.config.hidden_size, self.config.hidden_size,1)
        self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)
        self.regressor = nn.Linear(self.config.hidden_size, CFG.num_classes)
    
    def forward(self, inputs):
        pre_out = self.pre_model(**inputs)
        last_hidden_states = pre_out[0]
        last_hidden_states = self.dropout(self.head(last_hidden_states))
        logits1 = self.regressor(self.dropout1(last_hidden_states))
        logits2 = self.regressor(self.dropout2(last_hidden_states))
        logits3 = self.regressor(self.dropout3(last_hidden_states))
        logits4 = self.regressor(self.dropout4(last_hidden_states))
        logits5 = self.regressor(self.dropout5(last_hidden_states))
        logits = (logits1 + logits2 + logits3 + logits4 + logits5) / 5
        return logits

In [None]:
# ----------------------------------------------
# func: valid, predict
# ----------------------------------------------
def predict(model, dataloader):
    model.eval()
    result = np.zeros((len(dataloader.dataset), CFG.num_classes))
    idx = 0
    
    with torch.no_grad():
        for batch_idx, data in enumerate(dataloader):
          inputs = {}
          for k, v in data.items():
            if k != 'label':
              inputs[k] = v.to(CFG.device)            
          output = model(inputs)
          output = nn.Softmax(dim=1)(output)
          result[idx:idx + output.shape[0], :] = output.to('cpu')
          idx += output.shape[0]
            
    return result


def valid_func(model, dataloader):
    model.eval()
    result = np.zeros((len(dataloader.dataset), CFG.num_classes))
    idx = 0
    loss_sum = 0
    bar = tqdm(dataloader, total=len(dataloader))
    
    with torch.no_grad():
        for batch_idx, data in enumerate(bar):
          inputs = {}
          for k, v in data.items():
            if k != 'label':
              inputs[k] = v.to(CFG.device)            
          label = data['label'].to(CFG.device)          
          output = model(inputs)
          output = nn.Softmax(dim=1)(output)
          result[idx:idx + output.shape[0], :] = output.to('cpu')
          idx += output.shape[0]

          loss_sum += nn.CrossEntropyLoss(reduction='sum')(output, label).item()
            
    return loss_sum/(len(dataloader.dataset)), result.reshape((len(dataloader.dataset), CFG.num_classes))

def label_to_score(label):
    return (label*[0,0.25,0.5,0.75,1.0]).sum(axis=1)

def metric_pearson(predictions, labels):
    pred_score = label_to_score(predictions)
    label_score = label_to_score(labels)
    pearson = np.corrcoef(pred_score, label_score)[0][1]       
    return pearson

In [None]:
"""preds = predict(model, valid_loader)
label_to_score(valid_loader.dataset.label)
metric_pearson(preds, valid_loader.dataset.label)"""

'preds = predict(model, valid_loader)\nlabel_to_score(valid_loader.dataset.label)\nmetric_pearson(preds, valid_loader.dataset.label)'

In [None]:
# ----------------------------------------------
# func: train
# ----------------------------------------------
def train_fn(
    model,
    save_path,
    train_loader,
    val_loader,
    optimizer,
    scheduler=None,
    num_epochs=CFG.epochs
):

    best_score = 0
    best_epoch = 0
    running_loss = 0.0
    dataset_size = 0
    log_interval = CFG.log_interval
    oof_preds = None

    start = time.time()

    for epoch in range(num_epochs):
        val_score = None
        model.train()
        bar = tqdm(train_loader, total=len(train_loader))
        for batch_idx, data in enumerate(bar):
          inputs = {}
          for k, v in data.items():
            if k != 'label':
              inputs[k] = v.to(CFG.device)            
          label = data['label'].to(CFG.device)
          batch_size = label.size(0)

          output = model(inputs)
          loss = nn.CrossEntropyLoss()(output, label)
          loss = loss / CFG.gradient_accumulation_step

          loss.backward()

          if (batch_idx + 1) % CFG.gradient_accumulation_step == 0:
            optimizer.step()
            optimizer.zero_grad()
            if scheduler:
                scheduler.step()

          if CFG.debug == True:
            if (batch_idx > 0) & (batch_idx % CFG.debug_size == 0):
                break

          running_loss += (loss.item() * batch_size)
          dataset_size += batch_size            
          total_loss = running_loss / dataset_size
          bar.set_postfix(Epoch=epoch, Loss=loss.item(), TotalLoss=total_loss, LR=optimizer.param_groups[0]['lr'])

        val_start = time.time()
        val_score, predictions = valid_func(model, val_loader)
        pearson = metric_pearson(predictions, val_loader.dataset.label)
        print(f"Epoch {epoch+1}, Step {batch_idx+1}, train_loss: {loss:0.5f}, val_loss: {val_score:0.5f}, pearson: {pearson:0.5f}")
        if pearson > best_score:
            print(f"Model Inproved: {best_score} ----> {pearson}")
            best_score = pearson
            oof_preds = predictions
            torch.save(model.state_dict(), save_path)
        print(f"validation elasped time: {time.time() - val_start: 0.3}")

    print(f"total elasped time: {time.time() - start: 0.3}")
    start = time.time()

    return best_score, oof_preds

# ----------------------------------------------
# create optimizer
# ----------------------------------------------
def create_optimizer(model):
    named_params = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optim_params = []
    for idx_, (name_, params_) in enumerate(named_params):
        weight_decay = 0 if name_ in no_decay else 0.01
        optim_params.append({'params':params_,
                            'weight_decay': weight_decay,
                            })

    return AdamW(optim_params)


# https://www.ai-shift.co.jp/techblog/2145
def create_optimizer_grouped_parameters(model):
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters()
                       if 'lstm' in n
                       or 'cnn' in n
                       or 'regressor' in n],
            "weight_decay": 0.0,
            "lr": 1e-3,
        },
    ]
    num_layers = model.config.num_hidden_layers
    layers = [getattr(model, 'pre_model').embeddings] + list(getattr(model, 'pre_model').encoder.layer)
    layers.reverse()
    lr = CFG.learning_rate
    for layer in layers:
        lr *= CFG.lr_decay
        optimizer_grouped_parameters += [
            {
                "params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": CFG.weight_decay,
                "lr": lr,
            },
            {
                "params": [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
                "lr": lr,
            },
        ]
    return AdamW(optimizer_grouped_parameters)


In [None]:
# ----------------------------------------------
# Main Loop
# ----------------------------------------------
val_scores = []
oof_df = pd.DataFrame()

for fold in range(CFG.num_fold): 
    print(f"*** FOLD {fold+1} / {CFG.num_fold}***")

    save_path = f"{CFG.model_save_path}/model_{fold+1}.pth"

    train_data = train_df[train_df['fold'] != fold]
    valid_data = train_df[train_df['fold'] == fold]
    train_set = TrainDataset(train_data)
    valid_set = TrainDataset(valid_data)

    train_loader = DataLoader(train_set,
                            batch_size=CFG.train_batch_size,
                            shuffle=True,
                            drop_last=True,
                            num_workers=2,
                            pin_memory=True)
    valid_loader = DataLoader(valid_set,
                            batch_size=CFG.valid_batch_size,
                            shuffle=False,
                            drop_last=False,
                            num_workers=2,
                            pin_memory=True)

    model = PPPMModel().to(CFG.device)
    optimizer = create_optimizer_grouped_parameters(model)
    #optimizer = AdamW(model.parameters(), lr=CFG.learning_rate)
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_training_steps=CFG.epochs*len(train_loader),
        num_warmup_steps=100
    )

    val_score, val_preds = train_fn(model, save_path, train_loader, valid_loader, optimizer, scheduler=scheduler)
    val_scores.append(val_score)
    val_preds = label_to_score(val_preds)
    valid_data['preds'] = val_preds
    oof_df = pd.concat([oof_df, valid_data])

    del model
    torch.cuda.empty_cache()

    print(val_scores)
    print("Mean:", np.array(val_scores).mean())
    if CFG.fold1_only == True:
        if fold == 0:
            break

*** FOLD 1 / 4***


Downloading:   0%|          | 0.00/830M [00:00<?, ?B/s]

Some weights of the model checkpoint at https://huggingface.co/microsoft/cocolm-large/resolve/main/pytorch_model.bin were not used when initializing COCOLMModel: ['scl_head.LayerNorm.weight', 'scl_head.dense.bias', 'scl_head.LayerNorm.bias', 'binary_head.out_proj.bias', 'clm_head.decoder.weight', 'binary_head.out_proj.weight', 'scl_head.dense.weight', 'clm_head.bias', 'clm_head.decoder.bias']
- This IS expected if you are initializing COCOLMModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing COCOLMModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 1693/1693 [08:32<00:00,  3.31it/s, Epoch=0, LR=0.000913, Loss=0.413, TotalLoss=0.938]
100%|██████████| 587/587 [00:53<00:00, 10.89

Epoch 1, Step 1693, train_loss: 0.41336, val_loss: 1.26534, pearson: 0.81399
Model Inproved: 0 ----> 0.8139897061001019
validation elasped time:  59.9


100%|██████████| 1693/1693 [08:32<00:00,  3.30it/s, Epoch=1, LR=0.000665, Loss=0.466, TotalLoss=0.798]
100%|██████████| 587/587 [00:53<00:00, 10.93it/s]


Epoch 2, Step 1693, train_loss: 0.46631, val_loss: 1.23581, pearson: 0.82838
Model Inproved: 0.8139897061001019 ----> 0.8283832896571058
validation elasped time:  59.7


100%|██████████| 1693/1693 [08:32<00:00,  3.30it/s, Epoch=2, LR=0.000353, Loss=0.183, TotalLoss=0.693]
100%|██████████| 587/587 [00:53<00:00, 10.95it/s]


Epoch 3, Step 1693, train_loss: 0.18262, val_loss: 1.21720, pearson: 0.82051
validation elasped time:  53.6


100%|██████████| 1693/1693 [08:31<00:00,  3.31it/s, Epoch=3, LR=9.77e-5, Loss=0.213, TotalLoss=0.602]
100%|██████████| 587/587 [00:53<00:00, 10.99it/s]


Epoch 4, Step 1693, train_loss: 0.21332, val_loss: 1.21122, pearson: 0.81975
validation elasped time:  53.4


100%|██████████| 1693/1693 [08:30<00:00,  3.31it/s, Epoch=4, LR=0, Loss=0.218, TotalLoss=0.528]
100%|██████████| 587/587 [00:53<00:00, 10.92it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Epoch 5, Step 1693, train_loss: 0.21751, val_loss: 1.20592, pearson: 0.81732
validation elasped time:  53.8
total elasped time:  2.84e+03
[0.8283832896571058]
Mean: 0.8283832896571058
*** FOLD 2 / 4***


Some weights of the model checkpoint at https://huggingface.co/microsoft/cocolm-large/resolve/main/pytorch_model.bin were not used when initializing COCOLMModel: ['scl_head.LayerNorm.weight', 'scl_head.dense.bias', 'scl_head.LayerNorm.bias', 'binary_head.out_proj.bias', 'clm_head.decoder.weight', 'binary_head.out_proj.weight', 'scl_head.dense.weight', 'clm_head.bias', 'clm_head.decoder.bias']
- This IS expected if you are initializing COCOLMModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing COCOLMModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 1725/1725 [08:42<00:00,  3.30it/s, Epoch=0, LR=0.000913, Loss=1.23, TotalLoss=0.95]
100%|██████████| 554/554 [00:50<00:00, 10.92it

Epoch 1, Step 1725, train_loss: 1.23208, val_loss: 1.26339, pearson: 0.80005
Model Inproved: 0 ----> 0.8000510688485463
validation elasped time:  57.0


100%|██████████| 1725/1725 [08:41<00:00,  3.31it/s, Epoch=1, LR=0.000665, Loss=0.408, TotalLoss=0.808]
100%|██████████| 554/554 [00:50<00:00, 10.96it/s]


Epoch 2, Step 1725, train_loss: 0.40777, val_loss: 1.24591, pearson: 0.80233
Model Inproved: 0.8000510688485463 ----> 0.8023274264022165
validation elasped time:  56.7


100%|██████████| 1725/1725 [08:41<00:00,  3.31it/s, Epoch=2, LR=0.000353, Loss=0.416, TotalLoss=0.701]
100%|██████████| 554/554 [00:50<00:00, 10.93it/s]


Epoch 3, Step 1725, train_loss: 0.41554, val_loss: 1.22149, pearson: 0.81033
Model Inproved: 0.8023274264022165 ----> 0.810330542968867
validation elasped time:  56.7


100%|██████████| 1725/1725 [08:42<00:00,  3.30it/s, Epoch=3, LR=9.77e-5, Loss=0.183, TotalLoss=0.608]
100%|██████████| 554/554 [00:50<00:00, 10.91it/s]


Epoch 4, Step 1725, train_loss: 0.18252, val_loss: 1.21382, pearson: 0.80900
validation elasped time:  50.8


100%|██████████| 1725/1725 [08:42<00:00,  3.30it/s, Epoch=4, LR=0, Loss=0.135, TotalLoss=0.534]
100%|██████████| 554/554 [00:51<00:00, 10.83it/s]


Epoch 5, Step 1725, train_loss: 0.13478, val_loss: 1.21095, pearson: 0.80282
validation elasped time:  51.2
total elasped time:  2.88e+03
[0.8283832896571058, 0.810330542968867]
Mean: 0.8193569163129863
*** FOLD 3 / 4***


Some weights of the model checkpoint at https://huggingface.co/microsoft/cocolm-large/resolve/main/pytorch_model.bin were not used when initializing COCOLMModel: ['scl_head.LayerNorm.weight', 'scl_head.dense.bias', 'scl_head.LayerNorm.bias', 'binary_head.out_proj.bias', 'clm_head.decoder.weight', 'binary_head.out_proj.weight', 'scl_head.dense.weight', 'clm_head.bias', 'clm_head.decoder.bias']
- This IS expected if you are initializing COCOLMModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing COCOLMModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 1741/1741 [08:46<00:00,  3.31it/s, Epoch=0, LR=0.000913, Loss=1.04, TotalLoss=0.932]
100%|██████████| 539/539 [00:49<00:00, 10.92i

Epoch 1, Step 1741, train_loss: 1.04009, val_loss: 1.29446, pearson: 0.78680
Model Inproved: 0 ----> 0.7867997980879888
validation elasped time:  55.8


100%|██████████| 1741/1741 [08:47<00:00,  3.30it/s, Epoch=1, LR=0.000665, Loss=0.593, TotalLoss=0.795]
100%|██████████| 539/539 [00:49<00:00, 10.92it/s]


Epoch 2, Step 1741, train_loss: 0.59341, val_loss: 1.21701, pearson: 0.82878
Model Inproved: 0.7867997980879888 ----> 0.828775362210246
validation elasped time:  55.5


100%|██████████| 1741/1741 [08:50<00:00,  3.28it/s, Epoch=2, LR=0.000352, Loss=0.798, TotalLoss=0.69]
100%|██████████| 539/539 [00:49<00:00, 10.85it/s]


Epoch 3, Step 1741, train_loss: 0.79821, val_loss: 1.20960, pearson: 0.82548
validation elasped time:  49.7


100%|██████████| 1741/1741 [08:49<00:00,  3.29it/s, Epoch=3, LR=9.76e-5, Loss=0.274, TotalLoss=0.599]
100%|██████████| 539/539 [00:49<00:00, 10.95it/s]


Epoch 4, Step 1741, train_loss: 0.27438, val_loss: 1.19956, pearson: 0.82412
validation elasped time:  49.2


100%|██████████| 1741/1741 [08:49<00:00,  3.29it/s, Epoch=4, LR=0, Loss=0.129, TotalLoss=0.525]
100%|██████████| 539/539 [00:49<00:00, 10.86it/s]


Epoch 5, Step 1741, train_loss: 0.12887, val_loss: 1.19583, pearson: 0.82273
validation elasped time:  49.6
total elasped time:  2.9e+03
[0.8283832896571058, 0.810330542968867, 0.828775362210246]
Mean: 0.8224963982787395
*** FOLD 4 / 4***


Some weights of the model checkpoint at https://huggingface.co/microsoft/cocolm-large/resolve/main/pytorch_model.bin were not used when initializing COCOLMModel: ['scl_head.LayerNorm.weight', 'scl_head.dense.bias', 'scl_head.LayerNorm.bias', 'binary_head.out_proj.bias', 'clm_head.decoder.weight', 'binary_head.out_proj.weight', 'scl_head.dense.weight', 'clm_head.bias', 'clm_head.decoder.bias']
- This IS expected if you are initializing COCOLMModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing COCOLMModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 1678/1678 [08:30<00:00,  3.28it/s, Epoch=0, LR=0.000913, Loss=0.684, TotalLoss=0.917]
100%|██████████| 602/602 [00:55<00:00, 10.86

Epoch 1, Step 1678, train_loss: 0.68435, val_loss: 1.27497, pearson: 0.76787
Model Inproved: 0 ----> 0.7678687356444118
validation elasped time:  62.3


100%|██████████| 1678/1678 [08:31<00:00,  3.28it/s, Epoch=1, LR=0.000665, Loss=0.764, TotalLoss=0.777]
100%|██████████| 602/602 [00:55<00:00, 10.89it/s]


Epoch 2, Step 1678, train_loss: 0.76429, val_loss: 1.25001, pearson: 0.78186
Model Inproved: 0.7678687356444118 ----> 0.7818588342821159
validation elasped time:  61.6


100%|██████████| 1678/1678 [08:32<00:00,  3.28it/s, Epoch=2, LR=0.000353, Loss=0.283, TotalLoss=0.672]
100%|██████████| 602/602 [00:55<00:00, 10.85it/s]


Epoch 3, Step 1678, train_loss: 0.28312, val_loss: 1.22865, pearson: 0.79954
Model Inproved: 0.7818588342821159 ----> 0.7995442210045988
validation elasped time:  61.7


100%|██████████| 1678/1678 [08:32<00:00,  3.28it/s, Epoch=3, LR=9.77e-5, Loss=0.583, TotalLoss=0.582]
100%|██████████| 602/602 [00:55<00:00, 10.88it/s]


Epoch 4, Step 1678, train_loss: 0.58288, val_loss: 1.22425, pearson: 0.79468
validation elasped time:  55.4


100%|██████████| 1678/1678 [08:30<00:00,  3.28it/s, Epoch=4, LR=0, Loss=0.523, TotalLoss=0.51]
100%|██████████| 602/602 [00:55<00:00, 10.84it/s]


Epoch 5, Step 1678, train_loss: 0.52343, val_loss: 1.21974, pearson: 0.79195
validation elasped time:  55.5
total elasped time:  2.85e+03
[0.8283832896571058, 0.810330542968867, 0.828775362210246, 0.7995442210045988]
Mean: 0.8167583539602044


In [None]:
"""for batch_idx, data in enumerate(valid_loader):
  print(batch_idx)
"""

'for batch_idx, data in enumerate(valid_loader):\n  print(batch_idx)\n'

In [None]:
scores = {f'fold{i}':j for i,j in enumerate(val_scores)}
scores['oof'] = np.corrcoef(oof_df['preds'], oof_df['score'])[0][1]
scores = pd.Series(scores)
print(scores)

fold0    0.828383
fold1    0.810331
fold2    0.828775
fold3    0.799544
oof      0.815789
dtype: float64


In [None]:
scores.to_csv(f'{CFG.out_path}/scores.csv')
oof_df.to_csv(f'{CFG.out_path}/oof_df.csv')