In [1]:
!nvidia-smi

Thu May 26 10:05:59 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    25W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install -q pytorch-lightning wandb sentencepiece
!pip install git+https://github.com/huggingface/transformers.git
!pip install -q --upgrade --force-reinstall --no-deps kaggle

[K     |████████████████████████████████| 584 kB 14.5 MB/s 
[K     |████████████████████████████████| 1.8 MB 74.5 MB/s 
[K     |████████████████████████████████| 1.2 MB 76.3 MB/s 
[K     |████████████████████████████████| 140 kB 90.0 MB/s 
[K     |████████████████████████████████| 596 kB 80.1 MB/s 
[K     |████████████████████████████████| 409 kB 85.7 MB/s 
[K     |████████████████████████████████| 1.1 MB 70.4 MB/s 
[K     |████████████████████████████████| 181 kB 82.3 MB/s 
[K     |████████████████████████████████| 145 kB 87.3 MB/s 
[K     |████████████████████████████████| 63 kB 1.8 MB/s 
[K     |████████████████████████████████| 271 kB 101.1 MB/s 
[K     |████████████████████████████████| 94 kB 4.1 MB/s 
[K     |████████████████████████████████| 144 kB 95.0 MB/s 
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/huggingf

In [3]:
!mkdir /root/.kaggle
!cp /content/drive/MyDrive/Colab/kaggle/kaggle.json /root/.kaggle/kaggle.json

In [4]:
import os
import gc
import sys
import json
import itertools
from tqdm.auto import tqdm
import logging
import datetime
import ast
import numpy as np
import pandas as pd
import math
import re
from sklearn import model_selection as sms
from sklearn.preprocessing import LabelEncoder
import scipy as sp

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

import pytorch_lightning as pl
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import WandbLogger

from transformers import AutoConfig, AutoModel, AutoTokenizer, get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup
from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast

import wandb

%env TOKENIZERS_PARALLELISM=true

env: TOKENIZERS_PARALLELISM=true


In [5]:
class Config:
    # ==============================
    # Globals #
    # ==============================
    competition_name = "us-patent-phrase-to-phrase-matching"
    group = "BERT-for-Patents"
    exp_id = "019"
    debug = False
    inference_only = True
    upload_from_colab = True
    colab_dir = "/content/drive/MyDrive/Colab/kaggle/us-patent-phrase-to-phrase-matching"
    kaggle_json_path = "/root/.kaggle/kaggle.json"
    kaggle_dataset_path = None
    gpus = 1
    seed = 2434
    max_epochs = 5
    accumulate_grad_batches = 2
    precision = 16
    num_fold = 5
    train_fold = [0,1,2,3,4] # 実行するfold
    pretrained = True
    mlm_pretrained = False
    mlm_model_dir = "/content/drive/MyDrive/Colab/kaggle/us-patent-phrase-to-phrase-matching/output/mlm_exp001/model"
    gradient_clip_val = 1
    # ==============================
    # Dataloader #
    # ==============================
    train_batch_size =8
    valid_batch_size = 32
    test_batch_size = 32
    num_workers = 8
    # ==============================
    # Split #
    # ==============================
    split_name = "StratifiedGroupKFold"
    split_params = {
        "n_splits": num_fold,
        "shuffle": True,
        "random_state": seed,
    }
    # ==============================
    # Model #
    # ==============================
    model_name = "anferico/bert-for-patents"
    max_length = 117
    hidden_size = 1024
    use_backbone_dropout = True
    dropout = 0.2
    initializer_range = 0.02
    # ==============================
    # Loss #
    # ==============================
    loss_name = "MSELoss"
    loss_params = {
        "reduction": "mean",
    }
    # ==============================
    # Optimizer #
    # ==============================
    optimizer_name = "AdamW"
    optimizer_params = {
        "lr": 5e-6,
        "eps": 1e-6,
        "betas": (0.9, 0.999)
    }
    encoder_lr = 5e-6
    decoder_lr = 5e-6
    weight_decay = 0.01
    # ==============================
    # Scheduler #
    # ==============================
    scheduler_name = "cosine-warmup"
    scheduler_warmup_ratio = 0.1
    scheduler_params = {}
    scheduler_interval = "step"
    scheduler_cycle = "one-cycle" # epoch or one-cycle
    # ==============================
    # Callbacks #
    # ==============================
    checkpoint_params = {
        "monitor": "val/pearson_corr",
        "save_top_k": 1,
        "save_weights_only": True,
        "mode": "max",
        "verbose": True,
    }
    early_stopping = False
    early_stopping_params = {
        "monitor": "val/pearson_corr",
        "min_delta": 0.0,
        "patience": 8,
        "verbose": False,
        "mode": "min",
    }

In [7]:
# ====================================
# Setup #
# ====================================
class Logger:
    """ ref) https://github.com/ghmagazine/kagglebook/blob/master/ch04-model-interface/code/util.py"""
    def __init__(self, path):
        self.general_logger = logging.getLogger(path)
        stream_handler = logging.StreamHandler()
        file_general_handler = logging.FileHandler(os.path.join(path, 'Experiment.log'))
        if len(self.general_logger.handlers) == 0:
            self.general_logger.addHandler(stream_handler)
            self.general_logger.addHandler(file_general_handler)
            self.general_logger.setLevel(logging.INFO)

    def info(self, message):
        # display time
        self.general_logger.info('[{}] - {}'.format(self.now_string(), message))

    @staticmethod
    def now_string():
        return str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))


def setup(cfg):
    cfg.on_colab = "google.colab" in sys.modules
    if cfg.on_colab:
        # kaggle api
        f = open(Config.kaggle_json_path, 'r')
        json_data = json.load(f)
        os.environ["KAGGLE_USERNAME"] = json_data["username"]
        # set input/output dir
        cfg.input_dir = os.path.join(cfg.colab_dir, "input")
        cfg.train_csv = os.path.join(cfg.input_dir, "train.csv")
        cfg.test_csv = os.path.join(cfg.input_dir, "test.csv")
        cfg.cpc_data = os.path.join(cfg.input_dir, "cpc-data")
        cfg.cpc_codes_csv = os.path.join(cfg.input_dir, "cpc-codes/cpc_codes.csv")
        cfg.sample_submission = os.path.join(cfg.input_dir, "sample_submission.csv")
        cfg.output_dir = os.path.join(cfg.colab_dir, "output")
        cfg.exp_output_dir = os.path.join(cfg.output_dir, f"exp{cfg.exp_id}")
        cfg.model_dir = os.path.join(cfg.exp_output_dir, "model")

        for d in [cfg.output_dir, cfg.exp_output_dir, cfg.model_dir]:
            os.makedirs(d, exist_ok=True)
            
        # wandb
        wandb.login()
    else:
        cfg.input_dir = f"../input/{cfg.competition_name}"
        cfg.train_csv = os.path.join(cfg.input_dir, "train.csv")
        cfg.test_csv = os.path.join(cfg.input_dir, "test.csv")
        cfg.cpc_data = "../input/cpc-data"
        cfg.cpc_codes_csv = "../input/cpc-codes/cpc_codes.csv"
        cfg.sample_submission = os.path.join(cfg.input_dir, "sample_submission.csv")
        cfg.submission = "./"
        cfg.exp_output_dir = f"exp{cfg.exp_id}"
        cfg.model_dir = os.path.join(cfg.exp_output_dir, "model")

        if cfg.kaggle_dataset_path is not None:
            cfg.model_dir = os.path.join(cfg.kaggle_dataset_path, "model")

        for d in [cfg.exp_output_dir, cfg.model_dir]:
            os.makedirs(d, exist_ok=True)

    return cfg


# ====================================
# Preprocess #
# ====================================
def get_tokenizer(cfg):
    if cfg.kaggle_dataset_path is None:
        pretrained_dir = os.path.join(cfg.exp_output_dir, "pretrain_tokenizer")
    else:
        pretrained_dir = os.path.join(cfg.kaggle_dataset_path, "pretrain_tokenizer")

    if not os.path.isdir(pretrained_dir):
        # except for ("roberta", "deberta-v2", "deberta-v3")
        if "roberta" not in cfg.model_name:
            tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
        # roberta
        else:
            tokenizer = AutoTokenizer.from_pretrained(cfg.model_name, trim_offsets=False)

        tokenizer.save_pretrained(pretrained_dir)

    else:
        # deberta-v2 or deberta-v3
        if ("deberta-v2" in cfg.model_name) or ("deberta-v3" in cfg.model_name):
            tokenizer = DebertaV2TokenizerFast.from_pretrained(pretrained_dir)
        # except for ("roberta", "deberta-v2", "deberta-v3")
        elif "roberta" not in cfg.model_name:
            tokenizer = AutoTokenizer.from_pretrained(pretrained_dir)
        # roberta
        else:
            tokenizer = AutoTokenizer.from_pretrained(pretrained_dir, trim_offsets=False)

    return tokenizer


def get_backbone_config(cfg):
    filename = "model_config"
    filelist = get_filname_listdir(cfg.exp_output_dir if cfg.on_colab else cfg.kaggle_dataset_path)

    if not filename in filelist:
        model_config = AutoConfig.from_pretrained(cfg.model_name, output_hidden_states=True)
        torch.save(model_config, os.path.join(cfg.exp_output_dir if cfg.on_colab else cfg.kaggle_dataset_path, f"{filename}.pth"))
    else:
        cfg_path = os.path.join(cfg.exp_output_dir if cfg.on_colab else cfg.kaggle_dataset_path, f"{filename}.pth")
        model_config = torch.load(cfg_path)

    return model_config


def get_cpc_texts(cfg):
    contexts = []
    pattern = '[A-Z]\d+'
    for file_name in os.listdir(os.path.join(cfg.cpc_data, "CPCSchemeXML202105")):
        result = re.findall(pattern, file_name)
        if result:
            contexts.append(result)
    contexts = sorted(set(sum(contexts, [])))
    results = {}
    for cpc in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'Y']:
        with open(os.path.join(cfg.cpc_data, f"CPCTitleList202202/cpc-section-{cpc}_20220201.txt")) as f:
            s = f.read()
        pattern = f'{cpc}\t\t.+'
        result = re.findall(pattern, s)
        cpc_result = result[0].lstrip(pattern)
        for context in [c for c in contexts if c[0] == cpc]:
            pattern = f'{context}\t\t.+'
            result = re.findall(pattern, s)
            results[context] = cpc_result + ". " + result[0].lstrip(pattern)

    return results


def get_input_data(cfg, input_type: str = "train"):
    input_df = pd.read_csv(cfg.train_csv if input_type == "train" else cfg.test_csv, nrows=2000 if cfg.debug else None)

    cpc_texts = get_cpc_texts(cfg)
    input_df["context_text"] = input_df["context"].map(cpc_texts)
    # input_df["context_text"] = input_df["context_text"].str.lower()

    # cpc_codes_df = pd.read_csv(cfg.cpc_codes_csv)
    # cpc_codes_df["subclass_title"] = cpc_codes_df["subclass_title"].str.lower()
    # cpc_codes_df["group_title"] = cpc_codes_df["group_title"].str.lower()
    # input_df = input_df.merge(cpc_codes_df, on=["context"], how="left")

    # input_df["text"] = input_df["anchor"] + "[SEP]" + input_df["target"] + "[SEP]" + input_df["context_text"] + "[SEP]" + input_df["subclass_title"]
    input_df["text"] = input_df["anchor"] + "[SEP]" + input_df["target"] + "[SEP]" + input_df["context_text"]

    if input_type == "train":
        input_df["score_map"] = input_df["score"].map({0.00: 0, 0.25: 1, 0.50: 2, 0.75: 3, 1.00: 4})
 
    return input_df


def get_split(cfg, train_df):
    split_name = cfg.split_name
    split_params = cfg.split_params
    splitter = sms.__getattribute__(split_name)(**split_params)

    groups = train_df["context"].to_numpy()
    train_df["fold"] = -1

    for fold_id, (train_idx, valid_idx) in enumerate(splitter.split(
        train_df,
        train_df["score_map"],
        groups
        )):
        train_df.loc[valid_idx, "fold"] = int(fold_id)

    return train_df


def get_filname_listdir(directory):
    listdir = os.listdir(directory)
    out_lst = [os.path.splitext(d)[0] for d in listdir]
    
    return out_lst


# ====================================
# Dataset #
# ====================================
def get_inputs(cfg, tokenizer, text: str):
    encoding = tokenizer(
        text,
        max_length=cfg.max_length,
        padding="max_length",
        return_offsets_mapping=False,
        add_special_tokens=True,
        truncation=True
    )

    for k, v in encoding.items():
        encoding[k] = torch.tensor(v, dtype=torch.long)

    return encoding


class PPPMDataset(Dataset):
    def __init__(self, cfg, tokenizer, input_df: pd.DataFrame, phase: str = "train"):
        self.cfg = cfg
        self.tokenizer = tokenizer
        self.text = input_df["text"].to_numpy()
        self.label = input_df["score"].to_numpy() if phase == "train" else None
        self.phase = phase

    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        if self.phase == "train":
            inputs = get_inputs(
                self.cfg,
                self.tokenizer,
                self.text[idx]
            )

            return {
                "input_ids": inputs["input_ids"],
                "attention_mask": inputs["attention_mask"],
                "label": torch.tensor(self.label[idx], dtype=torch.float),
            }
        
        elif self.phase == "test":
            inputs = get_inputs(
                self.cfg,
                self.tokenizer,
                self.text[idx]
            )

            return {
                "input_ids": inputs["input_ids"],
                "attention_mask": inputs["attention_mask"],
            }
        else:
            raise NotImplementedError


class PPPMDataModule(pl.LightningDataModule):
    def __init__(self, cfg, tokenizer, train_df: pd.DataFrame = None, valid_df: pd.DataFrame = None, test_df: pd.DataFrame = None):
        super(PPPMDataModule, self).__init__()

        self.cfg = cfg
        self.tokenizer = tokenizer
        self.train_df = train_df
        self.valid_df = valid_df
        self.test_df = test_df

    def prepare_data(self):
        if self.test_df is None:
            self.train_dataset = PPPMDataset(
                cfg=self.cfg,
                tokenizer=self.tokenizer,
                input_df=self.train_df,
                phase="train"
            )
            self.val_dataset = PPPMDataset(
                cfg=self.cfg,
                tokenizer=self.tokenizer,
                input_df=self.valid_df,
                phase="train"
            )
        else:
            self.test_dataset = PPPMDataset(
                cfg=self.cfg,
                tokenizer=self.tokenizer,
                input_df=self.test_df,
                phase="test"
            )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.cfg.train_batch_size,
            num_workers=self.cfg.num_workers,
            shuffle=True,
            pin_memory=True,
            drop_last=False,
        )
    
    def val_dataloader(self):
        return DataLoader(
            self.val_dataset,
            batch_size=self.cfg.valid_batch_size,
            num_workers=self.cfg.num_workers,
            shuffle=False,
            pin_memory=True,
            drop_last=False,
        )

    def predict_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.cfg.test_batch_size,
            num_workers=self.cfg.num_workers,
            shuffle=False,
            pin_memory=True,
            drop_last=False,
        )


# ====================================
# Model #
# ====================================
class PPPMModel(nn.Module):
    def __init__(self, cfg):
        super(PPPMModel, self).__init__()
        
        self.cfg = cfg
        self.model_config = get_backbone_config(self.cfg)
        if self.cfg.mlm_pretrained:
            self.backbone = AutoModel.from_pretrained(self.cfg.mlm_model_dir, config=self.model_config)
        elif self.cfg.pretrained:
            self.backbone = AutoModel.from_pretrained(self.cfg.model_name, config=self.model_config)
        else:
            self.backbone = AutoModel.from_config(self.model_config)
        self.dropout = nn.Dropout(self.cfg.dropout)
        self.fc = nn.Linear(self.cfg.hidden_size, 1)
        self._init_weights(self.fc)
        self.layernorm = nn.LayerNorm(self.cfg.hidden_size)
        self._init_weights(self.layernorm)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.cfg.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.cfg.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs["last_hidden_state"][:, 0, :] # (batch_size, hidden_size)

        x = self.layernorm(x)

        x = self.dropout(x)
        x = self.fc(x) # (batch_size, 1)

        return x.squeeze(-1)


class PPPMLightningModule(pl.LightningModule):
    def __init__(self, cfg):
        super(PPPMLightningModule, self).__init__()

        self.cfg = cfg
        self.model = PPPMModel(self.cfg)
        self.criterion = get_criterion(self.cfg)

    def setup(self, stage=None):
        # calculate training total steps
        if stage == "fit":
            if self.cfg.scheduler_cycle == "one-cycle":
                self.training_steps = math.ceil(len(self.trainer.datamodule.train_dataloader()) / self.trainer.accumulate_grad_batches) * self.trainer.max_epochs
            elif self.cfg.scheduler_cycle == "epoch":
                self.training_steps = math.ceil(len(self.trainer.datamodule.train_dataloader()) / self.trainer.accumulate_grad_batches) * 1
            else:
                raise NotImplementedError
            self.warmup_steps = int(self.training_steps * self.cfg.scheduler_warmup_ratio)

    def forward(self, input_ids, attention_mask=None):
        return self.model(input_ids, attention_mask)

    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, label = batch["input_ids"], batch["attention_mask"], batch["label"]
        output = self.forward(input_ids, attention_mask)
        loss = self.criterion(output.view(-1, 1), label.view(-1, 1))
        self.log("train/loss", loss, on_epoch=True, logger=True, prog_bar=True)

        return loss

    def validation_step(self, batch, batch_idx):
        input_ids, attention_mask, label = batch["input_ids"], batch["attention_mask"], batch["label"]
        output = self.forward(input_ids, attention_mask)
        loss = self.criterion(output.view(-1, 1), label.view(-1, 1))
        self.log("val/loss", loss, on_epoch=True, logger=True, prog_bar=True)

        return {
            "preds": output.detach(),
            "labels": label.detach(),
            "loss": loss,
        }

    def validation_epoch_end(self, outputs):
        preds = torch.cat([output["preds"] for output in outputs]).sigmoid().cpu().numpy()
        labels = torch.cat([output["labels"] for output in outputs]).cpu().numpy()
        score = get_score(labels, preds)
        self.log("val/pearson_corr", score, logger=True, prog_bar=True)

    def predict_step(self, batch, batch_idx, dataloader_idx=None):
        input_ids, attention_mask = batch["input_ids"], batch["attention_mask"]
        output = self.forward(input_ids, attention_mask)
        output = output.sigmoid()

        return output.squeeze()

    def configure_optimizers(self):
        optimizer_params = get_optimizer_params(self.model, self.cfg.encoder_lr, self.cfg.decoder_lr, self.cfg.weight_decay)
        optimizer = get_optimizer(self.cfg, optimizer_params)

        if self.cfg.scheduler_name is None:
            return [optimizer]
        else:
            scheduler = get_scheduler(self.cfg, optimizer, num_warmup_steps=self.warmup_steps, num_training_steps=self.training_steps)
            scheduler = {"scheduler": scheduler, "interval": self.cfg.scheduler_interval}

            return [optimizer], [scheduler]


# ====================================
# Criterion, Optimizer, Scheduler #
# ====================================
def get_criterion(cfg):
    loss_name = cfg.loss_name
    loss_params = cfg.loss_params

    if loss_name != "SmoothFocalLoss":
        return nn.__getattribute__(loss_name)(**loss_params)
    else:
        return SmoothFocalLoss(**loss_params)


def get_optimizer(cfg, parameters):
    optimizer_name = cfg.optimizer_name
    optimizer_params = cfg.optimizer_params

    return optim.__getattribute__(optimizer_name)(parameters, **optimizer_params)


def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
    # param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in model.backbone.named_parameters() if not any(nd in n for nd in no_decay)],
            'lr': encoder_lr, 'weight_decay': weight_decay},
        {'params': [p for n, p in model.backbone.named_parameters() if any(nd in n for nd in no_decay)],
            'lr': encoder_lr, 'weight_decay': 0.0},
        {'params': [p for n, p in model.named_parameters() if "backbone" not in n],
            'lr': decoder_lr, 'weight_decay': 0.0}
    ]

    return optimizer_parameters


def get_scheduler(cfg, optimizer, num_warmup_steps=None, num_training_steps=None):
    scheduler_name = cfg.scheduler_name
    scheduler_params = cfg.scheduler_params

    if scheduler_name == "cosine-warmup":
        return get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_training_steps,
            **scheduler_params
        )
    elif scheduler_name == "linear-warmup":
        return get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_training_steps,
            **scheduler_params
        )
    else:
        return optim.lr_scheduler.__getattribute__(scheduler_name)(optimizer, **scheduler_params)


class FocalLoss(nn.Module):
    """
    reference: https://www.kaggle.com/competitions/nbme-score-clinical-patient-notes/discussion/322799
    """
    def __init__(self, reduction='none', alpha=1, gamma=2):
        super().__init__()
        self.reduction = reduction
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        bce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        pt = torch.exp(-bce_loss)
        loss = self.alpha * (1. - pt)**self.gamma * bce_loss
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss


class SmoothFocalLoss(nn.Module):
    def __init__(self, reduction='none', alpha=1, gamma=2, smoothing=0.0):
        super().__init__()
        self.reduction = reduction
        self.focal_loss = FocalLoss(reduction='none', alpha=alpha, gamma=gamma)
        self.smoothing = smoothing

    @staticmethod
    def _smooth(targets:torch.Tensor, smoothing=0.0):
        assert 0 <= smoothing < 1
        with torch.no_grad():
            targets = targets * (1.0 - smoothing) + 0.5 * smoothing
        return targets

    def forward(self, inputs, targets):
        targets = SmoothFocalLoss._smooth(targets, self.smoothing)
        loss = self.focal_loss(inputs, targets)
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss


# ====================================
# Train & Predict #
# ====================================
def train_fold(cfg, tokenizer, train_df, valid_df, fold):
    # Seed
    seed_everything(cfg.seed)

    # Wandb
    wandb_logger = WandbLogger(
        project=cfg.competition_name,
        group=cfg.group,
        name=f"exp{cfg.exp_id}-fold-{fold}",
        job_type=f"exp{cfg.exp_id}",
        reinit=True,
        anonymous="must",
    )

    # Model Checkpoint
    checkpoint = ModelCheckpoint(
        dirpath=cfg.model_dir,
        # filename=f"exp{cfg.exp_id}-fold-{fold}" + "-{epoch}",
        filename=f"exp{cfg.exp_id}-fold-{fold}",
        **cfg.checkpoint_params,
    )

    # Learning Rate
    lr_monitor = LearningRateMonitor(logging_interval="step")
    callbacks = [checkpoint, lr_monitor]

    # Early Stopping
    if cfg.early_stopping:
        early_stopping = EarlyStopping(**cfg.early_stopping_params)
        callbacks += [early_stopping]
    
    # DataModule
    lightning_datamodule = PPPMDataModule(
        cfg=cfg,
        tokenizer=tokenizer,
        train_df=train_df,
        valid_df=valid_df,
    )

    # Model
    lightning_model = PPPMLightningModule(
        cfg=cfg,
    )

    # Trainer
    trainer = Trainer(
        gpus=cfg.gpus,
        max_epochs=cfg.max_epochs,
        callbacks=callbacks,
        logger=[wandb_logger],
        accumulate_grad_batches=cfg.accumulate_grad_batches,
        precision=cfg.precision,
        # deterministic=True,
        benchmark=False,
        gradient_clip_val=cfg.gradient_clip_val
    )

    trainer.fit(lightning_model, datamodule=lightning_datamodule)
    wandb.finish(quiet=True)

    del lightning_datamodule, lightning_model, trainer

    gc.collect()
    torch.cuda.empty_cache()


def train_cv(cfg, tokenizer, input_df: pd.DataFrame):
    oof_df = []

    for fold_id in range(cfg.num_fold):
        if fold_id in cfg.train_fold:
            filename = f"exp{cfg.exp_id}-fold-{fold_id}"
            filelist = get_filname_listdir(cfg.model_dir)

            train_df = input_df[input_df["fold"] != fold_id].reset_index(drop=True)
            valid_df = input_df[input_df["fold"] == fold_id].reset_index(drop=True)

            # training
            if not filename in filelist:
                train_fold(
                    cfg=cfg,
                    tokenizer=tokenizer,
                    train_df=train_df,
                    valid_df=valid_df,
                    fold=fold_id,
                )

            # oof
            oof_pred = predict(
                cfg=cfg,
                tokenizer=tokenizer,
                input_df=valid_df,
                filename=filename,
            )
            valid_df["oof"] = oof_pred
            oof_df.append(valid_df)

            oof_score = get_score(valid_df["score"].to_numpy(), oof_pred)
            cfg.logger.info(f"fold{fold_id}-score: {oof_score}")

    oof_df = pd.concat(oof_df, axis=0).reset_index(drop=True)        
    oof_score = get_score(oof_df["score"].to_numpy(), oof_df["oof"].to_numpy())
    cfg.logger.info(f"cv-score: {oof_score}")

    return oof_df


def predict_raw_prediction(cfg, tokenizer, input_df: pd.DataFrame, filename: str):
    checkpoint_path = os.path.join(cfg.model_dir, filename + ".ckpt")

    lightning_model = PPPMLightningModule(
        cfg=cfg,
    )

    lightning_model = lightning_model.load_from_checkpoint(
        checkpoint_path=checkpoint_path,
        cfg=cfg,
    )

    lightning_datamodule = PPPMDataModule(
        cfg,
        tokenizer=tokenizer,
        test_df=input_df
    )

    trainer = Trainer(
        gpus=cfg.gpus,
    )

    preds = trainer.predict(
        lightning_model,
        datamodule=lightning_datamodule,
        return_predictions=True
    )

    preds = torch.cat(preds).cpu().numpy() # (samples, 1)

    del lightning_datamodule, lightning_model, trainer

    gc.collect()
    torch.cuda.empty_cache()
    
    return preds


def predict(cfg, tokenizer, input_df: pd.DataFrame, filename: str):
    file_path = os.path.join(cfg.exp_output_dir, f"{filename}.npy")
    
    if os.path.isfile(file_path):
        preds = np.load(file_path)
    else:
        preds = predict_raw_prediction(
            cfg=cfg,
            tokenizer=tokenizer,
            input_df=input_df,
            filename=filename
        )
        np.save(os.path.join(cfg.exp_output_dir, filename), preds)

    return preds


def predict_cv(cfg, tokenizer, input_df: pd.DataFrame):
    """
    CVモデルで予測
    """
    output_df = input_df.copy()
    fold_preds = []

    for fold_id in range(cfg.num_fold):
        if fold_id in cfg.train_fold:
            filename = f"exp{cfg.exp_id}-fold-{fold_id}"
            preds = predict_raw_prediction(
                cfg=cfg,
                tokenizer=tokenizer,
                input_df=input_df,
                filename=filename
            )
            fold_preds.append(preds)

    fold_preds = np.mean(fold_preds, axis=0)
    output_df["score"] = fold_preds
    
    return output_df


# ====================================
# Metrics #
# ====================================
def get_score(y_true, y_pred):
    score = sp.stats.pearsonr(y_true, y_pred)[0]

    return score

In [4]:
def main(Config):
    # setup
    Config = setup(Config)
    Config.logger = Logger(Config.exp_output_dir)
    
    # load dataset
    train_df = get_input_data(Config, input_type="train")
    test_df = get_input_data(Config, input_type="test")
    submission_df = pd.read_csv(Config.sample_submission)

    # split
    train_df = get_split(Config, train_df)

    # tokenizer
    tokenizer = get_tokenizer(Config)

    if not Config.inference_only:
        # training
        oof_df = train_cv(
            cfg=Config,
            tokenizer=tokenizer,
            input_df=train_df,
        )

    # predict
    raw_pred_df = predict_cv(
        cfg=Config,
        input_df=test_df,
        tokenizer=tokenizer,
    )

    # upload output to kaggle dataset
    if Config.upload_from_colab:
        from kaggle.api.kaggle_api_extended import KaggleApi

        def dataset_create_new(dataset_name, upload_dir):
            dataset_metadata = {}
            dataset_metadata['id'] = f'{os.environ["KAGGLE_USERNAME"]}/{dataset_name}'
            dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
            dataset_metadata['title'] = dataset_name
            with open(os.path.join(upload_dir, 'dataset-metadata.json'), 'w') as f:
                json.dump(dataset_metadata, f, indent=4)
            api = KaggleApi()
            api.authenticate()
            api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode='tar')

        dataset_create_new(dataset_name=f"{Config.competition_name}-exp{Config.exp_id}", upload_dir=Config.exp_output_dir)

    # make submission
    if not Config.on_colab:
        raw_pred_df[["id", "score"]].to_csv(os.path.join(Config.submission, "submission.csv"), index=False)


if __name__ == "__main__":
    main(Config)

[34m[1mwandb[0m: Currently logged in as: [33mazupero[0m. Use [1m`wandb login --relogin`[0m to force relogin
Some weights of the model checkpoint at anferico/bert-for-patents were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Predicting: 0it [00:00, ?it/s]

Some weights of the model checkpoint at anferico/bert-for-patents were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at anferico/bert-for-patents were not used when initializing BertModel: ['cls.

Predicting: 0it [00:00, ?it/s]

Some weights of the model checkpoint at anferico/bert-for-patents were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at anferico/bert-for-patents were not used when initializing BertModel: ['cls.

Predicting: 0it [00:00, ?it/s]

Some weights of the model checkpoint at anferico/bert-for-patents were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at anferico/bert-for-patents were not used when initializing BertModel: ['cls.

Predicting: 0it [00:00, ?it/s]

Some weights of the model checkpoint at anferico/bert-for-patents were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at anferico/bert-for-patents were not used when initializing BertModel: ['cls.

Predicting: 0it [00:00, ?it/s]

Starting upload for file model.tar


100%|██████████| 6.42G/6.42G [02:40<00:00, 43.0MB/s]


Upload successful: model.tar (6GB)
Starting upload for file pretrain_tokenizer.tar


100%|██████████| 1.24M/1.24M [00:03<00:00, 346kB/s]


Upload successful: pretrain_tokenizer.tar (1MB)
Starting upload for file model_config.pth


100%|██████████| 2.23k/2.23k [00:05<00:00, 446B/s]


Upload successful: model_config.pth (2KB)
Starting upload for file exp019-fold-0.npy


100%|██████████| 25.9k/25.9k [00:01<00:00, 13.5kB/s]


Upload successful: exp019-fold-0.npy (26KB)
Starting upload for file exp019-fold-1.npy


100%|██████████| 29.9k/29.9k [00:04<00:00, 7.47kB/s]


Upload successful: exp019-fold-1.npy (30KB)
Starting upload for file exp019-fold-2.npy


100%|██████████| 35.8k/35.8k [00:04<00:00, 7.79kB/s]


Upload successful: exp019-fold-2.npy (36KB)
Starting upload for file exp019-fold-3.npy


100%|██████████| 26.3k/26.3k [00:02<00:00, 9.58kB/s]


Upload successful: exp019-fold-3.npy (26KB)
Starting upload for file exp019-fold-4.npy


100%|██████████| 25.2k/25.2k [00:03<00:00, 6.76kB/s]


Upload successful: exp019-fold-4.npy (25KB)
Starting upload for file Experiment.log


100%|██████████| 333/333 [00:01<00:00, 185B/s]  


Upload successful: Experiment.log (333B)
