In [None]:
!nvidia-smi

In [None]:
!pip install -q pytorch-lightning wandb torchmetrics transformers sentencepiece
!pip install -q --upgrade --force-reinstall --no-deps kaggle

In [None]:
!mkdir /root/.kaggle
!cp /content/drive/MyDrive/Colab/kaggle/kaggle.json /root/.kaggle/kaggle.json

In [None]:
# import deberta-v2-v3-fast-tokenizer
import shutil
from pathlib import Path

transformers_path = Path("/usr/local/lib/python3.7/dist-packages/transformers")
input_dir = Path("/content/drive/MyDrive/Colab/kaggle/nbme-score-clinical-patient-notes/input/deberta-v2-v3-fast-tokenizer")

convert_file = input_dir / "convert_slow_tokenizer.py"
conversion_path = transformers_path/convert_file.name

if conversion_path.exists():
    conversion_path.unlink()

shutil.copy(convert_file, transformers_path)
deberta_v2_path = transformers_path / "models" / "deberta_v2"

for filename in ['tokenization_deberta_v2.py', 'tokenization_deberta_v2_fast.py']:
    filepath = deberta_v2_path/filename
    
    if filepath.exists():
        filepath.unlink()

    shutil.copy(input_dir/filename, filepath)

In [1]:
import os
import gc
import sys
import json
import itertools
from tqdm.auto import tqdm
import logging
import datetime
import ast
import numpy as np
import pandas as pd
import sklearn.model_selection as sms
from sklearn.metrics import f1_score
import math
import re

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

import pytorch_lightning as pl
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import WandbLogger

from transformers import AutoConfig, AutoModel, AutoTokenizer, get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup
from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast

import wandb

%env TOKENIZERS_PARALLELISM=true

env: TOKENIZERS_PARALLELISM=true


In [5]:
class Config:
    # ==============================
    # Globals #
    # ==============================
    competition_name = "nbme-score-clinical-patient-notes"
    group = "DeBERTa-v3-large"
    exp_id = "017"
    debug = False
    inference_only = False
    upload_from_colab = False
    colab_dir = "/content/drive/MyDrive/Colab/kaggle/nbme-score-clinical-patient-notes"
    kaggle_json_path = "/root/.kaggle/kaggle.json"
    kaggle_dataset_path = None
    gpus = 1
    seed = 2434
    max_epochs = 5
    accumulate_grad_batches = 4
    precision = 32
    num_fold = 5
    train_fold = [0,1,2] # 実行するfold
    pred_threshold = {
        0: 0.4,
        1: 0.67,
        2: 0.41,
        3: 0.54,
        4: 0.69,
        5: 0.41,
        6: 0.45,
        7: 0.6,
        8: 0.55,
        9: 0.67,
        # best_th: 0.57
    }
    use_pseudo_train = False
    # ==============================
    # Dataloader #
    # ==============================
    train_batch_size = 2
    valid_batch_size = 32
    test_batch_size = 32
    num_workers = 8
    # ==============================
    # Split #
    # ==============================
    split_name = "StratifiedGroupKFold"
    split_params = {
        "n_splits": num_fold if not debug else 4,
        "shuffle": True,
        "random_state": seed,
    }
    # ==============================
    # Model #
    # ==============================
    model_name = "microsoft/deberta-v3-large"
    max_length = 512
    hidden_size = 1024
    num_class = 1
    use_backbone_dropout = True
    dropout = 0.2
    initializer_range = 0.02
    lstm_params = {
        "num_layers": 1,
        "batch_first": True,
        "bidirectional": True,
        "dropout": 0.2,
    }
    # ==============================
    # Loss #
    # ==============================
    loss_name = "BCEWithLogitsLoss"
    loss_params = {
        "reduction": "none"
    }
    # ==============================
    # Optimizer #
    # ==============================
    optimizer_name = "AdamW"
    optimizer_params = {
        "lr": 2e-5,
        "weight_decay": 1e-2,
        "eps": 1e-6,
        "betas": (0.9, 0.999)
    }
    encoder_lr = 2e-5
    decoder_lr = 2e-5
    weight_decay = 0.01
    # ==============================
    # Scheduler #
    # ==============================
    scheduler_name = "cosine-warmup"
    scheduler_warmup_ratio = 0.1
    scheduler_params = {}
    scheduler_interval = "step"
    scheduler_cycle = "one-cycle" # epoch or one-cycle
    # ==============================
    # Callbacks #
    # ==============================
    checkpoint_params = {
        "monitor": "val/micro-F1",
        "save_top_k": 1,
        "save_weights_only": True,
        "mode": "max",
        "verbose": True,
    }
    early_stopping = False
    early_stopping_params = {
        "monitor": "val/loss",
        "min_delta": 0.0,
        "patience": 8,
        "verbose": False,
        "mode": "min",
    }

In [6]:
# ====================================
# Setup #
# ====================================
class Logger:
    """ ref) https://github.com/ghmagazine/kagglebook/blob/master/ch04-model-interface/code/util.py"""
    def __init__(self, path):
        self.general_logger = logging.getLogger(path)
        stream_handler = logging.StreamHandler()
        file_general_handler = logging.FileHandler(os.path.join(path, 'Experiment.log'))
        if len(self.general_logger.handlers) == 0:
            self.general_logger.addHandler(stream_handler)
            self.general_logger.addHandler(file_general_handler)
            self.general_logger.setLevel(logging.INFO)

    def info(self, message):
        # display time
        self.general_logger.info('[{}] - {}'.format(self.now_string(), message))

    @staticmethod
    def now_string():
        return str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))


def setup(cfg):
    cfg.on_colab = "google.colab" in sys.modules
    if cfg.on_colab:
        # kaggle api
        f = open(Config.kaggle_json_path, 'r')
        json_data = json.load(f)
        os.environ["KAGGLE_USERNAME"] = json_data["username"]
        # set input/output dir
        cfg.input_dir = os.path.join(cfg.colab_dir, "input")
        cfg.train_csv = os.path.join(cfg.input_dir, "cleaned_train.csv")
        cfg.external_train_csv = os.path.join(cfg.input_dir, "external_exact_match_train.csv")
        cfg.features_csv = os.path.join(cfg.input_dir, "features.csv")
        cfg.patient_notes_csv = os.path.join(cfg.input_dir, "patient_notes.csv")
        cfg.test_csv = os.path.join(cfg.input_dir, "test.csv")
        cfg.sample_submission = os.path.join(cfg.input_dir, "sample_submission.csv")
        cfg.output_dir = os.path.join(cfg.colab_dir, "output")
        cfg.exp_output_dir = os.path.join(cfg.output_dir, f"exp{cfg.exp_id}")
        cfg.model_dir = os.path.join(cfg.exp_output_dir, "model")

        for d in [cfg.output_dir, cfg.exp_output_dir, cfg.model_dir]:
            os.makedirs(d, exist_ok=True)
            
        # wandb
        wandb.login()
    else:
        cfg.input_dir = f"../input/{cfg.competition_name}"
        cfg.train_csv = os.path.join(cfg.input_dir, "train.csv")
        cfg.features_csv = os.path.join(cfg.input_dir, "features.csv")
        cfg.patient_notes_csv = os.path.join(cfg.input_dir, "patient_notes.csv")
        cfg.test_csv = os.path.join(cfg.input_dir, "test.csv")
        cfg.sample_submission = os.path.join(cfg.input_dir, "sample_submission.csv")
        cfg.submission = "./"
        cfg.exp_output_dir = f"exp{cfg.exp_id}"
        cfg.model_dir = os.path.join(cfg.exp_output_dir, "model")

        if cfg.kaggle_dataset_path is not None:
            cfg.model_dir = os.path.join(cfg.kaggle_dataset_path, "model")

        for d in [cfg.exp_output_dir, cfg.model_dir]:
            os.makedirs(d, exist_ok=True)

    return cfg


# ====================================
# Preprocess #
# ====================================
def get_input_data(cfg, input_type="train"):
    input_df = pd.read_csv(cfg.train_csv) if input_type == "train" else pd.read_csv(cfg.test_csv)
    if cfg.debug and input_type != "test":
        input_df = input_df[input_df["pn_num"].isin(input_df["pn_num"].unique()[:100])].reset_index(drop=True)
    
    feature_texts_df = pd.read_csv(Config.features_csv)
    patient_notes_df = pd.read_csv(Config.patient_notes_csv)

    if input_type == "train":
        external_df = pd.read_csv(cfg.external_train_csv)
        external_df = external_df.sample(14300 * 4, random_state=2434)
        input_df = pd.concat([input_df, external_df], axis=0).reset_index(drop=True)
        input_df["annotation"] = input_df["annotation"].apply(ast.literal_eval)
        input_df["location"] = input_df["location"].apply(ast.literal_eval)
    
    input_df = input_df.merge(feature_texts_df, on=["feature_num", "case_num"], how="left")
    input_df = input_df.merge(patient_notes_df, on=["pn_num", "case_num"], how="left")

    input_df["pn_history"] = input_df["pn_history"].apply(clean_feature_text_for_preprocess)

    return input_df


def get_and_merge_external_data(cfg, train_df: pd.DataFrame, fold: int):
    input_df = pd.read_csv(os.path.join(cfg.input_dir, f"external_train_fold_{fold}.csv"))
    input_df["annotation"] = input_df["annotation"].apply(ast.literal_eval)
    input_df["location"] = input_df["location"].apply(ast.literal_eval)
    input_df["pn_history"] = input_df["pn_history"].apply(clean_feature_text_for_preprocess)

    train_df = pd.concat([train_df, input_df], axis=0).reset_index(drop=True)

    return train_df


def get_split(cfg, train_df):
    split_name = cfg.split_name
    split_params = cfg.split_params
    splitter = sms.__getattribute__(split_name)(**split_params)

    groups = train_df["pn_num"].to_numpy()
    train_df["fold"] = -1

    for fold_id, (train_idx, valid_idx) in enumerate(splitter.split(train_df, train_df["case_num"], groups)):
        train_df.loc[valid_idx, "fold"] = int(fold_id)

    return train_df


def get_filname_listdir(dirctory):
    listdir = os.listdir(dirctory)
    out_lst = [os.path.splitext(d)[0] for d in listdir]
    return out_lst


def get_tokenizer(cfg):
    if cfg.kaggle_dataset_path is not None:
        pretrained_dir = os.path.join(cfg.kaggle_dataset_path, "pretrain_tokenizer")
    else:
        pretrained_dir = os.path.join(cfg.exp_output_dir, "pretrain_tokenizer")

    if not os.path.isdir(pretrained_dir):
        # deberta-v2 or deberta-v3
        if ("deberta-v2" in cfg.model_name) or ("deberta-v3" in cfg.model_name):
            tokenizer = DebertaV2TokenizerFast.from_pretrained(cfg.model_name)
        # except for ("roberta", "deberta-v2", "deberta-v3")
        elif "roberta" not in cfg.model_name:
            tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
        # roberta
        else:
            tokenizer = AutoTokenizer.from_pretrained(cfg.model_name, trim_offsets=False)

        tokenizer.save_pretrained(pretrained_dir)

    else:
        # deberta-v2 or deberta-v3
        if ("deberta-v2" in cfg.model_name) or ("deberta-v3" in cfg.model_name):
            tokenizer = DebertaV2TokenizerFast.from_pretrained(pretrained_dir)
        # except for ("roberta", "deberta-v2", "deberta-v3")
        elif "roberta" not in cfg.model_name:
            tokenizer = AutoTokenizer.from_pretrained(pretrained_dir)
        # roberta
        else:
            tokenizer = AutoTokenizer.from_pretrained(pretrained_dir, trim_offsets=False)

    return tokenizer


def get_backbone(cfg):
    if cfg.kaggle_dataset_path is not None:
        pretrained_dir = os.path.join(cfg.kaggle_dataset_path, "pretrain_model")
    else:
        pretrained_dir = os.path.join(cfg.exp_output_dir, "pretrain_model")

    if not os.path.isdir(pretrained_dir):
        model_config = AutoConfig.from_pretrained(cfg.model_name)
        if not cfg.use_backbone_dropout:
            model_config.attention_probs_dropout_prob = 0.0
            model_config.hidden_dropout_prob = 0.0
        backbone = AutoModel.from_pretrained(cfg.model_name, config=model_config)

        backbone.save_pretrained(pretrained_dir)

    else:
        model_config = AutoConfig.from_pretrained(pretrained_dir)
        if not cfg.use_backbone_dropout:
            model_config.attention_probs_dropout_prob = 0.0
            model_config.hidden_dropout_prob = 0.0
        backbone = AutoModel.from_pretrained(pretrained_dir, config=model_config)

    return backbone


def clean_feature_text_for_preprocess(text: str):
    """
    reference: https://www.kaggle.com/code/theoviel/roberta-strikes-back
    """
    text = re.sub('I-year', '1-year', text)
    text = re.sub('-OR-', " or ", text)
    text = re.sub('-', ' ', text)

    return text


# ====================================
# Dataset #
# ====================================
def get_inputs(cfg, text: str, feature_text: str, tokenizer):
    encoding = tokenizer(
        text,
        feature_text,
        max_length=cfg.max_length,
        padding="max_length",
        return_offsets_mapping=False,
        # add_special_tokens=True
    )

    for k, v in encoding.items():
        encoding[k] = torch.tensor(v, dtype=torch.long)

    return encoding


def get_label(cfg, text: str, locations: list, tokenizer):
    encoding = tokenizer(
        text,
        max_length=cfg.max_length,
        padding="max_length",
        return_offsets_mapping=True,
        # add_special_tokens=True
    )
    
    offset_mapping = encoding["offset_mapping"]
    ignore_idx = np.where(np.array(encoding.sequence_ids()) != 0)[0]
    label = np.zeros(len(offset_mapping))
    label[ignore_idx] = -1

    if len(locations) != 0:
        for location in locations:
            for loc in [s.split() for s in location.split(";")]:
                start_idx = -1
                end_idx = -1
                start, end = int(loc[0]), int(loc[1])
                for idx in range(len(offset_mapping)):
                    # DeBERTaのTokenizerは前の空白も含めるため+1する
                    if (start_idx == -1) & (start < offset_mapping[idx][0]):
                        start_idx = idx - 1
                    if (end_idx == -1) & (end <= offset_mapping[idx][1]):
                        end_idx = idx + 1
                if start_idx == -1:
                    start_idx = end_idx
                if (start_idx != -1) & (end_idx != -1):
                    label[start_idx: end_idx] = 1
    
    return torch.tensor(label, dtype=torch.float)


class NBMEDataset(Dataset):
    def __init__(self, cfg, input_df: pd.DataFrame, tokenizer, phase: str = "train"):
        self.cfg = cfg
        self.input_df = input_df
        self.tokenizer = tokenizer
        self.phase = phase
        self.pn_histories = self.input_df["pn_history"].to_numpy()
        self.feature_texts = self.input_df["feature_text"].to_numpy()
        self.locations = self.input_df["location"].to_numpy() if self.phase is "train" else None

    def __len__(self):
        return len(self.input_df)

    def __getitem__(self, idx):
        if self.phase == "train":
            inputs = get_inputs(
                self.cfg,
                self.pn_histories[idx],
                self.feature_texts[idx],
                self.tokenizer,
            )
            label = get_label(
                self.cfg,
                self.pn_histories[idx],
                self.locations[idx],
                self.tokenizer,
            )

            return {
                "input_ids": inputs["input_ids"],
                "attention_mask": inputs["attention_mask"],
                "labels": label,
            }

        elif self.phase == "test":
            inputs = get_inputs(
                self.cfg,
                self.pn_histories[idx],
                self.feature_texts[idx],
                self.tokenizer,
            )

            return {
                "input_ids": inputs["input_ids"],
                "attention_mask": inputs["attention_mask"],
            }
        else:
            raise NotImplementedError


class NBMEDataModule(pl.LightningDataModule):
    def __init__(self, cfg, tokenizer, train_df: pd.DataFrame = None, valid_df: pd.DataFrame = None, test_df: pd.DataFrame = None):
        super(NBMEDataModule, self).__init__()

        self.cfg = cfg
        self.tokenizer = tokenizer
        self.train_df = train_df
        self.valid_df = valid_df
        self.test_df = test_df

    def prepare_data(self):
        if self.test_df is None:
            self.train_dataset = NBMEDataset(
                cfg=self.cfg,
                input_df=self.train_df,
                tokenizer=self.tokenizer,
                phase="train"
            )
            self.val_dataset = NBMEDataset(
                cfg=self.cfg,
                input_df=self.valid_df,
                tokenizer=self.tokenizer,
                phase="train"
            )
        else:
            self.test_dataset = NBMEDataset(
                cfg=self.cfg,
                input_df=self.test_df,
                tokenizer=self.tokenizer,
                phase="test"
            )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.cfg.train_batch_size,
            num_workers=self.cfg.num_workers,
            shuffle=True,
            pin_memory=True,
            drop_last=False,
        )
    
    def val_dataloader(self):
        return DataLoader(
            self.val_dataset,
            batch_size=self.cfg.valid_batch_size,
            num_workers=self.cfg.num_workers,
            shuffle=False,
            pin_memory=True,
            drop_last=False,
        )

    def predict_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.cfg.test_batch_size,
            num_workers=self.cfg.num_workers,
            shuffle=False,
            pin_memory=True,
            drop_last=False,
        )


# ====================================
# Model #
# ====================================
class NBMEModel(nn.Module):
    def __init__(self, cfg):
        super(NBMEModel, self).__init__()

        self.cfg = cfg
        self.backbone = get_backbone(self.cfg)
        self.dropout = nn.Dropout(self.cfg.dropout)
        self.lstm = nn.LSTM(self.cfg.hidden_size, self.cfg.hidden_size, **self.cfg.lstm_params)
        self.classifier = nn.Linear(self.cfg.hidden_size * 2, self.cfg.num_class)
        self._init_weights(self.classifier)
        self._reinitialize()

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.cfg.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.cfg.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def _reinitialize(self):
        """
        Tensorflow/Keras-like initialization
        """
        for name, p in self.named_parameters():
            if 'lstm' in name:
                if 'weight_ih' in name:
                    nn.init.xavier_uniform_(p.data)
                elif 'weight_hh' in name:
                    nn.init.orthogonal_(p.data)
                elif 'bias_ih' in name:
                    p.data.fill_(0)
                    # Set forget-gate bias to 1
                    n = p.size(0)
                    p.data[(n // 4):(n // 2)].fill_(1)
                elif 'bias_hh' in name:
                    p.data.fill_(0)
            elif 'fc' in name:
                if 'weight' in name:
                    nn.init.xavier_uniform_(p.data)
                elif 'bias' in name:
                    p.data.fill_(0)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask) # (batch_size, seq_len, hidden_size)
        x = outputs[0] # extract last_hidden_states
        x, _ = self.lstm(x)
        x = self.dropout(x)
        x = self.classifier(x) # (batch_size, seq_len, num_class)

        return x


class NBMELightningModule(pl.LightningModule):
    def __init__(self, cfg, tokenizer=None, valid_df=None, valid_labels=None):
        super(NBMELightningModule, self).__init__()

        self.cfg = cfg
        self.model = NBMEModel(self.cfg)
        self.criterion = get_criterion(self.cfg)
        self.tokenizer = tokenizer
        self.valid_df = valid_df
        self.valid_labels = valid_labels

    def setup(self, stage=None):
        # calculate training total steps
        if stage == "fit":
            if self.cfg.scheduler_cycle == "one-cycle":
                self.training_steps = math.ceil(len(self.trainer.datamodule.train_dataloader()) / self.trainer.accumulate_grad_batches) * self.trainer.max_epochs
            elif self.cfg.scheduler_cycle == "epoch":
                self.training_steps = math.ceil(len(self.trainer.datamodule.train_dataloader()) / self.trainer.accumulate_grad_batches) * 1
            else:
                raise NotImplementedError
            self.warmup_steps = int(self.training_steps * self.cfg.scheduler_warmup_ratio) if self.cfg.scheduler_warmup_ratio else None
    
    def forward(self, input_ids, attention_mask):
        return self.model(input_ids, attention_mask)

    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch["input_ids"], batch["attention_mask"], batch["labels"]
        y_preds = self.forward(input_ids, attention_mask)
        loss = self.criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        mask = (labels.view(-1, 1) != -1)
        loss = torch.masked_select(loss, mask).mean()
        self.log("train/loss", loss, on_step=True, on_epoch=True, logger=True, prog_bar=True)

        return loss

    def validation_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch["input_ids"], batch["attention_mask"], batch["labels"]
        y_preds = self.forward(input_ids, attention_mask)
        loss = self.criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        mask = (labels.view(-1, 1) != -1)
        loss = torch.masked_select(loss, mask).mean()
        self.log("val/loss", loss, on_step=True, on_epoch=True, logger=True, prog_bar=True)

        return {
            "loss": loss,
            "preds": y_preds.detach()
        }

    def validation_epoch_end(self, outputs):
        preds = torch.cat([output["preds"] for output in outputs]).squeeze().cpu().numpy()
        char_preds = get_token_probs_to_char_probs(self.valid_df["pn_history"].to_numpy(), preds, self.tokenizer)
        results = get_results(self.cfg, char_preds, th=0.5)
        preds = get_predictions(results)
        score = get_score(self.valid_labels, preds)
        self.log("val/micro-F1", score, logger=True, prog_bar=True)

    def predict_step(self, batch, batch_idx, dataloader_idx=None):
        input_ids, attention_mask = batch["input_ids"], batch["attention_mask"]
        y_preds = self.forward(input_ids, attention_mask)
        y_preds = y_preds.sigmoid()

        return y_preds.squeeze()

    def configure_optimizers(self):
        optimizer_params = get_optimizer_params(self.model, self.cfg.encoder_lr, self.cfg.decoder_lr, self.cfg.weight_decay)
        optimizer = get_optimizer(self.cfg, optimizer_params)

        if self.cfg.scheduler_name is None:
            return [optimizer]
        else:
            scheduler = get_scheduler(self.cfg, optimizer, num_warmup_steps=self.warmup_steps, num_training_steps=self.training_steps)
            scheduler = {"scheduler": scheduler, "interval": self.cfg.scheduler_interval}

            return [optimizer], [scheduler]


# ====================================
# Criterion, Optimizer, Scheduler #
# ====================================
def get_criterion(cfg):
    loss_name = cfg.loss_name
    loss_params = cfg.loss_params

    return nn.__getattribute__(loss_name)(**loss_params)


def get_optimizer(cfg, parameters):
    optimizer_name = cfg.optimizer_name
    optimizer_params = cfg.optimizer_params

    return optim.__getattribute__(optimizer_name)(parameters, **optimizer_params)


def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
    # param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in model.backbone.named_parameters() if not any(nd in n for nd in no_decay)],
            'lr': encoder_lr, 'weight_decay': weight_decay},
        {'params': [p for n, p in model.backbone.named_parameters() if any(nd in n for nd in no_decay)],
            'lr': encoder_lr, 'weight_decay': 0.0},
        {'params': [p for n, p in model.named_parameters() if "backbone" not in n],
            'lr': decoder_lr, 'weight_decay': 0.0}
    ]

    return optimizer_parameters


def get_scheduler(cfg, optimizer, num_warmup_steps=None, num_training_steps=None):
    scheduler_name = cfg.scheduler_name
    scheduler_params = cfg.scheduler_params

    if scheduler_name == "cosine-warmup":
        return get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_training_steps,
            **scheduler_params
        )
    elif scheduler_name == "linear-warmup":
        return get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_training_steps,
            **scheduler_params
        )
    else:
        return optim.lr_scheduler.__getattribute__(scheduler_name)(optimizer, **scheduler_params)


# ====================================
# Train & Predict #
# ====================================
def train_fold(cfg, train_df, valid_df, tokenizer, fold, valid_labels):
    # Seed
    seed_everything(cfg.seed)

    # Wandb
    wandb_logger = WandbLogger(
        project=cfg.competition_name,
        group=cfg.group,
        name=f"exp{cfg.exp_id}-fold-{fold}",
        job_type=f"exp{cfg.exp_id}",
        reinit=True,
        anonymous="must",
    )

    # Model Checkpoint
    checkpoint = ModelCheckpoint(
        dirpath=cfg.model_dir,
        # filename=f"exp{cfg.exp_id}-fold-{fold}" + "-{epoch}",
        filename=f"exp{cfg.exp_id}-fold-{fold}",
        **cfg.checkpoint_params,
    )

    # Learning Rate
    lr_monitor = LearningRateMonitor(logging_interval="step")
    callbacks = [checkpoint, lr_monitor]

    # Early Stopping
    if cfg.early_stopping:
        early_stopping = EarlyStopping(**cfg.early_stopping_params)
        callbacks += [early_stopping]
    
    # DataModule
    lightning_datamodule = NBMEDataModule(
        cfg=cfg,
        tokenizer=tokenizer,
        train_df=train_df,
        valid_df=valid_df,
    )

    # Model
    lightning_model = NBMELightningModule(
        cfg,
        tokenizer,
        valid_df,
        valid_labels,
    )

    # Trainer
    trainer = Trainer(
        gpus=cfg.gpus,
        max_epochs=cfg.max_epochs,
        callbacks=callbacks,
        logger=[wandb_logger],
        accumulate_grad_batches=cfg.accumulate_grad_batches,
        precision=cfg.precision,
        # deterministic=True,
        benchmark=False,
    )

    trainer.fit(lightning_model, datamodule=lightning_datamodule)
    wandb.finish(quiet=True)

    del lightning_datamodule, lightning_model, trainer

    gc.collect()
    torch.cuda.empty_cache()


def train_cv(cfg, input_df, tokenizer):
    oof_char_probs = []
    true_df = pd.DataFrame()

    for fold_id in range(cfg.num_fold):
        if fold_id in cfg.train_fold:
            filename = f"exp{cfg.exp_id}-fold-{fold_id}"
            filelist = get_filname_listdir(cfg.model_dir)

            train_df = input_df[input_df["fold"] != fold_id].reset_index(drop=True)
            if cfg.use_pseudo_train:
                train_df = get_and_merge_external_data(cfg, train_df, fold_id) # merge external train
            valid_df = input_df[input_df["fold"] == fold_id].reset_index(drop=True)
            valid_df["labels"] = create_labels_for_scoring(valid_df)

            # training
            if not filename in filelist:
                train_fold(
                    cfg=cfg,
                    train_df=train_df,
                    valid_df=valid_df,
                    tokenizer=tokenizer,
                    fold=fold_id,
                    valid_labels=valid_df["labels"].to_numpy(),
                )

            # oof
            char_probs = predict(
                cfg=cfg,
                input_df=valid_df,
                tokenizer=tokenizer,
                filename=filename,
                labels=valid_df["labels"].to_numpy(),
            )
            # scoring and optimize threshodl for each case
            get_score_and_threshold(cfg, char_probs, valid_df, fold_id)
            
            oof_char_probs += char_probs
            true_df = pd.concat([true_df, valid_df], axis=0)

    get_score_and_threshold(cfg, oof_char_probs, true_df.reset_index(drop=True), "cv")
    results = get_results(cfg, oof_char_probs, cases=true_df["case_num"].to_list())
    preds = get_predictions(results)
    oof_score = get_score(true_df["labels"].to_list(), preds)
    cfg.logger.info(f"optimized case-threshold cv-score: {oof_score}")


def predict_raw_prediction(cfg, input_df, tokenizer, filename, labels=None):
    checkpoint_path = os.path.join(cfg.model_dir, filename + ".ckpt")

    lightning_model = NBMELightningModule(
        cfg,
        tokenizer,
        input_df,
        labels,
    )

    lightning_model = lightning_model.load_from_checkpoint(
        checkpoint_path=checkpoint_path,
        cfg=cfg,
    )

    lightning_datamodule = NBMEDataModule(
        cfg,
        tokenizer=tokenizer,
        test_df=input_df
    )

    trainer = Trainer(
        gpus=cfg.gpus,
    )

    preds = trainer.predict(
        lightning_model,
        datamodule=lightning_datamodule,
        return_predictions=True
    )

    preds = torch.cat(preds).cpu().numpy() # (sample, max_seq, num_class)

    del lightning_datamodule, lightning_model, trainer

    gc.collect()
    torch.cuda.empty_cache()
    
    return preds
    

def predict(cfg, input_df, tokenizer, filename, labels):
    file_path = os.path.join(cfg.exp_output_dir, f"{filename}.npy")
    
    if os.path.isfile(file_path):
        preds = np.load(file_path)
    else:
        preds = predict_raw_prediction(cfg, input_df, tokenizer, filename, labels)
        np.save(os.path.join(cfg.exp_output_dir, filename), preds)

    char_probs = get_token_probs_to_char_probs(input_df["pn_history"].to_numpy(), preds, tokenizer)

    return char_probs


def predict_cv(cfg, input_df, tokenizer):
    """
    CVモデルで予測
    """
    fold_preds = []
    for fold_id in range(cfg.num_fold):
        if fold_id in cfg.train_fold:
            filename = f"exp{cfg.exp_id}-fold-{fold_id}"
            preds = predict_raw_prediction(cfg, input_df, tokenizer, filename)
            char_preds = get_token_probs_to_char_probs(input_df["pn_history"].to_numpy(), preds, tokenizer)
            fold_preds.append(char_preds)

    fold_preds = np.mean(fold_preds, axis=0)
    results = get_results(cfg, fold_preds, cases=input_df["case_num"].to_list())

    output_df = input_df.copy()
    output_df["location"] = results
    
    return output_df


def get_token_probs_to_char_probs(texts, predictions, tokenizer):
    """
    予測値をtoken-level -> char-levelに変形
    """
    results = [np.zeros(len(t)) for t in texts]
    for i, (text, prediction) in enumerate(zip(texts, predictions)):
        encoded = tokenizer(
            text, 
            add_special_tokens=True,
            return_offsets_mapping=True
        )
        
        for idx, (offset_mapping, pred) in enumerate(zip(encoded['offset_mapping'], prediction)):
            start = offset_mapping[0]
            end = offset_mapping[1]

            # 先行するスペースがあればスパンから除く
            # if text[start] == " ":
            #     start = start + 1
            
            results[i][start: end] = pred
    
    return results


def get_results(cfg, char_probs, th=0.5, cases=None):
    """
    ";"区切りのスパンに変換
    """
    results = []
    if cases:
        for char_prob, case in zip(char_probs, cases):
            th = cfg.pred_threshold[case]
            result = np.where(char_prob >= th)[0] + 1
            result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
            result = [f"{min(r)} {max(r)}" for r in result]
            result = ";".join(result)
            results.append(result)
    else:
        for char_prob in char_probs:
            result = np.where(char_prob >= th)[0] + 1
            result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
            result = [f"{min(r)} {max(r)}" for r in result]
            result = ";".join(result)
            results.append(result)
    
    return results


def get_predictions(results):
    """
    各スパンのリストを要素とするリストに変換
    '3 4;7 9;12 13' -> [[3, 4], [7, 9], [12, 13]]
    """
    predictions = []
    for result in results:
        prediction = []
        if result != "":
            for loc in [s.split() for s in result.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    
    return predictions


def create_labels_for_scoring(df):
    # example: ['0 1', '3 4'] -> ['0 1; 3 4']
    df = df.copy()
    df['location_for_create_labels'] = [ast.literal_eval(f'[]')] * len(df)
    for i in range(len(df)):
        lst = df.loc[i, 'location']
        if lst:
            new_lst = ';'.join(lst)
            df.loc[i, 'location_for_create_labels'] = ast.literal_eval(f'[["{new_lst}"]]')
    # create labels
    truths = []
    for location_list in df['location_for_create_labels'].values:
        truth = []
        if len(location_list) > 0:
            location = location_list[0]
            for loc in [s.split() for s in location.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                truth.append([start, end])
        truths.append(truth)
    
    return truths


# ====================================
# Metrics #
# ====================================
def micro_f1(preds, truths):
    """
    Micro f1 on binary arrays.

    Args:
        preds (list of lists of ints): Predictions.
        truths (list of lists of ints): Ground truths.

    Returns:
        float: f1 score.
    """
    # Micro : aggregating over all instances
    preds = np.concatenate(preds)
    truths = np.concatenate(truths)

    return f1_score(truths, preds)


def spans_to_binary(spans, length=None):
    """
    Converts spans to a binary array indicating whether each character is in the span.

    Args:
        spans (list of lists of two ints): Spans.

    Returns:
        np array [length]: Binarized spans.
    """
    length = np.max(spans) if length is None else length
    binary = np.zeros(length)
    for start, end in spans:
        binary[start:end] = 1
    
    return binary


def span_micro_f1(preds, truths):
    """
    Micro f1 on spans.

    Args:
        preds (list of lists of two ints): Prediction spans.
        truths (list of lists of two ints): Ground truth spans.

    Returns:
        float: f1 score.
    """
    bin_preds = []
    bin_truths = []
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            continue
        length = max(np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0)
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
    
    return micro_f1(bin_preds, bin_truths)


def get_score(y_true, y_pred):
    score = span_micro_f1(y_true, y_pred)

    return score


def optimize_threshold(cfg, valid_labels, char_probs):
    best_thres = 0.5
    best_score = 0.0
    for th in np.arange(0.40, 0.70, 0.01):
        th = np.round(th, 2)
        results = get_results(cfg, char_probs, th=th)
        preds = get_predictions(results)
        score = get_score(valid_labels, preds)

        if best_score < score:
            best_thres = th
            best_score = score

    return best_thres, best_score


def get_score_and_threshold(cfg, pred_char_probs, valid_df, fold_id):
    """
    case毎 & 全体のスコアリングと閾値の最適化
    """
    class_scores = {}
    valid_df = valid_df.copy()
    valid_df["pred_char_probs"] = pred_char_probs

    for case in valid_df["case_num"].unique():
        case_idx = valid_df.query('case_num == @case').index
        case_labels = valid_df.iloc[case_idx]["labels"].to_list()
        case_char_probs = valid_df.iloc[case_idx]["pred_char_probs"].to_list()
        best_thres, best_score = optimize_threshold(cfg, case_labels, case_char_probs)
        if fold_id != "cv":
            cfg.logger.info(f"fold {fold_id}: case_num: {case} best_th: {best_thres}  score: {best_score:.5f}")
        else:
            cfg.logger.info(f"case_num: {case} best_th: {best_thres}  score: {best_score:.5f}")

    best_thres, best_score = optimize_threshold(cfg, valid_df["labels"].to_list(), pred_char_probs)
    if fold_id != "cv":
        cfg.logger.info(f"fold {fold_id}: best_th: {best_thres}  score: {best_score:.5f}")
    else:
        cfg.logger.info(f"best_th: {best_thres}  score: {best_score:.5f}")


# ====================================
# Pseudo labeling #
# ====================================
def get_input_data_for_pseudo_labeling(cfg, nrows: int = None):
    train_df = pd.read_csv(cfg.train_csv)
    feature_texts_df = pd.read_csv(Config.features_csv)
    patient_notes_df = pd.read_csv(Config.patient_notes_csv)

    train_pn_idx = list(train_df["pn_num"].unique())
    extract_train_df = patient_notes_df[~patient_notes_df["pn_num"].isin(train_pn_idx)].reset_index(drop=True)
    extract_train_df = extract_train_df.merge(feature_texts_df, on=["case_num"], how="left")
    extract_train_df["id"] = extract_train_df["pn_num"].astype(str).str.zfill(5) + "_" + extract_train_df["feature_num"].astype(str).str.zfill(3)
    extract_train_df["pn_history"] = extract_train_df["pn_history"].apply(clean_feature_text_for_preprocess)
    extract_train_df = extract_train_df.reindex(columns=["id", "case_num", "pn_num", "feature_num", "feature_text", "pn_history"])

    if nrows is not None:
        select_idx = extract_train_df.drop_duplicates(subset="pn_num").sample(n=nrows, random_state=2434)["pn_num"].to_list()
        extract_train_df = extract_train_df[extract_train_df["pn_num"].isin(select_idx)].reset_index(drop=True)

    return extract_train_df


def create_external_input(pred_df):
    pred_df["predict"] = get_predictions(pred_df["location"].to_list())

    all_annotation_texts = []
    for history, locations in zip(pred_df["pn_history"].to_numpy(), pred_df["predict"].to_numpy()):
        sample_annotation_texts = []
        for loc in locations:
            start, end = loc[0], loc[1]
            annotion_text = history[start: end]
            sample_annotation_texts.append(annotion_text)
        all_annotation_texts.append(sample_annotation_texts)

    pred_df["annotation"] = all_annotation_texts
    pred_df["location"] = pred_df["location"].apply(lambda x: x.split(";"))

    pred_df["len_annotation"] = pred_df["annotation"].apply(len)
    pred_df = pred_df[pred_df["len_annotation"] != 0].reset_index(drop=True)

    return pred_df.drop(columns=["predict", "len_annotation"], axis=1)


def predict_for_pseudo_labeling(cfg, input_df, tokenizer):
    """
    pseudo-labeling for each fold model
    """
    for fold_id in range(cfg.num_fold):
        if fold_id in cfg.train_fold:
            filename = f"exp{cfg.exp_id}-fold-{fold_id}"
            preds = predict_raw_prediction(cfg, input_df, tokenizer, filename)
            char_preds = get_token_probs_to_char_probs(input_df["pn_history"].to_numpy(), preds, tokenizer)
            results = get_results(cfg, char_preds, cases=input_df["case_num"].to_list())

            output_df = input_df.copy()
            output_df["location"] = results

            output_df = create_external_input(output_df)
            output_df["fold"] = fold_id
            output_df.to_csv(os.path.join(cfg.input_dir, f"external_train_fold_{fold_id}.csv"), index=False)

            del output_df

In [None]:
def main(Config):
    # setup
    Config = setup(Config)
    Config.logger = Logger(Config.exp_output_dir)
    # load dataset
    train_df = get_input_data(Config, input_type="train")
    test_df = get_input_data(Config, input_type="test")
    # submission_df = pd.read_csv(Config.sample_submission)
    # extract_train_df = get_input_data_for_pseudo_labeling(Config, nrows=3000)

    # split
    train_df = get_split(Config, train_df)

    # tokenizer
    tokenizer = get_tokenizer(Config)

    if not Config.inference_only:
        # training
        train_cv(
            cfg=Config,
            input_df=train_df,
            tokenizer=tokenizer,
        )

    # predict
    raw_pred_df = predict_cv(
        cfg=Config,
        input_df=test_df,
        tokenizer=tokenizer,
    )

    # pseudo-labeling
    # predict_for_pseudo_labeling(cfg=Config, input_df=extract_train_df, tokenizer=tokenizer)

    # upload output to kaggle dataset
    if Config.upload_from_colab:
        from kaggle.api.kaggle_api_extended import KaggleApi

        def dataset_create_new(dataset_name, upload_dir):
            dataset_metadata = {}
            dataset_metadata['id'] = f'{os.environ["KAGGLE_USERNAME"]}/{dataset_name}'
            dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
            dataset_metadata['title'] = dataset_name
            with open(os.path.join(upload_dir, 'dataset-metadata.json'), 'w') as f:
                json.dump(dataset_metadata, f, indent=4)
            api = KaggleApi()
            api.authenticate()
            api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode='tar')

        dataset_create_new(dataset_name=f"{Config.competition_name}-exp{Config.exp_id}", upload_dir=Config.exp_output_dir)

    # make submission
    if not Config.on_colab:
        raw_pred_df[["id", "location"]].to_csv(os.path.join(Config.submission, "submission.csv"), index=False)


if __name__ == "__main__":
    main(Config)

[34m[1mwandb[0m: Currently logged in as: [33mazupero[0m (use `wandb login --relogin` to force relogin)
  "num_layers={}".format(dropout, num_layers))
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

[2022-04-09 12:25:11] - fold 0: case_num: 0 best_th: 0.4  score: 0.90663
[2022-04-09 12:25:13] - fold 0: case_num: 1 best_th: 0.47  score: 0.90981
[2022-04-09 12:25:17] - fold 0: case_num: 2 best_th: 0.43  score: 0.82591
[2022-04-09 12:25:36] - fold 0: case_num: 3 best_th: 0.59  score: 0.94845
[2022-04-09 12:25:42] - fold 0: case_num: 4 best_th: 0.51  score: 0.90612
[2022-04-09 12:25:53] - fold 0: case_num: 5 best_th: 0.48  score: 0.92352
[2022-04-09 12:25:56] - fold 0: case_num: 6 best_th: 0.51  score: 0.92517
[2022-04-09 12:26:02] - fold 0: case_num: 7 best_th: 0.69  score: 0.91518
[2022-04-09 12:26:10] - fold 0: case_num: 8 best_th: 0.48  score: 0.93578
[2022-04-09 12:26:20] - fold 0: case_num: 9 best_th: 0.55  score: 0.88229
[2022-04-09 12:27:44] - fold 0: best_th: 0.48  score: 0.91440
  "num_layers={}".format(dropout, num_layers))
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_R

Predicting: 0it [00:00, ?it/s]

[2022-04-09 12:39:09] - fold 1: case_num: 0 best_th: 0.67  score: 0.92388
[2022-04-09 12:39:10] - fold 1: case_num: 1 best_th: 0.69  score: 0.88784
[2022-04-09 12:39:14] - fold 1: case_num: 2 best_th: 0.55  score: 0.88204
[2022-04-09 12:39:31] - fold 1: case_num: 3 best_th: 0.49  score: 0.95340
[2022-04-09 12:39:37] - fold 1: case_num: 4 best_th: 0.69  score: 0.91541
[2022-04-09 12:39:49] - fold 1: case_num: 5 best_th: 0.46  score: 0.93943
[2022-04-09 12:39:53] - fold 1: case_num: 6 best_th: 0.61  score: 0.89638
[2022-04-09 12:39:59] - fold 1: case_num: 7 best_th: 0.68  score: 0.91178
[2022-04-09 12:40:06] - fold 1: case_num: 8 best_th: 0.58  score: 0.93930
[2022-04-09 12:40:16] - fold 1: case_num: 9 best_th: 0.69  score: 0.87623
[2022-04-09 12:41:39] - fold 1: best_th: 0.56  score: 0.92220
  "num_layers={}".format(dropout, num_layers))
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_

Predicting: 0it [00:00, ?it/s]

[2022-04-09 12:53:03] - fold 2: case_num: 0 best_th: 0.61  score: 0.90819
[2022-04-09 12:53:06] - fold 2: case_num: 1 best_th: 0.66  score: 0.88610
[2022-04-09 12:53:10] - fold 2: case_num: 2 best_th: 0.62  score: 0.87924
[2022-04-09 12:53:28] - fold 2: case_num: 3 best_th: 0.55  score: 0.94873
[2022-04-09 12:53:35] - fold 2: case_num: 4 best_th: 0.63  score: 0.91864
[2022-04-09 12:53:46] - fold 2: case_num: 5 best_th: 0.59  score: 0.92242
[2022-04-09 12:53:49] - fold 2: case_num: 6 best_th: 0.49  score: 0.90959
[2022-04-09 12:53:55] - fold 2: case_num: 7 best_th: 0.68  score: 0.91496
[2022-04-09 12:54:03] - fold 2: case_num: 8 best_th: 0.69  score: 0.94151
[2022-04-09 12:54:14] - fold 2: case_num: 9 best_th: 0.69  score: 0.87346
[2022-04-09 12:55:37] - fold 2: best_th: 0.69  score: 0.91867
[2022-04-09 12:55:54] - case_num: 0 best_th: 0.61  score: 0.91183
[2022-04-09 12:56:01] - case_num: 1 best_th: 0.54  score: 0.89058
[2022-04-09 12:56:15] - case_num: 2 best_th: 0.5  score: 0.85834
[

Predicting: 0it [00:00, ?it/s]

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:01, ?it/s]

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]