In [1]:
!nvidia-smi

Sun May  1 12:52:38 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    25W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install -q pytorch-lightning wandb torchmetrics transformers sentencepiece
!pip install -q --upgrade --force-reinstall --no-deps kaggle

[K     |████████████████████████████████| 582 kB 4.2 MB/s 
[K     |████████████████████████████████| 1.8 MB 76.3 MB/s 
[K     |████████████████████████████████| 408 kB 77.2 MB/s 
[K     |████████████████████████████████| 4.0 MB 45.2 MB/s 
[K     |████████████████████████████████| 1.2 MB 56.1 MB/s 
[K     |████████████████████████████████| 596 kB 64.6 MB/s 
[K     |████████████████████████████████| 136 kB 88.0 MB/s 
[K     |████████████████████████████████| 1.1 MB 58.7 MB/s 
[K     |████████████████████████████████| 144 kB 82.9 MB/s 
[K     |████████████████████████████████| 181 kB 72.8 MB/s 
[K     |████████████████████████████████| 63 kB 1.9 MB/s 
[K     |████████████████████████████████| 77 kB 5.2 MB/s 
[K     |████████████████████████████████| 6.6 MB 49.0 MB/s 
[K     |████████████████████████████████| 895 kB 53.9 MB/s 
[K     |████████████████████████████████| 144 kB 86.8 MB/s 
[K     |████████████████████████████████| 271 kB 84.4 MB/s 
[K     |████████████████████

In [3]:
!mkdir /root/.kaggle
!cp /content/drive/MyDrive/Colab/kaggle/kaggle.json /root/.kaggle/kaggle.json

In [4]:
# import deberta-v2-v3-fast-tokenizer
import shutil
from pathlib import Path

transformers_path = Path("/usr/local/lib/python3.7/dist-packages/transformers")
input_dir = Path("/content/drive/MyDrive/Colab/kaggle/nbme-score-clinical-patient-notes/input/deberta-v2-v3-fast-tokenizer")

convert_file = input_dir / "convert_slow_tokenizer.py"
conversion_path = transformers_path/convert_file.name

if conversion_path.exists():
    conversion_path.unlink()

shutil.copy(convert_file, transformers_path)
deberta_v2_path = transformers_path / "models" / "deberta_v2"

for filename in ['tokenization_deberta_v2.py', 'tokenization_deberta_v2_fast.py']:
    filepath = deberta_v2_path/filename
    
    if filepath.exists():
        filepath.unlink()

    shutil.copy(input_dir/filename, filepath)

In [5]:
import os
import gc
import sys
import json
import itertools
from tqdm.auto import tqdm
import logging
import datetime
import ast
import numpy as np
import pandas as pd
import sklearn.model_selection as sms
from sklearn.metrics import f1_score
import math
import re
import lightgbm as lgb

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

import pytorch_lightning as pl
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import WandbLogger

from transformers import AutoConfig, AutoModel, AutoTokenizer, get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup
from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast

import wandb

%env TOKENIZERS_PARALLELISM=true

env: TOKENIZERS_PARALLELISM=true


In [6]:
class Config:
    # ==============================
    # Globals #
    # ==============================
    competition_name = "nbme-score-clinical-patient-notes"
    group = "DeBERTa-v3-large"
    exp_id = "018"
    debug = False
    inference_only = False
    upload_from_colab = False
    colab_dir = "/content/drive/MyDrive/Colab/kaggle/nbme-score-clinical-patient-notes"
    kaggle_json_path = "/root/.kaggle/kaggle.json"
    kaggle_dataset_path = None
    gpus = 1
    seed = 2434
    max_epochs = 5
    accumulate_grad_batches = 4
    precision = 32
    num_fold = 5
    train_fold = [0,1,2,3,4] # 実行するfold
    pred_threshold = {
        0: 0.52,
        1: 0.55,
        2: 0.45,
        3: 0.69,
        4: 0.4,
        5: 0.4,
        6: 0.43,
        7: 0.63,
        8: 0.64,
        9: 0.52,
        # best_th: 0.57
    }
    use_pseudo_train = False
    # ==============================
    # Dataloader #
    # ==============================
    train_batch_size = 2
    valid_batch_size = 32
    test_batch_size = 32
    num_workers = 8
    # ==============================
    # Split #
    # ==============================
    split_name = "StratifiedGroupKFold"
    split_params = {
        "n_splits": num_fold if not debug else 4,
        "shuffle": True,
        "random_state": seed,
    }
    # ==============================
    # Model #
    # ==============================
    model_name = "microsoft/deberta-v3-large"
    max_length = 512
    hidden_size = 1024
    num_class = 1
    use_backbone_dropout = True
    dropout = 0.2
    initializer_range = 0.02
    lstm_params = {
        "num_layers": 1,
        "batch_first": True,
        "bidirectional": True,
        "dropout": 0.2,
    }
    # ==============================
    # Loss #
    # ==============================
    loss_name = "BCEWithLogitsLoss"
    loss_params = {
        "reduction": "none"
    }
    # ==============================
    # Optimizer #
    # ==============================
    optimizer_name = "AdamW"
    optimizer_params = {
        "lr": 2e-5,
        "weight_decay": 1e-2,
        "eps": 1e-6,
        "betas": (0.9, 0.999)
    }
    encoder_lr = 2e-5
    decoder_lr = 2e-5
    weight_decay = 0.01
    # ==============================
    # Scheduler #
    # ==============================
    scheduler_name = "cosine-warmup"
    scheduler_warmup_ratio = 0.1
    scheduler_params = {}
    scheduler_interval = "step"
    scheduler_cycle = "one-cycle" # epoch or one-cycle
    # ==============================
    # Callbacks #
    # ==============================
    checkpoint_params = {
        "monitor": "val/micro-F1",
        "save_top_k": 1,
        "save_weights_only": True,
        "mode": "max",
        "verbose": True,
    }
    early_stopping = False
    early_stopping_params = {
        "monitor": "val/loss",
        "min_delta": 0.0,
        "patience": 8,
        "verbose": False,
        "mode": "min",
    }
    # ==============================
    # LightGBM #
    # ==============================
    lgb_params = {
        "objective": "binary",
        "metric": "binary_logloss",
        "boosting_type": "gbdt",
        "learning_rate": 0.01,
        "max_depth": 4,
        "num_leaves": int(2**4*0.7),
        "num_threads": 8,
        # "lambda_l1": 0.1,
        # "lambda_l2": 0.1,
        # "bagging_fraction": 0.5,
        # "bagging_freq": 3,
        "feature_fraction": 0.8,
        "min_data_in_leaf": 20,
        "verbosity": -1,
        "num_iterations": 10000,
        "early_stopping_round": 100,
    }

In [7]:
# ====================================
# Setup #
# ====================================
class Logger:
    """ ref) https://github.com/ghmagazine/kagglebook/blob/master/ch04-model-interface/code/util.py"""
    def __init__(self, path):
        self.general_logger = logging.getLogger(path)
        stream_handler = logging.StreamHandler()
        file_general_handler = logging.FileHandler(os.path.join(path, 'Experiment.log'))
        if len(self.general_logger.handlers) == 0:
            self.general_logger.addHandler(stream_handler)
            self.general_logger.addHandler(file_general_handler)
            self.general_logger.setLevel(logging.INFO)

    def info(self, message):
        # display time
        self.general_logger.info('[{}] - {}'.format(self.now_string(), message))

    @staticmethod
    def now_string():
        return str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))


def setup(cfg):
    cfg.on_colab = "google.colab" in sys.modules
    if cfg.on_colab:
        # kaggle api
        f = open(Config.kaggle_json_path, 'r')
        json_data = json.load(f)
        os.environ["KAGGLE_USERNAME"] = json_data["username"]
        # set input/output dir
        cfg.input_dir = os.path.join(cfg.colab_dir, "input")
        cfg.train_csv = os.path.join(cfg.input_dir, "cleaned_train.csv")
        cfg.external_train_csv = os.path.join(cfg.input_dir, "external_exact_match_train.csv")
        cfg.features_csv = os.path.join(cfg.input_dir, "features.csv")
        cfg.patient_notes_csv = os.path.join(cfg.input_dir, "patient_notes.csv")
        cfg.test_csv = os.path.join(cfg.input_dir, "test.csv")
        cfg.sample_submission = os.path.join(cfg.input_dir, "sample_submission.csv")
        cfg.output_dir = os.path.join(cfg.colab_dir, "output")
        cfg.exp_output_dir = os.path.join(cfg.output_dir, f"exp{cfg.exp_id}")
        cfg.model_dir = os.path.join(cfg.exp_output_dir, "model")

        for d in [cfg.output_dir, cfg.exp_output_dir, cfg.model_dir]:
            os.makedirs(d, exist_ok=True)
            
        # wandb
        wandb.login()
    else:
        cfg.input_dir = f"../input/{cfg.competition_name}"
        cfg.train_csv = os.path.join(cfg.input_dir, "train.csv")
        cfg.features_csv = os.path.join(cfg.input_dir, "features.csv")
        cfg.patient_notes_csv = os.path.join(cfg.input_dir, "patient_notes.csv")
        cfg.test_csv = os.path.join(cfg.input_dir, "test.csv")
        cfg.sample_submission = os.path.join(cfg.input_dir, "sample_submission.csv")
        cfg.submission = "./"
        cfg.exp_output_dir = f"exp{cfg.exp_id}"
        cfg.model_dir = os.path.join(cfg.exp_output_dir, "model")

        if cfg.kaggle_dataset_path is not None:
            cfg.model_dir = os.path.join(cfg.kaggle_dataset_path, "model")

        for d in [cfg.exp_output_dir, cfg.model_dir]:
            os.makedirs(d, exist_ok=True)

    return cfg


# ====================================
# Preprocess #
# ====================================
def get_input_data(cfg, input_type="train"):
    input_df = pd.read_csv(cfg.train_csv) if input_type == "train" else pd.read_csv(cfg.test_csv)
    if cfg.debug and input_type != "test":
        input_df = input_df[input_df["pn_num"].isin(input_df["pn_num"].unique()[:100])].reset_index(drop=True)
    
    feature_texts_df = pd.read_csv(Config.features_csv)
    patient_notes_df = pd.read_csv(Config.patient_notes_csv)

    if input_type == "train":
        # external_df = pd.read_csv(cfg.external_train_csv)
        # external_df = external_df.sample(14300 * 4, random_state=2434)
        # input_df = pd.concat([input_df, external_df], axis=0).reset_index(drop=True)
        input_df["annotation"] = input_df["annotation"].apply(ast.literal_eval)
        input_df["location"] = input_df["location"].apply(ast.literal_eval)
    
    input_df = input_df.merge(feature_texts_df, on=["feature_num", "case_num"], how="left")
    input_df = input_df.merge(patient_notes_df, on=["pn_num", "case_num"], how="left")

    input_df["pn_history"] = input_df["pn_history"].apply(clean_feature_text_for_preprocess)

    return input_df


def get_and_merge_external_data(cfg, train_df: pd.DataFrame, fold: int):
    input_df = pd.read_csv(os.path.join(cfg.input_dir, f"external_train_fold_{fold}.csv"))
    input_df = input_df.sample(20000, random_state=2434)
    input_df["annotation"] = input_df["annotation"].apply(ast.literal_eval)
    input_df["location"] = input_df["location"].apply(ast.literal_eval)
    input_df["pn_history"] = input_df["pn_history"].apply(clean_feature_text_for_preprocess)

    train_df = pd.concat([train_df, input_df], axis=0).reset_index(drop=True)

    return train_df


def get_split(cfg, train_df):
    split_name = cfg.split_name
    split_params = cfg.split_params
    splitter = sms.__getattribute__(split_name)(**split_params)

    groups = train_df["pn_num"].to_numpy()
    train_df["fold"] = -1

    for fold_id, (train_idx, valid_idx) in enumerate(splitter.split(train_df, train_df["case_num"], groups)):
        train_df.loc[valid_idx, "fold"] = int(fold_id)

    return train_df


def get_filname_listdir(dirctory):
    listdir = os.listdir(dirctory)
    out_lst = [os.path.splitext(d)[0] for d in listdir]
    return out_lst


def get_tokenizer(cfg):
    if cfg.kaggle_dataset_path is not None:
        pretrained_dir = os.path.join(cfg.kaggle_dataset_path, "pretrain_tokenizer")
    else:
        pretrained_dir = os.path.join(cfg.exp_output_dir, "pretrain_tokenizer")

    if not os.path.isdir(pretrained_dir):
        # deberta-v2 or deberta-v3
        if ("deberta-v2" in cfg.model_name) or ("deberta-v3" in cfg.model_name):
            tokenizer = DebertaV2TokenizerFast.from_pretrained(cfg.model_name)
        # except for ("roberta", "deberta-v2", "deberta-v3")
        elif "roberta" not in cfg.model_name:
            tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
        # roberta
        else:
            tokenizer = AutoTokenizer.from_pretrained(cfg.model_name, trim_offsets=False)

        tokenizer.save_pretrained(pretrained_dir)

    else:
        # deberta-v2 or deberta-v3
        if ("deberta-v2" in cfg.model_name) or ("deberta-v3" in cfg.model_name):
            tokenizer = DebertaV2TokenizerFast.from_pretrained(pretrained_dir)
        # except for ("roberta", "deberta-v2", "deberta-v3")
        elif "roberta" not in cfg.model_name:
            tokenizer = AutoTokenizer.from_pretrained(pretrained_dir)
        # roberta
        else:
            tokenizer = AutoTokenizer.from_pretrained(pretrained_dir, trim_offsets=False)

    return tokenizer


def get_backbone(cfg):
    if cfg.kaggle_dataset_path is not None:
        pretrained_dir = os.path.join(cfg.kaggle_dataset_path, "pretrain_model")
    else:
        pretrained_dir = os.path.join(cfg.exp_output_dir, "pretrain_model")

    if not os.path.isdir(pretrained_dir):
        model_config = AutoConfig.from_pretrained(cfg.model_name)
        if not cfg.use_backbone_dropout:
            model_config.attention_probs_dropout_prob = 0.0
            model_config.hidden_dropout_prob = 0.0
        backbone = AutoModel.from_pretrained(cfg.model_name, config=model_config)

        backbone.save_pretrained(pretrained_dir)

    else:
        model_config = AutoConfig.from_pretrained(pretrained_dir)
        if not cfg.use_backbone_dropout:
            model_config.attention_probs_dropout_prob = 0.0
            model_config.hidden_dropout_prob = 0.0
        backbone = AutoModel.from_pretrained(pretrained_dir, config=model_config)

    return backbone


def clean_feature_text_for_preprocess(text: str):
    """
    reference: https://www.kaggle.com/code/theoviel/roberta-strikes-back
    """
    text = re.sub('I-year', '1-year', text)
    text = re.sub('-OR-', " or ", text)
    text = re.sub('-', ' ', text)

    return text


# ====================================
# Dataset #
# ====================================
def get_inputs(cfg, text: str, feature_text: str, tokenizer):
    encoding = tokenizer(
        text,
        feature_text,
        max_length=cfg.max_length,
        padding="max_length",
        return_offsets_mapping=False,
        # add_special_tokens=True
    )

    for k, v in encoding.items():
        encoding[k] = torch.tensor(v, dtype=torch.long)

    return encoding


def get_label(cfg, text: str, locations: list, tokenizer):
    encoding = tokenizer(
        text,
        max_length=cfg.max_length,
        padding="max_length",
        return_offsets_mapping=True,
        # add_special_tokens=True
    )
    
    offset_mapping = encoding["offset_mapping"]
    ignore_idx = np.where(np.array(encoding.sequence_ids()) != 0)[0]
    label = np.zeros(len(offset_mapping))
    label[ignore_idx] = -1

    if len(locations) != 0:
        for location in locations:
            for loc in [s.split() for s in location.split(";")]:
                start_idx = -1
                end_idx = -1
                start, end = int(loc[0]), int(loc[1])
                for idx in range(len(offset_mapping)):
                    # DeBERTaのTokenizerは前の空白も含めるため+1する
                    if (start_idx == -1) & (start < offset_mapping[idx][0]):
                        start_idx = idx - 1
                    if (end_idx == -1) & (end <= offset_mapping[idx][1]):
                        end_idx = idx + 1
                if start_idx == -1:
                    start_idx = end_idx
                if (start_idx != -1) & (end_idx != -1):
                    label[start_idx: end_idx] = 1
    
    return torch.tensor(label, dtype=torch.float)


class NBMEDataset(Dataset):
    def __init__(self, cfg, input_df: pd.DataFrame, tokenizer, phase: str = "train"):
        self.cfg = cfg
        self.input_df = input_df
        self.tokenizer = tokenizer
        self.phase = phase
        self.pn_histories = self.input_df["pn_history"].to_numpy()
        self.feature_texts = self.input_df["feature_text"].to_numpy()
        self.locations = self.input_df["location"].to_numpy() if self.phase is "train" else None

    def __len__(self):
        return len(self.input_df)

    def __getitem__(self, idx):
        if self.phase == "train":
            inputs = get_inputs(
                self.cfg,
                self.pn_histories[idx],
                self.feature_texts[idx],
                self.tokenizer,
            )
            label = get_label(
                self.cfg,
                self.pn_histories[idx],
                self.locations[idx],
                self.tokenizer,
            )

            return {
                "input_ids": inputs["input_ids"],
                "attention_mask": inputs["attention_mask"],
                "labels": label,
            }

        elif self.phase == "test":
            inputs = get_inputs(
                self.cfg,
                self.pn_histories[idx],
                self.feature_texts[idx],
                self.tokenizer,
            )

            return {
                "input_ids": inputs["input_ids"],
                "attention_mask": inputs["attention_mask"],
            }
        else:
            raise NotImplementedError


class NBMEDataModule(pl.LightningDataModule):
    def __init__(self, cfg, tokenizer, train_df: pd.DataFrame = None, valid_df: pd.DataFrame = None, test_df: pd.DataFrame = None):
        super(NBMEDataModule, self).__init__()

        self.cfg = cfg
        self.tokenizer = tokenizer
        self.train_df = train_df
        self.valid_df = valid_df
        self.test_df = test_df

    def prepare_data(self):
        if self.test_df is None:
            self.train_dataset = NBMEDataset(
                cfg=self.cfg,
                input_df=self.train_df,
                tokenizer=self.tokenizer,
                phase="train"
            )
            self.val_dataset = NBMEDataset(
                cfg=self.cfg,
                input_df=self.valid_df,
                tokenizer=self.tokenizer,
                phase="train"
            )
        else:
            self.test_dataset = NBMEDataset(
                cfg=self.cfg,
                input_df=self.test_df,
                tokenizer=self.tokenizer,
                phase="test"
            )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.cfg.train_batch_size,
            num_workers=self.cfg.num_workers,
            shuffle=True,
            pin_memory=True,
            drop_last=False,
        )
    
    def val_dataloader(self):
        return DataLoader(
            self.val_dataset,
            batch_size=self.cfg.valid_batch_size,
            num_workers=self.cfg.num_workers,
            shuffle=False,
            pin_memory=True,
            drop_last=False,
        )

    def predict_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.cfg.test_batch_size,
            num_workers=self.cfg.num_workers,
            shuffle=False,
            pin_memory=True,
            drop_last=False,
        )


# ====================================
# Model #
# ====================================
class NBMEModel(nn.Module):
    def __init__(self, cfg):
        super(NBMEModel, self).__init__()

        self.cfg = cfg
        self.backbone = get_backbone(self.cfg)
        self.dropout = nn.Dropout(self.cfg.dropout)
        self.lstm = nn.LSTM(self.cfg.hidden_size, self.cfg.hidden_size, **self.cfg.lstm_params)
        self.classifier = nn.Linear(self.cfg.hidden_size * 2, self.cfg.num_class)
        self._init_weights(self.classifier)
        self._reinitialize()

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.cfg.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.cfg.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def _reinitialize(self):
        """
        Tensorflow/Keras-like initialization
        """
        for name, p in self.named_parameters():
            if 'lstm' in name:
                if 'weight_ih' in name:
                    nn.init.xavier_uniform_(p.data)
                elif 'weight_hh' in name:
                    nn.init.orthogonal_(p.data)
                elif 'bias_ih' in name:
                    p.data.fill_(0)
                    # Set forget-gate bias to 1
                    n = p.size(0)
                    p.data[(n // 4):(n // 2)].fill_(1)
                elif 'bias_hh' in name:
                    p.data.fill_(0)
            elif 'fc' in name:
                if 'weight' in name:
                    nn.init.xavier_uniform_(p.data)
                elif 'bias' in name:
                    p.data.fill_(0)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask) # (batch_size, seq_len, hidden_size)
        x = outputs[0] # extract last_hidden_states
        x, _ = self.lstm(x)
        x = self.dropout(x)
        x = self.classifier(x) # (batch_size, seq_len, num_class)

        return x


class NBMELightningModule(pl.LightningModule):
    def __init__(self, cfg, tokenizer=None, valid_df=None, valid_labels=None):
        super(NBMELightningModule, self).__init__()

        self.cfg = cfg
        self.model = NBMEModel(self.cfg)
        self.criterion = get_criterion(self.cfg)
        self.tokenizer = tokenizer
        self.valid_df = valid_df
        self.valid_labels = valid_labels

    def setup(self, stage=None):
        # calculate training total steps
        if stage == "fit":
            if self.cfg.scheduler_cycle == "one-cycle":
                self.training_steps = math.ceil(len(self.trainer.datamodule.train_dataloader()) / self.trainer.accumulate_grad_batches) * self.trainer.max_epochs
            elif self.cfg.scheduler_cycle == "epoch":
                self.training_steps = math.ceil(len(self.trainer.datamodule.train_dataloader()) / self.trainer.accumulate_grad_batches) * 1
            else:
                raise NotImplementedError
            self.warmup_steps = int(self.training_steps * self.cfg.scheduler_warmup_ratio) if self.cfg.scheduler_warmup_ratio else None
    
    def forward(self, input_ids, attention_mask):
        return self.model(input_ids, attention_mask)

    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch["input_ids"], batch["attention_mask"], batch["labels"]
        y_preds = self.forward(input_ids, attention_mask)
        loss = self.criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        mask = (labels.view(-1, 1) != -1)
        loss = torch.masked_select(loss, mask).mean()
        self.log("train/loss", loss, on_step=True, on_epoch=True, logger=True, prog_bar=True)

        return loss

    def validation_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch["input_ids"], batch["attention_mask"], batch["labels"]
        y_preds = self.forward(input_ids, attention_mask)
        loss = self.criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        mask = (labels.view(-1, 1) != -1)
        loss = torch.masked_select(loss, mask).mean()
        self.log("val/loss", loss, on_step=True, on_epoch=True, logger=True, prog_bar=True)

        return {
            "loss": loss,
            "preds": y_preds.detach()
        }

    def validation_epoch_end(self, outputs):
        preds = torch.cat([output["preds"] for output in outputs]).squeeze().cpu().numpy()
        char_preds = get_token_probs_to_char_probs(self.valid_df["pn_history"].to_numpy(), preds, self.tokenizer)
        results = get_results(self.cfg, char_preds, th=0.5)
        preds = get_predictions(results)
        score = get_score(self.valid_labels, preds)
        self.log("val/micro-F1", score, logger=True, prog_bar=True)

    def predict_step(self, batch, batch_idx, dataloader_idx=None):
        input_ids, attention_mask = batch["input_ids"], batch["attention_mask"]
        y_preds = self.forward(input_ids, attention_mask)
        y_preds = y_preds.sigmoid()

        return y_preds.squeeze()

    def configure_optimizers(self):
        optimizer_params = get_optimizer_params(self.model, self.cfg.encoder_lr, self.cfg.decoder_lr, self.cfg.weight_decay)
        optimizer = get_optimizer(self.cfg, optimizer_params)

        if self.cfg.scheduler_name is None:
            return [optimizer]
        else:
            scheduler = get_scheduler(self.cfg, optimizer, num_warmup_steps=self.warmup_steps, num_training_steps=self.training_steps)
            scheduler = {"scheduler": scheduler, "interval": self.cfg.scheduler_interval}

            return [optimizer], [scheduler]


# ====================================
# Criterion, Optimizer, Scheduler #
# ====================================
def get_criterion(cfg):
    loss_name = cfg.loss_name
    loss_params = cfg.loss_params

    return nn.__getattribute__(loss_name)(**loss_params)


def get_optimizer(cfg, parameters):
    optimizer_name = cfg.optimizer_name
    optimizer_params = cfg.optimizer_params

    return optim.__getattribute__(optimizer_name)(parameters, **optimizer_params)


def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
    # param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in model.backbone.named_parameters() if not any(nd in n for nd in no_decay)],
            'lr': encoder_lr, 'weight_decay': weight_decay},
        {'params': [p for n, p in model.backbone.named_parameters() if any(nd in n for nd in no_decay)],
            'lr': encoder_lr, 'weight_decay': 0.0},
        {'params': [p for n, p in model.named_parameters() if "backbone" not in n],
            'lr': decoder_lr, 'weight_decay': 0.0}
    ]

    return optimizer_parameters


def get_scheduler(cfg, optimizer, num_warmup_steps=None, num_training_steps=None):
    scheduler_name = cfg.scheduler_name
    scheduler_params = cfg.scheduler_params

    if scheduler_name == "cosine-warmup":
        return get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_training_steps,
            **scheduler_params
        )
    elif scheduler_name == "linear-warmup":
        return get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_training_steps,
            **scheduler_params
        )
    else:
        return optim.lr_scheduler.__getattribute__(scheduler_name)(optimizer, **scheduler_params)


# ====================================
# Train & Predict #
# ====================================
def train_fold(cfg, train_df, valid_df, tokenizer, fold, valid_labels):
    # Seed
    seed_everything(cfg.seed)

    # Wandb
    wandb_logger = WandbLogger(
        project=cfg.competition_name,
        group=cfg.group,
        name=f"exp{cfg.exp_id}-fold-{fold}",
        job_type=f"exp{cfg.exp_id}",
        reinit=True,
        anonymous="must",
    )

    # Model Checkpoint
    checkpoint = ModelCheckpoint(
        dirpath=cfg.model_dir,
        # filename=f"exp{cfg.exp_id}-fold-{fold}" + "-{epoch}",
        filename=f"exp{cfg.exp_id}-fold-{fold}",
        **cfg.checkpoint_params,
    )

    # Learning Rate
    lr_monitor = LearningRateMonitor(logging_interval="step")
    callbacks = [checkpoint, lr_monitor]

    # Early Stopping
    if cfg.early_stopping:
        early_stopping = EarlyStopping(**cfg.early_stopping_params)
        callbacks += [early_stopping]
    
    # DataModule
    lightning_datamodule = NBMEDataModule(
        cfg=cfg,
        tokenizer=tokenizer,
        train_df=train_df,
        valid_df=valid_df,
    )

    # Model
    lightning_model = NBMELightningModule(
        cfg,
        tokenizer,
        valid_df,
        valid_labels,
    )

    # Trainer
    trainer = Trainer(
        gpus=cfg.gpus,
        max_epochs=cfg.max_epochs,
        callbacks=callbacks,
        logger=[wandb_logger],
        accumulate_grad_batches=cfg.accumulate_grad_batches,
        precision=cfg.precision,
        # deterministic=True,
        benchmark=False,
    )

    trainer.fit(lightning_model, datamodule=lightning_datamodule)
    wandb.finish(quiet=True)

    del lightning_datamodule, lightning_model, trainer

    gc.collect()
    torch.cuda.empty_cache()


def train_cv(cfg, input_df, tokenizer):
    oof_char_probs = []
    true_df = pd.DataFrame()

    for fold_id in range(cfg.num_fold):
        if fold_id in cfg.train_fold:
            filename = f"exp{cfg.exp_id}-fold-{fold_id}"
            filelist = get_filname_listdir(cfg.model_dir)

            train_df = input_df[input_df["fold"] != fold_id].reset_index(drop=True)
            if cfg.use_pseudo_train:
                train_df = get_and_merge_external_data(cfg, train_df, fold_id) # merge external train
            valid_df = input_df[input_df["fold"] == fold_id].reset_index(drop=True)
            valid_df["labels"] = create_labels_for_scoring(valid_df)

            # training
            if not filename in filelist:
                train_fold(
                    cfg=cfg,
                    train_df=train_df,
                    valid_df=valid_df,
                    tokenizer=tokenizer,
                    fold=fold_id,
                    valid_labels=valid_df["labels"].to_numpy(),
                )

            # oof
            char_probs = predict(
                cfg=cfg,
                input_df=valid_df,
                tokenizer=tokenizer,
                filename=filename,
                labels=valid_df["labels"].to_numpy(),
            )
            # scoring and optimize threshodl for each case
            get_score_and_threshold(cfg, char_probs, valid_df, fold_id)
            
            oof_char_probs += char_probs
            true_df = pd.concat([true_df, valid_df], axis=0)

    get_score_and_threshold(cfg, oof_char_probs, true_df.reset_index(drop=True), "cv")
    results = get_results(cfg, oof_char_probs, cases=true_df["case_num"].to_list())
    preds = get_predictions(results)
    oof_score = get_score(true_df["labels"].to_list(), preds)
    cfg.logger.info(f"optimized case-threshold cv-score: {oof_score}")

    # true_df = true_df.sort_values(by="id", ascending=True).reset_index(drop=True)

    return true_df.reset_index(drop=True), oof_char_probs


def predict_raw_prediction(cfg, input_df, tokenizer, filename, labels=None):
    checkpoint_path = os.path.join(cfg.model_dir, filename + ".ckpt")

    lightning_model = NBMELightningModule(
        cfg,
        tokenizer,
        input_df,
        labels,
    )

    lightning_model = lightning_model.load_from_checkpoint(
        checkpoint_path=checkpoint_path,
        cfg=cfg,
    )

    lightning_datamodule = NBMEDataModule(
        cfg,
        tokenizer=tokenizer,
        test_df=input_df
    )

    trainer = Trainer(
        gpus=cfg.gpus,
    )

    preds = trainer.predict(
        lightning_model,
        datamodule=lightning_datamodule,
        return_predictions=True
    )

    preds = torch.cat(preds).cpu().numpy() # (sample, max_seq, num_class)

    del lightning_datamodule, lightning_model, trainer

    gc.collect()
    torch.cuda.empty_cache()
    
    return preds
    

def predict(cfg, input_df, tokenizer, filename, labels):
    file_path = os.path.join(cfg.exp_output_dir, f"{filename}.npy")
    
    if os.path.isfile(file_path):
        preds = np.load(file_path)
    else:
        preds = predict_raw_prediction(cfg, input_df, tokenizer, filename, labels)
        np.save(os.path.join(cfg.exp_output_dir, filename), preds)

    char_probs = get_token_probs_to_char_probs(input_df["pn_history"].to_numpy(), preds, tokenizer)

    return char_probs


def predict_cv(cfg, input_df, tokenizer):
    """
    CVモデルで予測
    """
    fold_preds = []
    for fold_id in range(cfg.num_fold):
        if fold_id in cfg.train_fold:
            filename = f"exp{cfg.exp_id}-fold-{fold_id}"
            preds = predict_raw_prediction(cfg, input_df, tokenizer, filename)
            char_preds = get_token_probs_to_char_probs(input_df["pn_history"].to_numpy(), preds, tokenizer)
            fold_preds.append(char_preds)

    fold_preds = np.mean(fold_preds, axis=0)
    results = get_results(cfg, fold_preds, cases=input_df["case_num"].to_list())

    output_df = input_df.copy()
    output_df["location"] = results
    
    return output_df


def get_token_probs_to_char_probs(texts, predictions, tokenizer):
    """
    予測値をtoken-level -> char-levelに変形
    """
    results = [np.zeros(len(t)) for t in texts]
    for i, (text, prediction) in enumerate(zip(texts, predictions)):
        encoded = tokenizer(
            text, 
            add_special_tokens=True,
            return_offsets_mapping=True
        )
        
        for idx, (offset_mapping, pred) in enumerate(zip(encoded['offset_mapping'], prediction)):
            start = offset_mapping[0]
            end = offset_mapping[1]

            # 先行するスペースがあればスパンから除く
            # if text[start] == " ":
            #     start = start + 1
            
            results[i][start: end] = pred
    
    return results


def get_results(cfg, char_probs, th=0.5, cases=None):
    """
    ";"区切りのスパンに変換
    """
    results = []
    if cases:
        for char_prob, case in zip(char_probs, cases):
            th = cfg.pred_threshold[case]
            result = np.where(char_prob >= th)[0] + 1 # DeBERTa系Tokenizerの先頭のスペース分スライド
            result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
            result = [f"{min(r)} {max(r)}" for r in result]
            result = ";".join(result)
            results.append(result)
    else:
        for char_prob in char_probs:
            result = np.where(char_prob >= th)[0] + 1
            result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
            result = [f"{min(r)} {max(r)}" for r in result]
            result = ";".join(result)
            results.append(result)
    
    return results


def get_predictions(results):
    """
    各スパンのリストを要素とするリストに変換
    '3 4;7 9;12 13' -> [[3, 4], [7, 9], [12, 13]]
    """
    predictions = []
    for result in results:
        prediction = []
        if result != "":
            for loc in [s.split() for s in result.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    
    return predictions


def create_labels_for_scoring(df):
    # example: ['0 1', '3 4'] -> ['0 1; 3 4']
    df = df.copy()
    df['location_for_create_labels'] = [ast.literal_eval(f'[]')] * len(df)
    for i in range(len(df)):
        lst = df.loc[i, 'location']
        if lst:
            new_lst = ';'.join(lst)
            df.loc[i, 'location_for_create_labels'] = ast.literal_eval(f'[["{new_lst}"]]')
    # create labels
    truths = []
    for location_list in df['location_for_create_labels'].values:
        truth = []
        if len(location_list) > 0:
            location = location_list[0]
            for loc in [s.split() for s in location.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                truth.append([start, end])
        truths.append(truth)
    
    return truths


# ====================================
# Metrics #
# ====================================
def micro_f1(preds, truths):
    """
    Micro f1 on binary arrays.

    Args:
        preds (list of lists of ints): Predictions.
        truths (list of lists of ints): Ground truths.

    Returns:
        float: f1 score.
    """
    # Micro : aggregating over all instances
    preds = np.concatenate(preds)
    truths = np.concatenate(truths)

    return f1_score(truths, preds)


def spans_to_binary(spans, length=None):
    """
    Converts spans to a binary array indicating whether each character is in the span.

    Args:
        spans (list of lists of two ints): Spans.

    Returns:
        np array [length]: Binarized spans.
    """
    length = np.max(spans) if length is None else length
    binary = np.zeros(length)
    for start, end in spans:
        binary[start:end] = 1
    
    return binary


def span_micro_f1(preds, truths):
    """
    Micro f1 on spans.

    Args:
        preds (list of lists of two ints): Prediction spans.
        truths (list of lists of two ints): Ground truth spans.

    Returns:
        float: f1 score.
    """
    bin_preds = []
    bin_truths = []
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            continue
        length = max(np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0)
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
    
    return micro_f1(bin_preds, bin_truths)


def get_score(y_true, y_pred):
    score = span_micro_f1(y_true, y_pred)

    return score


def optimize_threshold(cfg, valid_labels, char_probs):
    best_thres = 0.5
    best_score = 0.0
    for th in np.arange(0.40, 0.70, 0.01):
        th = np.round(th, 2)
        results = get_results(cfg, char_probs, th=th)
        preds = get_predictions(results)
        score = get_score(valid_labels, preds)

        if best_score < score:
            best_thres = th
            best_score = score

    return best_thres, best_score


def get_score_and_threshold(cfg, pred_char_probs, valid_df, fold_id):
    """
    case毎 & 全体のスコアリングと閾値の最適化
    """
    class_scores = {}
    valid_df = valid_df.copy()
    valid_df["pred_char_probs"] = pred_char_probs

    for case in valid_df["case_num"].unique():
        case_idx = valid_df.query('case_num == @case').index
        case_labels = valid_df.iloc[case_idx]["labels"].to_list()
        case_char_probs = valid_df.iloc[case_idx]["pred_char_probs"].to_list()
        best_thres, best_score = optimize_threshold(cfg, case_labels, case_char_probs)
        if fold_id != "cv":
            cfg.logger.info(f"fold {fold_id}: case_num: {case} best_th: {best_thres}  score: {best_score:.5f}")
        else:
            cfg.logger.info(f"case_num: {case} best_th: {best_thres}  score: {best_score:.5f}")

    best_thres, best_score = optimize_threshold(cfg, valid_df["labels"].to_list(), pred_char_probs)
    if fold_id != "cv":
        cfg.logger.info(f"fold {fold_id}: best_th: {best_thres}  score: {best_score:.5f}")
    else:
        cfg.logger.info(f"best_th: {best_thres}  score: {best_score:.5f}")


# ====================================
# Pseudo labeling #
# ====================================
def get_input_data_for_pseudo_labeling(cfg, nrows: int = None):
    train_df = pd.read_csv(cfg.train_csv)
    feature_texts_df = pd.read_csv(Config.features_csv)
    patient_notes_df = pd.read_csv(Config.patient_notes_csv)

    train_pn_idx = list(train_df["pn_num"].unique())
    extract_train_df = patient_notes_df[~patient_notes_df["pn_num"].isin(train_pn_idx)].reset_index(drop=True)
    extract_train_df = extract_train_df.merge(feature_texts_df, on=["case_num"], how="left")
    extract_train_df["id"] = extract_train_df["pn_num"].astype(str).str.zfill(5) + "_" + extract_train_df["feature_num"].astype(str).str.zfill(3)
    extract_train_df["pn_history"] = extract_train_df["pn_history"].apply(clean_feature_text_for_preprocess)
    extract_train_df = extract_train_df.reindex(columns=["id", "case_num", "pn_num", "feature_num", "feature_text", "pn_history"])

    if nrows is not None:
        select_idx = extract_train_df.drop_duplicates(subset="pn_num").sample(n=nrows, random_state=2434)["pn_num"].to_list()
        extract_train_df = extract_train_df[extract_train_df["pn_num"].isin(select_idx)].reset_index(drop=True)

    return extract_train_df


def create_external_input(pred_df):
    pred_df["predict"] = get_predictions(pred_df["location"].to_list())

    all_annotation_texts = []
    for history, locations in zip(pred_df["pn_history"].to_numpy(), pred_df["predict"].to_numpy()):
        sample_annotation_texts = []
        for loc in locations:
            start, end = loc[0], loc[1]
            annotion_text = history[start: end]
            sample_annotation_texts.append(annotion_text)
        all_annotation_texts.append(sample_annotation_texts)

    pred_df["annotation"] = all_annotation_texts
    pred_df["location"] = pred_df["location"].apply(lambda x: x.split(";"))

    pred_df["len_annotation"] = pred_df["annotation"].apply(len)
    pred_df = pred_df[pred_df["len_annotation"] != 0].reset_index(drop=True)

    return pred_df.drop(columns=["predict", "len_annotation"], axis=1)


def predict_for_pseudo_labeling(cfg, input_df, tokenizer):
    """
    pseudo-labeling for each fold model
    """
    for fold_id in range(cfg.num_fold):
        if fold_id in cfg.train_fold:
            filename = f"exp{cfg.exp_id}-fold-{fold_id}"
            preds = predict_raw_prediction(cfg, input_df, tokenizer, filename)
            char_preds = get_token_probs_to_char_probs(input_df["pn_history"].to_numpy(), preds, tokenizer)
            results = get_results(cfg, char_preds, cases=input_df["case_num"].to_list())

            output_df = input_df.copy()
            output_df["location"] = results

            output_df = create_external_input(output_df)
            output_df["fold"] = fold_id
            output_df.to_csv(os.path.join(cfg.input_dir, f"external_train_fold_{fold_id}.csv"), index=False)

            del output_df

In [8]:
# def main(Config):
#     # setup
#     Config = setup(Config)
#     Config.logger = Logger(Config.exp_output_dir)
#     # load dataset
#     train_df = get_input_data(Config, input_type="train")
#     test_df = get_input_data(Config, input_type="test")
#     # submission_df = pd.read_csv(Config.sample_submission)
#     # extract_train_df = get_input_data_for_pseudo_labeling(Config, nrows=5000)

#     # split
#     train_df = get_split(Config, train_df)

#     # tokenizer
#     tokenizer = get_tokenizer(Config)

#     if not Config.inference_only:
#         # training
#         train_cv(
#             cfg=Config,
#             input_df=train_df,
#             tokenizer=tokenizer,
#         )

#     # predict
#     raw_pred_df = predict_cv(
#         cfg=Config,
#         input_df=test_df,
#         tokenizer=tokenizer,
#     )

#     # pseudo-labeling
#     # predict_for_pseudo_labeling(cfg=Config, input_df=extract_train_df, tokenizer=tokenizer)

#     # upload output to kaggle dataset
#     if Config.upload_from_colab:
#         from kaggle.api.kaggle_api_extended import KaggleApi

#         def dataset_create_new(dataset_name, upload_dir):
#             dataset_metadata = {}
#             dataset_metadata['id'] = f'{os.environ["KAGGLE_USERNAME"]}/{dataset_name}'
#             dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
#             dataset_metadata['title'] = dataset_name
#             with open(os.path.join(upload_dir, 'dataset-metadata.json'), 'w') as f:
#                 json.dump(dataset_metadata, f, indent=4)
#             api = KaggleApi()
#             api.authenticate()
#             api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode='tar')

#         dataset_create_new(dataset_name=f"{Config.competition_name}-exp{Config.exp_id}", upload_dir=Config.exp_output_dir)

#     # make submission
#     if not Config.on_colab:
#         raw_pred_df[["id", "location"]].to_csv(os.path.join(Config.submission, "submission.csv"), index=False)


# if __name__ == "__main__":
#     main(Config)

In [9]:
# setup
Config = setup(Config)
Config.logger = Logger(Config.exp_output_dir)

# load dataset
train_df = get_input_data(Config, input_type="train")
test_df = get_input_data(Config, input_type="test")


# split
train_df = get_split(Config, train_df)

# tokenizer
tokenizer = get_tokenizer(Config)


def predict_cv(cfg, input_df, tokenizer):
    """
    CVモデルで予測
    """
    fold_preds = []
    for fold_id in range(cfg.num_fold):
        if fold_id in cfg.train_fold:
            filename = f"exp{cfg.exp_id}-fold-{fold_id}"
            preds = predict_raw_prediction(cfg, input_df, tokenizer, filename)
            char_preds = get_token_probs_to_char_probs(input_df["pn_history"].to_numpy(), preds, tokenizer)
            fold_preds.append(char_preds)

    fold_preds = np.mean(fold_preds, axis=0)
    results = get_results(cfg, fold_preds, cases=input_df["case_num"].to_list())

    output_df = input_df.copy()
    output_df["location"] = results
    
    return output_df, fold_preds


# predict
raw_pred_df, pred_char_probs = predict_cv(
    cfg=Config,
    input_df=test_df,
    tokenizer=tokenizer,
)

true_df, train_char_probs = train_cv(
    cfg=Config,
            input_df=train_df,
            tokenizer=tokenizer,
)

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
  "num_layers={}".format(dropout, num_layers))
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: /content/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

[2022-05-01 12:57:51] - fold 0: case_num: 0 best_th: 0.65  score: 0.88050
[2022-05-01 12:57:52] - fold 0: case_num: 1 best_th: 0.69  score: 0.91163
[2022-05-01 12:57:53] - fold 0: case_num: 2 best_th: 0.47  score: 0.86321
[2022-05-01 12:57:55] - fold 0: case_num: 3 best_th: 0.56  score: 0.90957
[2022-05-01 12:57:55] - fold 0: case_num: 4 best_th: 0.4  score: 0.92525
[2022-05-01 12:57:57] - fold 0: case_num: 5 best_th: 0.66  score: 0.80361
[2022-05-01 12:57:58] - fold 0: case_num: 6 best_th: 0.51  score: 0.87941
[2022-05-01 12:57:58] - fold 0: case_num: 7 best_th: 0.65  score: 0.87472
[2022-05-01 12:58:00] - fold 0: case_num: 8 best_th: 0.57  score: 0.92547
[2022-05-01 12:58:01] - fold 0: case_num: 9 best_th: 0.59  score: 0.93555
[2022-05-01 12:58:12] - fold 0: best_th: 0.63  score: 0.88441
[2022-05-01 12:58:17] - fold 1: case_num: 0 best_th: 0.56  score: 0.90600
[2022-05-01 12:58:18] - fold 1: case_num: 1 best_th: 0.52  score: 0.89292
[2022-05-01 12:58:19] - fold 1: case_num: 2 best_th

## rulebase

In [10]:
# df = true_df[true_df["feature_num"] == 916].copy().reset_index(drop=True)
df = true_df.copy()
df["span_length"] = df["annotation"].apply(lambda x: len(x))
df = df[df["span_length"] != 0].reset_index(drop=True)
df["length"] = df["annotation"].apply(lambda x: len(x[0]))
df.sort_values(by="span_length")

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history,fold,labels,span_length,length
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724],Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...,0,"[[696, 724]]",1,28
6307,10888_104,1,10888,104,[20 y.o],[41 47],20-year,Suzanne Powelton is an otherwise healthy 20 y....,3,"[[41, 47]]",1,6
6305,10888_102,1,10888,102,[Last sexual activity 9 months ago],[843 876],Not-sexually-active,Suzanne Powelton is an otherwise healthy 20 y....,3,"[[843, 876]]",1,33
6304,10888_100,1,10888,100,[denies vaginal discharge],[601 607;654 671],No-vaginal-discharge,Suzanne Powelton is an otherwise healthy 20 y....,3,"[[601, 607], [654, 671]]",1,24
6303,10847_112,1,10847,112,[female],[24 30],Female,HPI: Patient is a 20 yo female who presents wi...,3,"[[24, 30]]",1,6
...,...,...,...,...,...,...,...,...,...,...,...,...
6910,44866_400,4,44866,400,"[Denies sweating, Denies consptiation, diarrhe...","[458 473, 458 464;475 497, 458 464;499 502, 45...",Lack-of-other-thyroid-symptoms,Karin Moore is a 45 year old female with an ep...,3,"[[458, 473], [458, 464], [475, 497], [458, 464...",6,15
992,44641_400,4,44641,400,"[denies hair loss, denies hot or cold intolera...","[123 129;226 235, 123 129;199 222, 123 129;190...",Lack-of-other-thyroid-symptoms,Ms. Moore si a 45 yo woman with chief complain...,0,"[[123, 129], [226, 235], [123, 129], [199, 222...",6,16
5449,73296_702,7,73296,702,"[problems with her periods, periods have been ...","[34 59, 107 134, 107 124;139 144, 146 167, 173...",heavy-periods-OR-irregular-periods,35 year old female complaining of problems wit...,2,"[[34, 59], [107, 134], [107, 124], [139, 144],...",6,25
6824,42520_400,4,42520,400,"[denies palpatations, denies sweating, denies ...","[492 498;525 537, 492 498;598 606, 492 498;616...",Lack-of-other-thyroid-symptoms,"Pt is a 45 yo F with a CC of two weeks of ""ner...",3,"[[492, 498], [525, 537], [492, 498], [598, 606...",7,19


In [10]:
results = get_results(Config, train_char_probs, th=0.53)
results = get_predictions(results)
true_df["pred"] = results
true_df

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history,fold,labels,pred
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724],Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...,0,"[[696, 724]]","[[696, 724]]"
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693],Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...,0,"[[668, 693]]","[[668, 693]]"
2,00016_002,0,16,2,[chest pressure],[203 217],Chest-pressure,HPI: 17yo M presents with palpitations. Patien...,0,"[[203, 217]]","[[203, 217]]"
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]",Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...,0,"[[70, 91], [176, 183]]","[[70, 91]]"
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258],Lightheaded,HPI: 17yo M presents with palpitations. Patien...,0,"[[222, 258]]","[[222, 258]]"
...,...,...,...,...,...,...,...,...,...,...,...
14295,95330_912,9,95330,912,"[mother migraines, FH migraines]","[641 647;651 660, 637 639;651 660]",Family-history-of-migraines,Ms. Madden is a 20 yo female presenting w/ the...,4,"[[641, 647], [651, 660], [637, 639], [651, 660]]","[[636, 639], [641, 647], [651, 660]]"
14296,95330_913,9,95330,913,"[Ms, female]","[0 2, 22 28]",Female,Ms. Madden is a 20 yo female presenting w/ the...,4,"[[0, 2], [22, 28]]","[[22, 28]]"
14297,95330_914,9,95330,914,[photophobia],[270 281],Photophobia,Ms. Madden is a 20 yo female presenting w/ the...,4,"[[270, 281]]","[[270, 281]]"
14298,95330_915,9,95330,915,[No sick contacts],[340 356],No-known-illness-contacts,Ms. Madden is a 20 yo female presenting w/ the...,4,"[[340, 356]]","[[340, 356]]"


In [11]:
def get_seq(preds, labels):
    pred = set(sum([list(range(pred[0], pred[1])) for pred in preds], []))
    labels = set(sum([list(range(label[0], label[1])) for label in labels], []))
    return len(pred), len(labels)

len_pred = []
len_labels = []
for loc, labels in tqdm(zip(true_df["pred"].values, true_df["labels"].values), total=len(true_df)):
    seq_pred, seq_labels = get_seq(loc, labels)
    len_pred.append(seq_pred)
    len_labels.append(seq_labels)

true_df["len_pred"] = len_pred
true_df["len_labels"] = len_labels
true_df["len_diff"] = true_df["len_pred"] - true_df["len_labels"]
true_df.sort_values(by="len_diff", ascending=False).head(10)

  0%|          | 0/14300 [00:00<?, ?it/s]

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history,fold,labels,pred,len_pred,len_labels,len_diff
1511,52835_505,5,52835,505,[2 weeks ago seen by the ED work-up normal],[330 341;351 362;365 367;406 413;482 488],Recent-visit-to-emergency-department-with-nega...,Pt is a 26 y/o female w/ no significant PMHx w...,0,"[[330, 341], [351, 362], [365, 367], [406, 413...","[[330, 362], [364, 488]]",156,37,119
7303,54660_505,5,54660,505,[],[],Recent-visit-to-emergency-department-with-nega...,"CC: ""heart racing""\r\nHPI: Ms. Whelan is a 26 ...",2,[],"[[416, 524]]",108,0,108
3681,22081_212,2,22081,212,[],[],Irregular-flow-OR-Irregular-frequency-OR-Irreg...,CC: Irregular peroids\r\n\r\nHPI: 44 year olf ...,1,[],"[[105, 136], [147, 153], [389, 428], [433, 446]]",89,0,89
1497,51939_509,5,51939,509,[],[],Increased-frequency-recently,A case of 26 year old female patient comes in ...,0,[],"[[67, 86], [167, 229]]",81,0,81
7181,52390_509,5,52390,509,[],[],Increased-frequency-recently,Ms. Whelan is a 26 yo woman with history of pa...,2,[],"[[74, 83], [90, 113], [115, 162]]",79,0,79
7195,52588_505,5,52588,505,[],[],Recent-visit-to-emergency-department-with-nega...,elie welan 26 year female having palpitaion fo...,2,[],"[[128, 201]]",73,0,73
10285,56697_505,5,56697,505,[],[],Recent-visit-to-emergency-department-with-nega...,26 year old woman with 3 week hx of racing hea...,3,[],"[[231, 264], [278, 316]]",71,0,71
1678,55153_510,5,55153,510,[],[],Associated-feeling-of-impending-doom,"26 year old female, has come to the outpatient...",0,[],"[[181, 252]]",71,0,71
3550,21591_200,2,21591,200,[have been 28 day cycles],[150 173],Prior-normal-periods,Dolores Montgomery is a 44 year old female wit...,1,"[[150, 173]]","[[150, 173], [175, 212], [217, 248]]",91,23,68
3409,20492_212,2,20492,212,[],[],Irregular-flow-OR-Irregular-frequency-OR-Irreg...,44 y/o F comes in due to irregular periods tha...,1,[],"[[171, 237]]",66,0,66


In [12]:
agg_df = true_df.groupby(["case_num", "feature_num"], as_index=False)[["len_pred"]].quantile(q=0.75)
agg_df["len_pred"] = agg_df["len_pred"].astype(int)
agg_df = agg_df.rename(columns={"len_pred": "pred_mean"})
agg_df

Unnamed: 0,case_num,feature_num,pred_mean
0,0,0,21
1,0,1,27
2,0,2,14
3,0,3,25
4,0,4,15
...,...,...,...
138,9,912,20
139,9,913,6
140,9,914,18
141,9,915,19


In [13]:
true_df = true_df.merge(agg_df, on=["case_num", "feature_num"], how="left")
true_df

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history,fold,labels,pred,len_pred,len_labels,len_diff,pred_mean
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724],Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...,0,"[[696, 724]]","[[696, 724]]",28,28,0,21
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693],Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...,0,"[[668, 693]]","[[668, 693]]",25,25,0,27
2,00016_002,0,16,2,[chest pressure],[203 217],Chest-pressure,HPI: 17yo M presents with palpitations. Patien...,0,"[[203, 217]]","[[203, 217]]",14,14,0,14
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]",Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...,0,"[[70, 91], [176, 183]]","[[70, 91]]",21,28,-7,25
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258],Lightheaded,HPI: 17yo M presents with palpitations. Patien...,0,"[[222, 258]]","[[222, 258]]",36,36,0,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14295,95330_912,9,95330,912,"[mother migraines, FH migraines]","[641 647;651 660, 637 639;651 660]",Family-history-of-migraines,Ms. Madden is a 20 yo female presenting w/ the...,4,"[[641, 647], [651, 660], [637, 639], [651, 660]]","[[636, 639], [641, 647], [651, 660]]",18,17,1,20
14296,95330_913,9,95330,913,"[Ms, female]","[0 2, 22 28]",Female,Ms. Madden is a 20 yo female presenting w/ the...,4,"[[0, 2], [22, 28]]","[[22, 28]]",6,8,-2,6
14297,95330_914,9,95330,914,[photophobia],[270 281],Photophobia,Ms. Madden is a 20 yo female presenting w/ the...,4,"[[270, 281]]","[[270, 281]]",11,11,0,18
14298,95330_915,9,95330,915,[No sick contacts],[340 356],No-known-illness-contacts,Ms. Madden is a 20 yo female presenting w/ the...,4,"[[340, 356]]","[[340, 356]]",16,16,0,19


In [51]:
def fix_pred(preds, pred_mean):
    pred = set(sum([list(range(pred[0], pred[1])) for pred in preds], []))
    pred = sorted(list(pred))
    x = pred[:-pred_mean]

    if len(x) != 0:
        result = []
        tmp = [x[0]]
        for i in range(len(x)-1):
            if x[i+1] - x[i] == 1:
                tmp.append(x[i+1])
            else:
                if len(tmp) > 0:
                    result.append(tmp)
                tmp = []
                tmp.append(x[i+1])
        result.append(tmp)
    else:
        result = []
    output = []
    for r in result:
        start, end = r[0], r[-1]
        output.append([start, end])

    return output

In [73]:
true_df["diff_mean"] = true_df["len_pred"] - true_df["pred_mean"]
df = true_df[true_df["diff_mean"] > 0].reset_index(drop=True)
df

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history,fold,labels,pred,len_pred,len_labels,len_diff,pred_mean,diff_mean
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724],Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...,0,"[[696, 724]]","[[696, 724]]",28,28,0,21,7
1,00016_004,0,16,4,[felt as if he were going to pass out],[222 258],Lightheaded,HPI: 17yo M presents with palpitations. Patien...,0,"[[222, 258]]","[[222, 258]]",36,36,0,15,21
2,00016_006,0,16,6,"[adderall, adderrall, adderrall]","[321 329, 404 413, 652 661]",Adderall-use,HPI: 17yo M presents with palpitations. Patien...,0,"[[321, 329], [404, 413], [652, 661]]","[[321, 329], [404, 413], [652, 661]]",26,26,0,16,10
3,00016_009,0,16,9,"[palpitations, heart beating/pounding]","[26 38, 96 118]",heart-pounding-OR-heart-racing,HPI: 17yo M presents with palpitations. Patien...,0,"[[26, 38], [96, 118]]","[[26, 38], [97, 109], [111, 134]]",47,34,13,32,15
4,00041_002,0,41,2,[PRESSURE ON HER CHEST],[263 284],Chest-pressure,17 Y/O M CAME TO THE CLINIC C/O HEART POUNDING...,0,"[[263, 284]]","[[263, 284]]",21,21,0,14,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2797,95128_914,9,95128,914,[Feels better with the light turnes out],[301 339],Photophobia,20 year odl female c/o headaches x few hrs. He...,4,"[[301, 339]]","[[301, 339]]",38,38,0,18,20
2798,95243_902,9,95243,902,[1 day],[40 45],1-day-duration-OR-2-days-duration,20 y/o F with no PMH is presenting with 1 day ...,4,"[[40, 45]]","[[40, 45], [91, 100]]",14,5,9,9,5
2799,95243_914,9,95243,914,[sensitive to the light],[268 290],Photophobia,20 y/o F with no PMH is presenting with 1 day ...,4,"[[268, 290]]","[[268, 290]]",22,22,0,18,4
2800,95330_904,9,95330,904,"[HA around her head, HA diffuse]","[53 55;225 240, 53 55;207 214]",Global-headache-OR-diffuse-headache,Ms. Madden is a 20 yo female presenting w/ the...,4,"[[53, 55], [225, 240], [53, 55], [207, 214]]","[[207, 219], [225, 240]]",27,24,3,25,2


In [79]:
results = []
for pred, pred_mean in zip(df["pred"], df["pred_mean"]):
    results.append(fix_pred(pred, int(pred_mean*0.5)))
df["fix_pred"] = results
df

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history,fold,labels,pred,len_pred,len_labels,len_diff,pred_mean,diff_mean,fix_preds,fix_pred
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724],Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...,0,"[[696, 724]]","[[696, 724]]",28,28,0,21,7,"[[696, 713]]","[[696, 713]]"
1,00016_004,0,16,4,[felt as if he were going to pass out],[222 258],Lightheaded,HPI: 17yo M presents with palpitations. Patien...,0,"[[222, 258]]","[[222, 258]]",36,36,0,15,21,"[[222, 250]]","[[222, 250]]"
2,00016_006,0,16,6,"[adderall, adderrall, adderrall]","[321 329, 404 413, 652 661]",Adderall-use,HPI: 17yo M presents with palpitations. Patien...,0,"[[321, 329], [404, 413], [652, 661]]","[[321, 329], [404, 413], [652, 661]]",26,26,0,16,10,"[[321, 328], [404, 412], [652, 652]]","[[321, 328], [404, 412], [652, 652]]"
3,00016_009,0,16,9,"[palpitations, heart beating/pounding]","[26 38, 96 118]",heart-pounding-OR-heart-racing,HPI: 17yo M presents with palpitations. Patien...,0,"[[26, 38], [96, 118]]","[[26, 38], [97, 109], [111, 134]]",47,34,13,32,15,"[[26, 37], [97, 108], [111, 117]]","[[26, 37], [97, 108], [111, 117]]"
4,00041_002,0,41,2,[PRESSURE ON HER CHEST],[263 284],Chest-pressure,17 Y/O M CAME TO THE CLINIC C/O HEART POUNDING...,0,"[[263, 284]]","[[263, 284]]",21,21,0,14,7,"[[263, 276]]","[[263, 276]]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2797,95128_914,9,95128,914,[Feels better with the light turnes out],[301 339],Photophobia,20 year odl female c/o headaches x few hrs. He...,4,"[[301, 339]]","[[301, 339]]",38,38,0,18,20,"[[301, 329]]","[[301, 329]]"
2798,95243_902,9,95243,902,[1 day],[40 45],1-day-duration-OR-2-days-duration,20 y/o F with no PMH is presenting with 1 day ...,4,"[[40, 45]]","[[40, 45], [91, 100]]",14,5,9,9,5,"[[40, 44], [91, 95]]","[[40, 44], [91, 95]]"
2799,95243_914,9,95243,914,[sensitive to the light],[268 290],Photophobia,20 y/o F with no PMH is presenting with 1 day ...,4,"[[268, 290]]","[[268, 290]]",22,22,0,18,4,"[[268, 280]]","[[268, 280]]"
2800,95330_904,9,95330,904,"[HA around her head, HA diffuse]","[53 55;225 240, 53 55;207 214]",Global-headache-OR-diffuse-headache,Ms. Madden is a 20 yo female presenting w/ the...,4,"[[53, 55], [225, 240], [53, 55], [207, 214]]","[[207, 219], [225, 240]]",27,24,3,25,2,"[[207, 218], [225, 227]]","[[207, 218], [225, 227]]"


In [80]:
get_score(df["labels"].to_list(), df["fix_pred"].to_list())

0.6483502057899166

## lightgbm

In [115]:
# postprocessの前処理
def get_character_prob_df(input_df: pd.DataFrame, char_probs: list, threshold: float = 0.5):
    char_df = []
    for id, proba, fold in tqdm(zip(input_df["id"].values, char_probs, input_df["fold"].values), total=len(input_df["id"].values)):
        tmp_df = pd.DataFrame({
            "id": [id for i in range(len(proba))],
            "fold": [fold for i in range(len(proba))],
            "char_idx": [i for i in range(len(proba))],
            "proba": proba
        }
        )
        char_df.append(tmp_df)

    char_df = pd.concat(char_df, axis=0)
    char_df = char_df.reset_index(drop=True)
    char_df["pred_label"] = (char_df["proba"] >= threshold).astype(int)
    char_df["diff"] = char_df["pred_label"] - char_df.groupby("id")["pred_label"].shift(1, fill_value=0).astype(int)

    return char_df


def get_character_prob_label(input_df: pd.DataFrame, span_df: pd.DataFrame):
    feature_df = span_df.groupby(["id", "split_id"])["char_idx"].agg(["min", "max"]).reset_index()
    feature_df["location"] = (feature_df["min"] + 1).astype(str) + " " + (feature_df["max"] + 1).astype(str)
    feature_df["location"] = get_predictions(feature_df["location"])
    feature_df = feature_df.merge(input_df[["id", "labels", "case_num", "feature_num", "fold"]], on="id", how="left")
    feature_df = feature_df.drop(columns=["min", "max"], axis=1)

    # labeling
    tp_labels = []
    for idx, row in feature_df.iterrows():
        pred = row["location"]
        labels = row["labels"]
        cnt = 0
        for label in labels:
            if (pred[0][0] == label[0]) and (pred[0][1] == label[1]):
                cnt += 1
        if cnt > 0:
            tp_labels.append(1)
        else:
            tp_labels.append(0)

    feature_df["label"] = tp_labels

    return feature_df


def create_feature(train_df: pd.DataFrame, span_df: pd.DataFrame):
    feature_df = train_df[["id", "split_id", "case_num", "feature_num"]].copy()
    # aggregate
    agg_df = span_df.groupby(["id", "split_id"])["proba"].agg(
        [
         "min", 
         "max", 
         "mean",
         "median",
         "std", 
         "count",
         "first",
         "last",
    ])
    agg_cols = agg_df.columns

    feature_df = feature_df.merge(agg_df.reset_index(), on=["id", "split_id"], how="left")
    # lag
    # lag = 1
    # for col in agg_cols:
    #     feature_df[f"{col}_lag_{lag}"] = feature_df.groupby(["id"])[col].shift(lag)

    return feature_df.drop(columns=["id", "split_id"], axis=1)


# character-levelの予測をdataframeに加工
char_df = get_character_prob_df(true_df, train_char_probs, threshold=0.52)

# 予測spanを作成
span_df = char_df[char_df["pred_label"] == 1].reset_index(drop=True)
span_df["split_id"] = span_df.groupby("id")["diff"].cumsum()

# 教師ラベル(ハードラベリング)を作成
span_train_df = get_character_prob_label(true_df, span_df)

  0%|          | 0/14300 [00:00<?, ?it/s]

In [116]:
def get_tp_ratio(pred, labels):
    pred = pred[0]
    pred_set = set(list(range(pred[0], pred[1])))
    label_set = set(sum([list(range(label[0], label[1])) for label in labels], []))
    tp = pred_set & label_set
    try:
        tp_ratio = len(tp) / len(pred_set)
    except ZeroDivisionError as e:
        tp_ratio = 0.0
    return tp_ratio

tp_ratios = []
for loc, labels in tqdm(zip(span_train_df["location"].values, span_train_df["labels"].values), total=len(span_train_df)):
    tp_ratio = get_tp_ratio(loc, labels)
    tp_ratios.append(tp_ratio)

span_train_df["tp_ratio"] = tp_ratios

  0%|          | 0/14163 [00:00<?, ?it/s]

In [117]:
def train_lgb(cfg, input_df: pd.DataFrame, span_df: pd.DataFrame):
    oof = np.zeros((len(input_df),))
    models = []
    for i in range(cfg.num_fold):
        train_df, valid_df = input_df[input_df["fold"] != i], input_df[input_df["fold"] == i]
        y_train, y_valid = train_df["tp_ratio"].values, valid_df["tp_ratio"].values

        X_train = create_feature(train_df, span_df)
        X_valid = create_feature(valid_df, span_df)

        train_set = lgb.Dataset(X_train, y_train)
        valid_set = lgb.Dataset(X_valid, y_valid, reference=train_set)

        model = lgb.train(
            params=cfg.lgb_params,
            train_set=train_set,
            valid_sets=[train_set, valid_set],
            valid_names=["train", "valid"],
            verbose_eval=100,
        )

        y_oof = model.predict(X_valid, num_iteration=model.best_iteration)
        oof[valid_df.index] = y_oof
        models.append(model)

    return oof, models

oof, models = train_lgb(Config, span_train_df, span_df)

span_train_df["lgb_pred"] = oof
span_train_df



Training until validation scores don't improve for 100 rounds.
[100]	train's binary_logloss: 0.221427	valid's binary_logloss: 0.242945
[200]	train's binary_logloss: 0.207437	valid's binary_logloss: 0.23829
[300]	train's binary_logloss: 0.201651	valid's binary_logloss: 0.238525
Early stopping, best iteration is:
[215]	train's binary_logloss: 0.206313	valid's binary_logloss: 0.238152
Training until validation scores don't improve for 100 rounds.
[100]	train's binary_logloss: 0.226995	valid's binary_logloss: 0.225803
[200]	train's binary_logloss: 0.213681	valid's binary_logloss: 0.213144
[300]	train's binary_logloss: 0.208167	valid's binary_logloss: 0.210405
[400]	train's binary_logloss: 0.204517	valid's binary_logloss: 0.210547
Early stopping, best iteration is:
[337]	train's binary_logloss: 0.206691	valid's binary_logloss: 0.210122
Training until validation scores don't improve for 100 rounds.
[100]	train's binary_logloss: 0.227884	valid's binary_logloss: 0.221917
[200]	train's binary_l

Unnamed: 0,id,split_id,location,labels,case_num,feature_num,fold,label,tp_ratio,lgb_pred
0,00016_000,1,"[[696, 724]]","[[696, 724]]",0,0,0,1,1.0,0.898259
1,00016_001,1,"[[668, 693]]","[[668, 693]]",0,1,0,1,1.0,0.973786
2,00016_002,1,"[[203, 217]]","[[203, 217]]",0,2,0,1,1.0,0.947860
3,00016_003,1,"[[70, 91]]","[[70, 91], [176, 183]]",0,3,0,1,1.0,0.964698
4,00016_004,1,"[[222, 258]]","[[222, 258]]",0,4,0,1,1.0,0.664758
...,...,...,...,...,...,...,...,...,...,...
14158,95333_910,1,"[[576, 594]]","[[576, 594]]",9,910,3,1,1.0,0.980313
14159,95333_913,1,"[[34, 39]]",[],9,913,3,0,0.0,0.984015
14160,95333_914,1,"[[274, 282]]","[[274, 282]]",9,914,3,1,1.0,0.986269
14161,95333_915,1,"[[421, 437]]","[[421, 437]]",9,915,3,1,1.0,0.978969


In [118]:
# postprocessによるTP予測からFPのspanの予測値を置換
span_train_df["is_tp"] = (span_train_df["lgb_pred"] >= 0.4).astype(int)
fp_df = span_train_df[span_train_df["is_tp"] == 0].reset_index(drop=True)

fp_indicies = []
for id, loc in tqdm(zip(fp_df["id"], fp_df["location"]), total=len(fp_df)):
    start_idx, end_idx = loc[0][0], loc[0][1]
    # fp_idx = char_df[(char_df["id"] == id) & (char_df["char_idx"] >= start_idx - 1) & (char_df["char_idx"] <= end_idx - 1)].index
    fp_idx = char_df.query('id == @id and (char_idx - 1) >= @start_idx and char_idx <= (@end_idx - 1)').index
    # fp_indicies.append(fp_idx)
    char_df.loc[fp_idx, "proba"] = 0.0

  0%|          | 0/24 [00:00<?, ?it/s]

In [119]:
pp_char_probs = []
for idx, group in char_df.groupby(["fold", "id"]):
    pp_char_probs.append(group["proba"].values)

In [120]:
results = get_results(Config, pp_char_probs, th=0.52)
preds = get_predictions(results)
oof_score = get_score(true_df["labels"].to_list(), preds)
oof_score

0.8832939320269458

In [11]:
# def visualize_feature_importance(models, feat_train_df)
#     '''LightGBMのfeature importanceを可視化
#     '''
#     feature_importance_df = pd.DataFrame()
#     for i, model in enumerate(models):
#         _df = pd.DataFrame()
#         _df['feature_importance'] = model.feature_importance(importance_type="gain")
#         _df['feature'] = feat_train_df.columns
#         _df['model_no'] = i + 1
#         feature_importance_df = pd.concat([feature_importance_df, _df], 
#                                         axis=0, ignore_index=True)

#     order = feature_importance_df.groupby('feature')\
#         .mean()[['feature_importance']]\
#         .sort_values('feature_importance', ascending=False).index[:50]
    
#     # fig = px.box(
#     #     feature_importance_df.query("feature in @order"),
#     #     x="feature_importance",
#     #     y="feature",
#     #     category_orders={"feature": order},
#     #     width=1250,
#     #     height=900,
#     #     title="Top 50 feature importance",
#     # )
#     # fig.update_yaxes(showgrid=True)
#     # fig.show()

#     feature_importance_df = feature_importance_df.groupby("feature")[["feature_importance"]].mean().sort_values(by="feature_importance", ascending=False)

#     return feature_importance_df

# feature_importance_df = visualize_feature_importance(models, create_feature(span_train_df, span_df))
# feature_importance_df

In [12]:
# # postprocessによるTP予測からFPのspanの予測値を置換
# for id, loc in zip(span_train_df["id"], span_train_df["location"]):
#     start_idx, end_idx = loc[0][0], loc[0][1]
#     fp_idx = char_df[(char_df["id"] == id) & (char_df["char_idx"] >= start_idx - 1) & (char_df["char_idx"] <= end_idx - 1)].index
#     char_df.loc[fp_idx, "proba"] = 0.0

In [13]:
# def get_result(char_probs, th=0.5):
#     results = []
#     for char_prob in char_probs:
#         result = np.where(char_prob >= th)[0] + 1
#         result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
#         result = [f"{min(r)} {max(r)}" for r in result]
#         result = ";".join(result)
#         results.append(result)

#     return results


# get_result(np.array(df["proba"]).reshape(5,-1))