In [1]:
import os
import gc
import sys
import json
import itertools
from tqdm.auto import tqdm
import logging
import datetime
import ast

import numpy as np
import pandas as pd
import sklearn.model_selection as sms
from sklearn.metrics import f1_score
import math
import re

# import torch
# import torch.nn as nn
# from torch.nn import functional as F
# from torch.utils.data import Dataset, DataLoader
# import torch.optim as optim

# import pytorch_lightning as pl
# from pytorch_lightning import Trainer, seed_everything
# from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
# from pytorch_lightning.loggers import WandbLogger

# from transformers import AutoConfig, AutoModel, AutoTokenizer, get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup

# import wandb

In [2]:
class Config:
    # ==============================
    # Globals #
    # ==============================
    competition_name = "nbme-score-clinical-patient-notes"
    group = "RoBERTa-large"
    exp_id = "004"
    debug = False
    inference_only = True
    upload_from_colab = False
    colab_dir = "/content/drive/MyDrive/Colab/kaggle/nbme-score-clinical-patient-notes"
    kaggle_json_path = "/root/.kaggle/kaggle.json"
    kaggle_dataset_path = None
    gpus = 1
    seed = 2434
    max_epochs = 5
    accumulate_grad_batches = 2
    precision = 32
    num_fold = 5
    train_fold = [0,1,2,3,4] # 実行するfold
    # ==============================
    # Dataloader #
    # ==============================
    train_batch_size = 4
    valid_batch_size = 32
    test_batch_size = 32
    num_workers = 8
    # ==============================
    # Split #
    # ==============================
    split_name = "GroupKFold"
    split_params = {
        "n_splits": num_fold if not debug else 4,
        # "shuffle": True,
        # "random_state": seed,
    }
    # ==============================
    # Model #
    # ==============================
    model_name = "roberta-large"
    max_length = 512
    hidden_size = 1024
    num_class = 1
    dropout = 0.2
    initializer_range = 0.02
    # ==============================
    # Loss #
    # ==============================
    loss_name = "BCEWithLogitsLoss"
    loss_params = {
        "reduction": "none"
    }
    # ==============================
    # Optimizer #
    # ==============================
    optimizer_name = "AdamW"
    optimizer_params = {
        "lr": 1e-4,
        "weight_decay": 1e-2,
        "eps": 1e-6,
        "betas": (0.9, 0.999)
    }
    encoder_lr = 2e-5
    decoder_lr = 2e-5
    weight_decay = 0.01
    # ==============================
    # Scheduler #
    # ==============================
    scheduler_name = "cosine-warmup"
    scheduler_warmup_ratio = 0.1
    scheduler_params = {}
    scheduler_interval = "step"
    # ==============================
    # Callbacks #
    # ==============================
    checkpoint_params = {
        "monitor": "val/micro-F1",
        "save_top_k": 1,
        "save_weights_only": True,
        "mode": "max",
        "verbose": True,
    }
    early_stopping = False
    early_stopping_params = {
        "monitor": "val/loss",
        "min_delta": 0.0,
        "patience": 8,
        "verbose": False,
        "mode": "min",
    }

In [7]:
# ====================================
# Setup #
# ====================================
class Logger:
    """ ref) https://github.com/ghmagazine/kagglebook/blob/master/ch04-model-interface/code/util.py"""
    def __init__(self, path):
        self.general_logger = logging.getLogger(path)
        stream_handler = logging.StreamHandler()
        file_general_handler = logging.FileHandler(os.path.join(path, 'Experiment.log'))
        if len(self.general_logger.handlers) == 0:
            self.general_logger.addHandler(stream_handler)
            self.general_logger.addHandler(file_general_handler)
            self.general_logger.setLevel(logging.INFO)

    def info(self, message):
        # display time
        self.general_logger.info('[{}] - {}'.format(self.now_string(), message))

    @staticmethod
    def now_string():
        return str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))


def setup(cfg):
    cfg.on_colab = "google.colab" in sys.modules
    if cfg.on_colab:
        # kaggle api
        # f = open(Config.kaggle_json_path, 'r')
        # json_data = json.load(f)
        # os.environ["KAGGLE_USERNAME"] = json_data["username"]
        # set input/output dir
        cfg.input_dir = os.path.join(cfg.colab_dir, "input")
        cfg.train_csv = os.path.join(cfg.input_dir, "cleaned_train.csv")
        cfg.features_csv = os.path.join(cfg.input_dir, "features.csv")
        cfg.patient_notes_csv = os.path.join(cfg.input_dir, "patient_notes.csv")
        cfg.test_csv = os.path.join(cfg.input_dir, "test.csv")
        cfg.sample_submission = os.path.join(cfg.input_dir, "sample_submission.csv")
        cfg.output_dir = os.path.join(cfg.colab_dir, "output")
        cfg.exp_output_dir = os.path.join(cfg.output_dir, f"exp{cfg.exp_id}")
        cfg.model_dir = os.path.join(cfg.exp_output_dir, "model")

        for d in [cfg.output_dir, cfg.exp_output_dir, cfg.model_dir]:
            os.makedirs(d, exist_ok=True)
            
        # wandb
        # wandb.login()
    else:
        cfg.input_dir = f"../input/{cfg.competition_name}"
        cfg.train_csv = os.path.join(cfg.input_dir, "train.csv")
        cfg.features_csv = os.path.join(cfg.input_dir, "features.csv")
        cfg.patient_notes_csv = os.path.join(cfg.input_dir, "patient_notes.csv")
        cfg.test_csv = os.path.join(cfg.input_dir, "test.csv")
        cfg.sample_submission = os.path.join(cfg.input_dir, "sample_submission.csv")
        cfg.submission = "./"
        cfg.exp_output_dir = f"exp{cfg.exp_id}"
        cfg.model_dir = os.path.join(cfg.exp_output_dir, "model")

        if cfg.kaggle_dataset_path is not None:
            cfg.model_dir = os.path.join(cfg.kaggle_dataset_path, "model")

        for d in [cfg.exp_output_dir, cfg.model_dir]:
            os.makedirs(d, exist_ok=True)

    return cfg

In [13]:
Config = setup(Config)
train_df = pd.read_csv(Config.train_csv)
pn_notes_df = pd.read_csv(Config.patient_notes_csv)
features_df = pd.read_csv(Config.features_csv)
train_df.head()

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location
0,00016_000,0,16,0,['dad with recent heart attcak'],['696 724']
1,00016_001,0,16,1,"['mom with ""thyroid disease']",['668 693']
2,00016_002,0,16,2,['chest pressure'],['203 217']
3,00016_003,0,16,3,"['intermittent episodes', 'episode']","['70 91', '176 183']"
4,00016_004,0,16,4,['felt as if he were going to pass out'],['222 258']


In [12]:
train_df.shape

(14300, 6)

In [9]:
train_df.pn_num.nunique()

1000

In [11]:
pn_notes_df

Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...
...,...,...,...
42141,95330,9,Ms. Madden is a 20 yo female presenting w/ the...
42142,95331,9,A 20 YO F CAME COMPLAIN A DULL 8/10 HEADACHE T...
42143,95332,9,Ms. Madden is a 20yo female who presents with ...
42144,95333,9,Stephanie madden is a 20 year old woman compla...


In [14]:
features_df

Unnamed: 0,feature_num,case_num,feature_text
0,0,0,Family-history-of-MI-OR-Family-history-of-myoc...
1,1,0,Family-history-of-thyroid-disorder
2,2,0,Chest-pressure
3,3,0,Intermittent-symptoms
4,4,0,Lightheaded
...,...,...,...
138,912,9,Family-history-of-migraines
139,913,9,Female
140,914,9,Photophobia
141,915,9,No-known-illness-contacts


In [17]:
pn_notes_df.merge(features_df, on=["case_num"], how="left")

Unnamed: 0,pn_num,case_num,pn_history,feature_num,feature_text
0,0,0,"17-year-old male, has come to the student heal...",0,Family-history-of-MI-OR-Family-history-of-myoc...
1,0,0,"17-year-old male, has come to the student heal...",1,Family-history-of-thyroid-disorder
2,0,0,"17-year-old male, has come to the student heal...",2,Chest-pressure
3,0,0,"17-year-old male, has come to the student heal...",3,Intermittent-symptoms
4,0,0,"17-year-old male, has come to the student heal...",4,Lightheaded
...,...,...,...,...,...
626897,95334,9,patient is a 20 yo F who presents with a heada...,912,Family-history-of-migraines
626898,95334,9,patient is a 20 yo F who presents with a heada...,913,Female
626899,95334,9,patient is a 20 yo F who presents with a heada...,914,Photophobia
626900,95334,9,patient is a 20 yo F who presents with a heada...,915,No-known-illness-contacts
