# About this notebook
- Deberta-base starter code
- pip wheels is [here](https://www.kaggle.com/yasufuminakama/nbme-pip-wheels)
- Training notebook is [here](https://www.kaggle.com/yasufuminakama/nbme-deberta-base-baseline-train)

If this notebook is helpful, feel free to upvote :)

In [1]:
# The following is necessary if you want to use the fast tokenizer for deberta v2 or v3
# This must be done before importing transformers
import shutil
from pathlib import Path

#transformers_path = Path("/opt/conda/lib/python3.7/site-packages/transformers")
#transformers_path = Path('E:\\Apps\\anacondas\\lib\\site-packages\\transformers')
transformers_path = Path('/home/artem/anaconda3/envs/base_env0/lib/python3.9/site-packages/transformers')


#input_dir = Path("../input/deberta-v2-3-fast-tokenizer")
#input_dir = Path("E:/programming/jupyter_files/kaggle/NBME_patient_notes/debetra_tokenizer")
input_dir = Path("/home/artem/jupyter_files/NBME_patient_notes/debetra_tokenizer")



convert_file = input_dir / "convert_slow_tokenizer.py"
conversion_path = transformers_path/convert_file.name

if conversion_path.exists():
    conversion_path.unlink()

shutil.copy(convert_file, transformers_path)
deberta_v2_path = transformers_path / "models" / "deberta_v2"

for filename in ['tokenization_deberta_v2.py', 'tokenization_deberta_v2_fast.py']:
    filepath = deberta_v2_path/filename
    if filepath.exists():
        filepath.unlink()

    shutil.copy(input_dir/filename, filepath)


# CFG

In [2]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    path="/home/artem/jupyter_files/NBME_patient_notes/"
    config_path=path+'config.pth'
    model="microsoft/deberta-v3-large"
    batch_size=4
    fc_dropout=0.2
    max_len=512
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    gradient_accumulation_steps=1
    print_freq=100

# Library

In [3]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

#os.system('pip uninstall -y transformers')
#os.system('python -m pip install --no-index --find-links=../input/nbme-pip-wheels transformers')
#os.system('pip install tokenizers')
#os.system('pip install transformers')
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true
from IPython.display import clear_output

#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
torch.cuda.set_device(torch.device('cuda:1'))
print(torch.cuda.get_device_name())

tokenizers.__version__: 0.11.6
transformers.__version__: 4.17.0
env: TOKENIZERS_PARALLELISM=true
Tesla K40m


# tokenizer

In [4]:
# ====================================================
# tokenizer
# ====================================================
#CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.path+'tokenizer/')


from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast
tokenizer = DebertaV2TokenizerFast.from_pretrained(CFG.path+'tokenizer/')

#tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

# Helper functions for scoring

In [5]:
# From https://www.kaggle.com/theoviel/evaluation-metric-folds-baseline

def micro_f1(preds, truths):
    """
    Micro f1 on binary arrays.

    Args:
        preds (list of lists of ints): Predictions.
        truths (list of lists of ints): Ground truths.

    Returns:
        float: f1 score.
    """
    # Micro : aggregating over all instances
    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    return f1_score(truths, preds)


def spans_to_binary(spans, length=None):
    """
    Converts spans to a binary array indicating whether each character is in the span.

    Args:
        spans (list of lists of two ints): Spans.

    Returns:
        np array [length]: Binarized spans.
    """
    length = np.max(spans) if length is None else length
    binary = np.zeros(length)
    for start, end in spans:
        binary[start:end] = 1
    return binary


def span_micro_f1(preds, truths):
    """
    Micro f1 on spans.

    Args:
        preds (list of lists of two ints): Prediction spans.
        truths (list of lists of two ints): Ground truth spans.

    Returns:
        float: f1 score.
    """
    bin_preds = []
    bin_truths = []
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            continue
        length = max(np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0)
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
    return micro_f1(bin_preds, bin_truths)

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


In [6]:
def create_labels_for_scoring(df):
    # example: ['0 1', '3 4'] -> ['0 1; 3 4']
    df['location_for_create_labels'] = [ast.literal_eval(f'[]')] * len(df)
    for i in range(len(df)):
        lst = df.loc[i, 'location']
        if lst:
            new_lst = ';'.join(lst)
            df.loc[i, 'location_for_create_labels'] = ast.literal_eval(f'[["{new_lst}"]]')
    # create labels
    truths = []
    for location_list in df['location_for_create_labels'].values:
        truth = []
        if len(location_list) > 0:
            location = location_list[0]
            for loc in [s.split() for s in location.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                truth.append([start, end])
        truths.append(truth)
    return truths


def get_char_probs(texts, predictions, tokenizer):
    results = [np.zeros(len(t)) for t in texts]
    for i, (text, prediction) in enumerate(zip(texts, predictions)):
        encoded = tokenizer(text, 
                            add_special_tokens=True,
                            return_offsets_mapping=True)
        for idx, (offset_mapping, pred) in enumerate(zip(encoded['offset_mapping'], prediction)):
            start = offset_mapping[0]
            end = offset_mapping[1]
            results[i][start:end] = pred
    return results


def get_results(char_probs, th=0.5):
    results = []
    for char_prob in char_probs:
        result = np.where(char_prob >= th)[0] + 1
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        result = [f"{min(r)} {max(r)}" for r in result]
        result = ";".join(result)
        results.append(result)
    return results


def get_predictions(results):
    predictions = []
    for result in results:
        prediction = []
        if result != "":
            for loc in [s.split() for s in result.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    return predictions

# Utils

In [7]:
# ====================================================
# Utils
# ====================================================
def get_score(y_true, y_pred):
    score = span_micro_f1(y_true, y_pred)
    return score


def get_logger(filename='inference'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

# OOF

# Data Loading

In [8]:
# ====================================================
# Data Loading
# ====================================================
#train = pd.read_csv('../input/nbme-score-clinical-patient-notes/train.csv')
train = pd.read_csv('/home/artem/jupyter_files/NBME_patient_notes/Data/train.csv')
#train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/kaggle/NBME_patient_notes2/Data/train.csv')
train['annotation'] = train['annotation'].apply(ast.literal_eval)
train['location'] = train['location'].apply(ast.literal_eval)
#features = pd.read_csv('../input/nbme-score-clinical-patient-notes/features.csv')
features = pd.read_csv('/home/artem/jupyter_files/NBME_patient_notes/Data/features.csv')
#features = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/kaggle/NBME_patient_notes2/Data/features.csv')
def preprocess_features(features):
    features.loc[27, 'feature_text'] = "Last-Pap-smear-1-year-ago"
    return features
features = preprocess_features(features)
#patient_notes = pd.read_csv('../input/nbme-score-clinical-patient-notes/patient_notes.csv')
patient_notes = pd.read_csv('/home/artem/jupyter_files/NBME_patient_notes/Data/patient_notes.csv')
#patient_notes = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/kaggle/NBME_patient_notes2/Data/patient_notes.csv')

print(f"train.shape: {train.shape}")
display(train.head())
print(f"features.shape: {features.shape}")
display(features.head())
print(f"patient_notes.shape: {patient_notes.shape}")
display(patient_notes.head())

train.shape: (14300, 6)


Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724]
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693]
2,00016_002,0,16,2,[chest pressure],[203 217]
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]"
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258]


features.shape: (143, 3)


Unnamed: 0,feature_num,case_num,feature_text
0,0,0,Family-history-of-MI-OR-Family-history-of-myoc...
1,1,0,Family-history-of-thyroid-disorder
2,2,0,Chest-pressure
3,3,0,Intermittent-symptoms
4,4,0,Lightheaded


patient_notes.shape: (42146, 3)


Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...


In [9]:
train = train.merge(features, on=['feature_num', 'case_num'], how='left')
train = train.merge(patient_notes, on=['pn_num', 'case_num'], how='left')
display(train.head())

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724],Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693],Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...
2,00016_002,0,16,2,[chest pressure],[203 217],Chest-pressure,HPI: 17yo M presents with palpitations. Patien...
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]",Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258],Lightheaded,HPI: 17yo M presents with palpitations. Patien...


In [10]:
# incorrect annotation
train.loc[338, 'annotation'] = ast.literal_eval('[["father heart attack"]]')
train.loc[338, 'location'] = ast.literal_eval('[["764 783"]]')

train.loc[621, 'annotation'] = ast.literal_eval('[["for the last 2-3 months"]]')
train.loc[621, 'location'] = ast.literal_eval('[["77 100"]]')

train.loc[655, 'annotation'] = ast.literal_eval('[["no heat intolerance"], ["no cold intolerance"]]')
train.loc[655, 'location'] = ast.literal_eval('[["285 292;301 312"], ["285 287;296 312"]]')

train.loc[1262, 'annotation'] = ast.literal_eval('[["mother thyroid problem"]]')
train.loc[1262, 'location'] = ast.literal_eval('[["551 557;565 580"]]')

train.loc[1265, 'annotation'] = ast.literal_eval('[[\'felt like he was going to "pass out"\']]')
train.loc[1265, 'location'] = ast.literal_eval('[["131 135;181 212"]]')

train.loc[1396, 'annotation'] = ast.literal_eval('[["stool , with no blood"]]')
train.loc[1396, 'location'] = ast.literal_eval('[["259 280"]]')

train.loc[1591, 'annotation'] = ast.literal_eval('[["diarrhoe non blooody"]]')
train.loc[1591, 'location'] = ast.literal_eval('[["176 184;201 212"]]')

train.loc[1615, 'annotation'] = ast.literal_eval('[["diarrhea for last 2-3 days"]]')
train.loc[1615, 'location'] = ast.literal_eval('[["249 257;271 288"]]')

train.loc[1664, 'annotation'] = ast.literal_eval('[["no vaginal discharge"]]')
train.loc[1664, 'location'] = ast.literal_eval('[["822 824;907 924"]]')

train.loc[1714, 'annotation'] = ast.literal_eval('[["started about 8-10 hours ago"]]')
train.loc[1714, 'location'] = ast.literal_eval('[["101 129"]]')

train.loc[1929, 'annotation'] = ast.literal_eval('[["no blood in the stool"]]')
train.loc[1929, 'location'] = ast.literal_eval('[["531 539;549 561"]]')

train.loc[2134, 'annotation'] = ast.literal_eval('[["last sexually active 9 months ago"]]')
train.loc[2134, 'location'] = ast.literal_eval('[["540 560;581 593"]]')

train.loc[2191, 'annotation'] = ast.literal_eval('[["right lower quadrant pain"]]')
train.loc[2191, 'location'] = ast.literal_eval('[["32 57"]]')

train.loc[2553, 'annotation'] = ast.literal_eval('[["diarrhoea no blood"]]')
train.loc[2553, 'location'] = ast.literal_eval('[["308 317;376 384"]]')

train.loc[3124, 'annotation'] = ast.literal_eval('[["sweating"]]')
train.loc[3124, 'location'] = ast.literal_eval('[["549 557"]]')

train.loc[3858, 'annotation'] = ast.literal_eval('[["previously as regular"], ["previously eveyr 28-29 days"], ["previously lasting 5 days"], ["previously regular flow"]]')
train.loc[3858, 'location'] = ast.literal_eval('[["102 123"], ["102 112;125 141"], ["102 112;143 157"], ["102 112;159 171"]]')

train.loc[4373, 'annotation'] = ast.literal_eval('[["for 2 months"]]')
train.loc[4373, 'location'] = ast.literal_eval('[["33 45"]]')

train.loc[4763, 'annotation'] = ast.literal_eval('[["35 year old"]]')
train.loc[4763, 'location'] = ast.literal_eval('[["5 16"]]')

train.loc[4782, 'annotation'] = ast.literal_eval('[["darker brown stools"]]')
train.loc[4782, 'location'] = ast.literal_eval('[["175 194"]]')

train.loc[4908, 'annotation'] = ast.literal_eval('[["uncle with peptic ulcer"]]')
train.loc[4908, 'location'] = ast.literal_eval('[["700 723"]]')

train.loc[6016, 'annotation'] = ast.literal_eval('[["difficulty falling asleep"]]')
train.loc[6016, 'location'] = ast.literal_eval('[["225 250"]]')

train.loc[6192, 'annotation'] = ast.literal_eval('[["helps to take care of aging mother and in-laws"]]')
train.loc[6192, 'location'] = ast.literal_eval('[["197 218;236 260"]]')

train.loc[6380, 'annotation'] = ast.literal_eval('[["No hair changes"], ["No skin changes"], ["No GI changes"], ["No palpitations"], ["No excessive sweating"]]')
train.loc[6380, 'location'] = ast.literal_eval('[["480 482;507 519"], ["480 482;499 503;512 519"], ["480 482;521 531"], ["480 482;533 545"], ["480 482;564 582"]]')

train.loc[6562, 'annotation'] = ast.literal_eval('[["stressed due to taking care of her mother"], ["stressed due to taking care of husbands parents"]]')
train.loc[6562, 'location'] = ast.literal_eval('[["290 320;327 337"], ["290 320;342 358"]]')

train.loc[6862, 'annotation'] = ast.literal_eval('[["stressor taking care of many sick family members"]]')
train.loc[6862, 'location'] = ast.literal_eval('[["288 296;324 363"]]')

train.loc[7022, 'annotation'] = ast.literal_eval('[["heart started racing and felt numbness for the 1st time in her finger tips"]]')
train.loc[7022, 'location'] = ast.literal_eval('[["108 182"]]')

train.loc[7422, 'annotation'] = ast.literal_eval('[["first started 5 yrs"]]')
train.loc[7422, 'location'] = ast.literal_eval('[["102 121"]]')

train.loc[8876, 'annotation'] = ast.literal_eval('[["No shortness of breath"]]')
train.loc[8876, 'location'] = ast.literal_eval('[["481 483;533 552"]]')

train.loc[9027, 'annotation'] = ast.literal_eval('[["recent URI"], ["nasal stuffines, rhinorrhea, for 3-4 days"]]')
train.loc[9027, 'location'] = ast.literal_eval('[["92 102"], ["123 164"]]')

train.loc[9938, 'annotation'] = ast.literal_eval('[["irregularity with her cycles"], ["heavier bleeding"], ["changes her pad every couple hours"]]')
train.loc[9938, 'location'] = ast.literal_eval('[["89 117"], ["122 138"], ["368 402"]]')

train.loc[9973, 'annotation'] = ast.literal_eval('[["gaining 10-15 lbs"]]')
train.loc[9973, 'location'] = ast.literal_eval('[["344 361"]]')

train.loc[10513, 'annotation'] = ast.literal_eval('[["weight gain"], ["gain of 10-16lbs"]]')
train.loc[10513, 'location'] = ast.literal_eval('[["600 611"], ["607 623"]]')

train.loc[11551, 'annotation'] = ast.literal_eval('[["seeing her son knows are not real"]]')
train.loc[11551, 'location'] = ast.literal_eval('[["386 400;443 461"]]')

train.loc[11677, 'annotation'] = ast.literal_eval('[["saw him once in the kitchen after he died"]]')
train.loc[11677, 'location'] = ast.literal_eval('[["160 201"]]')

train.loc[12124, 'annotation'] = ast.literal_eval('[["tried Ambien but it didnt work"]]')
train.loc[12124, 'location'] = ast.literal_eval('[["325 337;349 366"]]')

train.loc[12279, 'annotation'] = ast.literal_eval('[["heard what she described as a party later than evening these things did not actually happen"]]')
train.loc[12279, 'location'] = ast.literal_eval('[["405 459;488 524"]]')

train.loc[12289, 'annotation'] = ast.literal_eval('[["experienced seeing her son at the kitchen table these things did not actually happen"]]')
train.loc[12289, 'location'] = ast.literal_eval('[["353 400;488 524"]]')

train.loc[13238, 'annotation'] = ast.literal_eval('[["SCRACHY THROAT"], ["RUNNY NOSE"]]')
train.loc[13238, 'location'] = ast.literal_eval('[["293 307"], ["321 331"]]')

train.loc[13297, 'annotation'] = ast.literal_eval('[["without improvement when taking tylenol"], ["without improvement when taking ibuprofen"]]')
train.loc[13297, 'location'] = ast.literal_eval('[["182 221"], ["182 213;225 234"]]')

train.loc[13299, 'annotation'] = ast.literal_eval('[["yesterday"], ["yesterday"]]')
train.loc[13299, 'location'] = ast.literal_eval('[["79 88"], ["409 418"]]')

train.loc[13845, 'annotation'] = ast.literal_eval('[["headache global"], ["headache throughout her head"]]')
train.loc[13845, 'location'] = ast.literal_eval('[["86 94;230 236"], ["86 94;237 256"]]')

train.loc[14083, 'annotation'] = ast.literal_eval('[["headache generalized in her head"]]')
train.loc[14083, 'location'] = ast.literal_eval('[["56 64;156 179"]]')

In [11]:
#download extra datasets: fixed typos, full abbreviation
extra_train = pd.read_pickle('/home/artem/jupyter_files/NBME_patient_notes/typos/extra_train.pkl')
train_full = pd.concat([train,extra_train],ignore_index = True)

In [12]:
train['annotation_length'] = train['annotation'].apply(len)
train_full['annotation_length'] = train_full['annotation'].apply(len)
display(train['annotation_length'].value_counts())

1    8185
0    4399
2    1292
3     287
4      99
5      27
6       9
7       1
8       1
Name: annotation_length, dtype: int64

In [13]:
# ====================================================
# CV split
# ====================================================
Fold = GroupKFold(n_splits=CFG.n_fold)
groups = train_full['pn_num'].values
for n, (train_index, val_index) in enumerate(Fold.split(train_full, train_full['location'], groups)):
    train_full.loc[val_index, 'fold'] = int(n)
train_full['fold'] = train_full['fold'].astype(int)
display(train_full.groupby('fold').size())
train['fold'] = train_full['fold'].to_numpy()[:len(train)]

fold
0    2955
1    2955
2    2955
3    2955
4    2955
dtype: int64

# Dataset

In [14]:
# ====================================================
# Define max_len
# ====================================================
for text_col in ['pn_history']:
    pn_history_lengths = []
    tk0 = tqdm(train_full[text_col].fillna("").values, total=len(train_full))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        pn_history_lengths.append(length)
    LOGGER.info(f'{text_col} max(lengths): {max(pn_history_lengths)}')

for text_col in ['feature_text']:
    features_lengths = []
    tk0 = tqdm(features[text_col].fillna("").values, total=len(features))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        features_lengths.append(length)
    LOGGER.info(f'{text_col} max(lengths): {max(features_lengths)}')

CFG.max_len = max(pn_history_lengths) + max(features_lengths) + 3 # cls & sep & sep
LOGGER.info(f"max_len: {CFG.max_len}")

  0%|          | 0/14775 [00:00<?, ?it/s]

pn_history max(lengths): 309


  0%|          | 0/143 [00:00<?, ?it/s]

feature_text max(lengths): 28
max_len: 340


In [15]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text, feature_text):
    inputs = cfg.tokenizer(text, feature_text, 
                           add_special_tokens=True,
                           max_length=CFG.max_len,
                           padding="max_length",
                           return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


def create_label(cfg, text, annotation_length, location_list):
    encoded = cfg.tokenizer(text,
                            add_special_tokens=True,
                            max_length=CFG.max_len,
                            padding="max_length",
                            return_offsets_mapping=True)
    offset_mapping = encoded['offset_mapping']
    ignore_idxes = np.where(np.array(encoded.sequence_ids()) != 0)[0]
    label = np.zeros(len(offset_mapping))
    label[ignore_idxes] = -1
    if annotation_length != 0:
        for location in location_list:
            for loc in [s.split() for s in location.split(';')]:
                start_idx = -1
                end_idx = -1
                start, end = int(loc[0]), int(loc[1])
                for idx in range(len(offset_mapping)):
                    if (start_idx == -1) & (start < offset_mapping[idx][0]):
                        start_idx = idx - 1
                    if (end_idx == -1) & (end <= offset_mapping[idx][1]):
                        end_idx = idx + 1
                if start_idx == -1:
                    start_idx = end_idx
                if (start_idx != -1) & (end_idx != -1):
                    label[start_idx:end_idx] = 1
    return torch.tensor(label, dtype=torch.float)


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.feature_texts = df['feature_text'].values
        self.pn_historys = df['pn_history'].values
        self.annotation_lengths = df['annotation_length'].values
        self.locations = df['location'].values

    def __len__(self):
        return len(self.feature_texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, 
                               self.pn_historys[item], 
                               self.feature_texts[item])
        label = create_label(self.cfg, 
                             self.pn_historys[item], 
                             self.annotation_lengths[item], 
                             self.locations[item])
        return inputs, label

# Model

In [16]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, 1)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        return last_hidden_states

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

# inference

In [17]:
# ====================================================
# inference
# ====================================================
def inference_fn(valid_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(valid_loader, total=len(valid_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

def valid_fn(valid_loader, model, criterion, device):
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        loss = torch.masked_select(loss, labels.view(-1, 1) != -1).mean()
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        '''
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  #'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          remain=timeSince(start, float(step+1)/len(valid_loader))))'''
    predictions = np.concatenate(preds)
    return predictions

In [18]:
fold = 0
valid_folds = train[train['fold'] == fold].reset_index(drop=True)
    
valid_texts = valid_folds['pn_history'].values
valid_labels = create_labels_for_scoring(valid_folds)
valid_dataset = TrainDataset(CFG, valid_folds)

valid_loader = DataLoader(valid_dataset,
                          batch_size=CFG.batch_size,
                          shuffle=False,
                          num_workers=CFG.num_workers, pin_memory=True, drop_last=False)


predictions = []
for fold in CFG.trn_fold:
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=True)
    #model = CustomModel(CFG, config_path=None, pretrained=True)
    
    checkpoint = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))
    model.load_state_dict(checkpoint['model'])
    model.to(device)
    #prediction = inference_fn(test_loader, model, device)
    criterion = nn.BCEWithLogitsLoss(reduction="none")
    prediction = valid_fn(valid_loader, model, criterion, device)
    prediction = prediction.reshape((len(valid_folds), CFG.max_len))
    char_probs = get_char_probs(valid_texts, prediction, CFG.tokenizer)
    #char_probs = get_char_probs(test['pn_history'].values, prediction, CFG.tokenizer)
    results = get_results(char_probs, th=0.5)
    preds = get_predictions(results)
    
    #predictions.append(char_probs)
    del model, checkpoint, prediction, char_probs; gc.collect()
    torch.cuda.empty_cache()
    break
#predictions = np.mean(predictions, axis=0)

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [23]:
preds_np = np.array(preds)
with open(f'preds_fold{fold}.npy','wb') as f:
    np.save(f, preds_np)

Unnamed: 0,annotation
0,[father heart attach]
1,[Mother thyroid disease]
2,[chest pressure]
3,"[episodes, episodes]"
4,[]
...,...
2851,[Mom has a Hx of migraines]
2852,[F]
2853,[sensitive to the light]
2854,[]


In [1]:
import numpy as np
array = np.load('preds_fold0.npy')
print(array)

FileNotFoundError: [Errno 2] No such file or directory: 'preds_fold0.npy'

In [36]:
def getAnnotations(df,preds):
    pred_annots = []
    for i in range(len(preds)):
        if len(preds[i]) == 0:
            pred_annots.append([])
        else:
            row_annots = []
            for one_loc in preds[i]:
                #print(one_loc)
                start,end = one_loc[0],one_loc[1]
                history = df.loc[i]['pn_history']
                row_annots.append(history[start:end])
            pred_annots.append(row_annots)
    return pred_annots

In [66]:
for_disp.loc[1161]['pn_history']

'CC: "pain in mid epigastric area"\r\n35 yo M c/o 2 mo of constant, intermittent epigastric abdominal pain. Pain is 5/10 and is present at least 2X daily and wakes him up 3X each night.  There are no precipitating or exacerbating factors. Pain was alleviated by tums in past but no longer helps. patient has no past medical history but reports that an uncle had h/o peptic ulcer disease. Patient reports darker stools for 2 months, + nausea but denies vomitting. There has been no changes in frequency or texture of stools. He feels bloated after food. \r\nPMH: backache and spasms PSH: none SH: recently divorced and is stressed about that. Recently stopped alcohol but has smoked 1/2 pk for 15 yrs. no illicuit drugs use. \r\nFH: Uncle - bleeding ulcers. Health maintenance: Last visit to physician was long time ago. \r\nROS: negative except as in HPI'

In [34]:
import pandas as pd
import numpy as np
import re
df = pd.DataFrame([r'\nhello, my \nname is Artem'],columns = ['pn_history'])
preds = [[[0,6],[1,15]]]
display(df)
def MistakesProcessingFold0(df,preds):
    new_preds = [[] for i in range(len(preds))]
    for i,df_ind in enumerate(list(df.index)):
        if len(preds[i]) != 0:
            for one_loc in preds[i]:
                start,end = one_loc[0],one_loc[1]
                history = df.loc[df_ind]['pn_history']
                if history[start]+history[start+1] == '\\n':
                    start +=2
                else:
                    current = start
                    while current != 0:
                        current -= 1
                        if history[current].isdigit() or history[current].isalpha():
                            start -= 1
                        else:
                            break
                print(history[start])
                if history[start] == 'n':
                    if start != 0:
                        if history[start-1] == '\\':
                            start +=1
                last_step = start
                found = False
                for match in re.finditer(r"\\n", history[start:end]):
                    print(history[start:end])
                    found = True
                    index = match.start()
                    dist_holder = 2 
                    while history[index+start-1].isdigit() == False and history[index+start-1].isalpha() == False:
                        index -= 1
                        dist_holder += 1
                    new_preds[i].append([last_step,index+start])
                    last_step += index+dist_holder
                    #value = match.group()
                if found == True and last_step < end:
                    new_preds[i].append([last_step,end])
                else:
                    new_preds[i].append([start,end])
                    
        return new_preds
    
print(MistakesProcessingFold0(df,preds))           

Unnamed: 0,pn_history
0,"\nhello, my \nname is Artem"


h
n
hello, my \nn
[[[2, 6], [2, 11], [14, 15]]]


In [25]:
a = r'\nhello, my \nname is Artem'
print(a[1:15])
re.findall(r'\\n',a)

nhello, my \nn


['\\n', '\\n']

In [71]:
list(valid_folds.index)

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [69]:
pred_annots = getAnnotations(valid_folds,preds)
valid_w_preds = valid_folds.copy()
valid_w_preds['annotations_preds'] = pred_annots
valid_w_preds['location_preds'] = preds
#display(valid_w_preds.head())
mistakes_df = valid_w_preds[valid_w_preds['annotation'] != valid_w_preds['annotations_preds']]
print(len(mistakes_df))
for_disp = mistakes_df[['annotation','annotations_preds','location','location_preds','pn_history','feature_text']]

display(for_disp.iloc[600:700])
#display(valid_w_preds.loc[valid_w_preds['annotation'] != valid_w_preds['annotations_preds']])

825


Unnamed: 0,annotation,annotations_preds,location,location_preds,pn_history,feature_text
1975,[Exercise induced Asthma],"[asthma, Exercise induced Asthma]",[599 622],"[[32, 38], [599, 622]]",Mr. Smith is a 17 yo M w/PMH of asthma who pre...,Exercise-induced-asthma
1977,[1 day],"[1 day of, began when he awoke yesterday]",[57 62],"[[57, 65], [114, 143]]",Mr. Smith is a 17 yo M w/PMH of asthma who pre...,Duration-x-1-day
1984,[17yo],[7yo],[0 4],"[[1, 4]]",17yo m presents with 1 day h/o sharp chest pai...,17-year
1985,[URI 3-4 days ago with congestion],[Recent URI 3-4 days ago with congestion],[257 289],"[[250, 289]]",17yo m presents with 1 day h/o sharp chest pai...,Recent-upper-respiratory-symptoms
1986,[7-8/10 with deep breath],[Intensity 7-8/10 with deep breath],[60 83],"[[50, 83]]",17yo m presents with 1 day h/o sharp chest pai...,Worse-with-deep-breath-OR-pleuritic
1988,"[chest pain, pain chest]","[chest pain, pain]","[37 47, 97 101;126 131]","[[37, 47], [97, 101]]",17yo m presents with 1 day h/o sharp chest pai...,Chest-pain
1989,"[1 day h/o, began yesterday]","[1 day h/o, began when p, woke up yesterday]","[21 30, 150 155;172 181]","[[21, 30], [150, 162], [164, 181]]",17yo m presents with 1 day h/o sharp chest pai...,Duration-x-1-day
1996,[17 year old],[7 year old],[0 11],"[[1, 11]]",17 year old male presenting with sharp stabbin...,17-year
1997,[stuffy nose for past 3-4 days],"[stuffy nose, past 3-4 days]",[318 329;347 364],"[[318, 329], [351, 364]]",17 year old male presenting with sharp stabbin...,Recent-upper-respiratory-symptoms
2004,[inhaler didn't help.],[using his inhaler but it didn't help],[236 243;251 263],"[[226, 262]]",17 year old male presenting with sharp stabbin...,No-relief-with-asthma-inhaler


In [28]:
display(valid_folds.loc[0]['pn_history'])

'17 yo M college student comes to the clinic due to heart pounding.  He states the he has had episodes of heart pounding for the last 3-4 months.  Nothing makes it better or worse and it goes away on its own.  he denies any chest pain but does report chest pressure during some of the episodes.  He denies warmth or sweating, recent illness, abdominal pain or N/V, tingling in extremities, recent illness or trauma.  He is new to college and has been taking aderol a few times a week.  The last time he took aderol was 2 days ago, which was also the last time he had episode of palpitations.  \r\nROS: negative except for above\r\nPMH, PSH: none, medications: aderol self administered.  knda, FH: Mother thyroid disease, father heart attach at 52 is okay now.  SH: no changes in weight or appetite, he is on the meal plan and tries to be healthy, he plays intermural soccer and runs, no smoking, alcohol hx.  Triend marijuanna, uses aderol. Stress college'

In [20]:
display(preds)

[[[716, 735]],
 [[692, 714]],
 [[250, 264]],
 [[93, 101]],
 [],
 [[298, 311]],
 [[457, 463], [655, 661], [928, 934]],
 [],
 [],
 [[51, 65], [105, 119], [577, 589]],
 [[120, 143]],
 [[1, 5]],
 [[6, 7]],
 [[682, 697]],
 [[652, 680]],
 [[411, 425], [455, 460]],
 [],
 [[374, 385]],
 [[462, 468], [483, 498]],
 [],
 [[387, 402]],
 [],
 [[40, 52], [104, 116], [161, 183], [185, 208]],
 [[68, 90]],
 [],
 [[24, 25]],
 [[483, 500]],
 [[455, 480]],
 [[177, 191]],
 [[20, 28], [124, 135]],
 [[287, 314], [701, 716]],
 [],
 [[555, 563]],
 [],
 [],
 [[42, 55], [63, 82]],
 [],
 [[8, 14]],
 [[15, 16]],
 [[621, 626], [631, 637]],
 [[587, 602], [610, 616]],
 [[362, 381]],
 [[177, 195]],
 [],
 [],
 [[813, 821]],
 [[398, 417]],
 [],
 [[96, 115], [121, 133]],
 [[161, 176]],
 [],
 [[21, 25]],
 [[490, 513]],
 [[525, 551]],
 [[207, 221]],
 [[125, 136]],
 [[247, 268]],
 [],
 [[388, 401]],
 [[223, 242]],
 [],
 [[16, 30], [70, 83]],
 [],
 [[4, 9]],
 [[10, 11]],
 [[603, 623]],
 [[581, 601]],
 [[244, 265]],
 [[276, 2