In [1]:
!pip install -q transformers

[K     |████████████████████████████████| 3.8 MB 32.0 MB/s 
[K     |████████████████████████████████| 895 kB 55.7 MB/s 
[K     |████████████████████████████████| 67 kB 4.8 MB/s 
[K     |████████████████████████████████| 6.5 MB 43.0 MB/s 
[K     |████████████████████████████████| 596 kB 44.7 MB/s 
[?25h

In [2]:
import os
import sys
import json
import itertools
from tqdm.auto import tqdm
import logging
import datetime
import ast
import re

import numpy as np
import pandas as pd
import sklearn.model_selection as sms
import math

import torch
import torch.nn as nn

from transformers import AutoConfig, AutoModel, AutoTokenizer

In [3]:
input_dir = "/content/drive/MyDrive/Colab/kaggle/nbme-score-clinical-patient-notes/input"
train_df = pd.read_csv(os.path.join(input_dir, "train.csv"))
features_df = pd.read_csv(os.path.join(input_dir, "features.csv"))
patient_notes_df = pd.read_csv(os.path.join(input_dir, "patient_notes.csv"))
test_df = pd.read_csv(os.path.join(input_dir, "test.csv"))
sample_submission = pd.read_csv(os.path.join(input_dir, "sample_submission.csv"))

train_df["annotation"] = train_df["annotation"].apply(ast.literal_eval)
train_df["location"] = train_df["location"].apply(ast.literal_eval)

In [4]:
train_df.head()

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724]
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693]
2,00016_002,0,16,2,[chest pressure],[203 217]
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]"
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258]


In [5]:
train_df.tail(20)

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location
14280,95330_914,9,95330,914,[photophobia],[270 281]
14281,95330_915,9,95330,915,[No sick contacts],[340 356]
14282,95330_916,9,95330,916,[Felt warm],[358 367]
14283,95333_900,9,95333,900,"[Did not respond to ibuprofen, Did not respond...","[212 240, 212 230;242 249]"
14284,95333_901,9,95333,901,[20 year old],[22 33]
14285,95333_902,9,95333,902,[yesterday],[76 85]
14286,95333_903,9,95333,903,[],[]
14287,95333_904,9,95333,904,[],[]
14288,95333_905,9,95333,905,[Neck stiffness],[338 352]
14289,95333_906,9,95333,906,[vomiting],[365 373]


In [20]:
def create_labels_for_scoring(df):
    # example: ['0 1', '3 4'] -> ['0 1; 3 4']
    df = df.copy()
    df['location_for_create_labels'] = [ast.literal_eval(f'[]')] * len(df)
    for i in range(len(df)):
        lst = df.loc[i, 'location']
        if lst:
            new_lst = ';'.join(lst)
            df.loc[i, 'location_for_create_labels'] = ast.literal_eval(f'[["{new_lst}"]]')
    # create labels
    truths = []
    for location_list in df['location_for_create_labels'].values:
        truth = []
        if len(location_list) > 0:
            location = location_list[0]
            for loc in [s.split() for s in location.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                truth.append([start, end])
        truths.append(truth)
    return truths

create_labels_for_scoring(train_df.iloc[:10])

[[[696, 724]],
 [[668, 693]],
 [[203, 217]],
 [[70, 91], [176, 183]],
 [[222, 258]],
 [],
 [[321, 329], [404, 413], [652, 661]],
 [],
 [],
 [[26, 38], [96, 118]]]

In [6]:
test_df

Unnamed: 0,id,case_num,pn_num,feature_num
0,00016_000,0,16,0
1,00016_001,0,16,1
2,00016_002,0,16,2
3,00016_003,0,16,3
4,00016_004,0,16,4


In [7]:
features_df

Unnamed: 0,feature_num,case_num,feature_text
0,0,0,Family-history-of-MI-OR-Family-history-of-myoc...
1,1,0,Family-history-of-thyroid-disorder
2,2,0,Chest-pressure
3,3,0,Intermittent-symptoms
4,4,0,Lightheaded
...,...,...,...
138,912,9,Family-history-of-migraines
139,913,9,Female
140,914,9,Photophobia
141,915,9,No-known-illness-contacts


In [8]:
patient_notes_df

Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...
...,...,...,...
42141,95330,9,Ms. Madden is a 20 yo female presenting w/ the...
42142,95331,9,A 20 YO F CAME COMPLAIN A DULL 8/10 HEADACHE T...
42143,95332,9,Ms. Madden is a 20yo female who presents with ...
42144,95333,9,Stephanie madden is a 20 year old woman compla...


In [9]:
train_df = train_df.merge(features_df, on=["feature_num", "case_num"], how="left")
train_df = train_df.merge(patient_notes_df, on=["pn_num", "case_num"], how="left")

def process_feature_text(text):
    text = re.sub('I-year', '1-year', text)
    text = re.sub('-OR-', " or ", text)
    text = re.sub('-', ' ', text)
    return text

train_df["pn_history"] = train_df["pn_history"].apply(process_feature_text)
train_df

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724],Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693],Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...
2,00016_002,0,16,2,[chest pressure],[203 217],Chest-pressure,HPI: 17yo M presents with palpitations. Patien...
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]",Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258],Lightheaded,HPI: 17yo M presents with palpitations. Patien...
...,...,...,...,...,...,...,...,...
14295,95333_912,9,95333,912,[],[],Family-history-of-migraines,Stephanie madden is a 20 year old woman compla...
14296,95333_913,9,95333,913,[],[],Female,Stephanie madden is a 20 year old woman compla...
14297,95333_914,9,95333,914,[photobia],[274 282],Photophobia,Stephanie madden is a 20 year old woman compla...
14298,95333_915,9,95333,915,[no sick contacts],[421 437],No-known-illness-contacts,Stephanie madden is a 20 year old woman compla...


In [10]:
train_df.tail(30)

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history
14270,95330_904,9,95330,904,"[HA around her head, HA diffuse]","[53 55;225 240, 53 55;207 214]",Global-headache-OR-diffuse-headache,Ms. Madden is a 20 yo female presenting w/ the...
14271,95330_905,9,95330,905,[],[],Neck-pain,Ms. Madden is a 20 yo female presenting w/ the...
14272,95330_906,9,95330,906,[vomited],[318 325],Vomiting,Ms. Madden is a 20 yo female presenting w/ the...
14273,95330_907,9,95330,907,[No rashes],[377 379;419 425],No-rash,Ms. Madden is a 20 yo female presenting w/ the...
14274,95330_908,9,95330,908,[nausea],[306 312],Nausea,Ms. Madden is a 20 yo female presenting w/ the...
14275,95330_909,9,95330,909,[],[],viral-symptoms-OR-rhinorrhea-OR-scratchy-throat,Ms. Madden is a 20 yo female presenting w/ the...
14276,95330_910,9,95330,910,[],[],Shares-an-apartment,Ms. Madden is a 20 yo female presenting w/ the...
14277,95330_911,9,95330,911,[],[],Meningococcal-vaccine-status-unknown,Ms. Madden is a 20 yo female presenting w/ the...
14278,95330_912,9,95330,912,"[mother migraines, FH migraines]","[641 647;651 660, 637 639;651 660]",Family-history-of-migraines,Ms. Madden is a 20 yo female presenting w/ the...
14279,95330_913,9,95330,913,"[Ms, female]","[0 2, 22 28]",Female,Ms. Madden is a 20 yo female presenting w/ the...


In [11]:
# model_name = "microsoft/deberta-base"
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer = AutoTokenizer.from_pretrained(model_name, trim_offsets=False)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [21]:
AutoConfig.from_pretrained("microsoft/deberta-base")

Downloading:   0%|          | 0.00/474 [00:00<?, ?B/s]

DebertaConfig {
  "_name_or_path": "microsoft/deberta-base",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "c2p",
    "p2c"
  ],
  "position_biased_input": false,
  "relative_attention": true,
  "transformers_version": "4.17.0",
  "type_vocab_size": 0,
  "vocab_size": 50265
}

In [12]:
encoding = tokenizer(
    train_df["pn_history"][14295],
    train_df["feature_text"][14295],
    # max_length=400,
    # padding="max_length",
    return_offsets_mapping=True,
)
encoding

{'input_ids': [0, 25093, 4134, 324, 475, 23004, 16, 10, 291, 76, 793, 693, 13689, 9, 19344, 4, 23689, 880, 2350, 662, 8, 34, 57, 562, 3007, 4, 38776, 5891, 4285, 700, 741, 45931, 6, 3007, 19, 3051, 6, 33842, 81, 6, 117, 32216, 16158, 2433, 4, 6553, 45, 2519, 7, 34154, 658, 1001, 22132, 6, 255, 4360, 225, 1168, 6, 50, 3581, 4, 20722, 5069, 9, 17190, 33693, 6, 53, 45, 43676, 24938, 6, 117, 34705, 4, 36994, 2088, 11696, 452, 4, 33224, 37760, 4, 234, 17498, 102, 8, 23600, 4, 440, 24719, 219, 1825, 50, 1109, 19279, 1825, 6, 117, 8269, 6, 117, 4736, 9872, 4, 50121, 50118, 500, 3196, 35, 3680, 2430, 50121, 50118, 5683, 725, 35, 4146, 50121, 50118, 3888, 725, 35, 4146, 50121, 50118, 725, 16497, 35, 4146, 50121, 50118, 21243, 35, 27178, 797, 13106, 50121, 50118, 3684, 11249, 918, 35, 4146, 50121, 50118, 29971, 35, 1364, 23, 7798, 205, 1400, 6, 1074, 19, 929, 877, 6, 117, 45510, 1043, 3036, 6, 4400, 2678, 132, 155, 6696, 15, 12729, 6, 4401, 2161, 4989, 155, 204, 24944, 10, 186, 6, 5912, 2171, 8,

In [13]:
# nakama baseline
def create_label(tokenizer, text_idx):
    encoding = tokenizer(train_df["pn_history"][text_idx], max_length=500, padding="max_length", return_offsets_mapping=True)
    location_list = train_df["location"][text_idx]
    annotation_length = len(location_list)
    offset_mapping = encoding["offset_mapping"]

    ignore_idx = np.where(np.array(encoding.sequence_ids()) != 0)[0]
    label = np.zeros(len(offset_mapping))
    label[ignore_idx] = -1

    if annotation_length != 0:
        for location in location_list:
            for loc in [s.split() for s in location.split(";")]:
                print("loc :", loc)
                start_idx = -1
                end_idx = -1
                start, end = int(loc[0]), int(loc[1])
                for idx in range(len(offset_mapping)):
                    if (start_idx == -1) & (start < offset_mapping[idx][0]):
                        start_idx = idx - 1
                        # print("start_idx :", start_idx)
                    if (end_idx == -1) & (end <= offset_mapping[idx][1]):
                        end_idx = idx + 1
                        print("end_idx :", end_idx)
                if start_idx == -1:
                    print(f"start_idx == end_idx in {text_idx}")
                    start_idx = end_idx
                    print("start_idx :", start_idx)
                if (start_idx != 1) & (end_idx != 1):
                    print("span")
                    print("start_idx :", start_idx)
                    print("end_idx :", end_idx)
                    label[start_idx: end_idx] = 1
    return label


def create_label_v2(tokenizer, text_idx):
    encoding = tokenizer(train_df["pn_history"][text_idx], max_length=500, padding="max_length", return_offsets_mapping=True)
    location_list = train_df["location"][text_idx]
    annotation_length = len(location_list)
    offset_mapping = encoding["offset_mapping"]

    ignore_idx = np.where(np.array(encoding.sequence_ids()) != 0)[0]
    label = np.zeros(len(offset_mapping))
    label[ignore_idx] = -1

    if annotation_length != 0:
        for location in location_list:
            for loc in [s.split() for s in location.split(";")]:
                # print("loc :", loc)
                start_idx = -1
                end_idx = -1
                start, end = int(loc[0]), int(loc[1])
                for idx in range(len(offset_mapping)):
                    if (start_idx == -1) & (start <= offset_mapping[idx][0] + 1):
                        start_idx = idx
                        print("start_idx :", start_idx)
                    if (end_idx == -1) & (end <= offset_mapping[idx][1]):
                        end_idx = idx + 1
                        print("end_idx :", end_idx)
                if start_idx == -1:
                    print(f"start_idx == end_idx in {text_idx}")
                    start_idx = end_idx
                    print("start_idx :", start_idx)
                if (start_idx != 1) & (end_idx != 1):
                    print("span")
                    print("start_idx :", start_idx)
                    print("end_idx :", end_idx)
                    label[start_idx: end_idx] = 1
    return label

create_label_v2(tokenizer, 1417)

start_idx : 123
end_idx : 124
span
start_idx : 123
end_idx : 124
start_idx : 132
end_idx : 133
span
start_idx : 132
end_idx : 133


array([-1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1

In [14]:
def get_char_probs(texts, predictions, tokenizer):
    """
    予測値をtoken-level -> char-levelに変形
    """
    results = [np.zeros(len(t)) for t in texts]
    for i, (text, prediction) in enumerate(zip(texts, predictions)):
        encoded = tokenizer(text, 
                            add_special_tokens=True,
                            return_offsets_mapping=True)
        for idx, (offset_mapping, pred) in enumerate(zip(encoded['offset_mapping'], prediction)):
            start = offset_mapping[0]
            end = offset_mapping[1]
            results[i][start:end] = pred
    return results


def get_results(char_probs, th=0.5):
    results = []
    for char_prob in char_probs:
        result = np.where(char_prob >= th)[0] + 1
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        result = [f"{min(r)} {max(r)}" for r in result]
        result = ";".join(result)
        results.append(result)
    return results


def get_predictions(results):
    predictions = []
    for result in results:
        prediction = []
        if result != "":
            for loc in [s.split() for s in result.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    return predictions

In [15]:
pred = np.array([0.1, 0.4, 0.5, 0.5, 0.2, 0.1, 0.6, 0.7, 0.6, 0.1, 0.4, 0.5, 0.5])
result = np.where(pred >= 0.5)[0] + 1
result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
result

[[3, 4], [7, 8, 9], [12, 13]]

In [16]:
result = [f"{min(r)} {max(r)}" for r in result]
result

['3 4', '7 9', '12 13']

In [17]:
result = ";".join(result)
result

'3 4;7 9;12 13'

In [18]:
prediction = []
if result != "":
    for loc in [s.split() for s in result.split(';')]:
        start, end = int(loc[0]), int(loc[1])
        prediction.append([start, end])
prediction

[[3, 4], [7, 9], [12, 13]]