In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns 
import plotly.express as px
import matplotlib.pyplot as plt 
import plotly.graph_objects as go
import spacy
import warnings 
import wordcloud 




import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/nbme-score-clinical-patient-notes/sample_submission.csv
/kaggle/input/nbme-score-clinical-patient-notes/patient_notes.csv
/kaggle/input/nbme-score-clinical-patient-notes/features.csv
/kaggle/input/nbme-score-clinical-patient-notes/train.csv
/kaggle/input/nbme-score-clinical-patient-notes/test.csv
/kaggle/input/nbmebertv1/label_encoder.dill
/kaggle/input/nbmebertv1/labels.npy
/kaggle/input/nbmebertv1/sequences.npy
/kaggle/input/nbmebertv1/masks.npy
/kaggle/input/nbmebertv1/model.h5
/kaggle/input/nbmebertv1/my_tokenizer/config.json
/kaggle/input/nbmebertv1/my_tokenizer/tokenizer.json
/kaggle/input/nbmebertv1/my_tokenizer/tokenizer_config.json
/kaggle/input/nbmebertv1/my_tokenizer/special_tokens_map.json
/kaggle/input/nbmebertv1/my_tokenizer/vocab.txt


## **Step 1: Gather data, determine the method of data collection and provenance of the data**

The data source I have selected is from the National Board of Medical Examiners (NBME) and is a collection of around 40,000 patient notes (some of the patient notes are annotated). The data is available on Kaggle at the following [link](https://www.kaggle.com/competitions/nbme-score-clinical-patient-notes/data). 

## **Step 2: Identify a Deep Learning Problem**

**Project Topic:** The goal of this project is to reduce the need for human and financial resources involved in scoring patient note exams for medical students and residents. In order to do this, we must create a model that can identify specific clinical concepts in patient notes to determine if students identified and documented the correct observations when examining and interviewing the standardized patient. 

This is a natural language processing (NLP) problem. 

## **Step 3: Exploratory Data Analysis (EDA) - Inspect, Visualize, and Clean the Data**


In [2]:
# Read in the data 

df_train = pd.read_csv("../input/nbme-score-clinical-patient-notes/train.csv")
df_test = pd.read_csv("../input/nbme-score-clinical-patient-notes/test.csv")
df_features = pd.read_csv("../input/nbme-score-clinical-patient-notes/features.csv")
df_patient_notes = pd.read_csv("../input/nbme-score-clinical-patient-notes/patient_notes.csv")
df_sample_submission = pd.read_csv("../input/nbme-score-clinical-patient-notes/sample_submission.csv")


In [3]:
df_train.shape

(14300, 6)

In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14300 entries, 0 to 14299
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           14300 non-null  object
 1   case_num     14300 non-null  int64 
 2   pn_num       14300 non-null  int64 
 3   feature_num  14300 non-null  int64 
 4   annotation   14300 non-null  object
 5   location     14300 non-null  object
dtypes: int64(3), object(3)
memory usage: 670.4+ KB


In [5]:
df_train.head()

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location
0,00016_000,0,16,0,['dad with recent heart attcak'],['696 724']
1,00016_001,0,16,1,"['mom with ""thyroid disease']",['668 693']
2,00016_002,0,16,2,['chest pressure'],['203 217']
3,00016_003,0,16,3,"['intermittent episodes', 'episode']","['70 91', '176 183']"
4,00016_004,0,16,4,['felt as if he were going to pass out'],['222 258']


We can see above that the training data has 14,300 rows with 6 columns. Definitions for these 6 columns are below. 

`id` - Unique identifier for each patient note / feature pair.

`case_num` - The case to which this patient note belongs.

`pn_num` - The patient note annotated in this row.

`feature_num` - The feature annotated in this row.

`annotation` - The text(s) within a patient note indicating a feature. A feature may be indicated multiple times within a single note.

`location` - Character spans indicating the location of each annotation within the note. Multiple spans may be needed to represent an annotation, in which case the spans are delimited by a semicolon


In [6]:
df_test.shape

(5, 4)

In [7]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           5 non-null      object
 1   case_num     5 non-null      int64 
 2   pn_num       5 non-null      int64 
 3   feature_num  5 non-null      int64 
dtypes: int64(3), object(1)
memory usage: 288.0+ bytes


In [8]:
df_test.head()

Unnamed: 0,id,case_num,pn_num,feature_num
0,00016_000,0,16,0
1,00016_001,0,16,1
2,00016_002,0,16,2
3,00016_003,0,16,3
4,00016_004,0,16,4


The test set only has 5 rows with 4 of the columns from the training data. Annotation and location are not included. 

In [9]:
df_patient_notes.shape 


(42146, 3)

In [11]:
df_patient_notes.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42146 entries, 0 to 42145
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   pn_num      42146 non-null  int64 
 1   case_num    42146 non-null  int64 
 2   pn_history  42146 non-null  object
dtypes: int64(2), object(1)
memory usage: 987.9+ KB


In [12]:
df_patient_notes.head()


Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...


In [13]:
print(df_patient_notes["pn_history"].iloc[111])

17 y/o previously healthy male presents for episodic palpitations. They last 3-4 minutes and resolve on their own. They happen both at rest and during exercise - he has not noticed any pattern to their recurrence. Last time this happened he got light-headed and sweaty and felt chest pressure about five minutes into playing basketball. He denies shortness of breath. Endorses drinking coffee and red bull regularly.
PMH: none
PSH: none
Allergies: none
Meds: adderall from roommate 1-2 times a week for the past 7-8 months
FHx: no sudden death at a young age in the family, dad had a heart attack @ 52 but recovered well
Social: drinks 3-4 beers on weekends, denies smoking, tried pot once otherwise no drug use, started college 7-8 months ago, lives on campus with a roommate, unsure of what he wants to study


In [14]:
print(df_patient_notes["pn_history"].iloc[222])

Mr. Cleveland is a 17 yo boy who presents with 2-3 months of episodic heart pounding. Currently he feels fine but has episodes where his heart beats fast and feels like it "is jumping out of his chest". No association with exercise and cannot identify any triggers, occurs randomly. Two days ago, he had an episode of heart pounding where he felt a pressure in the center of his chest, SOB, and light-headed. He denies syncope, diaphoresis, weight changes, fever, chills, nausea, heat or cold intolerance. 
ROS: Negative except as above
PMHx: None
PSHx: None
FHx: Father had an MI last year; mother has a "thyroid problem"
Meds: Recently started using his roommate's Adderall, has used it in the past but uses more frequently now in college. 
SHx: Full-time college student, lives with a roommate. Drinks 3-4 beers on the weekends, denies use of tobacco or other drugs. Runs 2-3 miles a day and plays basketball. No acute stressors


In [17]:
print(df_patient_notes["pn_history"].iloc[11111])

HPI: 35 YO M C/O OF EPIGASTRIC ABDOMINAL PAIN OF 2 MONTHS . PAIN IS BURNING, INTERMITTENT, NONRADIATING, RELIEVED BY ANTACIDS. NOTHING WORSENS IT, NO RELATION TO FOOD. PAIN OCCURS TWICE A DAY. THE PATIENT ALSO REPORTS MELENA, AMD BLOATING. NO CHANGES IN WEIGHT OR APPETITE BUT HE HAS REDUCED THE AMOUNT  OF FOOD HE TAKES IN BECAUSE OF BLOATING AFTER MEALS. NO HISTORY OF FEVER OR DIARRHEA OR CONSTIPATION. PAIN ALSO WAKES HIM UP FROM SLEEP AND THIS HAS CAUSED FATIGUE AND AFFECTED HIS WORK. NO CHEST PAIN OR SHORTNESS OF BREATH. 
ROS: NEGATIVE EXCEPT AS ABOVE
ALLERGIES: NKDA
MEDICATIONS: ANTACIDS
PMH: HAS HAD BACK PAIN AND MUSCLE SPASMS FROM CONSTRUCTION WORK
PSH: NONE
FH: UNCLE HAD A BLEEDING ULCER
SH: SMOKES 1 PPD FOR 18 YEARS, DRINKS 2-3 BEERS PER WEEK, WORKS AS A CONSTRUCTION WORKER


In [18]:
print(df_patient_notes["pn_history"].iloc[22222])

26 yof with h/o palpitations. Onset was 5 years ago but has gotten worse in the last 3 weeks. No alleviating factors, no aggravating factors, no triggers to her palpitations. 3 months ago she has moved into a new condominum and lost her job 2 months ago. Associated with SOB, throat tightness, feeling more hot, clammy hands, nausea, fatgue, concentration deficit. No chest pain HA, vomiting, fever, urinary/bowel issue, sleep change, appetite change, unexpected wt loss. 2 wks ago was in ED for same complaint: CBC, CMP, ECG, cardiac enzymes were normal
ROS: negative except as above
Allergy: none; meds:none
FH: none 
SH: unemployed, lives alone, no tobacco, drinking, illicit use; sexually active with 1 male, uses condoms, not STD hx


The patient notes data has 42,146 rows with 3 columns, the most important of which is the patient note created by the student stored in `pn_history`.

In the above output you can also see a few examples of patient notes from different case numbers. 

Next, I would like to understand if the notes are distributed evenly among the 10 patient cases. 

In [22]:
notes_counts = df_patient_notes.groupby("case_num").count()
fig = px.bar(data_frame =notes_counts, 
             x = notes_counts.index,
             y = 'pn_num' , 
             color = "pn_num",
             color_continuous_scale="Cividis"
             ) 
fig.update_layout(title = {
        'text': 'Distribution of Patient Notes by Patient Case',
        'y':0.95,
        'x':0.48,
        'xanchor': 'center',
        'yanchor': 'top'} ,
                   xaxis = dict(
        tickmode = 'array',
        tickvals = [0, 1,2, 3, 4,5, 6,7,8,9],
        ticktext = ['Case 0', 'Case 1', 'Case 2', 'Case 3', 'Case 4', 'Case 5', 'Case 6', 'Case 7', 'Case 8', 'Case 9']),
                  template = "plotly_white")
fig.show()

Here we can see the notes are not evenly distributed among the cases with Case 3 having the majority of the notes. 

In [23]:
df_features.shape


(143, 3)

In [24]:
df_features.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143 entries, 0 to 142
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   feature_num   143 non-null    int64 
 1   case_num      143 non-null    int64 
 2   feature_text  143 non-null    object
dtypes: int64(2), object(1)
memory usage: 3.5+ KB


In [25]:
df_features.head()

Unnamed: 0,feature_num,case_num,feature_text
0,0,0,Family-history-of-MI-OR-Family-history-of-myoc...
1,1,0,Family-history-of-thyroid-disorder
2,2,0,Chest-pressure
3,3,0,Intermittent-symptoms
4,4,0,Lightheaded


In [26]:
df_features["feature_text"].iloc[0]

'Family-history-of-MI-OR-Family-history-of-myocardial-infarction'

In the above output, we can see that the features dataframe has 143 rows and 3 columns. Those 3 columns are 2 of the same from other dataframes, `feature_num` and `case_num`, and there is one new column, `feature_text` which is the text of the concepts we are attempting to identify. 

In the above output you can also see an example of the feature text that corresponds to the 1st example patient note above. 

Similar to the patient notes, I would like to see if the features are evenly distributed across the 9 cases. 

In [28]:
feature_counts = df_features.groupby("case_num").count()
fig = px.bar(data_frame =feature_counts, 
             x = feature_counts.index,
             y = 'feature_num' , 
             color = "feature_num",
             color_continuous_scale="Cividis") 
fig.update_layout(title = {
        'text': 'Distribution of Features by Case',
        'y':0.95,
        'x':0.48,
        'xanchor': 'center',
        'yanchor': 'top'} ,
                   xaxis = dict(
        tickmode = 'array',
        tickvals = [0, 1,2, 3, 4,5, 6,7,8,9],
        ticktext = ['Case 0', 'Case 1', 'Case 2', 'Case 3', 'Case 4', 'Case 5', 'Case 6', 'Case 7', 'Case 8', 'Case 9']),
                  template = "plotly_white")
fig.show()


They appear to be more evenly distributed, but there are still some cases (case 4 and case 7) that have significantly fewer features than the others.

Next, I want to take a closer look at the patient level data. There should be (according to the data description) 1000 patients with annotations for the target features in the training dataset. I want to verify this and see what the annotations look like. 

In [29]:
# Number of unique patients in the training dataset 

len(df_train["pn_num"].value_counts())

1000

In [43]:
ex_patient = df_train[df_train["pn_num"] == 16]

ex_patient

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location
0,00016_000,0,16,0,['dad with recent heart attcak'],['696 724']
1,00016_001,0,16,1,"['mom with ""thyroid disease']",['668 693']
2,00016_002,0,16,2,['chest pressure'],['203 217']
3,00016_003,0,16,3,"['intermittent episodes', 'episode']","['70 91', '176 183']"
4,00016_004,0,16,4,['felt as if he were going to pass out'],['222 258']
5,00016_005,0,16,5,[],[]
6,00016_006,0,16,6,"['adderall', 'adderrall', 'adderrall']","['321 329', '404 413', '652 661']"
7,00016_007,0,16,7,[],[]
8,00016_008,0,16,8,[],[]
9,00016_009,0,16,9,"['palpitations', 'heart beating/pounding']","['26 38', '96 118']"


In [44]:
print(f"\033[94mPatient Notes - ")
print(f'\033[94m',df_patient_notes[df_patient_notes["pn_num"] == 16]["pn_history"].iloc[0])
print("------------")
print(f'\033[92mAnnotaions:')
for i in range(len(ex_patient)):
    print(f'\033[92m',ex_patient["annotation"].iloc[i])

[94mPatient Notes - 
[94m HPI: 17yo M presents with palpitations. Patient reports 3-4 months of intermittent episodes of "heart beating/pounding out of my chest." 2 days ago during a soccer game had an episode, but this time had chest pressure and felt as if he were going to pass out (did not lose conciousness). Of note patient endorses abusing adderall, primarily to study (1-3 times per week). Before recent soccer game, took adderrall night before and morning of game. Denies shortness of breath, diaphoresis, fevers, chills, headache, fatigue, changes in sleep, changes in vision/hearing, abdominal paun, changes in bowel or urinary habits. 
PMHx: none
Rx: uses friends adderrall
FHx: mom with "thyroid disease," dad with recent heart attcak
All: none
Immunizations: up to date
SHx: Freshmen in college. Endorses 3-4 drinks 3 nights / week (on weekends), denies tabacco, endorses trying marijuana. Sexually active with girlfriend x 1 year, uses condoms
------------
[92mAnnotaions:
[92m ['d

In the above output you can see the patient data and an example note and accompanying annotations that corresponds to the first example case from the beginning of this notebook. 

There are several missing annotations above. I would like to see how many are missing in the dataset. 

In [46]:
# Number of missing annotations 

sum(df_train["location"] == '[]')

4399

In [52]:
df_train["location"] = df_train["location"].apply(eval)
df_train['annotation'] = df_train['annotation'].apply(eval)

In [57]:
# Annotation Visualization

ex_patient = df_train[df_train["pn_num"] == 16]
location  = ex_patient["location"]
annotation = ex_patient["annotation"]
start_pos = []
end_pos = []
for i in location:
    for j in i:
        start_pos.append(j.split()[0])
        end_pos.append(j.split()[1])
        
ents = []
for i in range(len(start_pos)):
    ents.append({
        'start': int(start_pos[i]), 
        'end' : int(end_pos[i]),
        "label" : "Annotation"
    })
doc = {
    'text' : df_patient_notes[df_patient_notes["pn_num"] == 16]["pn_history"].iloc[0],
    "ents" : ents
}
colors = {"Annotation" :"linear-gradient(90deg, #aa9cfc, #fc9ce7)" } 
options = {"colors": colors}
spacy.displacy.render(doc, style="ent", options = options , manual=True, jupyter=True);

### EDA Summary: Please see comments throughout. 

## Step 4: Perform Analysis Using Deep Learning Models of your Choice, Present Discussion, and Conclusions 

### **Model 1: BERT**

In [2]:
# Importing libraries needed for modeling 

from sklearn.preprocessing import  LabelEncoder
from tqdm.auto import tqdm
import random
import tensorflow as tf
#import tensorflow.keras as keras
#import tensorflow.keras.layers as layers
import dill
#import tensorflow.keras.backend as K
from tqdm.auto import tqdm
from tensorflow.keras import mixed_precision
from transformers import AutoTokenizer, AutoConfig,TFAutoModel
import json

2024-03-04 00:51:52.531776: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-04 00:51:52.531900: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-04 00:51:52.652590: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [17]:
print("TensorFlow version:", tf.__version__)

TensorFlow version: 2.15.0


In [3]:
try: # detect TPUs
    tpu  = tf.distribute.cluster_resolver.TPUClusterResolver.connect() # TPU detection
    tf.config.experimental_connect_to_cluster(tpu )
    tf.tpu.experimental.initialize_tpu_system(tpu )
    strategy = tf.distribute.TPUStrategy(tpu )
    print('Using TPU')
except ValueError: # detect GPUs
    tpu = None
    strategy = tf.distribute.MirroredStrategy() # for GPU or multi-GPU machines
    #strategy = tf.distribute.get_strategy() # default strategy that works on CPU and single GPU
    #strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() # for clusters of multi-GPU machines

print("Number of accelerators: ", strategy.num_replicas_in_sync)


AUTO = tf.data.experimental.AUTOTUNE
REPLICAS = strategy.num_replicas_in_sync
print(f'REPLICAS: {REPLICAS}')

Number of accelerators:  1
REPLICAS: 1


In [4]:
seed=777
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1'
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
print('Mixed precision enabled')

Mixed precision enabled


In [5]:
TRAIN = False

In [6]:
# Fresh load of data in case EDA altered anything 

df_features = pd.read_csv("../input/nbme-score-clinical-patient-notes/features.csv")
df_patient_notes = pd.read_csv("../input/nbme-score-clinical-patient-notes/patient_notes.csv")
df_test = pd.read_csv("../input/nbme-score-clinical-patient-notes/test.csv")
df_train= pd.read_csv("../input/nbme-score-clinical-patient-notes/train.csv")
df_sample_submission= pd.read_csv("../input/nbme-score-clinical-patient-notes/sample_submission.csv")

In [7]:
df_test = df_test.merge(df_patient_notes,on=['case_num','pn_num']).merge(df_features,on=['case_num','feature_num'])
df_train = df_train.merge(df_patient_notes,on=['case_num','pn_num']).merge(df_features,on=['case_num','feature_num'])

In [8]:
df_train.head()

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,pn_history,feature_text
0,00016_000,0,16,0,['dad with recent heart attcak'],['696 724'],HPI: 17yo M presents with palpitations. Patien...,Family-history-of-MI-OR-Family-history-of-myoc...
1,00016_001,0,16,1,"['mom with ""thyroid disease']",['668 693'],HPI: 17yo M presents with palpitations. Patien...,Family-history-of-thyroid-disorder
2,00016_002,0,16,2,['chest pressure'],['203 217'],HPI: 17yo M presents with palpitations. Patien...,Chest-pressure
3,00016_003,0,16,3,"['intermittent episodes', 'episode']","['70 91', '176 183']",HPI: 17yo M presents with palpitations. Patien...,Intermittent-symptoms
4,00016_004,0,16,4,['felt as if he were going to pass out'],['222 258'],HPI: 17yo M presents with palpitations. Patien...,Lightheaded


In [9]:
MODEL_NAME = 'bert-base-uncased'
DATA_PATH = "../input/nbmebertv1"
DATA_EXISTS = os.path.exists(DATA_PATH)
SEQUENCE_LENGTH = 512

In [10]:
if DATA_EXISTS:
    tokenizer = AutoTokenizer.from_pretrained(DATA_PATH+"/my_tokenizer/",normalization=True)
    config = AutoConfig.from_pretrained(DATA_PATH+"/my_tokenizer/config.json")
else:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME,normalization=True)
    config = AutoConfig.from_pretrained(MODEL_NAME)
    tokenizer.save_pretrained('my_tokenizer')
    config.save_pretrained('my_tokenizer')

In [11]:
EMPTY =  'EMPTY'
CLASSES = [EMPTY,]+df_features.feature_num.unique().tolist()

if DATA_EXISTS:
    label_encoder = dill.load(open(DATA_PATH+"/label_encoder.dill",'rb'))
else:
    # label_encoder
    label_encoder = LabelEncoder()
    # Encode labels
    label_encoder.fit(CLASSES)
    dill.dump(label_encoder,open('label_encoder.dill','wb'))
df_train['TARGET']= label_encoder.transform(df_train['feature_num'])
df_test['TARGET']= label_encoder.transform(df_test['feature_num'])
N_CLASSES = len(label_encoder.classes_)
EMPTY_IDX = label_encoder.transform([EMPTY,]) [0]

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [12]:
def decode_location(locations):
    for x in ["[","]","'"]:
        locations = locations.replace(x,'')
    locations = locations.replace(',',';')
    locations = locations.split(";")
    res = []
    for location in locations:
        if location:
            x,y = location.split()
            res.append((int(x),int(y)))
    return sorted(res,key=lambda x:x[0])

In [13]:
if DATA_EXISTS:
    sequences = np.load(open(DATA_PATH+"/sequences.npy",'rb'))
    masks = np.load(open(DATA_PATH+"/masks.npy",'rb'))
    labels = np.load(open(DATA_PATH+"/labels.npy",'rb'))
else:
    sequences, labels, masks = [], [], []
    for g1 in tqdm(df_train.groupby('pn_num')):
        gdf = g1[1]
        pn_history  = gdf.iloc[0].pn_history

        tokens = tokenizer.encode_plus(pn_history, max_length=SEQUENCE_LENGTH, padding='max_length',truncation=True, return_offsets_mapping=True)
        sequence = tokens['input_ids']
        attention_mask = tokens['attention_mask']
        label = np.array([EMPTY_IDX for _ in range(SEQUENCE_LENGTH)])

        # BUILD THE TARGET ARRAY
        offsets = tokens['offset_mapping']
        label_empty = True
        for index, row in gdf.iterrows():
            TARGET = row.TARGET
            for i, (w_start, w_end) in enumerate(offsets):
                for start,end in decode_location(row.location):
                    if w_start < w_end and (w_start >= start) and (end >= w_end):
                        label[i] = TARGET
                        label_empty = False
                    if w_start >= w_end:
                        break
        if not label_empty:
            sequences.append(sequence)
            masks.append(attention_mask)
            labels.append(label)

    sequences = np.array(sequences).astype(np.int32)
    masks = np.array(masks).astype(np.uint8)
    labels = np.array(tf.keras.utils.to_categorical(labels,N_CLASSES)).astype(np.uint8)

    np.save(open("sequences.npy",'wb'), sequences)
    np.save(open("masks.npy",'wb'), masks)
    np.save(open("labels.npy",'wb'), labels)

In [23]:
class CastAttentionLayer(tf.keras.layers.Layer):
    def call(self, inputs):
        return tf.cast(inputs, tf.int32)



In [36]:
def build_model():
    tokens_input = tf.keras.layers.Input(shape=(SEQUENCE_LENGTH,), name='tokens', dtype=tf.int32)
    attention_input = tf.keras.layers.Input(shape=(SEQUENCE_LENGTH,), name='attention', dtype=tf.int32)
    
    if DATA_EXISTS:
        config = AutoConfig.from_pretrained(DATA_PATH+"/my_tokenizer/config.json")
        backbone = TFAutoModel.from_config(config)
    else:
        config = AutoConfig.from_pretrained(MODEL_NAME)
        backbone = TFAutoModel.from_pretrained(MODEL_NAME, config=config)
    
    # Use BERT model's internal mechanism to handle attention mask
    out = backbone(tokens_input, attention_mask=attention_input)[0]
    out = tf.keras.layers.Dropout(0.2)(out)
    out = tf.keras.layers.Dense(N_CLASSES, activation='softmax')(out)
    
    model = tf.keras.Model(inputs=[tokens_input, attention_input], outputs=out)
    
    return model


In [15]:
if TRAIN:
    with strategy.scope():
        model = build_model()

        callback = tf.keras.callbacks.EarlyStopping(monitor='loss',mode='min', patience=3)

        # Compile the model
        model.compile(optimizer=tf.keras.optimizers.Adam(1e-5),
                      loss=tf.keras.losses.categorical_crossentropy,metrics=['acc',])

        history = model.fit((sequences,masks),labels,
                            batch_size=12,
                            epochs=10,
                            callbacks=[callback,])

        model.save_weights(f'model.h5')


In [None]:
if not TRAIN:
    model = build_model()
    model.load_weights(DATA_PATH+"/model.h5")

In [39]:
test_sequences, test_masks, test_offsets = [], [],[]
row_ids = []
targets = []

for g1 in tqdm(df_test.groupby('pn_num')):
    gdf = g1[1]
    pn_history  = gdf.iloc[0].pn_history
    targets.append([])
    row_ids.append([])
    
    test_tokens = tokenizer.encode_plus(pn_history, max_length=SEQUENCE_LENGTH, padding='max_length',truncation=True, return_offsets_mapping=True)
    test_sequence = test_tokens['input_ids']
    test_attention_mask = test_tokens['attention_mask'] 

    # BUILD THE TARGET ARRAY
    offset = test_tokens['offset_mapping']
    
    for index, row in gdf.iterrows():
        targets[-1].append(row.TARGET)
        row_ids[-1].append(row.id)
         
    test_sequences.append(test_sequence)
    test_masks.append(test_attention_mask)
    test_offsets.append(offset)

test_sequences = np.array(test_sequences).astype(np.int32)
test_masks = np.array(test_masks).astype(np.uint8)
targets_to_row_ids = [dict(zip(a,b)) for a,b in zip(targets,row_ids)]

  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
preds = model.predict((test_sequences,test_masks),batch_size=16)
preds = np.argmax(preds,axis=-1)

### **Model 2: RoBERTa**


In [41]:
# Fresh read in of the data again in case any previous steps were destructive 

df_features = pd.read_csv("../input/nbme-score-clinical-patient-notes/features.csv")
df_patient_notes = pd.read_csv("../input/nbme-score-clinical-patient-notes/patient_notes.csv")
df_test = pd.read_csv("../input/nbme-score-clinical-patient-notes/test.csv")
df_train= pd.read_csv("../input/nbme-score-clinical-patient-notes/train.csv")
df_sample_submission= pd.read_csv("../input/nbme-score-clinical-patient-notes/sample_submission.csv")

In [42]:
df_test = df_test.merge(df_patient_notes,on=['case_num','pn_num']).merge(df_features,on=['case_num','feature_num'])
df_train = df_train.merge(df_patient_notes,on=['case_num','pn_num']).merge(df_features,on=['case_num','feature_num'])

In [43]:
df_train.head()

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,pn_history,feature_text
0,00016_000,0,16,0,['dad with recent heart attcak'],['696 724'],HPI: 17yo M presents with palpitations. Patien...,Family-history-of-MI-OR-Family-history-of-myoc...
1,00016_001,0,16,1,"['mom with ""thyroid disease']",['668 693'],HPI: 17yo M presents with palpitations. Patien...,Family-history-of-thyroid-disorder
2,00016_002,0,16,2,['chest pressure'],['203 217'],HPI: 17yo M presents with palpitations. Patien...,Chest-pressure
3,00016_003,0,16,3,"['intermittent episodes', 'episode']","['70 91', '176 183']",HPI: 17yo M presents with palpitations. Patien...,Intermittent-symptoms
4,00016_004,0,16,4,['felt as if he were going to pass out'],['222 258'],HPI: 17yo M presents with palpitations. Patien...,Lightheaded


In [55]:
MODEL_NAME = 'roberta-base'
TRAIN = True # True to train and evaluate the model
RESET = True # True to retrain the same model without using committed model files, generates tokens data
INPUT = '/kaggle/input/nbmebertv1' 

In [56]:
input_path = f'../input/{INPUT}/'
output_path = f'./'

if RESET:
    # If not resetting, use the model files already committed
    data_exists = False
else:
    # If reset, generates new data arrays for training
    data_exists = os.path.exists(f'{input_path}/model.h5')

In [57]:
os.listdir('/kaggle/input/nbmebertv1')


['label_encoder.dill',
 'labels.npy',
 'sequences.npy',
 'masks.npy',
 'my_tokenizer',
 'model.h5']

In [58]:
TRAIN_SPLIT = 0.8
BATCH_SIZE = 12
EPOCHS = 20
SEQUENCE_LENGTH = 512
SEED = 999

In [59]:
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [73]:
# Prevents Data Sharding
os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"
options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA

# Sets output size for plots
plt.rcParams["figure.figsize"] = (15, 10)

In [60]:
EMPTY =  'EMPTY'
CLASSES = [EMPTY,]+df_features.feature_num.unique().tolist()

if DATA_EXISTS:
    label_encoder = dill.load(open(DATA_PATH+"/label_encoder.dill",'rb'))
else:
    # label_encoder
    label_encoder = LabelEncoder()
    # Encode labels
    label_encoder.fit(CLASSES)
    dill.dump(label_encoder,open('label_encoder.dill','wb'))
df_train['TARGET']= label_encoder.transform(df_train['feature_num'])
df_test['TARGET']= label_encoder.transform(df_test['feature_num'])
N_CLASSES = len(label_encoder.classes_)
EMPTY_IDX = label_encoder.transform([EMPTY,]) [0]

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [61]:
if data_exists:
    # If data exsists, loads the Tokenizer from disk
    tokenizer = AutoTokenizer.from_pretrained(f'{input_path}/my_tokenizer',normalization=True)
else:
    # Else, downloads & initializes the Tokenizer from the cloud
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME,normalization=True)
    tokenizer.save_pretrained(f'{output_path}/my_tokenizer')

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [66]:
if data_exists:
    # Loads data if it already exists on disk
    sequences = np.load(open(f'{input_path}/sequences.npy','rb'))
    masks = np.load(open(f'{input_path}/masks.npy','rb'))
    labels = np.load(open(f'{input_path}/labels.npy','rb'))
else:
    # Initialize the lists
    sequences, labels, masks = [], [], []
    # Groups dataframe by 'pn_num' and iterates over each object
    for g1 in tqdm(df_train.groupby('pn_num')):
        # Get the grouped dataframe
        gdf = g1[1]
        # Get whole history note text for each patient
        pn_history  = gdf.iloc[0].pn_history

        # Apply the tokenizer on the patient history text
        tokens = tokenizer.encode_plus(
            pn_history,
            max_length=SEQUENCE_LENGTH,
            padding='max_length',
            truncation=True,
            return_offsets_mapping=True)
        # Build the sequence as the tokens input ids according to the sequence lenght
        sequence = tokens['input_ids']
        attention_mask = tokens['attention_mask']
        # Initialize the empty labels
        label = np.array([EMPTY_IDX for _ in range(SEQUENCE_LENGTH)])

        # BUILDS THE TARGET ARRAY
        # Getting the tokens
        offsets = tokens['offset_mapping']
        label_empty = True
        for index, row in gdf.iterrows():
            TARGET = row.TARGET
            # Setting the targets onto the empty labels
            for i, (w_start, w_end) in enumerate(offsets):
                for start,end in decode_location(row.location):
                    if w_start < w_end and (w_start >= start) and (end >= w_end):
                        label[i] = TARGET
                        label_empty = False
                    if w_start >= w_end:
                        break
        # Appending embeddings
        if not label_empty:
            sequences.append(sequence)
            masks.append(attention_mask)
            labels.append(label)
            
        # Formats the data
    sequences = np.array(sequences).astype(np.int32)
    masks = np.array(masks).astype(np.uint8)
    labels = np.array(tf.keras.utils.to_categorical(labels,N_CLASSES)).astype(np.uint8)

    # Saves the data to disk
    np.save(open(f'{output_path}/sequences.npy','wb'), sequences)
    np.save(open(f'{output_path}/masks.npy','wb'), masks)
    np.save(open(f'{output_path}/labels.npy','wb'), labels)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [67]:
pn_history

'Stephanie madden is a 20 year old woman complaining of headache. Pain began yesterday morning and has been getting worse. Dull constant ache bilaterally, worse with walking, bending over, no alleviating factors. Did not respond to ibuprofen, Tylenol, or sleep. Complains of photobia, but not phonophobia, no aura. Subjective fever today. Neck stiffness. Nausea and vomiting. No dizzyness or lightheadedness, no weakness, no sick contacts.\r\nROS: otherwise negative\r\nPMH: none\r\nPSH: none\r\nHosp: none\r\nMed: Birth control pill\r\nAllergies: none\r\nSoc: works at sporting good store, lives with roomate, no tobacoo, etoh 2-3 drinks on weekends, marijuna 3-4 joints a week, sexually active and uses condoms'

In [68]:
np.array(sequence)

array([    0, 25093,  4134,   324,   475, 23004,    16,    10,   291,
          76,   793,   693, 13689,     9, 19344,     4, 23689,   880,
        2350,   662,     8,    34,    57,   562,  3007,     4, 38776,
        5891,  4285,   700,   741, 45931,     6,  3007,    19,  3051,
           6, 33842,    81,     6,   117, 32216, 16158,  2433,     4,
        6553,    45,  2519,     7, 34154,   658,  1001, 22132,     6,
         255,  4360,   225,  1168,     6,    50,  3581,     4, 20722,
        5069,     9, 17190, 33693,     6,    53,    45, 43676, 24938,
           6,   117, 34705,     4, 36994,  2088, 11696,   452,     4,
       33224, 37760,     4,   234, 17498,   102,     8, 23600,     4,
         440, 24719,   219,  1825,    50,  1109, 19279,  1825,     6,
         117,  8269,     6,   117,  4736,  9872,     4, 50121, 50118,
         500,  3196,    35,  3680,  2430, 50121, 50118,  5683,   725,
          35,  4146, 50121, 50118,  3888,   725,    35,  4146, 50121,
       50118,   725,

In [69]:
# Builds the TensorFlow dataset with the embeddings arrays
train_dataset = tf.data.Dataset.from_tensor_slices(
        ((sequences, masks), labels))

In [70]:
def get_dataset_partitions_tf(ds, ds_size, train_split=0.8, val_split=0.1, test_split=0.1, shuffle=True, shuffle_size=10000):
    assert (train_split + test_split + val_split) == 1
    
    if shuffle:
        # Specify seed to always have the same split distribution between runs
        ds = ds.shuffle(shuffle_size, seed=12)
    
    train_size = int(train_split * ds_size)
    val_size = int(val_split * ds_size)
    
    train_ds = ds.take(train_size)    
    val_ds = ds.skip(train_size).take(val_size)
    test_ds = ds.skip(train_size).skip(val_size)
    
    return train_ds, val_ds, test_ds

In [71]:
samples = sequences.shape[0]
train_dataset, val_dataset, test_df = get_dataset_partitions_tf(
    train_dataset,
    train_split=TRAIN_SPLIT,
    val_split=0.2,
    test_split=0,
    ds_size=samples)

train_dataset, val_dataset

(<_TakeDataset element_spec=((TensorSpec(shape=(512,), dtype=tf.int32, name=None), TensorSpec(shape=(512,), dtype=tf.uint8, name=None)), TensorSpec(shape=(512, 144), dtype=tf.uint8, name=None))>,
 <_TakeDataset element_spec=((TensorSpec(shape=(512,), dtype=tf.int32, name=None), TensorSpec(shape=(512,), dtype=tf.uint8, name=None)), TensorSpec(shape=(512, 144), dtype=tf.uint8, name=None))>)

In [74]:
# Creates repeating generators with options and the set batch sizer with the datasets
train_dataset = train_dataset.repeat().batch(BATCH_SIZE).with_options(options)
val_dataset = val_dataset.repeat().batch(BATCH_SIZE).with_options(options)

train_dataset, val_dataset

(<_OptionsDataset element_spec=((TensorSpec(shape=(None, 512), dtype=tf.int32, name=None), TensorSpec(shape=(None, 512), dtype=tf.uint8, name=None)), TensorSpec(shape=(None, 512, 144), dtype=tf.uint8, name=None))>,
 <_OptionsDataset element_spec=((TensorSpec(shape=(None, 512), dtype=tf.int32, name=None), TensorSpec(shape=(None, 512), dtype=tf.uint8, name=None)), TensorSpec(shape=(None, 512, 144), dtype=tf.uint8, name=None))>)

In [75]:
steps_per_epoch = (samples * TRAIN_SPLIT) // BATCH_SIZE
validation_steps = (samples * (1 - TRAIN_SPLIT)) // BATCH_SIZE

steps_per_epoch, validation_steps

(66.0, 16.0)

In [76]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='loss',
    mode='min',
    patience=3)

In [81]:
callbacks = [early_stopping]

In [82]:
if data_exists:
    # If data exsists, loads the config from disk
    config = AutoConfig.from_pretrained(f'{input_path}/my_tokenizer/config.json')
else:
    # Else, downloads & initializes the config from the cloud
    config = AutoConfig.from_pretrained(MODEL_NAME)
    config.save_pretrained(f'{output_path}/my_tokenizer')


In [90]:
def build_model(data_exists, config):
    # Creates input layers
    tokens_input = tf.keras.layers.Input(shape=(SEQUENCE_LENGTH,), name='tokens', dtype=tf.int32)
    attention_input = tf.keras.layers.Input(shape=(SEQUENCE_LENGTH,), name='attention', dtype=tf.int32)
    
    if data_exists:
        # Initializes the AutoConfig Model from disk
        backbone = TFAutoModel.from_config(config)
    else:
        # Initializes the AutoConfig Model from the cloud
        backbone = TFAutoModel.from_pretrained(MODEL_NAME, config=config)
    
    # Output layers
    out = backbone(input_ids=tokens_input, attention_mask=attention_input)[0]
    out = tf.keras.layers.Dropout(0.2)(out)
    # Sets the output activation to softmax
    out = tf.keras.layers.Dense(N_CLASSES, activation='softmax')(out)
    
    # Generates the model structure
    model = tf.keras.Model(inputs=[tokens_input, attention_input], outputs=out)
    
    # Compiles the model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(5e-5),
        loss=tf.keras.losses.categorical_crossentropy,
        metrics=[tfa.metrics.F1Score(num_classes=N_CLASSES, average='micro')])
    
    return model


In [91]:
def fit_model(model, train_ds, val_ds, steps_per_epoch, validation_steps, callbacks, epochs=10):
    
    # Fits the model
    history = model.fit(
        train_ds,
        validation_data=val_ds,
        callbacks=callbacks,
        epochs=epochs,
        steps_per_epoch=steps_per_epoch,
        validation_steps=validation_steps)

    return model, history


In [None]:
if TRAIN:
    with strategy.scope():
        
        # Builds the model
        model = build_model(data_exists, config)
        # Fits the model
        model, history = fit_model(
            model,
            train_dataset,
            val_dataset,
            steps_per_epoch,
            validation_steps,
            callbacks,
            epochs=EPOCHS)


In [None]:
if TRAIN:
    # Summarizes history for accuracy
    plt.plot(history.history['f1_score'])
    plt.plot(history.history['val_f1_score'])
    plt.title('model f1_score')
    plt.ylabel('f1_score')
    plt.xlabel('epoch')
    plt.legend(['train', 'val'], loc='upper left')
    plt.show()

In [None]:
preds = model.predict((test_sequences,test_masks),batch_size=16)
preds = np.argmax(preds,axis=-1)

### **Results and Analysis**

In summary we attempted to solve this NLP problem with 2 different approaches, BERT and RoBERTa. The latter model performed better, which was expected since RoBERTa builds on BERT. I iterated over both models multiple times and although performance improved slightly, it was not significant. 


### **Discussion and Conclusion**

A major key learning for me from this project is the amount of time needed to develop models that perform well. An additional key takeaway or learning is that it is important to be very familiar with the framework you decide to employ and this can be very time consuming as well (Tensorflow vs. PyTorch, etc.). Ultimately, I think the reason these models "didn't work" as well as I would've like is due to not having enough time to really refine things. So the major way to improve in the future is to spend more time attempting to solve the problem, or to perhaps use a different approach as NLP solutions are being developed at a rapid pace currently.  