In [1]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m50.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score, accuracy_score

In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
###key hyperparameters
max_length_tokenization = 3072 ## max length of token IDs
padding_tokentization = True ## whether to pad the token IDs

**0. Load Input data, split the text to sentences for BERT & Tokenize to IDs, onehot encoding of lables**

In [5]:
##get all labels (top 50 as of now)
all_labels = pd.read_csv('/content/drive/MyDrive/CS769_SH/MI_DATA/top_x_codes.txt', header=None)
#all_labels = pd.read_csv('MIMIC_DATA/top_x_codes.txt', header=None)
all_labels.columns = ["ICD9_CODE"]
all_labels = all_labels['ICD9_CODE'].tolist()

In [6]:
##create hierarchy of labels from all labels (when we use ICD Taxonomy)
hier_mapping = {} ##dictionary to store hierarchy mapping between ICD codes and their parents
for i in range(len(all_labels)):
    split_data = all_labels[i].split('.') ##split the ICD code by '.' to get the hierarchy
    if len(split_data) == 1: ##if the ICD code doesn't have a child
        hier_mapping[all_labels[i]] = all_labels[i] ##the parent is the ICD code itself
    else:
        hier_mapping[all_labels[i]] = split_data[0]


In [7]:
## create updated all labels list using mapping (distinct keys and values of the mapping)
all_labels_updated = list (set(hier_mapping.keys()) | set(hier_mapping.values()))
all_labels_updated.sort()
#all_labels_updated

In [8]:
len(all_labels_updated)

85

In [9]:
##mapping of all labels to their parents

indices_mapping_to_parents = [0]*len(all_labels_updated) ##list to store the indices of the parents of all labels
for i in range(len(all_labels_updated)):
    if all_labels_updated[i] in hier_mapping.keys():
        indices_mapping_to_parents[i] = all_labels_updated.index(hier_mapping[all_labels_updated[i]])
    else:
        indices_mapping_to_parents[i] = i ##for parents, the index is the same as the index of the label

#indices_mapping_to_parents


In [10]:
## only child indices (Original ICD Codes that are not parents)
only_child_indices = []

for i in range(len(all_labels_updated)):
    if all_labels_updated[i] in hier_mapping.keys():
        only_child_indices.append(i)

len(only_child_indices)

50

In [11]:
def split_text_to_sentences(text, tokenizer,max_length = 128, padding = False, token_length = 3072 ):
    text = text.replace('"', '') ## to remove quotes at the beginning and end of the text
    #text = text.split(' ') ## to split the text into words
    encoded_dict = tokenizer(text, padding=padding, truncation=True, max_length=token_length, add_special_tokens=True)
    input_ids = encoded_dict['input_ids']
    sentences = []
    attention_masks = []
    token_type_ids = []
    for i in range(0, len(input_ids), max_length):
        sentences.append(input_ids[i:i+max_length]) ## last sentence may be less than max_length
        attention_masks.append(encoded_dict['attention_mask'][i:i+max_length])
        token_type_ids.append(encoded_dict['token_type_ids'][i:i+max_length])
        if len(sentences[-1]) < max_length: ## if last sentence is less than max_length, pad it with [PAD]
            # tl =len(sentences[-1][0]) ## length of token IDs generated by tokenizer
            # sentences[-1] = sentences[-1] + [[0]*tl]*(max_length - len(sentences[-1])) ## pad with 0
            # attention_masks[-1] = attention_masks[-1] + [[0]*tl]*(max_length - len(attention_masks[-1])) ## pad with 0
            # token_type_ids[-1] = token_type_ids[-1] + [[0]*tl]*(max_length - len(token_type_ids[-1])) ## pad with 0

            sentences[-1] = sentences[-1] + [0]*(max_length - len(sentences[-1])) ## pad with 0
            attention_masks[-1] = attention_masks[-1] + [0]*(max_length - len(attention_masks[-1])) ## pad with 0
            token_type_ids[-1] = token_type_ids[-1] + [0]*(max_length - len(token_type_ids[-1])) ## pad with 0

    return [sentences, attention_masks, token_type_ids]

def labels_to_one_hot(labels, all_labels, hier_mapping):
    one_hot_labels = []
    for label in labels:
        one_hot = [0]*len(all_labels)
        for code in label:
            one_hot[all_labels.index(code)] = 1
            ##update parent code to 1
            parent_code = hier_mapping[code]
            one_hot[all_labels.index(parent_code)] = 1
        one_hot_labels.append(one_hot)
    return one_hot_labels
    

In [12]:
def data_pull(df, all_labels, padding_tokentization, max_length_tokenization, hier_mapping ):
    tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_Discharge_Summary_BERT", use_fast=True)
    notes = df['TEXT'].apply(lambda x: split_text_to_sentences(x, tokenizer, 128,padding_tokentization, max_length_tokenization)) #list of list of sentences
    labels = df['ICD9_CODE'].apply(lambda x: x.split(';')) ##list of ICD9 codes
    notes = notes.values.tolist() ## convert to list
    labels = labels.values.tolist() ## convert to list
    one_hot_labels = labels_to_one_hot(labels, all_labels, hier_mapping)

    return notes, one_hot_labels

In [13]:
train_df = pd.read_csv('/content/drive/MyDrive/CS769_SH/MI_DATA/train_ds_notes.csv')
#train_df = pd.read_csv('MIMIC_DATA/train_ds_notes.csv')



In [14]:
train_df[:40000]['length'].max()

2291

In [15]:
train_notes, train_labels = data_pull(train_df[:40000], all_labels_updated ,padding_tokentization, max_length_tokenization, hier_mapping)


Downloading (…)lve/main/config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

In [16]:
dev_df = pd.read_csv('/content/drive/MyDrive/CS769_SH/MI_DATA/dev_ds_notes.csv')
#dev_df = pd.read_csv('MIMIC_DATA/dev_ds_notes.csv')
#TAKE one in 6 rows
dev_df = dev_df[:5000]
dev_df = dev_df.iloc[::6, :]
dev_notes, dev_labels = data_pull(dev_df, all_labels_updated,padding_tokentization, max_length_tokenization, hier_mapping)


In [17]:
##clearning memory
import gc
gc.collect()
del train_df
del dev_df

**1. Main Model Definition (BERT + LabelAttention + Loss)**

In [18]:
class ICD9_Detection(nn.Module):
    def __init__(self, num_labels,indices_mapping_to_parents):
        super(ICD9_Detection, self).__init__()
        self.bert = AutoModel.from_pretrained("emilyalsentzer/Bio_Discharge_Summary_BERT")
        self.dropout = nn.Dropout(0.1)
        self.linear_z = nn.Linear(768, 768)
        self.linear_a = nn.Linear(768, num_labels)
        self.linear_o = nn.Linear(768, num_labels)
        self.num_labels = num_labels
        self.indices_mapping_to_parents = indices_mapping_to_parents

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(input_ids.view(-1, 128), attention_mask=attention_mask.view(-1, 128), token_type_ids=token_type_ids.view(-1, 128))
        last_hidden_state = outputs[0].view(input_ids.shape[0],input_ids.shape[1]*input_ids.shape[2], 768) ##shape: (b, s*c, 768)
        z = torch.tanh(self.linear_z(last_hidden_state)) ##shape: (b, s*c, 768)
        a = torch.softmax(self.linear_a(z), dim=1).transpose(1,2) ##shape: (b, num_labels, s*c) weights for each label
        d = torch.matmul(a, last_hidden_state) ##shape: (b, num_labels, 768) weighted sum for each label (check matmul once again)
        logits = self.linear_o.weight.mul(d).sum(dim=2) ##shape: (b, num_labels) logits for each label

        ##c-hmcnn specific layers
        parent_logits = logits[:, self.indices_mapping_to_parents] ##shape: (b, num_labels) logits for each label's parent
        #print("parent_logits.shape", parent_logits.shape)
        h_logits = torch.cat((logits.unsqueeze(2), parent_logits.unsqueeze(2)), dim=2) ##shape: (b, num_labels, 2) logits for each label and its parent
        #print("h_logits.shape", h_logits.shape)

        ##MIN (not MAX) Constraint Module
        mcm = torch.min(h_logits, dim = 2)[0] ##shape: (b, num_labels) min of label and its parent logits
        #print("mcm.shape", mcm.shape)


        return logits, mcm
    
    def loss(self, logits, mcm, labels):

        ##loss
        b_logits = labels * logits ##shape: (b, num_labels) logits for each label multiplied by label's one hot
        b_parent_logits = b_logits[:, self.indices_mapping_to_parents] ##shape: (b, num_labels) logits for each label's parent multiplied by label's one hot
        b_h_logits = torch.cat((b_logits.unsqueeze(2), b_parent_logits.unsqueeze(2)), dim=2) ##shape: (b, num_labels, 2) logits for each label and its parent multiplied by label's one hot
        b_mcm = torch.min(b_h_logits, dim = 2)[0] ##shape: (b, num_labels) min logits for each label and its parent multiplied by label's one hot
        # print("b_mcm.shape", b_mcm.shape)
        # print("labels.shape", labels.shape)

        input_to_loss = ((1-labels)*mcm) + (labels*b_mcm) ##shape: (b, num_labels) input to loss function

        loss = nn.BCEWithLogitsLoss()
        return loss(input_to_loss.view(-1, self.num_labels), labels.view(-1, self.num_labels))

    


**2. Train the model**

In [19]:
def batch_convertor(training_data, training_labels, batch_size = 32):##add padding when # sentences differes across batches
    all_batches_sentences, all_batches_attention, all_batches_tokentype = [], [], []
    all_batches_labels = []
    for i in range(0, len(training_data), batch_size):
        batch = training_data[i:i+batch_size]
        max_num_sentences = max([len(x[0]) for x in batch])
        batch_sentences, batch_attention_masks, batch_token_type_ids = [], [], []
        for j in range(len(batch)):
            num_sentences = len(batch[j][0])
            batch_sentences.append(batch[j][0] + [[0]*128]*(max_num_sentences - num_sentences))
            batch_attention_masks.append(batch[j][1] + [[0]*128]*(max_num_sentences - num_sentences))
            batch_token_type_ids.append(batch[j][2] + [[0]*128]*(max_num_sentences - num_sentences))

        all_batches_sentences.append(torch.tensor(batch_sentences))
        all_batches_attention.append(torch.tensor(batch_attention_masks))
        all_batches_tokentype.append(torch.tensor(batch_token_type_ids))
        all_batches_labels.append(torch.tensor(training_labels[i:i+batch_size]))
    
    return [all_batches_sentences, all_batches_attention,all_batches_tokentype], all_batches_labels



In [20]:
## training by taking one example at a time
def train(model,training_notes, labels, num_epochs, optimizer, only_child_indices,device, model_save_path = None, dev_notes = None, dev_labels = None, threshold=0.2):
    print('Training started')
    for epoch in range(num_epochs):
        epoch_loss = []
        model_save_path_curr = model_save_path
        #model.train()
        for i in range(len(training_notes[0])):
            note = training_notes[0][i].to(device) ##batched sentences of shape(batch_size, max_num_sentences, max_length=128)
            attention_mask = training_notes[1][i].to(device)
            token_type_ids = training_notes[2][i].to(device)
            label = labels[i].to(device)
            logits, mcm = model(note, attention_mask, token_type_ids)
            #print(logits, label)
            loss = model.loss(logits, mcm, label.float())
            #accuracy = model.accuracy(logits, label)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            torch.cuda.empty_cache()
            del note, attention_mask, token_type_ids, logits, mcm
            if i%2000 == 0:
                dev_micro_f1, dev_macro_f1 = evaluate_model(model, dev_notes, dev_labels,only_child_indices, device)
                print('Epoch: {}, #Batches: {}, Loss: {}, Dev Micro F1: {}, Dev Macro F1: {}'.format(epoch, i, loss.item(), dev_micro_f1, dev_macro_f1))

            epoch_loss.append(loss.item())
            del loss
        print('Epoch: {}, Loss: {}'.format(epoch, np.mean(epoch_loss)))
        ###save model
        model_save_path_curr = model_save_path_curr + str(epoch) + '.pt'
        torch.save(model.state_dict(), model_save_path_curr)
        print('Model saved to {}'.format(model_save_path_curr))

    return model

def evaluate_model (model, dev_notes, dev_labels,only_child_indices, device=torch.device("cpu"), threshold=0.2):
    #model.eval()
    with torch.no_grad():
        predictions = []
        for i in range(len(dev_notes[0])):
            note = dev_notes[0][i].to(device)
            attention_mask = dev_notes[1][i].to(device)
            token_type_ids = dev_notes[2][i].to(device)
            #label = torch.tensor(dev_labels[i]).to(device)
            logits, mcm = model(note, attention_mask, token_type_ids)
            #loss = model.loss(logits, label.float())
            for ll in range(len(mcm)):#for each example in the batch ## for C-HMCNN, using mcm for predictions instead of logits
                logits_i = torch.sigmoid(mcm[ll])
                logits_i = [1 if x>threshold else 0 for x in logits_i]
                #logits_i = logits_i.cpu().detach()
                predictions.append(logits_i) ## appending for each example in the batch/dataset
    
    ##f1_score calc
    del logits, note, attention_mask,token_type_ids
    #take only child indices
    predictions = torch.tensor(predictions)[:, only_child_indices]
    dev_labels = torch.tensor(dev_labels)[:, only_child_indices]


    micro_f1 = f1_score(dev_labels.numpy(), predictions.numpy(), average='micro')
    macro_f1 = f1_score(dev_labels.numpy(), predictions.numpy(), average='macro')

    #print('Micro F1: {}, Macro F1: {}'.format(micro_f1, macro_f1))

    return micro_f1, macro_f1


In [21]:
##train the model
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('GPU device is available')
else:
    device = torch.device("cpu")
    print('GPU device is not available, using CPU instead')
    
model = ICD9_Detection(len(all_labels_updated), indices_mapping_to_parents ).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=5e-6)
model_save_path = '/content/drive/MyDrive/CS769_SH/coherent_saved_models/chmcnn_lr2_run_5_sh_icd_pl_weights_'
#model_save_path = 'test_models/test_'

##load pre-trained weights
model.load_state_dict(torch.load('/content/drive/MyDrive/CS769_SH/coherent_saved_models/chmcnn_lr2_run_3_sh_icd_pl_weights_1.pt'))


GPU device is available


Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at emilyalsentzer/Bio_Discharge_Summary_BERT were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [22]:
##batch data generation for training
training_notes_batches, training_labels_batches = batch_convertor(train_notes, train_labels, batch_size = 4)


In [23]:
dev_notes_batches, dev_labels_batches = batch_convertor(dev_notes, dev_labels, batch_size = 4)

In [24]:
len(training_notes_batches[0]), len(training_labels_batches), len(dev_notes_batches[0]), len(dev_labels_batches)

(10000, 10000, 209, 209)

In [25]:
model = train(model, training_notes_batches, training_labels_batches, num_epochs =1, optimizer= optimizer, only_child_indices=only_child_indices, device = device , model_save_path=model_save_path, dev_notes = dev_notes_batches, dev_labels = dev_labels, threshold=0.25)

Training started
Epoch: 0, #Batches: 0, Loss: 0.3036505877971649, Dev Micro F1: 0.6137907017238378, Dev Macro F1: 0.5868382930688713
Epoch: 0, #Batches: 2000, Loss: 0.05320281535387039, Dev Micro F1: 0.6432200078155529, Dev Macro F1: 0.5981086748004657
Epoch: 0, #Batches: 4000, Loss: 0.1370682418346405, Dev Micro F1: 0.6373736362726454, Dev Macro F1: 0.5944250418855465
Epoch: 0, #Batches: 6000, Loss: 0.08296418935060501, Dev Micro F1: 0.6468153830624811, Dev Macro F1: 0.6012169891948882
Epoch: 0, #Batches: 8000, Loss: 0.11709914356470108, Dev Micro F1: 0.6432845058799257, Dev Macro F1: 0.5976840818479106
Epoch: 0, Loss: 0.10569393743956461
Model saved to /content/drive/MyDrive/CS769_SH/coherent_saved_models/chmcnn_lr2_run_5_sh_icd_pl_weights_0.pt


In [None]:
len(dev_notes_batches[0]), len(dev_labels_batches)

(625, 625)

In [26]:
for i in [0.25, 0.275, 0.3, 0.325, 0.35, 0.375, 0.4, 0.425, 0.45]:
  micro_f1, macro_f1 = evaluate_model(model, dev_notes_batches, dev_labels, only_child_indices, device, threshold=i)
  print("for threshold ", i, "micro_f1: ", micro_f1, "macro_f1: ", macro_f1)

for threshold  0.25 micro_f1:  0.6283304581176923 macro_f1:  0.5913785972352414
for threshold  0.275 micro_f1:  0.6319084417489974 macro_f1:  0.5945449525612626
for threshold  0.3 micro_f1:  0.6339517255136645 macro_f1:  0.5937542916754909
for threshold  0.325 micro_f1:  0.6363913441024077 macro_f1:  0.593445182600125
for threshold  0.35 micro_f1:  0.636982065553494 macro_f1:  0.5921401963276568
for threshold  0.375 micro_f1:  0.637941853168793 macro_f1:  0.5924527919084775
for threshold  0.4 micro_f1:  0.640118618936666 macro_f1:  0.5921836408684367
for threshold  0.425 micro_f1:  0.6410228859997852 macro_f1:  0.5913872867883019
for threshold  0.45 micro_f1:  0.642055972993575 macro_f1:  0.590196885448609


Testing

In [27]:
test_df = pd.read_csv('/content/drive/MyDrive/CS769_SH/MI_DATA/test_ds_notes.csv')
#TAKE one in 6 rows
test_df = test_df[:5013]
#test_df = test_df.iloc[::6, :]
test_notes, test_labels = data_pull(test_df, all_labels_updated ,padding_tokentization, max_length_tokenization, hier_mapping)

In [28]:
test_notes_batches, test_labels_batches = batch_convertor(test_notes, test_labels, batch_size = 4)

In [29]:
len(test_notes_batches[0])

1254

In [46]:
##choosen 0.4 as this seems to have a good balance between micro and macro F1 scores
micro_f1, macro_f1 = evaluate_model(model, test_notes_batches, test_labels,only_child_indices, device, threshold=0.4)
print("for threshold ", 0.4, "macro_f1: ", macro_f1, "micro_f1: ", micro_f1)

for threshold  0.4 macro_f1:  0.5986642749476602 micro_f1:  0.6449552114436498


In [43]:
test_df[:5013]['length'].max()

2303