In [None]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.3-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m39.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m56.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.3


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score, accuracy_score

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
###key hyperparameters
max_length_tokenization = 3072 ## max length of token IDs
padding_tokentization = True ## whether to pad the token IDs

**0. Load Input data, split the text to sentences for BERT & Tokenize to IDs, onehot encoding of lables**

In [None]:
##get all labels (top 50 as of now)
all_labels = pd.read_csv('/content/drive/MyDrive/CS769_SH/MI_DATA/top_x_codes.txt', header=None)
all_labels.columns = ["ICD9_CODE"]
all_labels = all_labels['ICD9_CODE'].tolist()

In [None]:
def split_text_to_sentences(text, tokenizer,max_length = 128, padding = False, token_length = 3072 ):
    text = text.replace('"', '') ## to remove quotes at the beginning and end of the text
    #text = text.split(' ') ## to split the text into words
    encoded_dict = tokenizer(text, padding=padding, truncation=True, max_length=token_length, add_special_tokens=True)
    input_ids = encoded_dict['input_ids']
    sentences = []
    attention_masks = []
    token_type_ids = []
    for i in range(0, len(input_ids), max_length):
        sentences.append(input_ids[i:i+max_length]) ## last sentence may be less than max_length
        attention_masks.append(encoded_dict['attention_mask'][i:i+max_length])
        token_type_ids.append(encoded_dict['token_type_ids'][i:i+max_length])
        if len(sentences[-1]) < max_length: ## if last sentence is less than max_length, pad it with [PAD]
            # tl =len(sentences[-1][0]) ## length of token IDs generated by tokenizer
            # sentences[-1] = sentences[-1] + [[0]*tl]*(max_length - len(sentences[-1])) ## pad with 0
            # attention_masks[-1] = attention_masks[-1] + [[0]*tl]*(max_length - len(attention_masks[-1])) ## pad with 0
            # token_type_ids[-1] = token_type_ids[-1] + [[0]*tl]*(max_length - len(token_type_ids[-1])) ## pad with 0

            sentences[-1] = sentences[-1] + [0]*(max_length - len(sentences[-1])) ## pad with 0
            attention_masks[-1] = attention_masks[-1] + [0]*(max_length - len(attention_masks[-1])) ## pad with 0
            token_type_ids[-1] = token_type_ids[-1] + [0]*(max_length - len(token_type_ids[-1])) ## pad with 0

    return [sentences, attention_masks, token_type_ids]

def labels_to_one_hot(labels, all_labels):
    one_hot_labels = []
    for label in labels:
        one_hot = [0]*len(all_labels)
        for code in label:
            one_hot[all_labels.index(code)] = 1
        one_hot_labels.append(one_hot)
    return one_hot_labels
    

In [None]:
def data_pull(df, all_labels, padding_tokentization, max_length_tokenization ):
    tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_Discharge_Summary_BERT", use_fast=True)
    notes = df['TEXT'].apply(lambda x: split_text_to_sentences(x, tokenizer, 128,padding_tokentization, max_length_tokenization)) #list of list of sentences
    labels = df['ICD9_CODE'].apply(lambda x: x.split(';')) ##list of ICD9 codes
    notes = notes.values.tolist() ## convert to list
    labels = labels.values.tolist() ## convert to list
    one_hot_labels = labels_to_one_hot(labels, all_labels)

    return notes, one_hot_labels

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/CS769_SH/MI_DATA/train_ds_notes.csv')




In [None]:
train_df[:40000]['length'].max()

2291

In [None]:
train_notes, train_labels = data_pull(train_df[:40000], all_labels,padding_tokentization, max_length_tokenization)


Downloading (…)lve/main/config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

In [None]:
dev_df = pd.read_csv('/content/drive/MyDrive/CS769_SH/MI_DATA/dev_ds_notes.csv')
#TAKE one in 6 rows
dev_df = dev_df[:5000]
#dev_df = dev_df.iloc[::6, :]
dev_notes, dev_labels = data_pull(dev_df, all_labels,padding_tokentization, max_length_tokenization)


In [None]:
##clearning memory
import gc
gc.collect()
del train_df
del dev_df

**1. Main Model Definition (BERT + LabelAttention + Loss)**

In [None]:
class ICD9_Detection(nn.Module):
    def __init__(self, num_labels):
        super(ICD9_Detection, self).__init__()
        self.bert = AutoModel.from_pretrained("emilyalsentzer/Bio_Discharge_Summary_BERT")
        self.dropout = nn.Dropout(0.1)
        self.linear_z = nn.Linear(768, 768)
        self.linear_a = nn.Linear(768, num_labels)
        self.linear_o = nn.Linear(768, num_labels)
        self.num_labels = num_labels

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(input_ids.view(-1, 128), attention_mask=attention_mask.view(-1, 128), token_type_ids=token_type_ids.view(-1, 128))
        last_hidden_state = outputs[0].view(input_ids.shape[0],input_ids.shape[1]*input_ids.shape[2], 768) ##shape: (b, s*c, 768)
        z = torch.tanh(self.linear_z(last_hidden_state)) ##shape: (b, s*c, 768)
        a = torch.softmax(self.linear_a(z), dim=1).transpose(1,2) ##shape: (b, num_labels, s*c) weights for each label
        d = torch.matmul(a, last_hidden_state) ##shape: (b, num_labels, 768) weighted sum for each label (check matmul once again)
        logits = self.linear_o.weight.mul(d).sum(dim=2) ##shape: (num_labels) logits for each label
        return logits
    
    def loss(self, logits, labels):
        loss = nn.BCEWithLogitsLoss()
        return loss(logits.view(-1, self.num_labels), labels.view(-1, self.num_labels))

    


**2. Train the model**

In [None]:
def batch_convertor(training_data, training_labels, batch_size = 32):##add padding when # sentences differes across batches
    all_batches_sentences, all_batches_attention, all_batches_tokentype = [], [], []
    all_batches_labels = []
    for i in range(0, len(training_data), batch_size):
        batch = training_data[i:i+batch_size]
        max_num_sentences = max([len(x[0]) for x in batch])
        batch_sentences, batch_attention_masks, batch_token_type_ids = [], [], []
        for j in range(len(batch)):
            num_sentences = len(batch[j][0])
            batch_sentences.append(batch[j][0] + [[0]*128]*(max_num_sentences - num_sentences))
            batch_attention_masks.append(batch[j][1] + [[0]*128]*(max_num_sentences - num_sentences))
            batch_token_type_ids.append(batch[j][2] + [[0]*128]*(max_num_sentences - num_sentences))

        all_batches_sentences.append(torch.tensor(batch_sentences))
        all_batches_attention.append(torch.tensor(batch_attention_masks))
        all_batches_tokentype.append(torch.tensor(batch_token_type_ids))
        all_batches_labels.append(torch.tensor(training_labels[i:i+batch_size]))
    
    return [all_batches_sentences, all_batches_attention,all_batches_tokentype], all_batches_labels



In [None]:
## training by taking one example at a time
def train(model,training_notes, labels, num_epochs, optimizer, device, model_save_path = None, dev_notes = None, dev_labels = None, threshold=0.2):
    print('Training started')
    for epoch in range(num_epochs):
        epoch_loss = []
        model_save_path_curr = model_save_path
        #model.train()
        for i in range(len(training_notes[0])):
            note = training_notes[0][i].to(device) ##batched sentences of shape(batch_size, max_num_sentences, max_length=128)
            attention_mask = training_notes[1][i].to(device)
            token_type_ids = training_notes[2][i].to(device)
            label = labels[i].to(device)
            logits = model(note, attention_mask, token_type_ids)
            #print(logits, label)
            loss = model.loss(logits, label.float())
            #accuracy = model.accuracy(logits, label)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            torch.cuda.empty_cache()
            del note, attention_mask, token_type_ids, logits
            if i%2500 == 0:
                dev_micro_f1, dev_macro_f1 = evaluate_model(model, dev_notes, dev_labels, device)
                print('Epoch: {}, #Batches: {}, Loss: {}, Dev Micro F1: {}, Dev Macro F1: {}'.format(epoch, i, loss.item(), dev_micro_f1, dev_macro_f1))

            epoch_loss.append(loss.item())
            del loss
        print('Epoch: {}, Loss: {}'.format(epoch, np.mean(epoch_loss)))
        ###save model
        model_save_path_curr = model_save_path_curr + str(epoch) + '.pt'
        torch.save(model.state_dict(), model_save_path_curr)
        print('Model saved to {}'.format(model_save_path_curr))

    return model

def evaluate_model (model, dev_notes, dev_labels, device=torch.device("cpu"), threshold=0.2):
    #model.eval()
    with torch.no_grad():
        predictions = []
        for i in range(len(dev_notes[0])):
            note = dev_notes[0][i].to(device)
            attention_mask = dev_notes[1][i].to(device)
            token_type_ids = dev_notes[2][i].to(device)
            #label = torch.tensor(dev_labels[i]).to(device)
            logits = model(note, attention_mask, token_type_ids)
            #loss = model.loss(logits, label.float())
            for ll in range(len(logits)):#for each example in the batch
                logits_i = torch.sigmoid(logits[ll])
                logits_i = [1 if x>threshold else 0 for x in logits_i]
                #logits_i = logits_i.cpu().detach()
                predictions.append(logits_i) ## appending for each example in the batch/dataset
    
    ##f1_score calc
    del logits, note, attention_mask,token_type_ids
    micro_f1 = f1_score(torch.tensor(dev_labels).numpy(), torch.tensor(predictions).numpy(), average='micro')
    macro_f1 = f1_score(torch.tensor(dev_labels).numpy(), torch.tensor(predictions).numpy(), average='macro')

    #print('Micro F1: {}, Macro F1: {}'.format(micro_f1, macro_f1))

    return micro_f1, macro_f1


In [None]:
##train the model
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('GPU device is available')
else:
    device = torch.device("cpu")
    print('GPU device is not available, using CPU instead')
    
model = ICD9_Detection(len(all_labels)).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=5e-6)
model_save_path = '/content/drive/MyDrive/CS769_SH/saved_models/run_4_sh_icd_pl_weights_'

##load pre-trained weights
model.load_state_dict(torch.load('/content/drive/MyDrive/CS769_SH/saved_models/run_3_sh_icd_pl_weights_0.pt'))


GPU device is available


Some weights of the model checkpoint at emilyalsentzer/Bio_Discharge_Summary_BERT were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [None]:
##batch data generation for training
training_notes_batches, training_labels_batches = batch_convertor(train_notes, train_labels, batch_size = 4)


In [None]:
dev_notes_batches, dev_labels_batches = batch_convertor(dev_notes, dev_labels, batch_size = 8)

In [None]:
len(training_notes_batches[0]), len(training_labels_batches), len(dev_notes_batches[0]), len(dev_labels_batches)

(10000, 10000, 105, 105)

In [None]:
model = train(model, training_notes_batches, training_labels_batches, num_epochs =10, optimizer= optimizer, device = device , model_save_path=model_save_path, dev_notes = dev_notes_batches, dev_labels = dev_labels, threshold=0.25)

In [None]:
len(dev_notes_batches[0]), len(dev_labels_batches)

(625, 625)

In [None]:
for i in [0.25, 0.3, 0.35, 0.4]:
  micro_f1, macro_f1 = evaluate_model(model, dev_notes_batches, dev_labels, device, threshold=i)
  print("for threshold ", i, "micro_f1: ", micro_f1, "macro_f1: ", macro_f1)

for threshold  0.25 micro_f1:  0.6337883122190437 macro_f1:  0.591586620381232
for threshold  0.3 micro_f1:  0.638507209499576 macro_f1:  0.5930755076319021
for threshold  0.35 micro_f1:  0.6422230393444346 macro_f1:  0.5939907651118143
for threshold  0.4 micro_f1:  0.6428069416662161 macro_f1:  0.5916769231424361


Testing

In [None]:
test_df = pd.read_csv('/content/drive/MyDrive/CS769_SH/MI_DATA/test_ds_notes.csv')
#TAKE one in 6 rows
test_df = test_df[:5013]
#test_df = test_df.iloc[::6, :]
test_notes, test_labels = data_pull(test_df, all_labels,padding_tokentization, max_length_tokenization)

In [None]:
test_notes_batches, test_labels_batches = batch_convertor(test_notes, test_labels, batch_size = 8)

In [None]:
len(test_notes_batches[0])

627

In [None]:
micro_f1, macro_f1 = evaluate_model(model, test_notes_batches, test_labels, device, threshold=0.35)
print("for threshold ", 0.35, "macro_f1: ", macro_f1, "micro_f1: ", micro_f1)

for threshold  0.35 macro_f1:  0.5952312652015899 micro_f1:  0.645972658896281
