In [None]:
!pip install -q transformers

[K     |████████████████████████████████| 2.8 MB 4.3 MB/s 
[K     |████████████████████████████████| 895 kB 50.4 MB/s 
[K     |████████████████████████████████| 3.3 MB 49.3 MB/s 
[K     |████████████████████████████████| 50 kB 7.3 MB/s 
[K     |████████████████████████████████| 636 kB 66.0 MB/s 
[?25h

In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
from sklearn.model_selection import train_test_split
import transformers
import torch
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
from transformers import AutoTokenizer, AutoModel

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report


In [None]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
np.random.seed(123)
torch.manual_seed(123)
torch.cuda.manual_seed_all(123)

## Load data

In [None]:
#change to where you store mimic3 data
MIMIC_3_DIR = '/content/drive/MyDrive/Colab Notebooks/MSc-Individual-Project/datasets'

train_df = pd.read_csv('%s/train_10.csv' % MIMIC_3_DIR)
eval_df = pd.read_csv('%s/dev_10.csv' % MIMIC_3_DIR)
test_df = pd.read_csv('%s/test_10.csv' % MIMIC_3_DIR)

train_df.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,CATEGORY,TEXT,LABELS,length
0,17341,151110,Nursing/other,rsbi,584.9;427.31,1
1,61638,103816,Nursing,title,414.01,1
2,61638,103816,General,title,414.01,1
3,23706,186321,Nursing/other,npn,401.9;428.0;530.81,1
4,55265,191108,General,title,530.81;584.9;427.31,1


In [None]:
full_df = pd.concat([train_df, eval_df, test_df], ignore_index=True)


 ## Preprocess Data

In [None]:
# split labels by ";", then convert to list
def split_lab (x):
    #print(x)
    return x.split(";")

full_df['LABELS'] = full_df['LABELS'].apply(split_lab)
#full_df['TEXT'] = full_df['TEXT'].apply(split_lab)

full_df.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,CATEGORY,TEXT,LABELS,length
0,17341,151110,Nursing/other,rsbi,"[584.9, 427.31]",1
1,61638,103816,Nursing,title,[414.01],1
2,61638,103816,General,title,[414.01],1
3,23706,186321,Nursing/other,npn,"[401.9, 428.0, 530.81]",1
4,55265,191108,General,title,"[530.81, 584.9, 427.31]",1


In [None]:
#load multi label binarizer for one-hot encoding
mlb = MultiLabelBinarizer(sparse_output=True)



In [None]:
#change label to one-hot encoding per code
full_df = full_df.join(
            pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(full_df.pop('LABELS')),
                columns=mlb.classes_))

full_df.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,CATEGORY,TEXT,length,250.00,272.4,401.9,414.01,427.31,428.0,518.81,530.81,584.9,599.0
0,17341,151110,Nursing/other,rsbi,1,0,0,0,0,1,0,0,0,1,0
1,61638,103816,Nursing,title,1,0,0,0,1,0,0,0,0,0,0
2,61638,103816,General,title,1,0,0,0,1,0,0,0,0,0,0
3,23706,186321,Nursing/other,npn,1,0,0,1,0,0,1,0,1,0,0
4,55265,191108,General,title,1,0,0,0,0,1,0,0,1,1,0


In [None]:
# Convert columns to list of one hot encoding
icd_classes_50 = mlb.classes_

full_df['labels'] = full_df[icd_classes_50].values.tolist()
#train_df.sort_values(['length'], ascending=False, inplace=True)
full_df


Unnamed: 0,SUBJECT_ID,HADM_ID,CATEGORY,TEXT,length,250.00,272.4,401.9,414.01,427.31,428.0,518.81,530.81,584.9,599.0,labels
0,17341,151110,Nursing/other,rsbi,1,0,0,0,0,1,0,0,0,1,0,"[0, 0, 0, 0, 1, 0, 0, 0, 1, 0]"
1,61638,103816,Nursing,title,1,0,0,0,1,0,0,0,0,0,0,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"
2,61638,103816,General,title,1,0,0,0,1,0,0,0,0,0,0,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"
3,23706,186321,Nursing/other,npn,1,0,0,1,0,0,1,0,1,0,0,"[0, 0, 1, 0, 0, 1, 0, 1, 0, 0]"
4,55265,191108,General,title,1,0,0,0,0,1,0,0,1,1,0,"[0, 0, 0, 0, 1, 0, 0, 1, 1, 0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294460,97158,152158,Discharge summary,admission date discharge date date of birth se...,4644,0,0,1,0,0,0,1,0,0,0,"[0, 0, 1, 0, 0, 0, 1, 0, 0, 0]"
294461,99650,199859,Discharge summary,admission date discharge date date of birth se...,5126,0,0,0,1,1,1,1,0,1,1,"[0, 0, 0, 1, 1, 1, 1, 0, 1, 1]"
294462,93623,187232,Discharge summary,admission date discharge date date of birth se...,5171,0,1,1,0,0,0,0,0,0,0,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0]"
294463,96260,110058,Discharge summary,admission date discharge date date of birth se...,5173,0,0,0,0,0,0,1,0,0,1,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 1]"


In [None]:
full_df.HADM_ID.unique().shape

(9446,)

In [None]:
full_df = full_df.drop(full_df[full_df['length']<300].index)

In [None]:
train_df, test_df = train_test_split(full_df, test_size=0.2)
train_df, eval_df = train_test_split(train_df, test_size=0.2)

In [None]:
train_df.sort_values(['length'], inplace=True)
eval_df.sort_values(['length'], inplace=True)
test_df.sort_values(['length'], inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
train_df = train_df.drop(train_df[train_df['CATEGORY']!='Nursing'].index)
eval_df = eval_df.drop(eval_df[eval_df['CATEGORY']!='Nursing'].index)
test_df = test_df.drop(test_df[test_df['CATEGORY']!='Nursing'].index)

In [None]:
train_df

Unnamed: 0,SUBJECT_ID,HADM_ID,CATEGORY,TEXT,length,250.00,272.4,401.9,414.01,427.31,428.0,518.81,530.81,584.9,599.0,labels
155219,67906,122154,Nursing,respiratory failure acute not ards doctor last...,300,0,0,0,0,1,0,1,0,0,0,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 0]"
155141,73831,197368,Nursing,full code yo obese m with dm hospital transfer...,300,1,0,0,0,0,0,1,0,1,0,"[1, 0, 0, 0, 0, 0, 1, 0, 1, 0]"
155078,49930,144435,Nursing,abdomen a abdomen remains open with colosplast...,300,0,0,0,0,0,0,1,0,0,0,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]"
155163,59156,163180,Nursing,chief complaint year old male with pmhx of mm ...,300,0,0,1,0,0,0,1,0,1,0,"[0, 0, 1, 0, 0, 0, 1, 0, 1, 0]"
155238,50819,182115,Nursing,pt initially presented to osh in with c o chro...,300,0,1,0,1,1,0,0,0,0,1,"[0, 1, 0, 1, 1, 0, 0, 0, 0, 1]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246899,84461,146684,Nursing,yo male with pmh including dm2 cri bl afib chf...,1287,0,0,0,0,1,1,0,0,1,0,"[0, 0, 0, 0, 1, 1, 0, 0, 1, 0]"
247100,84838,117609,Nursing,fluid collections thought related to his pseud...,1365,1,0,0,0,1,1,1,0,0,0,"[1, 0, 0, 0, 1, 1, 1, 0, 0, 0]"
247101,84838,117609,Nursing,fluid collections thought related to his pseud...,1365,1,0,0,0,1,1,1,0,0,0,"[1, 0, 0, 0, 1, 1, 1, 0, 0, 0]"
247207,84461,146684,Nursing,yo male with pmh including dm2 cri bl afib chf...,1424,0,0,0,0,1,1,0,0,1,0,"[0, 0, 0, 0, 1, 1, 0, 0, 1, 0]"


In [None]:
train_df.HADM_ID.unique().shape

(1818,)

In [None]:
#convert into 2 columns dataframe
train_df = pd.DataFrame(train_df, columns=['TEXT', 'labels'])
train_df.columns=['text', 'labels']
train_df.head()

eval_df = pd.DataFrame(eval_df, columns=['TEXT', 'labels'])
eval_df.columns=['text', 'labels']
eval_df.head()

test_df = pd.DataFrame(test_df, columns=['HADM_ID', 'TEXT', 'labels'])
test_df.columns=['id', 'text', 'labels']


In [None]:
train_df.reset_index(drop=True, inplace=True)
eval_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
test_df.head()

Unnamed: 0,id,text,labels
0,122154,y o m with a history of severe mixed obstructi...,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 0]"
1,170495,42m w pancreatitis s p drainage of pseudo cyst...,"[1, 1, 1, 1, 0, 0, 1, 0, 0, 0]"
2,115903,y o male s p fall off foot high scaffolding la...,"[1, 0, 0, 1, 0, 0, 1, 0, 0, 0]"
3,169694,y o male w hx of cholecystitis s p percutaneou...,"[1, 0, 1, 0, 1, 0, 1, 0, 0, 1]"
4,173216,year old woman with h o small cell lung cancer...,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 1]"


### Set Model Parameters

In [None]:
# Defining some key variables to configure model training
MAX_LEN = 512
TRAIN_BATCH_SIZE = 12
VALID_BATCH_SIZE = 8
TEST_BATCH_SIZE = 8
EPOCHS = 10
LEARNING_RATE = 3e-05

#set tokenizer
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

### Preparing Dataloader

In [None]:
#custom dataset for BERT class
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        
        '''
            set text as training data
            set labels as targets
        '''
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
#load df to dataset

training_set = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_set = CustomDataset(eval_df, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_df, tokenizer, MAX_LEN)

In [None]:
#data loader
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': False
                }

val_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False
                }

test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': False
                }

training_loader = DataLoader(training_set, **train_params)
valid_loader = DataLoader(valid_set, **val_params)
testing_loader = DataLoader(testing_set, **test_params)

### Create model class from pretrained model

In [None]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()

        self.l1 = transformers.AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", return_dict=False)
        #self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased', return_dict=False)
        
        
        self.l2 = torch.nn.Dropout(0.3)
        

        self.l3 = torch.nn.Linear(768, 10)
    
    def forward(self, ids, mask, token_type_ids):
#        print("ids: ", ids.size(), "mask: ", mask.size(), "token type ids: ", token_type_ids.size())
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

In [None]:
#loss function
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [None]:
#optimizer
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

### Train fine-tuning model

In [None]:
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        
    print(f'Epoch: {epoch}, Training Loss:  {loss.item()}')

In [None]:
# Evaluate the model

def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    losses=[]
    with torch.no_grad():
        for _, data in enumerate(valid_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            loss = loss_fn(outputs, targets)
            losses.append(loss.item())

            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    print(f'Epoch: {epoch}, Validation Loss:  {np.mean(losses):.2f}')
    return fin_outputs, fin_targets, losses

In [None]:
start_epoch=0
DIR = '/content/drive/MyDrive/Colab Notebooks/MSc-Individual-Project/'
resume = True     
if resume:
    if os.path.isfile(f"%s/models/models_nurs_epoch{start_epoch}.pth" % DIR):
        print("Resume from checkpoint...")
        checkpoint = torch.load(f"%s/models/models_nurs_epoch{start_epoch}.pth" % DIR)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        initepoch = checkpoint['epoch']
        print("====>loaded checkpoint (epoch{})".format(checkpoint['epoch']))
    else:
        print("====>no checkpoint found.")
        initepoch = 0

for epoch in tqdm(range(EPOCHS)):
    train(epoch)
    validation(epoch)

    if (epoch+start_epoch+1)%5 == 0:
        checkpoint = {"model_state_dict": model.state_dict(),
                      "optimizer_state_dict": optimizer.state_dict(),
                      "epoch": epoch+start_epoch+1}
        path_checkpoint = f"%s/models/models_nurs_epoch{epoch+start_epoch+1}.pth" % DIR
        torch.save(checkpoint, path_checkpoint)



====>no checkpoint found.


  0%|          | 0/10 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch: 0, Training Loss:  0.10012388229370117


 10%|█         | 1/10 [12:06<1:49:02, 726.97s/it]

Epoch: 0, Validation Loss:  0.39
Epoch: 1, Training Loss:  0.02855929546058178


 20%|██        | 2/10 [24:14<1:36:56, 727.08s/it]

Epoch: 1, Validation Loss:  0.23
Epoch: 2, Training Loss:  0.016132690012454987


 30%|███       | 3/10 [36:21<1:24:50, 727.17s/it]

Epoch: 2, Validation Loss:  0.16
Epoch: 3, Training Loss:  0.007916566915810108


 40%|████      | 4/10 [48:28<1:12:42, 727.12s/it]

Epoch: 3, Validation Loss:  0.14
Epoch: 4, Training Loss:  0.00865910854190588
Epoch: 4, Validation Loss:  0.13


 50%|█████     | 5/10 [1:00:52<1:01:05, 733.07s/it]

Epoch: 5, Training Loss:  0.004419534932821989


 60%|██████    | 6/10 [1:12:59<48:44, 731.12s/it]  

Epoch: 5, Validation Loss:  0.13
Epoch: 6, Training Loss:  0.0023375898599624634


 70%|███████   | 7/10 [1:25:05<36:28, 729.60s/it]

Epoch: 6, Validation Loss:  0.14
Epoch: 7, Training Loss:  0.001670807832852006


 80%|████████  | 8/10 [1:37:12<24:17, 728.68s/it]

Epoch: 7, Validation Loss:  0.14
Epoch: 8, Training Loss:  0.0010582499671727419


 90%|█████████ | 9/10 [1:49:20<12:08, 728.31s/it]

Epoch: 8, Validation Loss:  0.14
Epoch: 9, Training Loss:  0.00038989351014606655
Epoch: 9, Validation Loss:  0.15


100%|██████████| 10/10 [2:01:44<00:00, 730.50s/it]


In [None]:

DIR = '/content/drive/MyDrive/Colab Notebooks/MSc-Individual-Project/'

checkpoint = torch.load(f"%s/models/models_nurs_epoch5.pth" % DIR)
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>


### Model Evaluation

In [None]:
# Evaluate the model

def evaluation():
    model.eval()

    fin_targets=[]
    fin_outputs=[]
    losses=[]
    with torch.no_grad():
        for _, data in enumerate(valid_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            loss = loss_fn(outputs, targets)
            losses.append(loss.item())

            fin_targets.extend(targets.cpu().detach().numpy())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy())
    print(f'Loss:  {np.mean(losses):.2f}')
    return fin_outputs, fin_targets, losses

In [None]:
dev_out, dev_tar, losses = evaluation()



Loss:  0.06


In [None]:
# Evaluate the model
def testing():
    model.eval()

    fin_targets=[]
    fin_outputs=[]
    losses=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            loss = loss_fn(outputs, targets)
            losses.append(loss.item())

            fin_targets.extend(targets.cpu().detach().numpy())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy())
    print(f'Loss:  {np.mean(losses):.2f}')

    return fin_outputs, fin_targets, losses

In [None]:

test_out, targets, losses = testing()
outputs = np.array(test_out) >= 0.5
accuracy = metrics.accuracy_score(targets, outputs)
f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
f1_score_macro = metrics.f1_score(targets, outputs, average='macro')

print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Loss:  0.13
F1 Score (Micro) = 0.9388246843068135
F1 Score (Macro) = 0.9331258183654849


In [None]:
ruc_auc_score_micro = metrics.roc_auc_score(targets, outputs, average='micro')
ruc_auc_score_macro = metrics.roc_auc_score(targets, outputs, average='macro')

print(f"RUC AUC Score (Micro) = {ruc_auc_score_micro}")
print(f"RUC AUC Score (Macro) = {ruc_auc_score_macro}")

RUC AUC Score (Micro) = 0.9579052380290392
RUC AUC Score (Macro) = 0.9536846179738522


In [None]:
print(classification_report(targets, outputs, target_names=icd_classes_50, digits=4))

              precision    recall  f1-score   support

      250.00     0.8999    0.9165    0.9081       814
       272.4     0.8868    0.9222    0.9041       900
       401.9     0.9049    0.9392    0.9217      1448
      414.01     0.9452    0.9462    0.9457       893
      427.31     0.9714    0.9563    0.9638      1349
       428.0     0.9579    0.9608    0.9593      1301
      518.81     0.9594    0.9821    0.9706      1734
      530.81     0.9315    0.8760    0.9029       605
       584.9     0.9314    0.9418    0.9366      1341
       599.0     0.9190    0.9178    0.9184       766

   micro avg     0.9343    0.9434    0.9388     11151
   macro avg     0.9307    0.9359    0.9331     11151
weighted avg     0.9346    0.9434    0.9388     11151
 samples avg     0.9327    0.9403    0.9299     11151



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
test_df['prediction'] = test_out
test_df['tar'] = targets

In [None]:
test_df

Unnamed: 0,id,text,labels,prediction,tar
0,122154,y o m with a history of severe mixed obstructi...,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 0]","[0.0023793322, 0.00422487, 0.0025364656, 0.004...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, ..."
1,170495,42m w pancreatitis s p drainage of pseudo cyst...,"[1, 1, 1, 1, 0, 0, 1, 0, 0, 0]","[0.9856963, 0.97852844, 0.99405295, 0.977105, ...","[1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
2,115903,y o male s p fall off foot high scaffolding la...,"[1, 0, 0, 1, 0, 0, 1, 0, 0, 0]","[0.9967751, 0.026014842, 0.5737061, 0.9268575,...","[1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
3,169694,y o male w hx of cholecystitis s p percutaneou...,"[1, 0, 1, 0, 1, 0, 1, 0, 0, 1]","[0.99545175, 0.02555917, 0.9973022, 0.02012788...","[1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, ..."
4,173216,year old woman with h o small cell lung cancer...,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 1]","[0.050301515, 0.9673785, 0.8236653, 0.00031972...","[0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...
3791,124288,61yr old female w esld pbc primary biliary cir...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]","[0.0039596874, 0.0031641496, 0.0045863343, 0.0...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
3792,124288,61yr old female w esld pbc primary biliary cir...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]","[0.0039596874, 0.0031641512, 0.004586341, 0.00...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
3793,146684,yo male with pmh including dm2 cri bl afib chf...,"[0, 0, 0, 0, 1, 1, 0, 0, 1, 0]","[0.003638031, 0.009747761, 0.004910932, 0.0174...","[0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, ..."
3794,146684,yo male with pmh including dm2 cri bl afib chf...,"[0, 0, 0, 0, 1, 1, 0, 0, 1, 0]","[0.0035016658, 0.046437673, 0.0052691936, 0.01...","[0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, ..."


In [None]:
note_count_dict = test_df.groupby('id').size().to_dict()
test_df['note_count'] = test_df['id'].map(note_count_dict)

In [None]:
test_df['out_bool'] = [(test_df['prediction'][i]>=0.5).astype(int) for i in test_df.index]

In [None]:
out_freq_dict = test_df.groupby('id').out_bool.apply(np.sum).to_dict()
test_df['num_pred'] = test_df['id'].map(out_freq_dict)
test_df['num_pred'] = [(test_df['num_pred'][i]>=0.4*test_df['note_count'][i]).astype(int) for i in test_df.index]

In [None]:
df_freq = test_df.drop_duplicates('id')

In [None]:
out_freq = np.vstack([df_freq['num_pred'][i] for i in df_freq.index])
targets = np.vstack([df_freq['tar'][i] for i in df_freq.index])

#targets = dev_tar
accuracy = metrics.accuracy_score(targets, out_freq)
f1_score_micro = metrics.f1_score(targets, out_freq, average='micro')
f1_score_macro = metrics.f1_score(targets, out_freq, average='macro')

print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

F1 Score (Micro) = 0.8947997609085475
F1 Score (Macro) = 0.8913302881788173


In [None]:
print(classification_report(targets, out_freq, target_names=icd_classes_50, digits=4))

              precision    recall  f1-score   support

      250.00     0.8169    0.8689    0.8421       267
       272.4     0.7978    0.9068    0.8488       322
       401.9     0.8615    0.9038    0.8821       530
      414.01     0.9079    0.9377    0.9226       305
      427.31     0.9537    0.9511    0.9524       368
       428.0     0.9013    0.9494    0.9248       356
      518.81     0.9030    0.9760    0.9381       334
      530.81     0.8667    0.8325    0.8492       203
       584.9     0.8647    0.9081    0.8859       359
       599.0     0.8485    0.8869    0.8673       221

   micro avg     0.8737    0.9170    0.8948      3265
   macro avg     0.8722    0.9121    0.8913      3265
weighted avg     0.8747    0.9170    0.8950      3265
 samples avg     0.8801    0.9105    0.8828      3265



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
ruc_auc_score_micro = metrics.roc_auc_score(targets, out_freq, average='micro')
ruc_auc_score_macro = metrics.roc_auc_score(targets, out_freq, average='macro')

print(f"RUC AUC Score (Micro) = {ruc_auc_score_micro}")
print(f"RUC AUC Score (Macro) = {ruc_auc_score_macro}")

RUC AUC Score (Micro) = 0.9337422303009288
RUC AUC Score (Macro) = 0.9303691950157826


In [None]:
out_mean_dict = test_df.groupby('id').prediction.apply(np.mean).to_dict()
test_df['out_mean'] = test_df['id'].map(out_mean_dict)
test_df

Unnamed: 0,id,text,labels,prediction,tar,note_count,out_bool,num_pred,out_mean
0,122154,y o m with a history of severe mixed obstructi...,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 0]","[0.0023793322, 0.00422487, 0.0025364656, 0.004...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, ...",37,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 0]","[0, 0, 0, 0, 1, 0, 1, 0, 0, 0]","[0.004831251, 0.01380863, 0.014626026, 0.00978..."
1,170495,42m w pancreatitis s p drainage of pseudo cyst...,"[1, 1, 1, 1, 0, 0, 1, 0, 0, 0]","[0.9856963, 0.97852844, 0.99405295, 0.977105, ...","[1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...",1,"[1, 1, 1, 1, 0, 0, 1, 0, 0, 0]","[1, 1, 1, 1, 0, 0, 1, 0, 0, 0]","[0.9856963, 0.97852844, 0.99405295, 0.977105, ..."
2,115903,y o male s p fall off foot high scaffolding la...,"[1, 0, 0, 1, 0, 0, 1, 0, 0, 0]","[0.9967751, 0.026014842, 0.5737061, 0.9268575,...","[1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...",4,"[1, 0, 1, 1, 0, 0, 1, 0, 0, 0]","[1, 0, 1, 1, 0, 0, 1, 0, 0, 0]","[0.98116803, 0.03601263, 0.34381956, 0.9561689..."
3,169694,y o male w hx of cholecystitis s p percutaneou...,"[1, 0, 1, 0, 1, 0, 1, 0, 0, 1]","[0.99545175, 0.02555917, 0.9973022, 0.02012788...","[1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, ...",15,"[1, 0, 1, 0, 1, 0, 1, 0, 0, 1]","[1, 0, 1, 0, 1, 0, 1, 0, 0, 1]","[0.94792134, 0.020325365, 0.96252936, 0.023237..."
4,173216,year old woman with h o small cell lung cancer...,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 1]","[0.050301515, 0.9673785, 0.8236653, 0.00031972...","[0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0]","[0, 1, 1, 0, 0, 0, 0, 0, 0, 0]","[0.050301515, 0.9673785, 0.8236653, 0.00031972..."
...,...,...,...,...,...,...,...,...,...
3791,124288,61yr old female w esld pbc primary biliary cir...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]","[0.0039596874, 0.0031641496, 0.0045863343, 0.0...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",8,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]","[0.0036042677, 0.0036779707, 0.0044286577, 0.0..."
3792,124288,61yr old female w esld pbc primary biliary cir...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]","[0.0039596874, 0.0031641512, 0.004586341, 0.00...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",8,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]","[0.0036042677, 0.0036779707, 0.0044286577, 0.0..."
3793,146684,yo male with pmh including dm2 cri bl afib chf...,"[0, 0, 0, 0, 1, 1, 0, 0, 1, 0]","[0.003638031, 0.009747761, 0.004910932, 0.0174...","[0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, ...",29,"[0, 0, 0, 0, 1, 1, 0, 0, 1, 0]","[0, 0, 0, 0, 1, 1, 0, 0, 1, 0]","[0.020554462, 0.021448856, 0.046053637, 0.0698..."
3794,146684,yo male with pmh including dm2 cri bl afib chf...,"[0, 0, 0, 0, 1, 1, 0, 0, 1, 0]","[0.0035016658, 0.046437673, 0.0052691936, 0.01...","[0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, ...",29,"[0, 0, 0, 0, 1, 1, 0, 0, 1, 0]","[0, 0, 0, 0, 1, 1, 0, 0, 1, 0]","[0.020554462, 0.021448856, 0.046053637, 0.0698..."


In [None]:
df_mean = test_df.drop_duplicates('id')

In [None]:
out_mean = np.vstack([df_mean['out_mean'][i]>=0.5 for i in df_mean.index])
targets = np.vstack([df_mean['tar'][i] for i in df_mean.index])
#targets = dev_tar
accuracy = metrics.accuracy_score(targets, out_mean)
f1_score_micro = metrics.f1_score(targets, out_mean, average='micro')
f1_score_macro = metrics.f1_score(targets, out_mean, average='macro')

print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

F1 Score (Micro) = 0.9027756711663887
F1 Score (Macro) = 0.8989990673320142


In [None]:
print(classification_report(targets, out_mean, target_names=icd_classes_50, digits=4))

              precision    recall  f1-score   support

      250.00     0.8524    0.8652    0.8587       267
       272.4     0.8130    0.8913    0.8504       322
       401.9     0.8838    0.9038    0.8937       530
      414.01     0.9223    0.9344    0.9283       305
      427.31     0.9614    0.9484    0.9549       368
       428.0     0.9205    0.9438    0.9320       356
      518.81     0.9255    0.9671    0.9458       334
      530.81     0.8883    0.8227    0.8542       203
       584.9     0.8926    0.9025    0.8975       359
       599.0     0.8667    0.8824    0.8744       221

   micro avg     0.8942    0.9115    0.9028      3265
   macro avg     0.8927    0.9061    0.8990      3265
weighted avg     0.8950    0.9115    0.9029      3265
 samples avg     0.8926    0.9055    0.8884      3265



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
ruc_auc_score_micro = metrics.roc_auc_score(targets, out_mean, average='micro')
ruc_auc_score_macro = metrics.roc_auc_score(targets, out_mean, average='macro')

print(f"RUC AUC Score (Micro) = {ruc_auc_score_micro}")
print(f"RUC AUC Score (Macro) = {ruc_auc_score_macro}")

RUC AUC Score (Micro) = 0.9356169397170293
RUC AUC Score (Macro) = 0.9321886860644197
