In [1]:
!pip install -q transformers

[K     |████████████████████████████████| 2.8 MB 14.4 MB/s 
[K     |████████████████████████████████| 636 kB 22.0 MB/s 
[K     |████████████████████████████████| 50 kB 4.1 MB/s 
[K     |████████████████████████████████| 3.3 MB 27.3 MB/s 
[K     |████████████████████████████████| 895 kB 24.8 MB/s 
[?25h

In [2]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
from sklearn.model_selection import train_test_split
import transformers
import torch
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
from transformers import AutoTokenizer, AutoModel

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report


In [3]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
np.random.seed(123)
torch.manual_seed(123)
torch.cuda.manual_seed_all(123)

## Load data

In [6]:
#change to where you store mimic3 data
MIMIC_3_DIR = '/content/drive/MyDrive/Colab Notebooks/MSc-Individual-Project/datasets'

train_df = pd.read_csv('%s/train_10.csv' % MIMIC_3_DIR)
eval_df = pd.read_csv('%s/dev_10.csv' % MIMIC_3_DIR)
test_df = pd.read_csv('%s/test_10.csv' % MIMIC_3_DIR)

train_df.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,CATEGORY,TEXT,LABELS,length
0,17341,151110,Nursing/other,rsbi,584.9;427.31,1
1,61638,103816,Nursing,title,414.01,1
2,61638,103816,General,title,414.01,1
3,23706,186321,Nursing/other,npn,401.9;428.0;530.81,1
4,55265,191108,General,title,530.81;584.9;427.31,1


In [7]:
full_df = pd.concat([train_df, eval_df, test_df], ignore_index=True)


 ## Preprocess Data

In [8]:
# split labels by ";", then convert to list
def split_lab (x):
    #print(x)
    return x.split(";")

full_df['LABELS'] = full_df['LABELS'].apply(split_lab)
#full_df['TEXT'] = full_df['TEXT'].apply(split_lab)

full_df.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,CATEGORY,TEXT,LABELS,length
0,17341,151110,Nursing/other,rsbi,"[584.9, 427.31]",1
1,61638,103816,Nursing,title,[414.01],1
2,61638,103816,General,title,[414.01],1
3,23706,186321,Nursing/other,npn,"[401.9, 428.0, 530.81]",1
4,55265,191108,General,title,"[530.81, 584.9, 427.31]",1


In [9]:
#load multi label binarizer for one-hot encoding
mlb = MultiLabelBinarizer(sparse_output=True)



In [10]:
#change label to one-hot encoding per code
full_df = full_df.join(
            pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(full_df.pop('LABELS')),
                columns=mlb.classes_))

full_df.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,CATEGORY,TEXT,length,250.00,272.4,401.9,414.01,427.31,428.0,518.81,530.81,584.9,599.0
0,17341,151110,Nursing/other,rsbi,1,0,0,0,0,1,0,0,0,1,0
1,61638,103816,Nursing,title,1,0,0,0,1,0,0,0,0,0,0
2,61638,103816,General,title,1,0,0,0,1,0,0,0,0,0,0
3,23706,186321,Nursing/other,npn,1,0,0,1,0,0,1,0,1,0,0
4,55265,191108,General,title,1,0,0,0,0,1,0,0,1,1,0


In [11]:
# Convert columns to list of one hot encoding
icd_classes_50 = mlb.classes_

full_df['labels'] = full_df[icd_classes_50].values.tolist()
#train_df.sort_values(['length'], ascending=False, inplace=True)
full_df


Unnamed: 0,SUBJECT_ID,HADM_ID,CATEGORY,TEXT,length,250.00,272.4,401.9,414.01,427.31,428.0,518.81,530.81,584.9,599.0,labels
0,17341,151110,Nursing/other,rsbi,1,0,0,0,0,1,0,0,0,1,0,"[0, 0, 0, 0, 1, 0, 0, 0, 1, 0]"
1,61638,103816,Nursing,title,1,0,0,0,1,0,0,0,0,0,0,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"
2,61638,103816,General,title,1,0,0,0,1,0,0,0,0,0,0,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"
3,23706,186321,Nursing/other,npn,1,0,0,1,0,0,1,0,1,0,0,"[0, 0, 1, 0, 0, 1, 0, 1, 0, 0]"
4,55265,191108,General,title,1,0,0,0,0,1,0,0,1,1,0,"[0, 0, 0, 0, 1, 0, 0, 1, 1, 0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294460,97158,152158,Discharge summary,admission date discharge date date of birth se...,4644,0,0,1,0,0,0,1,0,0,0,"[0, 0, 1, 0, 0, 0, 1, 0, 0, 0]"
294461,99650,199859,Discharge summary,admission date discharge date date of birth se...,5126,0,0,0,1,1,1,1,0,1,1,"[0, 0, 0, 1, 1, 1, 1, 0, 1, 1]"
294462,93623,187232,Discharge summary,admission date discharge date date of birth se...,5171,0,1,1,0,0,0,0,0,0,0,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0]"
294463,96260,110058,Discharge summary,admission date discharge date date of birth se...,5173,0,0,0,0,0,0,1,0,0,1,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 1]"


In [12]:
full_df.HADM_ID.unique().shape

(9446,)

In [13]:
full_df = full_df.drop(full_df[full_df['length']<300].index)

In [15]:
train_df, test_df = train_test_split(full_df, test_size=0.2)
train_df, eval_df = train_test_split(train_df, test_size=0.2)

In [16]:
train_df.sort_values(['length'], inplace=True)
eval_df.sort_values(['length'], inplace=True)
test_df.sort_values(['length'], inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [17]:
train_df = train_df.drop(train_df[train_df['CATEGORY']!='Radiology'].index)
eval_df = eval_df.drop(eval_df[eval_df['CATEGORY']!='Radiology'].index)
test_df = test_df.drop(test_df[test_df['CATEGORY']!='Radiology'].index)

In [18]:
train_df

Unnamed: 0,SUBJECT_ID,HADM_ID,CATEGORY,TEXT,length,250.00,272.4,401.9,414.01,427.31,428.0,518.81,530.81,584.9,599.0,labels
155043,13494,164195,Radiology,pm ct abdomen w o contrast ct pelvis w o contr...,300,0,0,0,0,0,0,1,0,0,1,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 1]"
155053,59736,153856,Radiology,am last name un intestinal tube placement w fl...,300,0,1,1,0,1,1,0,0,0,0,"[0, 1, 1, 0, 1, 1, 0, 0, 0, 0]"
232247,88090,196729,Radiology,pm ct c spine w o contrast clip clip number ra...,300,0,1,0,0,1,1,0,1,1,1,"[0, 1, 0, 0, 1, 1, 0, 1, 1, 1]"
155220,619,167213,Radiology,pm perc nephrosto clip clip number radiology r...,300,0,0,0,0,0,0,1,0,0,1,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 1]"
155124,18484,119517,Radiology,am picc line placment sch clip clip number rad...,300,1,0,0,0,0,1,0,0,0,1,"[1, 0, 0, 0, 0, 1, 0, 0, 0, 1]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204436,53968,124365,Radiology,pm mr cervical spine w o contrast mr thoracic ...,1579,0,0,0,0,0,0,0,0,0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
293432,93233,107354,Radiology,am mr name13 stitle w w o contrast mr name13 s...,1587,1,1,1,1,1,1,0,0,0,1,"[1, 1, 1, 1, 1, 1, 0, 0, 0, 1]"
247672,86377,153623,Radiology,pm month day year chest w w o c recons non cor...,1787,1,1,1,0,0,1,0,0,0,0,"[1, 1, 1, 0, 0, 1, 0, 0, 0, 0]"
205698,15803,192549,Radiology,am carot cereb hospital1 clip clip number radi...,1913,0,0,0,0,1,1,0,1,0,0,"[0, 0, 0, 0, 1, 1, 0, 1, 0, 0]"


In [19]:
train_df.HADM_ID.unique().shape

(3331,)

In [20]:
#convert into 2 columns dataframe
train_df = pd.DataFrame(train_df, columns=['TEXT', 'labels'])
train_df.columns=['text', 'labels']
train_df.head()

eval_df = pd.DataFrame(eval_df, columns=['TEXT', 'labels'])
eval_df.columns=['text', 'labels']
eval_df.head()

test_df = pd.DataFrame(test_df, columns=['HADM_ID', 'TEXT', 'labels'])
test_df.columns=['id', 'text', 'labels']


In [21]:
train_df.reset_index(drop=True, inplace=True)
eval_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
test_df.head()

Unnamed: 0,id,text,labels
0,135559,pm pelvis limited pelvis u s transvaginal clip...,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]"
1,183196,pm ct chest w contrast clip clip number radiol...,"[1, 0, 0, 1, 0, 0, 0, 0, 0, 0]"
2,180782,am picc line placment sch clip clip number rad...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]"
3,192214,am ct head w contrast clip clip number radiolo...,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]"
4,146593,am bx needle liver by radiologist guidance loc...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]"


### Set Model Parameters

In [22]:
# Defining some key variables to configure model training
MAX_LEN = 512
TRAIN_BATCH_SIZE = 12
VALID_BATCH_SIZE = 8
TEST_BATCH_SIZE = 8
EPOCHS = 5
LEARNING_RATE = 3e-05

#set tokenizer
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

### Preparing Dataloader

In [23]:
#custom dataset for BERT class
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        
        '''
            set text as training data
            set labels as targets
        '''
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [24]:
#load df to dataset

training_set = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_set = CustomDataset(eval_df, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_df, tokenizer, MAX_LEN)

In [25]:
#data loader
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': False
                }

val_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False
                }

test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': False
                }

training_loader = DataLoader(training_set, **train_params)
valid_loader = DataLoader(valid_set, **val_params)
testing_loader = DataLoader(testing_set, **test_params)

### Create model class from pretrained model

In [26]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()

        self.l1 = transformers.AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", return_dict=False)
        #self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased', return_dict=False)
        
        
        self.l2 = torch.nn.Dropout(0.3)
        

        self.l3 = torch.nn.Linear(768, 10)
    
    def forward(self, ids, mask, token_type_ids):
#        print("ids: ", ids.size(), "mask: ", mask.size(), "token type ids: ", token_type_ids.size())
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

In [27]:
#loss function
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [28]:
#optimizer
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

### Train fine-tuning model

In [29]:
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        
    print(f'Epoch: {epoch}, Training Loss:  {loss.item()}')

In [30]:
# Evaluate the model

def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    losses=[]
    with torch.no_grad():
        for _, data in enumerate(valid_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            loss = loss_fn(outputs, targets)
            losses.append(loss.item())

            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    print(f'Epoch: {epoch}, Validation Loss:  {np.mean(losses):.2f}')
    return fin_outputs, fin_targets, losses

In [None]:
start_epoch=5
DIR = '/content/drive/MyDrive/Colab Notebooks/MSc-Individual-Project/'
resume = True     
if resume:
    if os.path.isfile(f"%s/models/models_rad_epoch{start_epoch}.pth" % DIR):
        print("Resume from checkpoint...")
        checkpoint = torch.load(f"%s/models/models_rad_epoch{start_epoch}.pth" % DIR)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        initepoch = checkpoint['epoch']
        print("====>loaded checkpoint (epoch{})".format(checkpoint['epoch']))
    else:
        print("====>no checkpoint found.")
        initepoch = 0

for epoch in tqdm(range(EPOCHS)):
    train(epoch)
    validation(epoch)

    if (epoch+start_epoch+1) >8:
        checkpoint = {"model_state_dict": model.state_dict(),
                      "optimizer_state_dict": optimizer.state_dict(),
                      "epoch": epoch+start_epoch+1}
        path_checkpoint = f"%s/models/models_rad_epoch{epoch+start_epoch+1}.pth" % DIR
        torch.save(checkpoint, path_checkpoint)



Resume from checkpoint...
====>loaded checkpoint (epoch5)




Epoch: 0, Training Loss:  0.41276055574417114


 20%|██        | 1/5 [06:13<24:55, 373.89s/it]

Epoch: 0, Validation Loss:  0.50
Epoch: 1, Training Loss:  0.35901501774787903


 40%|████      | 2/5 [12:27<18:42, 374.02s/it]

Epoch: 1, Validation Loss:  0.52
Epoch: 2, Training Loss:  0.30958372354507446


 60%|██████    | 3/5 [18:42<12:28, 374.12s/it]

Epoch: 2, Validation Loss:  0.54
Epoch: 3, Training Loss:  0.2604365646839142
Epoch: 3, Validation Loss:  0.56


 80%|████████  | 4/5 [25:02<06:16, 376.35s/it]

Epoch: 4, Training Loss:  0.21886220574378967
Epoch: 4, Validation Loss:  0.61


100%|██████████| 5/5 [31:36<00:00, 379.39s/it]


In [31]:

DIR = '/content/drive/MyDrive/Colab Notebooks/MSc-Individual-Project/'

checkpoint = torch.load(f"%s/models/models_rad_epoch10.pth" % DIR)
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>


### Model Evaluation

In [None]:
# Evaluate the model

def evaluation():
    model.eval()

    fin_targets=[]
    fin_outputs=[]
    losses=[]
    with torch.no_grad():
        for _, data in enumerate(valid_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            loss = loss_fn(outputs, targets)
            losses.append(loss.item())

            fin_targets.extend(targets.cpu().detach().numpy())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy())
    print(f'Loss:  {np.mean(losses):.2f}')
    return fin_outputs, fin_targets, losses

In [None]:
dev_out, dev_tar, losses = evaluation()



Loss:  0.06


In [32]:
# Evaluate the model
def testing():
    model.eval()

    fin_targets=[]
    fin_outputs=[]
    losses=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            loss = loss_fn(outputs, targets)
            losses.append(loss.item())

            fin_targets.extend(targets.cpu().detach().numpy())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy())
    print(f'Loss:  {np.mean(losses):.2f}')

    return fin_outputs, fin_targets, losses

In [33]:

test_out, targets, losses = testing()
outputs = np.array(test_out) >= 0.5
accuracy = metrics.accuracy_score(targets, outputs)
f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
f1_score_macro = metrics.f1_score(targets, outputs, average='macro')

print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Loss:  0.60
F1 Score (Micro) = 0.47821991994348956
F1 Score (Macro) = 0.4268194053330652


In [34]:
ruc_auc_score_micro = metrics.roc_auc_score(targets, outputs, average='micro')
ruc_auc_score_macro = metrics.roc_auc_score(targets, outputs, average='macro')

print(f"RUC AUC Score (Micro) = {ruc_auc_score_micro}")
print(f"RUC AUC Score (Macro) = {ruc_auc_score_macro}")

RUC AUC Score (Micro) = 0.6574898952554749
RUC AUC Score (Macro) = 0.6313074681140318


In [35]:
print(classification_report(targets, outputs, target_names=icd_classes_50, digits=4))

              precision    recall  f1-score   support

      250.00     0.3401    0.1965    0.2491       341
       272.4     0.3610    0.2667    0.3067       375
       401.9     0.6519    0.5763    0.6118       871
      414.01     0.5376    0.3717    0.4396       269
      427.31     0.4966    0.5960    0.5418       495
       428.0     0.4583    0.6227    0.5280       432
      518.81     0.4909    0.6110    0.5444       527
      530.81     0.2803    0.1623    0.2056       228
       584.9     0.4873    0.5044    0.4957       458
       599.0     0.4779    0.2707    0.3456       399

   micro avg     0.4955    0.4621    0.4782      4395
   macro avg     0.4582    0.4178    0.4268      4395
weighted avg     0.4878    0.4621    0.4655      4395
 samples avg     0.4796    0.4753    0.4412      4395



  _warn_prf(average, modifier, msg_start, len(result))


In [36]:
test_df['prediction'] = test_out
test_df['tar'] = targets

In [37]:
test_df

Unnamed: 0,id,text,labels,prediction,tar
0,135559,pm pelvis limited pelvis u s transvaginal clip...,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]","[0.0707193, 0.4297712, 0.8514224, 0.048973903,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
1,183196,pm ct chest w contrast clip clip number radiol...,"[1, 0, 0, 1, 0, 0, 0, 0, 0, 0]","[0.019412117, 0.0073652496, 0.10875898, 0.1892...","[1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,180782,am picc line placment sch clip clip number rad...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]","[0.15476291, 0.040511917, 0.5545002, 0.0435407...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
3,192214,am ct head w contrast clip clip number radiolo...,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]","[0.020170476, 0.0411623, 0.07619405, 0.0305041...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
4,146593,am bx needle liver by radiologist guidance loc...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]","[0.13316026, 0.11097865, 0.035170946, 0.057293...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
...,...,...,...,...,...
1840,107946,pm cta abd w w o c recons cta pelvis w w o c r...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]","[0.17107764, 0.06228826, 0.3029179, 0.01553796...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
1841,131143,pm pulmonary angio clip clip number radiology ...,"[1, 0, 0, 0, 0, 1, 1, 0, 0, 0]","[0.8107605, 0.025928473, 0.6501448, 0.02214550...","[1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, ..."
1842,174861,pm mr first name titles last name titles fx p ...,"[0, 1, 0, 1, 1, 1, 0, 0, 0, 0]","[0.2857634, 0.06713392, 0.013770675, 0.1710461...","[0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ..."
1843,125824,am messenertic clip clip number radiology reas...,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]","[0.03213908, 0.03304323, 0.1829054, 0.00357507...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."


In [38]:
note_count_dict = test_df.groupby('id').size().to_dict()
test_df['note_count'] = test_df['id'].map(note_count_dict)

In [39]:
test_df['out_bool'] = [(test_df['prediction'][i]>=0.5).astype(int) for i in test_df.index]

In [40]:
out_freq_dict = test_df.groupby('id').out_bool.apply(np.sum).to_dict()
test_df['num_pred'] = test_df['id'].map(out_freq_dict)
test_df['num_pred'] = [(test_df['num_pred'][i]>=0.4*test_df['note_count'][i]).astype(int) for i in test_df.index]

In [41]:
df_freq = test_df.drop_duplicates('id')

In [42]:
out_freq = np.vstack([df_freq['num_pred'][i] for i in df_freq.index])
targets = np.vstack([df_freq['tar'][i] for i in df_freq.index])

#targets = dev_tar
accuracy = metrics.accuracy_score(targets, out_freq)
f1_score_micro = metrics.f1_score(targets, out_freq, average='micro')
f1_score_macro = metrics.f1_score(targets, out_freq, average='macro')

print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

F1 Score (Micro) = 0.47584166784054094
F1 Score (Macro) = 0.42442692160766365


In [43]:
print(classification_report(targets, out_freq, target_names=icd_classes_50, digits=4))

              precision    recall  f1-score   support

      250.00     0.3158    0.1915    0.2384       282
       272.4     0.3710    0.2939    0.3280       313
       401.9     0.6385    0.5912    0.6139       702
      414.01     0.5269    0.3729    0.4367       236
      427.31     0.4754    0.6137    0.5358       409
       428.0     0.4646    0.6463    0.5406       376
      518.81     0.4538    0.6146    0.5221       384
      530.81     0.2672    0.1615    0.2013       192
       584.9     0.4808    0.5348    0.5063       374
       599.0     0.4247    0.2582    0.3211       306

   micro avg     0.4791    0.4726    0.4758      3574
   macro avg     0.4419    0.4278    0.4244      3574
weighted avg     0.4707    0.4726    0.4625      3574
 samples avg     0.4683    0.4869    0.4389      3574



  _warn_prf(average, modifier, msg_start, len(result))


In [44]:
ruc_auc_score_micro = metrics.roc_auc_score(targets, out_freq, average='micro')
ruc_auc_score_macro = metrics.roc_auc_score(targets, out_freq, average='macro')

print(f"RUC AUC Score (Micro) = {ruc_auc_score_micro}")
print(f"RUC AUC Score (Macro) = {ruc_auc_score_macro}")

RUC AUC Score (Micro) = 0.6537062081044721
RUC AUC Score (Macro) = 0.6269679328559374


In [45]:
out_mean_dict = test_df.groupby('id').prediction.apply(np.mean).to_dict()
test_df['out_mean'] = test_df['id'].map(out_mean_dict)
test_df


Unnamed: 0,id,text,labels,prediction,tar,note_count,out_bool,num_pred,out_mean
0,135559,pm pelvis limited pelvis u s transvaginal clip...,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]","[0.0707193, 0.4297712, 0.8514224, 0.048973903,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...",1,"[0, 0, 1, 0, 0, 1, 0, 0, 1, 0]","[0, 0, 1, 0, 0, 1, 0, 0, 1, 0]","[0.0707193, 0.4297712, 0.8514224, 0.048973903,..."
1,183196,pm ct chest w contrast clip clip number radiol...,"[1, 0, 0, 1, 0, 0, 0, 0, 0, 0]","[0.019412117, 0.0073652496, 0.10875898, 0.1892...","[1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,"[0, 0, 0, 0, 1, 1, 0, 0, 0, 0]","[0, 0, 0, 0, 1, 1, 0, 0, 0, 0]","[0.019412117, 0.0073652496, 0.10875898, 0.1892..."
2,180782,am picc line placment sch clip clip number rad...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]","[0.15476291, 0.040511917, 0.5545002, 0.0435407...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...",2,"[0, 0, 1, 0, 0, 0, 0, 0, 1, 0]","[0, 0, 1, 0, 0, 0, 0, 0, 1, 0]","[0.09120376, 0.022708733, 0.37408617, 0.036525..."
3,192214,am ct head w contrast clip clip number radiolo...,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]","[0.020170476, 0.0411623, 0.07619405, 0.0305041...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...",3,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]","[0.017007744, 0.04108332, 0.043928098, 0.06916..."
4,146593,am bx needle liver by radiologist guidance loc...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]","[0.13316026, 0.11097865, 0.035170946, 0.057293...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...",1,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]","[0.13316026, 0.11097865, 0.035170946, 0.057293..."
...,...,...,...,...,...,...,...,...,...
1840,107946,pm cta abd w w o c recons cta pelvis w w o c r...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]","[0.17107764, 0.06228826, 0.3029179, 0.01553796...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]","[0.12603074, 0.11720364, 0.25521073, 0.0285639..."
1841,131143,pm pulmonary angio clip clip number radiology ...,"[1, 0, 0, 0, 0, 1, 1, 0, 0, 0]","[0.8107605, 0.025928473, 0.6501448, 0.02214550...","[1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, ...",1,"[1, 0, 1, 0, 0, 0, 0, 0, 0, 0]","[1, 0, 1, 0, 0, 0, 0, 0, 0, 0]","[0.8107605, 0.025928473, 0.6501448, 0.02214550..."
1842,174861,pm mr first name titles last name titles fx p ...,"[0, 1, 0, 1, 1, 1, 0, 0, 0, 0]","[0.2857634, 0.06713392, 0.013770675, 0.1710461...","[0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...",1,"[0, 0, 0, 0, 1, 1, 0, 0, 1, 0]","[0, 0, 0, 0, 1, 1, 0, 0, 1, 0]","[0.2857634, 0.06713392, 0.013770675, 0.1710461..."
1843,125824,am messenertic clip clip number radiology reas...,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]","[0.03213908, 0.03304323, 0.1829054, 0.00357507...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...",2,"[0, 0, 0, 0, 1, 1, 1, 0, 1, 0]","[1, 0, 0, 0, 1, 1, 1, 1, 1, 0]","[0.28648278, 0.06495934, 0.10469299, 0.0757804..."


In [46]:
df_mean = test_df.drop_duplicates('id')

In [47]:
out_mean = np.vstack([df_mean['out_mean'][i]>=0.5 for i in df_mean.index])
targets = np.vstack([df_mean['tar'][i] for i in df_mean.index])
#targets = dev_tar
accuracy = metrics.accuracy_score(targets, out_mean)
f1_score_micro = metrics.f1_score(targets, out_mean, average='micro')
f1_score_macro = metrics.f1_score(targets, out_mean, average='macro')

print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

F1 Score (Micro) = 0.4680415386865584
F1 Score (Macro) = 0.415433399981763


In [48]:
print(classification_report(targets, out_mean, target_names=icd_classes_50, digits=4))

              precision    recall  f1-score   support

      250.00     0.3176    0.1667    0.2186       282
       272.4     0.3705    0.2652    0.3091       313
       401.9     0.6412    0.5627    0.5994       702
      414.01     0.5380    0.3602    0.4315       236
      427.31     0.4919    0.5941    0.5382       409
       428.0     0.4793    0.6144    0.5385       376
      518.81     0.4665    0.5807    0.5174       384
      530.81     0.2762    0.1510    0.1953       192
       584.9     0.4885    0.5134    0.5007       374
       599.0     0.4364    0.2353    0.3057       306

   micro avg     0.4903    0.4477    0.4680      3574
   macro avg     0.4506    0.4044    0.4154      3574
weighted avg     0.4791    0.4477    0.4534      3574
 samples avg     0.4723    0.4587    0.4289      3574



  _warn_prf(average, modifier, msg_start, len(result))


In [49]:
ruc_auc_score_micro = metrics.roc_auc_score(targets, out_mean, average='micro')
ruc_auc_score_macro = metrics.roc_auc_score(targets, out_mean, average='macro')

print(f"RUC AUC Score (Micro) = {ruc_auc_score_micro}")
print(f"RUC AUC Score (Macro) = {ruc_auc_score_macro}")

RUC AUC Score (Micro) = 0.6490367489563685
RUC AUC Score (Macro) = 0.6232357815183714
