## Modules + File Paths

In [None]:
from transformers import BertTokenizer, TFBertModel, BertConfig
import numpy as np
import pickle
import re

In [None]:
biobert = 'D:/Downloads/LengthOfStay_Tests/pucpr/biobertpt-all' #folder where the BioBertPT files were unzipped
folder_path = 'D:/Downloads/Results/Stage_6' #folder where the BRATECA V1 files were unzipped

with (open(folder_path+'/exam_feature_dict.pkl', "rb")) as openfile:
    adm_exam_features = pickle.load(openfile)    
with (open(folder_path+'/note_dict.pkl', "rb")) as openfile:
    cn_features = pickle.load(openfile)

## Constructing Test Sets

In [None]:
shared_keys = set(adm_exam_features.keys()) & set(cn_features.keys())

In [None]:
def tokenize(exam_dict, note_dict, patient_keys, tokenizer):
    exam_features, note_tokens, token_masks, los_labels, mort_labels = [],[],[],[],[]
    for key in patient_keys:
        #exam input
        curr_exam_features = exam_dict[key][0]
        
        #bert input
        concatenated_note = ''
        for note in note_dict[key][0]:
            concatenated_note = concatenated_note + note[0]
        preprocessed_note = re.sub('[@#$%&*<>]',' ',concatenated_note)
        inputs = tokenizer.encode_plus(preprocessed_note, add_special_tokens=True, max_length=512, padding='max_length', 
                                             return_attention_mask=True, truncation=True)
        #append to NN input arrays
        exam_features.append(curr_exam_features)
        note_tokens.append(inputs['input_ids'])
        token_masks.append(inputs['attention_mask'])
        if exam_dict[key][1]:
            los_labels.append(1)
        else:
            los_labels.append(0)
        if exam_dict[key][2] == 'Alta':
            mort_labels.append(0)
        elif exam_dict[key][2] == 'Obito':
            mort_labels.append(1)
        else:
            mort_labels.append(2)
        
    return np.asarray(exam_features, dtype='float64'), np.asarray(note_tokens, dtype='int32'), np.asarray(token_masks, dtype='int32'), np.asarray(los_labels, dtype='int32'), np.asarray(mort_labels, dtype='int32')

In [None]:
tokenizer = BertTokenizer.from_pretrained(biobert, do_lower_case='True', truncation_side='left',
                                         padding_side='right')

In [None]:
exam_features, note_tokens, token_masks, los_labels, mort_labels = tokenize(adm_exam_features,cn_features,
                                                                                shared_keys,tokenizer)

In [None]:
adm_exam_features = None
cn_features = None

In [None]:
print('Length of Stay Prediction\nLabel 0 = Less or equal to 7 days\nLabel 1 = More than 7 days\n')
print('Quantity of Label 0 Examples:', len([x for x in los_labels if x == 0]))
print('Quantity of Label 1 Examples:', len([x for x in los_labels if x == 1]))
print('Total Examples:', len(los_labels))
los_prop = round(len([x for x in los_labels if x == 1])/len([x for x in los_labels if x == 0]),2)
print('Positive-to-Negative: ',los_prop,':1',sep='')

In [None]:
print('Mortality Prediction\nLabel 0 = Discharge\nLabel 1 = Death\n')
print('Quantity of Label 0 Examples:', len([x for x in mort_labels if x == 0]))
print('Quantity of Label 1 Examples:', len([x for x in mort_labels if x == 1]))
print('Total Examples:', len(mort_labels))
mort_prop = round(len([x for x in mort_labels if x == 1])/len([x for x in mort_labels if x == 0]),2)
print('Positive-to-Negative: ',mort_prop,':1',sep='')

In [None]:
#balance dataset
def balance_dataset(labels):
    count = 0 #counter for number of negative-pairs in balanced dataset that doesn't have a positive-pair
    indexes_false = [] #indexes for negative pairs in balanced dataset
    indexes_true = []  #indexes for positive pairs in balanced dataset
    indexes_rest = []  #indexes for pairs not in balanced dataset
    for i,label in enumerate(labels): #iterate over labels, retrieve label and index of label
        if label == 1: #since there are less positives, always add them to balanced dataset
            indexes_true.append(i)
            count +=1
        elif label == 0 and count > 0: #if dataset in unbalanced, add negative to balanced dataset
            indexes_false.append(i)
            count -=1
        else: #if dataset is balanced, do not add negative pair to dataset
            indexes_rest.append(i)
    while len(indexes_true) > len(indexes_false): #if dataset in unbalanced by the end of the loop, add more negatives
        indexes_false.append(indexes_rest.pop(-1))
    return indexes_false, indexes_true, indexes_rest

In [None]:
los_false, los_true, los_rest = balance_dataset(los_labels)

In [None]:
print('Length of Stay Prediction\nLabel 0 = Less or equal to 7 days\nLabel 1 = More than 7 days\n')
print('Balanced Quantity of Label 0:',len(los_false))
print('Balanced Quantity of Label 1:',len(los_true))
print('Total Balanced Examples:', len(los_false)+len(los_true))
print('Total Left-over Examples:', len(los_rest))

In [None]:
mort_false, mort_true, mort_rest = balance_dataset(mort_labels)

In [None]:
print('Mortality Prediction\nLabel 0 = Discharge\nLabel 1 = Death\n')
print('Balanced Quantity of Label 0:',len(mort_false))
print('Balanced Quantity of Label 1:',len(mort_true))
print('Total Balanced Examples:', len(mort_false)+len(mort_true))
print('Total Left-over Examples:', len(mort_rest))

In [None]:
#join features and labels into the test sets by using the indexes
def make_datasets(indexes_true, indexes_false, indexes_rest,exam_features,note_tokens,token_masks,labels,prop):
    TRAIN_PERCENTAGE = 0.7 #percentage of dataset that will be used to train the model
    VAL_PERCENTAGE = 0.1 #percentage of dataset that will be used to validate the model
    train_proportion = round(len(indexes_false)*TRAIN_PERCENTAGE) #number of pairs in train dataset, based on train_percentage
    val_proportion = train_proportion+round(len(indexes_false)*VAL_PERCENTAGE)

    #balanced train and test datasets
    balanced_train_features = []
    balanced_train_masks = []
    balanced_train_exam = []
    balanced_train_labels = []
    balanced_test_features = []
    balanced_test_masks = []
    balanced_test_exam = []
    balanced_test_labels = []
    balanced_val_features = []
    balanced_val_masks = []
    balanced_val_exam = []
    balanced_val_labels = []

    for i,key in enumerate(indexes_false): #separate balanced pairs into train and test sets
        if i < train_proportion:
            balanced_train_features.append(note_tokens[key])
            balanced_train_masks.append(token_masks[key])
            balanced_train_exam.append(exam_features[key])
            balanced_train_labels.append(labels[key])
            balanced_train_features.append(note_tokens[indexes_true[i]])
            balanced_train_masks.append(token_masks[indexes_true[i]])
            balanced_train_exam.append(exam_features[indexes_true[i]])
            balanced_train_labels.append(labels[indexes_true[i]])
        elif i < val_proportion:
            balanced_val_features.append(note_tokens[key])
            balanced_val_masks.append(token_masks[key])
            balanced_val_exam.append(exam_features[key])
            balanced_val_labels.append(labels[key])
            balanced_val_features.append(note_tokens[indexes_true[i]])
            balanced_val_masks.append(token_masks[indexes_true[i]])
            balanced_val_exam.append(exam_features[indexes_true[i]])
            balanced_val_labels.append(labels[indexes_true[i]])
        else:
            balanced_test_features.append(note_tokens[key])
            balanced_test_masks.append(token_masks[key])
            balanced_test_exam.append(exam_features[key])
            balanced_test_labels.append(labels[key])
            balanced_test_features.append(note_tokens[indexes_true[i]])
            balanced_test_masks.append(token_masks[indexes_true[i]])
            balanced_test_exam.append(exam_features[indexes_true[i]])
            balanced_test_labels.append(labels[indexes_true[i]])

    #proportional test dataset, copied from the balanced set
    proportional_test_features = balanced_test_features.copy()
    proportional_test_masks = balanced_test_masks.copy()
    proportional_test_exam = balanced_test_exam.copy()
    proportional_test_labels = balanced_test_labels.copy()

    negative_pairs_in_testset = len(balanced_test_labels)/2

    for key in indexes_rest: #add positive pairs to copy of original balanced test set to make proportional test set
        if len(proportional_test_labels)-negative_pairs_in_testset < round((len(balanced_test_labels)/2)/prop):
            proportional_test_features.append(note_tokens[key])
            proportional_test_masks.append(token_masks[key])
            proportional_test_exam.append(exam_features[key])
            proportional_test_labels.append(labels[key])
        else:
            break

    #turn all lists into numpy arrays for use with TensorFlow
    balanced_train_features = np.array(balanced_train_features)
    balanced_train_masks = np.array(balanced_train_masks)
    balanced_train_exam = np.array(balanced_train_exam)
    balanced_train_labels = np.array(balanced_train_labels)
    training_data = [balanced_train_features,balanced_train_masks,balanced_train_exam,balanced_train_labels]
    
    balanced_test_features = np.array(balanced_test_features)
    balanced_test_masks = np.array(balanced_test_masks)
    balanced_test_exam = np.array(balanced_test_exam)
    balanced_test_labels = np.array(balanced_test_labels)
    testing_data = [balanced_test_features,balanced_test_masks,balanced_test_exam,balanced_test_labels]
    
    balanced_val_features = np.array(balanced_val_features)
    balanced_val_masks = np.array(balanced_val_masks)
    balanced_val_exam = np.array(balanced_val_exam)
    balanced_val_labels = np.array(balanced_val_labels)
    val_data = [balanced_val_features,balanced_val_masks,balanced_val_exam,balanced_val_labels]
    
    proportional_test_features = np.array(proportional_test_features)
    proportional_test_masks = np.array(proportional_test_masks)
    proportional_test_exam = np.array(proportional_test_exam)
    proportional_test_labels = np.array(proportional_test_labels)
    prop_testing_data = [proportional_test_features,proportional_test_masks,proportional_test_exam,proportional_test_labels]
    
    return [training_data, testing_data, val_data, prop_testing_data]

In [None]:
los_testset = make_datasets(los_true, los_false, los_rest, exam_features, note_tokens, token_masks, los_labels,los_prop)
mort_testset = make_datasets(mort_true, mort_false, mort_rest, exam_features, note_tokens, token_masks, mort_labels,mort_prop)

In [None]:
def print_stats(balanced_train_labels,balanced_test_labels,proportional_test_labels):
    print('Total Balanced Train Examples:',len(balanced_train_labels))
    balanced_negative_pairs = len([x for x in balanced_train_labels if x == 0])
    balanced_positive_pairs = len([x for x in balanced_train_labels if x == 1])
    print('Balanced Training Positive-to-Negative: ',round(balanced_positive_pairs/balanced_negative_pairs,2),':1.0',sep='')
    print('Total Balanced Test Examples:',len(balanced_test_labels))
    balanced_negative_pairs = len([x for x in balanced_test_labels if x == 0])
    balanced_positive_pairs = len([x for x in balanced_test_labels if x == 1])
    print('Balanced Testing Positive-to-Negative: ',round(balanced_positive_pairs/balanced_negative_pairs,2),':1.0',sep='')
    print('Total Proportional Test Examples:',len(proportional_test_labels))
    proportional_negative_pairs = len([x for x in proportional_test_labels if x == 0])
    proportional_positive_pairs = len([x for x in proportional_test_labels if x == 1])
    print('Proportional Positive-to-Negative: ',round(proportional_positive_pairs/proportional_negative_pairs,2),':1.0',sep='')

In [None]:
print('Length of Stay Prediction\nLabel 0 = Less or equal to 7 days\nLabel 1 = More than 7 days\n')
print_stats(los_testset[0][3],los_testset[1][3],los_testset[3][3])

In [None]:
print('Mortality Prediction\nLabel 0 = Discharge\nLabel 1 = Death\n')
print_stats(mort_testset[0][3],mort_testset[1][3],mort_testset[3][3])

In [None]:
with open(folder_path+'/los_testset.pkl', 'wb') as note_file:
    pickle.dump(los_testset, note_file, protocol=pickle.HIGHEST_PROTOCOL)
with open(folder_path+'/mort_testset.pkl', 'wb') as note_file:
    pickle.dump(mort_testset, note_file, protocol=pickle.HIGHEST_PROTOCOL)