In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.metrics import matthews_corrcoef
import os
from transformers import BertTokenizer
from sklearn.metrics import roc_auc_score, classification_report,average_precision_score
from sklearn.metrics import matthews_corrcoef, confusion_matrix, recall_score, accuracy_score
from scipy.special import expit as sigmoid

In [None]:
dirs_dct = dict(list(pd.read_csv('../directory_paths.csv')['paths'].apply(eval)))
checkpoints_dir = dirs_dct['checkpoints_dir']

In [2]:
bert_tokenizer = BertTokenizer.from_pretrained("bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12")

In [3]:
bio_tokenizer =  BertTokenizer.from_pretrained("bert-base-uncased")

In [4]:
i =196
# print(bio_tokenizer.vocab[i], bio_tokenizer.vocab[i])
bio_tokenizer.encode('this is a test hypertension sentence') ==bert_tokenizer.encode('this is a test hypertension sentence')

True

In [5]:
EXP_NAME = 'FINALLogistic'

# wandb.init(settings=wandb.Settings(start_method="fork"))

for i in range(100):
    if EXP_NAME +'_'+str(i)+'.csv' not in os.listdir(checkpoints_dir):
        EXP_NAME = EXP_NAME +'_'+str(i)
        break
print('experiment name is '+ EXP_NAME)


experiment name is FINALLogistic_0


In [6]:
def bert_encode(data):
    encoded =  bert_tokenizer.encode_plus(
            text=data,  # Preprocess sentence
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            return_attention_mask=True,      # Return attention mask
            return_token_type_ids = True,
            truncation=True
            )
    encoded=encoded['input_ids']
    encoded = encoded 
    encoded = encoded + [bert_tokenizer.pad_token_id]*(512-len(encoded))
    return np.asarray(encoded)

In [7]:
train_df = pd.read_csv(os.path.join(dirs_dct['data_dir'],'los_nicu_train_admissions.csv'))
val_df = pd.read_csv(os.path.join(dirs_dct['data_dir'],'los_nicu_val_admissions.csv'))
test_df = pd.read_csv(os.path.join(dirs_dct['data_dir'],'los_nicu_test_admissions.csv'))

train_df['encoded'] = train_df['TEXT'].apply(bert_encode)
val_df['encoded'] = val_df['TEXT'].apply(bert_encode)

test_df['encoded'] = test_df['TEXT'].apply(bert_encode)

train_df['TEXT']= train_df['encoded'].apply(lambda x: bert_tokenizer.decode(x))

val_df['TEXT']= val_df['encoded'].apply(lambda x: bert_tokenizer.decode(x))

test_df['TEXT']= test_df['encoded'].apply(lambda x: bert_tokenizer.decode(x))

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [9]:
def create_bag_of_words(encoded_text):
    bow = [0]*len(bert_tokenizer.vocab)
    for x in encoded_text:
        bow[x]=1
    return bow

In [10]:
len(bert_tokenizer.vocab.values())
# list(bert_tokenizer.vocab.values())[-1]

30522

In [11]:
train_inputs = train_df['encoded'].apply(create_bag_of_words)
val_inputs = val_df['encoded'].apply(create_bag_of_words)
test_inputs = test_df['encoded'].apply(create_bag_of_words)

In [12]:
train_inputs = np.stack(train_inputs.values)
val_inputs = np.stack(val_inputs.values)
test_inputs = np.stack(test_inputs.values)

In [14]:
import torch
from torch.autograd import Variable
import torchvision.transforms as transforms
import torchvision.datasets as dsets
from torch import nn, optim
from torch.nn import functional as F

# Defining neural network structure
class BoWClassifier(nn.Module):  # inheriting from nn.Module!

    def __init__(self, num_labels, vocab_size):
        # needs to be done everytime in the nn.module derived class
        super(BoWClassifier, self).__init__()

        # Define the parameters that are needed for linear model ( Ax + b)
        self.linear = nn.Linear(vocab_size, num_labels)

        # NOTE! The non-linearity log softmax does not have parameters! So we don't need
        # to worry about that here

    def forward(self, bow_vec): # Defines the computation performed at every call.
        # Pass the input through the linear layer,
        # then pass that through log_softmax.

        return F.log_softmax(self.linear(bow_vec), dim=1)

In [15]:
batch_size = 100
n_iters = 3000
epochs = n_iters / (len(train_inputs) / batch_size)
input_dim = len(bert_tokenizer.vocab)
output_dim = 2
lr_rate = 0.001

In [16]:
bow_nn_model = BoWClassifier(output_dim, input_dim)
# bow_nn_model.to(device)

# Loss Function
loss_function = nn.NLLLoss()
# Optimizer initlialization
optimizer = optim.SGD(bow_nn_model.parameters(), lr=0.01)

In [17]:
train_df['label'].dtype

dtype('int64')

In [18]:
train_dataset = list(zip(train_inputs, train_df['label']))
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)

In [19]:
val_dataset = list(zip(val_inputs, val_df['label']))
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=64, shuffle=True)
test_dataset = list(zip(test_inputs, test_df['label']))
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=True)

In [20]:
def save(model, save_dir, save_prefix, steps):
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)
    save_prefix = os.path.join(save_dir, save_prefix)
    save_path = '{}_steps_{}.pt'.format(save_prefix, steps)
    torch.save(model.state_dict(), save_path)

In [21]:
import time
start_time = time.time()

# Train the model
best_mcc = 0
for epoch in range(100):
    for i, (ex, label) in enumerate(train_loader):
        # Step 1. Remember that PyTorch accumulates gradients.
        # We need to clear them out before each instance
        bow_nn_model.zero_grad()

        # Step 2. Make BOW vector for input features and target label
        bow_vec = ex.float()
        target = label
        
        # Step 3. Run the forward pass.
        probs = bow_nn_model(bow_vec)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss = loss_function(probs, target)
        loss.backward()
        optimizer.step()
    if epoch%1==0:
        # calculate Accuracy
        metrics_dict={}
        for name, data_iter in {'val_':val_loader, 'test_':test_loader}.items():        
            correct = 0
            total = 0
            all_targets = []
            all_preds = []
            all_probs = []
            for i, (ex, label) in enumerate(data_iter):
                bow_vec = ex.float()
                target = label
                probs = bow_nn_model(bow_vec)

                _, predicted = torch.max(probs.data, 1)
                total+= target.size(0)
                all_targets = all_targets + list(target)
                all_preds = all_preds + list(predicted)
                all_probs = all_probs + probs.tolist()
                # for gpu, bring the predicted and labels back to cpu fro python operations to work
                correct+= (predicted == target).sum()
            accuracy = 100 * correct/total
            
            metrics_dict[name+'accuracy'] = accuracy
            metrics_dict[name+'mcc'] = matthews_corrcoef(all_targets, all_preds)
            probs=sigmoid(all_probs)
            all_targets_2d = [[1,0] if x==0 else [0,1] for x in all_targets]
            metrics_dict[name+'accuracy'] =  accuracy_score(all_targets, all_preds)
            # print(all_targets)
            # print(all_preds)
            metrics_dict[name+'auc_avg'] = roc_auc_score(all_targets_2d, sigm)
            metrics_dict[name+'auprc_avg'] = average_precision_score(all_targets_2d, probs)
            metrics_dict[name+'auc_1'] = roc_auc_score(all_targets, probs[:,1])
            metrics_dict[name+'auprc_1'] = average_precision_score(all_targets, probs[:,1])

            class_report = classification_report(all_targets,all_preds,output_dict=True) 
            metrics_dict[name+'precision'] = class_report['1']['precision']
            metrics_dict[name+'recall'] = class_report['1']['recall']
            metrics_dict[name+'f1-score'] = class_report['1']['f1-score']
            metrics_dict[name+'support'] = class_report['1']['support']
            metrics_dict[name+'specificity'] = class_report['0']['recall']
            metrics_dict['epoch'] = epoch
        fn = EXP_NAME+'_metrics.csv'
        dir = os.path.join(checkpoints_dir, 'logistic')
        df_dict = {k:[v] for k,v in metrics_dict.items()}
        if fn not in os.listdir(dir):
            pd.DataFrame(df_dict).to_csv(dir+fn, mode='a',header=True)
        else:
            pd.DataFrame(df_dict).to_csv(dir+fn, mode='a',header=False)
        mcc = metrics_dict['val_mcc']

        print("Iteration: {}. Loss: {}. Accuracy: {}. MCC: {}".format(epoch, loss.item(), accuracy, mcc))
        if mcc>best_mcc:
            best_mcc = mcc
            print('saving', epoch)
            save(bow_nn_model, os.path.join(checkpoints_dir, 'logistic'), 'best', epoch)
            
print("Time taken to train the model: " + str(time.time() - start_time))

Iteration: 0. Loss: 0.4293558895587921. Accuracy: 74.61660766601562. MCC: 0.1579019072146128
saving 0
Iteration: 1. Loss: 0.4160304367542267. Accuracy: 81.54415893554688. MCC: 0.5348744569110886
saving 1
Iteration: 2. Loss: 0.18994273245334625. Accuracy: 82.12586212158203. MCC: 0.5441962458449998
saving 2
Iteration: 3. Loss: 0.35016241669654846. Accuracy: 83.9238510131836. MCC: 0.5723336075727167
saving 3
Iteration: 4. Loss: 0.7666645646095276. Accuracy: 78.90005493164062. MCC: 0.4321800310385275
Iteration: 5. Loss: 0.4497051239013672. Accuracy: 82.39026641845703. MCC: 0.5644724433513655
Iteration: 6. Loss: 0.28951457142829895. Accuracy: 85.56319427490234. MCC: 0.6211892558732252
saving 6
Iteration: 7. Loss: 0.3974981904029846. Accuracy: 85.72183990478516. MCC: 0.611922224214253
Iteration: 8. Loss: 0.0648617148399353. Accuracy: 83.76520538330078. MCC: 0.5880767930077943
Iteration: 9. Loss: 0.13615325093269348. Accuracy: 84.29402160644531. MCC: 0.5854980356375117
Iteration: 10. Loss: 0.