In [122]:
# Data Science
import re
import csv
import json
import itertools
from tqdm import tqdm
import numpy as np
import pandas as pd
import seaborn as sns
from pandasgui import show
import matplotlib.pyplot as plt

# General
import os
import sys
import time
import math
import logging
import time
import random
from datetime import date
import warnings
current_date = date.today()
warnings.filterwarnings("ignore")

# ML
from sklearn.model_selection import train_test_split

# deep learning libraries
import torch
import transformers
from simpletransformers.classification import ClassificationModel, ClassificationArgs

# hyperparameter optimization
import optuna
from optuna.samplers import TPESampler
import optuna.visualization.matplotlib as oviz

spacy.prefer_gpu()
%load_ext autotime
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 3500000

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 844 ms


In [121]:
# set seeds to make computations deterministic
np.random.seed(1234)
torch.manual_seed(1234)

# check CUDA availability
cuda_available = torch.cuda.is_available()
print("Is CUDA available? ", "Yes" if cuda_available else "No")

Is CUDA available?  No
time: 0 ns


In [123]:
# configure logging options
logging.basicConfig(level = logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

time: 15 ms


## Train-Test-Split

In [133]:
always_patterns = pd.read_csv("input_optimized.csv") 
manual_review = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\test_and_validation.csv")
manual_review = manual_review[['patient_id', 'sequence', 'annotator_label']]
always_patterns = always_patterns[['patient_id', 'sequence', 'annotator_label']]

time: 344 ms


In [134]:
always_patterns

Unnamed: 0,patient_id,sequence,annotator_label
0,Z15564314,s other free text-see phs viewer social histor...,1
1,Z10171706,------- fusion: no sleep disturbance: no socia...,1
2,Z8935348,ain spasm). 30 tablet 0 unknown (outside pharm...,1
3,Z12212893,------- on 112mcg dose. maria will check dose ...,2
4,Z9598376,------- (vibramycin) 100 mg capsule take 1 cap...,1
...,...,...,...
8045,Z6595984,------- gnitive concerns. interim history pati...,2
8046,Z10123442,"------- rm and well perfused, no evidence of e...",0
8047,Z6694483,: 77.7 kg (171 lb 4.8 oz) spo2: 98% 99% 98% cu...,0
8048,Z9195765,mr. donovan lives with his wife with whom he h...,1


time: 15 ms


In [136]:
tokenizer = BertTokenizer.from_pretrained('emilyalsentzer/Bio_ClinicalBERT', 
                                          do_lower_case=True)

time: 1.45 s


In [137]:
model = LongformerForSequenceClassification.from_pretrained(
    'emilyalsentzer/Bio_ClinicalBERT',
    num_labels = 3, # The number of output labels--3 for multi-label classification.
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

NameError: name 'LongformerForSequenceClassification' is not defined

time: 47 ms


In [135]:
def split_data(trial):
    # stratify across sequences with and without always pattern matches and class_label (Y, N, NTR)
    
    # stratifying across sequences with always pattern
    X_train, X_other = train_test_split(always_patterns, random_state = 0,test_size = 0.1, stratify = always_patterns["annotator_label"].to_numpy())

    X_valid, X_test = train_test_split(X_other, random_state = 0, test_size = 0.25, stratify = X_other["annotator_label"].to_numpy())
    
    # stratifying across sequences without always pattern
    X_train_2, X_other_2 = train_test_split(manual_review, random_state = 0,test_size = 0.6, stratify = manual_review["annotator_label"].to_numpy())

    X_valid_2, X_test_2 = train_test_split(X_other_2, random_state = 0, test_size = (0.25/0.6), stratify = X_test_2["annotator_label"].to_numpy())
    
    # combining to get final train, test, validation splits
    X_train = X_train.append(X_train_2)
    X_valid = X_valid.append(X_valid_2)
    X_test = X_test.append(X_test_2)

    return X_train, X_valid, X_test

time: 0 ns


In [37]:
def define_model(trial, trial_dir):
    # set learning rate
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True)

    # define model name
    model_type = "bert"
    model_name = "emilyalsentzer/Bio_ClinicalBERT"
    max_seq_length = 512

    model_args = ClassificationArgs (
        ## NLP ARGUMENTS
        sliding_window = False,
        learning_rate = learning_rate, # default 4e-5
        adam_epsilon = 1e-8, # default 1e-8
        train_batch_size = 8, # default 8
        eval_batch_size = 4, # default 8
        num_train_epochs = 3,  # default 1 (number of epochs model will be trained for)
        do_lower_case = False, # default False
        max_seq_length = max_seq_length, # default 128 (maximum sequence length the model will support)

        ## TRAINING LOOP
        logging_steps = 50, # default 50
        manual_seed = 1234, # default None (necessary for reproducible results)
        n_gpu = 0, # default 1 (number of GPUs to use)
        save_steps = 2000, # default 2000 (save a model checkpoint at every specified number of steps)
        output_dir = trial_dir, 
        overwrite_output_dir = True, # default False (if True, then the trained model will be saved to the ouput_dir and will overwrite existing saved models in the same directory)

        ## EVALUATE DURING TRAINING
        evaluate_during_training = True, # default False
        evaluate_during_training_steps = 2000, # default  2000  
        evaluate_during_training_verbose = True, # default False

        ## EARLY STOPPING
        use_early_stopping = True, # default False
        early_stopping_delta = 0, # default 0 (improvement over best_eval_loss necessary to count as a better checkpoint)
        early_stopping_metric = "auc", # default eval_loss 
        early_stopping_metric_minimize = True, # default True
        early_stopping_patience = 2, # default value 3 (terminate training after these many epochs if there is no improvement in early_stopping_metric then early_stopping_delta)
    )

    # create the classification model
    model = ClassificationModel (
        model_type, model_name,
        args = model_args,
        use_cuda = cuda_available
    )
    
    return model

time: 46 ms


In [None]:
def objective(trial):
    # log time
    start_time = time.localtime()

    # log message
    print("\n-------- TRIAL #{} --------".format(trial.number))

    # create output directory
    trial_dir = "../../BigDataSets/Results/Model/trial_{}".format(trial.number)
    if os.path.isdir(trial_dir):
        shutil.rmtree(trial_dir)
        print("\n>>> {}: Removing Directory {}\n".format(time.strftime("%H:%M:%S", time.localtime()), trial_dir))
    os.mkdir(trial_dir)

    # log message
    print("\n>>> {}: Preparing Data\n".format(time.strftime("%H:%M:%S", time.localtime())))

    train_data, val_data, test_data = prepare_data(trial)
    
    # save test dataset to file
    f = open(Path(trial_dir, "data_{}.pkl".format(trial.number)), "wb")
    pickle.dump([train_data, val_data, test_data], f)
    f.close()
    
    # log message
    print("\n>>> {}: Defining Model\n".format(time.strftime("%H:%M:%S", time.localtime())))

    model = define_model(trial, trial_dir)
    
        # log message
    print("\n>>> {}: Started Training\n".format(time.strftime("%H:%M:%S", time.localtime())))

    # train model
    model.train_model(
        train_data,
        eval_df = val_data,
        auc = sk.roc_auc_score,
        acc = sk.accuracy_score
    )
    
    results, model_outputs, wrong_predictions = model.eval_model(
        val_data,
        auc = sk.roc_auc_score,
        acc = sk.accuracy_score
    )
    
    # save to file
    f = open(Path(trial_dir, "training_results_{}.pkl".format(trial.number)), "wb")
    pickle.dump([model, results, model_outputs, wrong_predictions], f)
    f.close()

## Training Loop

In [114]:
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

def evaluate(dataloader_val):
    model.eval()
    
    loss_val_total = 0
    predictions, true_vals, probabilities, = [], [], []
    
    for batch in dataloader_val:
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
        probabilities.append(nnf.softmax(logits, dim = 1))
    
    loss_val_avg = loss_val_total / len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis = 0)
    true_vals = np.concatenate(true_vals, axis = 0)
    probabilities = np.concatenate(probabilities, axis = 0)
            
    return loss_val_avg, predictions, true_vals, probabilities

time: 0 ns


In [117]:
for epoch in tqdm(range(1, epochs+1)):
    model.train()
    
    loss_train_total = 0
    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    
    for batch in progress_bar:
        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
    torch.save(model.state_dict(), f'data_volume/finetuned_BERT_epoch_{epoch}.model')
    
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total / len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    test_loss, predictions, true_vals, probabilities = evaluate(dataloader_test)
    
    acc = metrics.accuracy_score(true_vals, predictions)
    auc = metrics.roc_auc_score(true_vals, probabilities, average='weighted', multi_class='ovr')
    
    tqdm.write(f'Test loss: {test_loss}')
    tqdm.write(f'acc: {acc}')
    tqdm.write(f'auc: {auc}')

  0%|                                                                                            | 0/3 [00:00<?, ?it/s]
Epoch 1:   0%|                                                                                | 0/2768 [00:00<?, ?it/s][A
  0%|                                                                                            | 0/3 [00:00<?, ?it/s][A


RuntimeError: The expanded size of the tensor (1024) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [3, 1024].  Tensor sizes: [1, 512]

time: 313 ms
