In [None]:
# base libraries
import numpy as np
import pandas as pd
import regex as re
import itertools
import sklearn.metrics as sk
from functools import reduce

# deep learning libraries
import torch
import transformers
from sklearn.model_selection import train_test_split
from simpletransformers.classification import ClassificationModel, ClassificationArgs

# hyperparameter optimization
import optuna
from optuna.samplers import TPESampler
import optuna.visualization.matplotlib as oviz

# file system manipulation
import os
import shutil
from pathlib import Path
import pickle
import copy

# logging
import logging
import time# base libraries
import numpy as np
import pandas as pd
import regex as re
import itertools
import sklearn.metrics as sk
from functools import reduce

# deep learning libraries
import torch
import transformers
from sklearn.model_selection import train_test_split
from simpletransformers.classification import ClassificationModel, ClassificationArgs

# hyperparameter optimization
import optuna
from optuna.samplers import TPESampler
import optuna.visualization.matplotlib as oviz

# file system manipulation
import os
import shutil
from pathlib import Path
import pickle
import copy

# logging
import logging
import time

In [None]:
# define directories
data_dir = Path("Storage/Bert/")
results_dir = Path("Storage/Bert/Results")

In [None]:
# set seeds to make computations deterministic
np.random.seed(42)
torch.manual_seed(42)

# check CUDA availability
cuda_available = torch.cuda.is_available()
print("Is CUDA available? ", "Yes" if cuda_available else "No")

In [None]:
# configure logging options
logging.basicConfig(level = logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [None]:
def prepare_data(trial):
    train_data = pd.read_csv(r"Storage/Bert/train_8_11.csv")
    val_data = pd.read_csv(r"Storage/Bert/valid_8_11.csv")
    test_data = pd.read_csv(r"Storage/Bert/test_8_11.csv")
        
    # always_patterns = pd.read_csv("Storage/Bert/always_patterns.csv") 
    # always_patterns = always_patterns[['Unnamed: 0', 'patient_id', 'sequence','original', 'label']]
    # always_patterns.columns = ['Unnamed: 0', 'patient_id', 'sequence','original', 'annotator_label']
    
    # manual_review = pd.read_csv("Storage/Bert/manual_review.csv")
    # manual_review = manual_review[['Unnamed: 0', 'patient_id', 'sequence','original', 'label']]
    # manual_review.columns = ['Unnamed: 0', 'patient_id', 'sequence','original', 'annotator_label']
    
    # # stratification among always patterns
    # X = always_patterns[["patient_id", "sequence"]]
    # y = always_patterns["annotator_label"]

    # y_label = y.to_numpy()
    # X_train, X_test_valid, y_train, y_test_valid = train_test_split(X, y, random_state=None, test_size=0.10, stratify=y_label)

    # y_test_valid_label = y_test_valid.to_numpy()
    # X_valid, X_test, y_valid, y_test = train_test_split(X_test_valid, y_test_valid, random_state=None, test_size=(0.25), stratify=y_test_valid_label)
    
    # # stratification among manually reviewed sequences
    # X_2 = manual_review[["patient_id", "sequence"]]
    # y_2 = manual_review["annotator_label"]

    # y_label_2 = y_2.to_numpy()
    # X_train_2, X_test_valid_2, y_train_2, y_test_valid_2 = train_test_split(X_2,y_2,random_state=None,test_size=0.3, stratify=y_label_2)

    # y_test_valid_label_2 = y_test_valid_2.to_numpy()
    # X_valid_2, X_test_2, y_valid_2, y_test_2 = train_test_split(X_test_valid_2, y_test_valid_2, random_state=None, test_size=(0.15/0.3), stratify=y_test_valid_label_2)
    
    # # combining
    # X_train = X_train.append(X_train_2)
    # y_train = y_train.append(y_train_2)

    # X_test = X_test.append(X_test_2)
    # y_test = y_test.append(y_test_2)

    # X_valid = X_valid.append(X_valid_2)
    # y_valid = y_valid.append(y_valid_2)
    
    # # data = X + y
    # train_data = pd.concat([X_train.reset_index(drop = True), y_train.reset_index(drop = True)], axis = 1)
    # val_data = pd.concat([X_valid.reset_index(drop = True), y_valid.reset_index(drop = True)], axis = 1)
    # test_data = pd.concat([X_test.reset_index(drop = True), y_test.reset_index(drop = True)], axis = 1)
    
    train_data.columns = val_data.columns = test_data.columns = ["PatientID", "text", "labels"]

    return train_data, val_data, test_data

In [None]:
def define_model(trial, trial_dir):
    # hyperparameter tuning
    learning_rate = trial.suggest_float("learning_rate", 1e-8, 1e-4, log = True)
    adam_epsilon = trial.suggest_float("adam_epilson", 1e-8, 1e-4, log = True)
    num_train_epochs = trial.suggest_int("num_train_epochs", 1, 3)
    early_stopping_patience = trial.suggest_int("early_stopping_patience", 1, 3)

    print("- Learning Rate: {}".format(learning_rate))
    print("- Adam Epsilon: {}".format(adam_epsilon)) 
    print("- Training Epochs: {}".format(num_train_epochs))
    print("- Early Stopping Patience: {}".format(early_stopping_patience))

    # define model name
    model_type = "bert"
    model_name = "emilyalsentzer/Bio_ClinicalBERT"
    max_seq_length = 512 

    model_args = ClassificationArgs(

    ## NLP ARGUMENTS
    sliding_window = False,
    learning_rate = learning_rate, # default 4e-5
    adam_epsilon = adam_epsilon, # default 1e-8
    train_batch_size = 8, # default 8
    eval_batch_size = 4, # default 8
    num_train_epochs = num_train_epochs,  # default 1 (number of epochs model will be trained for)
    do_lower_case = False, # default False
    max_seq_length = max_seq_length, # default 128 (maximum sequence length the model will support)
    
    ## TRAINING LOOP
    logging_steps = 50, # default 50
    manual_seed = 1234, # default None (necessary for reproducible results)
    n_gpu = 2, # default 1 (number of GPUs to use)
    save_steps = 2000, # default 2000 (save a model checkpoint at every specified number of steps)
    output_dir = trial_dir, 
    overwrite_output_dir = True, # default False (if True, then the trained model will be saved to the ouput_dir and will overwrite existing saved models in the same directory)
    
    ## EVALUATE DURING TRAINING
    evaluate_during_training = True, # default False
    evaluate_during_training_steps = 2000, # default  2000  
    evaluate_during_training_verbose = True, # default False
    
    ## EARLY STOPPING
    use_early_stopping = True, # default False
    early_stopping_delta = 0, # default 0 (improvement over best_eval_loss necessary to count as a better checkpoint)
    early_stopping_metric = "eval_loss", # default eval_loss 
    early_stopping_metric_minimize = True, # default True
    early_stopping_patience = early_stopping_patience, # default value 3 (terminate training after these many epochs if there is no improvement in early_stopping_metric then early_stopping_delta)
    
    )
    
    # create the classification model
    model = ClassificationModel(
        model_type, model_name,
        num_labels = 3,
        args = model_args,
        use_cuda = cuda_available
    )
    
    return model

In [None]:
def objective(trial):
    # log time
    start_time = time.localtime()

    # log message
    print("\n-------- TRIAL #{} --------".format(trial.number))

    # create output directory
    trial_dir = "Storage/Bert/Results/trial_{}".format(trial.number)
    if os.path.isdir(trial_dir):
        shutil.rmtree(trial_dir)
        print("\n>>> {}: Removing Directory {}\n".format(time.strftime("%H:%M:%S", time.localtime()), trial_dir))
    os.mkdir(trial_dir)

    # log message
    print("\n>>> {}: Preparing Data\n".format(time.strftime("%H:%M:%S", time.localtime())))

    train_data, val_data, test_data = prepare_data(trial)

    assert len(train_data["labels"].unique() == 3)
    
    # save test dataset to file
    f = open(Path(trial_dir, "data_{}.pkl".format(trial.number)), "wb")
    pickle.dump([train_data, val_data, test_data], f)
    f.close()

    # log message
    print("\n>>> {}: Defining Model\n".format(time.strftime("%H:%M:%S", time.localtime())))

    model = define_model(trial, trial_dir)

    # log message
    print("\n>>> {}: Started Training\n".format(time.strftime("%H:%M:%S", time.localtime())))

    # train model
    model.train_model(
        train_data,
        eval_df = val_data,
        # auc = sk.roc_auc_score,
        # acc = sk.accuracy_score
    )

    print("\n>>> {}: Started Evaluation on Validation Set\n".format(time.strftime("%H:%M:%S", time.localtime())))
    
    results, model_outputs, wrong_predictions = model.eval_model(
        val_data,
        # auc = sk.roc_auc_score,
        # acc = sk.accuracy_score
    )

    # save to file
    f = open(Path(trial_dir, "training_results_{}.pkl".format(trial.number)), "wb")
    pickle.dump([model, results, model_outputs, wrong_predictions], f)
    f.close()

    # output message, initialize empty list
    print(">>> {}: Get Sequence Probabilities\n".format(time.strftime("%H:%M:%S", time.localtime())))
    df_list = []

    # extract context window probabilities
    max_prob_list = []
    val_prob_list = []
    val_pred_list = []
    for i in range(len(val_data)):
        # prob_list = list(torch.softmax(torch.from_numpy(model_outputs[i]), axis=0)[:,1])
        prob_list = torch.softmax(torch.from_numpy(model_outputs[i]), axis=0)
        #print("Prob List: ", prob_list, type(prob_list))

        extracted_prob_list = []
        for i in range(len(prob_list)):
            extracted_prob_list.append(float(prob_list[i]))

        #print("Extracted Prob List: ", extracted_prob_list)
        # find max one in each submatrix of length 3
        max_proba = max(extracted_prob_list)

        # identify model prediction based on location of max_proba within extracted_prob_list
        if (extracted_prob_list[0] == max_proba):
            val_pred_list.append(0)
        elif (extracted_prob_list[1] == max_proba):
            val_pred_list.append(1)
        else:
            val_pred_list.append(2)

        max_prob_list.append(max_proba)
        val_prob_list.append(extracted_prob_list)
    
    cw_probs = pd.DataFrame(columns = ["PatientID", "Prob", "Pred"])
    cw_probs["PatientID"] = val_data["PatientID"]
    cw_probs["Prob"] = max_prob_list
    cw_probs["Pred"] = val_pred_list
    cw_probs.to_csv(trial_dir + "/sequence_probabilities{}.csv".format(trial.number))

    # compute metrics
    # print("Shapes of Y-True and Y-Pred", val_data["labels"].shape, cw_probs["Prob"].shape) 
    best_auc = sk.roc_auc_score(val_data["labels"].to_list(), val_prob_list, multi_class = "ovr", average = "weighted")
    # best_auc = get_auc(val_prob_list, val_data["labels"])
    # best_acc, best_threshold = get_best_acc(cw_probs, val_data)
    best_acc = sk.accuracy_score(val_data["labels"].to_list(), cw_probs["Pred"].to_list())
    print(">>> {}: Current AUC: {}\n".format(time.strftime("%H:%M:%S", time.localtime()), best_auc))
    print(">>> {}: Current ACC: {}\n".format(time.strftime("%H:%M:%S", time.localtime()), best_acc))
    #print(">>> {}: Threshold for Validation Accuracy: {}\n".format(time.strftime("%H:%M:%S", time.localtime()), best_threshold))
    print(">>> {}: Start Training Time\n".format(time.strftime("%H:%M:%S", start_time)))
    print(">>> {}: Finish Training Time\n".format(time.strftime("%H:%M:%S", time.localtime())))

    return best_acc

In [None]:
# add stream handler of stdout to show the messages
optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))

# unique identifier of the study
study_name = "slat-study" 

# create study database
storage_name = "sqlite:///{}.db".format("Storage/Bert/Results/" + study_name)
study = optuna.create_study(direction = "maximize", sampler = TPESampler(seed = 1234, multivariate = True), study_name = study_name, storage = storage_name, load_if_exists = True)
study.optimize(objective, n_trials = 20, gc_after_trial = True)

pruned_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED]
complete_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]

print("\nStudy Statistics:")
print("- Finished Trials: ", len(study.trials))
print("- Pruned Trials: ", len(pruned_trials))
print("- Complete Trials: ", len(complete_trials))

print("\nBest Trial:")
best_trial = study.best_trial

print("- Number: ", best_trial.number)
print("- Value: ", best_trial.value)
print("- Hyperparameters: ")

for key, value in best_trial.params.items():
    print("   - {}: {}".format(key, value))