In [24]:
# Data Science
import re
import csv
import json
import itertools
from tqdm import tqdm
import numpy as np
import pandas as pd
import seaborn as sns
from pandasgui import show
import matplotlib.pyplot as plt

# General
import os
import sys
import time
import math
import logging
import time
import random
from datetime import date
import warnings
current_date = date.today()
warnings.filterwarnings("ignore")

# ML
from sklearn.model_selection import train_test_split

# deep learning libraries
import torch
import transformers
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from transformers import BertTokenizer, BertForSequenceClassification
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# hyperparameter optimization
import optuna
from optuna.samplers import TPESampler
import optuna.visualization.matplotlib as oviz

In [2]:
# set seeds to make computations deterministic
np.random.seed(1234)
torch.manual_seed(1234)

# check CUDA availability
cuda_available = torch.cuda.is_available()
print("Is CUDA available? ", "Yes" if cuda_available else "No")

Is CUDA available?  No


In [3]:
# configure logging options
logging.basicConfig(level = logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

## Preprocessing

In [4]:
always_patterns = pd.read_csv("input_optimized.csv") 
manual_review = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\test_and_validation.csv")
manual_review = manual_review[['patient_id', 'sequence', 'annotator_label']]
always_patterns = always_patterns[['patient_id', 'sequence', 'annotator_label']]
df = pd.concat([manual_review, always_patterns])

In [5]:
df = df.reset_index(drop = True)

In [6]:
tokenizer = BertTokenizer.from_pretrained('emilyalsentzer/Bio_ClinicalBERT', 
                                          do_lower_case = True)

In [7]:
model = BertForSequenceClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", 
                                                      num_labels = 3, 
                                                      output_attentions = False, 
                                                      output_hidden_states = False)

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model

In [8]:
def preprocessing(df):
    input_ids = [] # Tokenize all of the sentences and map the tokens to thier word IDs.
    lengths = []  # Record the length of each sequence (after truncating to 512).

    # For every sentence...
    for sen in df.sequence: 
        # `encode` will:
        #   (1) Tokenize the sentence.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        encoded_sent = tokenizer.encode (
                            sen,                     
                            add_special_tokens = True, 
                            pad_to_max_length = True, 
                            max_length = 1024,
                            truncation = True
                       )   

        # Add the encoded sentence to the list.
        input_ids.append(encoded_sent)
        # Record the truncated length.
        lengths.append(len(encoded_sent))

    print('DONE.')
    print('{:,} notes sample.'.format(len(input_ids)))
    
    return input_ids, lengths

In [9]:
always_input_ids, always_lengths = preprocessing(always_patterns)

DONE.
8,050 notes sample.


In [10]:
manual_input_ids, manual_lengths = preprocessing(manual_review)

DONE.
606 notes sample.


In [11]:
always_input_ids = pad_sequences(always_input_ids, maxlen = 1024, dtype="long", 
                          value=0, truncating="post", padding="post")

In [12]:
manual_input_ids = pad_sequences(manual_input_ids, maxlen = 1024, dtype="long", 
                          value=0, truncating="post", padding="post")

In [13]:
def attention_masks(input_ids):
    # Create attention masks
    attention_masks = []
    for sent in input_ids: 
        # Create the attention mask.
        #   - If a token ID is 0, then it's padding, set the mask to 0.
        #   - If a token ID is > 0, then it's a real token, set the mask to 1.
        att_mask = [int(token_id > 0) for token_id in sent]  
        # Store the attention mask for this sentence.
        attention_masks.append(att_mask)
    return attention_masks

In [14]:
always_attention_masks = attention_masks(always_input_ids)
manual_attention_masks = attention_masks(manual_input_ids)

## Train-Validation-Test Split

In [36]:
def split(input_ids, attention_mask):
    def helper(X, y, X_2, y_2):
        # stratiftying on df with sequences that have always pattern matches 
        y_label = y.to_numpy()
        X_train, X_test_valid, y_train, y_test_valid = train_test_split(X, y, random_state = 0, test_size = 0.1, stratify = y_label)

        y_test_valid_label = y_test_valid.to_numpy()
        X_valid, X_test, y_valid, y_test = train_test_split(X_test_valid, y_test_valid, random_state = 0, test_size = 0.25, stratify = y_test_valid_label)

        # stratiftying on df with sequences that don't have always pattern matches
        y_label_2 = y_2.to_numpy()
        X_train_2, X_test_valid_2, y_train_2, y_test_valid_2 = train_test_split(X_2, y_2, random_state = 0, test_size = 0.6, stratify = y_label_2)

        y_test_valid_label_2 = y_test_valid_2.to_numpy()
        X_valid_2, X_test_2, y_valid_2, y_test_2 = train_test_split(X_test_valid_2, y_test_valid_2, random_state = 0, test_size = (0.25/0.6), stratify = y_test_valid_label_2)
        
        # combining
        X_train = np.concatenate((X_train, X_train_2), axis = 0)
        y_train.append(y_train_2)
        prin
        X_test = np.concatenate((X_test, X_test_2), axis = 0)
        y_test.append(y_test_2)

        X_valid = np.concatenate((X_valid, X_valid_2), axis = 0)
        y_valid.append(y_valid_2)
        
        return X_train, y_train, X_valid, y_valid, X_test, y_test 
        
    if (input_ids == True):
        # doing split on input_ids
        X = always_input_ids
        y = always_patterns["annotator_label"]
        X_2 = manual_input_ids
        y_2 = manual_review["annotator_label"] 
        
        X_train, y_train, X_valid, y_valid, X_test, y_test = helper(X, y, X_2, y_2)
        return X_train, y_train, X_valid, y_valid, X_test, y_test
    
    elif (attention_mask == True):
        # doing split on attention masks
        X = always_attention_masks
        y = always_patterns["annotator_label"]
        X_2 = manual_attention_masks
        y_2 = manual_review["annotator_label"]
        
        X_train, y_train, X_valid, y_valid, X_test, y_test = helper(X, y, X_2, y_2)
        return X_train, y_train, X_valid, y_valid, X_test, y_test

In [37]:
train_input, train_label, valid_input, valid_label, test_input, test_label = split(True, False)

In [31]:
train_mask, _, valid_mask, _, test_mask, _ = split(False, True)

In [32]:
# Convert all inputs and labels into torch tensors, the required datatype 
train_inputs = torch.tensor(train_input)
validation_inputs = torch.tensor(valid_input)
test_inputs = torch.tensor(test_input)

train_labels = torch.tensor(train_label.to_list())
validation_labels = torch.tensor(valid_label.to_list())
test_labels = torch.tensor(test_label.to_list())

train_masks = torch.tensor(train_mask)
validation_masks = torch.tensor(valid_mask)
test_masks = torch.tensor(test_mask)

In [27]:
train_inputs.shape

torch.Size([7487, 1024])

In [33]:
train_masks.shape

torch.Size([7487, 1024])

In [38]:
train_labels.shape

torch.Size([7245])

In [34]:
# The DataLoader needs to know our batch size for training, so we specify it here.
# For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32.
batch_size = 4

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

AssertionError: Size mismatch between tensors

In [None]:
print('{:,} negative (no cognitive impairment)'.format(len(df[df["annotator_label"] == 0])))
print('{:,} neither (not relevant to cognitive impairment)'.format(len(df[df["annotator_label"] == 1])))
print('{:,} positive (cognitive impairment)'.format(len(df[df["annotator_label"] == 2])))

print('Min length:    {:,} tokens'.format(min(lengths)))
print('Max length:    {:,} tokens'.format(max(lengths)))
print('Median length: {:,} tokens'.format(np.median(lengths)))



In [None]:
def split_data(trial):
    # stratify across sequences with and without always pattern matches and class_label (Y, N, NTR)
    
    # stratifying across sequences with always pattern
    X_train, X_other = train_test_split(always_patterns, random_state = 0,test_size = 0.1, stratify = always_patterns["annotator_label"].to_numpy())

    X_valid, X_test = train_test_split(X_other, random_state = 0, test_size = 0.25, stratify = X_other["annotator_label"].to_numpy())
    
    # stratifying across sequences without always pattern
    X_train_2, X_other_2 = train_test_split(manual_review, random_state = 0,test_size = 0.6, stratify = manual_review["annotator_label"].to_numpy())

    X_valid_2, X_test_2 = train_test_split(X_other_2, random_state = 0, test_size = (0.25/0.6), stratify = X_test_2["annotator_label"].to_numpy())
    
    # combining to get final train, test, validation splits
    X_train = X_train.append(X_train_2)
    X_valid = X_valid.append(X_valid_2)
    X_test = X_test.append(X_test_2)

    return X_train, X_valid, X_test

In [None]:
def define_model(trial, trial_dir):
    # set learning rate
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True)

    # define model name
    model_type = "bert"
    model_name = "emilyalsentzer/Bio_ClinicalBERT"
    max_seq_length = 512

    model_args = ClassificationArgs (
        ## NLP ARGUMENTS
        sliding_window = False,
        learning_rate = learning_rate, # default 4e-5
        adam_epsilon = 1e-8, # default 1e-8
        train_batch_size = 8, # default 8
        eval_batch_size = 4, # default 8
        num_train_epochs = 3,  # default 1 (number of epochs model will be trained for)
        do_lower_case = False, # default False
        max_seq_length = max_seq_length, # default 128 (maximum sequence length the model will support)

        ## TRAINING LOOP
        logging_steps = 50, # default 50
        manual_seed = 1234, # default None (necessary for reproducible results)
        n_gpu = 0, # default 1 (number of GPUs to use)
        save_steps = 2000, # default 2000 (save a model checkpoint at every specified number of steps)
        output_dir = trial_dir, 
        overwrite_output_dir = True, # default False (if True, then the trained model will be saved to the ouput_dir and will overwrite existing saved models in the same directory)

        ## EVALUATE DURING TRAINING
        evaluate_during_training = True, # default False
        evaluate_during_training_steps = 2000, # default  2000  
        evaluate_during_training_verbose = True, # default False

        ## EARLY STOPPING
        use_early_stopping = True, # default False
        early_stopping_delta = 0, # default 0 (improvement over best_eval_loss necessary to count as a better checkpoint)
        early_stopping_metric = "auc", # default eval_loss 
        early_stopping_metric_minimize = True, # default True
        early_stopping_patience = 2, # default value 3 (terminate training after these many epochs if there is no improvement in early_stopping_metric then early_stopping_delta)
    )

    # create the classification model
    model = ClassificationModel (
        model_type, model_name,
        args = model_args,
        use_cuda = cuda_available
    )
    
    return model

In [None]:
X_2 = manual_review["sequence"]
y_2 = manual_review["annotator_label"]

y_label_2 = y_2.to_numpy()
X_train_2, X_test_valid_2, y_train_2, y_test_valid_2 = train_test_split(X_2,y_2,random_state=0,test_size=0.6, stratify=y_label_2)

y_test_valid_label_2 = y_test_valid_2.to_numpy()
X_valid_2, X_test_2, y_valid_2, y_test_2 = train_test_split(X_test_valid_2, y_test_valid_2, random_state=0, test_size=(0.25/0.6), stratify=y_test_valid_label_2)

In [None]:
label_dict