In [55]:
# Data Science
import re
import csv
import json
import itertools
from tqdm import tqdm
import numpy as np
import pandas as pd
import seaborn as sns
from pandasgui import show
import matplotlib.pyplot as plt

# General
import os
import sys
import time
import math
import logging
import time
import random
from datetime import date
import warnings
current_date = date.today()
warnings.filterwarnings("ignore")

# ML
from sklearn.model_selection import train_test_split

# deep learning libraries
import torch
import transformers
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from transformers import AdamW, BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# hyperparameter optimization
import optuna
from optuna.samplers import TPESampler
import optuna.visualization.matplotlib as oviz

In [58]:
# set seeds to make computations deterministic
np.random.seed(1234)
torch.manual_seed(1234)

# Identify Colab GPU or CPU as device
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [3]:
# configure logging options
logging.basicConfig(level = logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

## Preprocessing

In [4]:
always_patterns = pd.read_csv("input_optimized.csv") 
manual_review = pd.read_csv(r"C:\Users\MIND_DS\Dropbox (Partners HealthCare)\NLP\Tanish\APOE-SLAT\Modeling\test_and_validation.csv")
manual_review = manual_review[['patient_id', 'sequence', 'annotator_label']]
always_patterns = always_patterns[['patient_id', 'sequence', 'annotator_label']]
df = pd.concat([manual_review, always_patterns])

In [5]:
df = df.reset_index(drop = True)

In [6]:
tokenizer = BertTokenizer.from_pretrained('emilyalsentzer/Bio_ClinicalBERT', 
                                          do_lower_case = True)

In [7]:
model = BertForSequenceClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", 
                                                      num_labels = 3, 
                                                      output_attentions = False, 
                                                      output_hidden_states = False)

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model

In [8]:
def preprocessing(df):
    input_ids = [] # Tokenize all of the sentences and map the tokens to their word IDs
    lengths = []  # Record the length of each sequence 

    # For every sentence...
    for sen in df.sequence: 
        # `encode` will:
        #   (1) Tokenize the sentence.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        encoded_sent = tokenizer.encode (
                            sen,                     
                            add_special_tokens = True, 
                            pad_to_max_length = True, 
                            max_length = 1024,
                            truncation = True
                       )   

        # Add the encoded sentence to the list.
        input_ids.append(encoded_sent)
        # Record the truncated length.
        lengths.append(len(encoded_sent))

    print('DONE.')
    print('{:,} notes sample.'.format(len(input_ids)))
    
    return input_ids, lengths

In [9]:
always_input_ids, always_lengths = preprocessing(always_patterns)

DONE.
8,050 notes sample.


In [10]:
manual_input_ids, manual_lengths = preprocessing(manual_review)

DONE.
606 notes sample.


In [11]:
always_input_ids = pad_sequences(always_input_ids, maxlen = 1024, dtype="long", 
                          value=0, truncating="post", padding="post")

In [12]:
manual_input_ids = pad_sequences(manual_input_ids, maxlen = 1024, dtype="long", 
                          value=0, truncating="post", padding="post")

In [13]:
def attention_masks(input_ids):
    # Create attention masks
    attention_masks = []
    for sent in input_ids: 
        # Create the attention mask.
        #   - If a token ID is 0, then it's padding, set the mask to 0.
        #   - If a token ID is > 0, then it's a real token, set the mask to 1.
        att_mask = [int(token_id > 0) for token_id in sent]  
        # Store the attention mask for this sentence.
        attention_masks.append(att_mask)
    return attention_masks

In [14]:
always_attention_masks = attention_masks(always_input_ids)
manual_attention_masks = attention_masks(manual_input_ids)

## Train-Validation-Test Split

In [44]:
def split(input_ids, attention_mask):
    def helper(X, y, X_2, y_2):
        # stratiftying on df with sequences that have always pattern matches 
        y_label = y.to_numpy()
        X_train, X_test_valid, y_train, y_test_valid = train_test_split(X, y, random_state = 0, test_size = 0.1, stratify = y_label)

        y_test_valid_label = y_test_valid.to_numpy()
        X_valid, X_test, y_valid, y_test = train_test_split(X_test_valid, y_test_valid, random_state = 0, test_size = 0.25, stratify = y_test_valid_label)

        # stratiftying on df with sequences that don't have always pattern matches
        y_label_2 = y_2.to_numpy()
        X_train_2, X_test_valid_2, y_train_2, y_test_valid_2 = train_test_split(X_2, y_2, random_state = 0, test_size = 0.6, stratify = y_label_2)

        y_test_valid_label_2 = y_test_valid_2.to_numpy()
        X_valid_2, X_test_2, y_valid_2, y_test_2 = train_test_split(X_test_valid_2, y_test_valid_2, random_state = 0, test_size = (0.25/0.6), stratify = y_test_valid_label_2)
        
        
        # combining
        X_train = np.concatenate((X_train, X_train_2), axis = 0)
        y_train = y_train.append(y_train_2)

        X_test = np.concatenate((X_test, X_test_2), axis = 0)
        y_test = y_test.append(y_test_2)

        X_valid = np.concatenate((X_valid, X_valid_2), axis = 0)
        y_valid = y_valid.append(y_valid_2)
        
        return X_train, y_train, X_valid, y_valid, X_test, y_test 
        
    if (input_ids == True):
        # doing split on input_ids
        X = always_input_ids
        y = always_patterns["annotator_label"]
        X_2 = manual_input_ids
        y_2 = manual_review["annotator_label"] 
        
        X_train, y_train, X_valid, y_valid, X_test, y_test = helper(X, y, X_2, y_2)
        return X_train, y_train, X_valid, y_valid, X_test, y_test
    
    elif (attention_mask == True):
        # doing split on attention masks
        X = always_attention_masks
        y = always_patterns["annotator_label"]
        X_2 = manual_attention_masks
        y_2 = manual_review["annotator_label"]
        
        X_train, y_train, X_valid, y_valid, X_test, y_test = helper(X, y, X_2, y_2)
        return X_train, y_train, X_valid, y_valid, X_test, y_test

In [45]:
train_input, train_label, valid_input, valid_label, test_input, test_label = split(True, False)

In [46]:
train_mask, _, valid_mask, _, test_mask, _ = split(False, True)

In [47]:
# Convert all inputs and labels into torch tensors, the required datatype 
train_inputs = torch.tensor(train_input)
validation_inputs = torch.tensor(valid_input)
test_inputs = torch.tensor(test_input)

train_labels = torch.tensor(train_label.to_list())
validation_labels = torch.tensor(valid_label.to_list())
test_labels = torch.tensor(test_label.to_list())

train_masks = torch.tensor(train_mask)
validation_masks = torch.tensor(valid_mask)
test_masks = torch.tensor(test_mask)

In [51]:
# The DataLoader needs to know our batch size for training, so we specify it here.
# For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32.
batch_size = 4

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [56]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )
# Number of training epochs (authors recommend between 2 and 4)
epochs = 3

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs
# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [59]:
## Training loop
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Helper function for formatting elapsed times.

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Store the average loss after each epoch so we can plot them.
loss_values = []

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_loss = 0

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 100 batches.
        if step % 10 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we
        # have provided the `labels`.
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        
        # The call to `model` always returns a tuple, so we need to pull the 
        # loss value out of the tuple.
        loss = outputs[0]

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have
            # not provided labels.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        logits = outputs[0]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1

    # Report the final accuracy for this validation run.
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...


RuntimeError: The expanded size of the tensor (1024) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [4, 1024].  Tensor sizes: [1, 512]