In [None]:
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)

Mounted at /content/gdrive


In [None]:
import pandas as pd
import torch
import time
import datetime
import transformers
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel,AdamW,BertForSequenceClassification
import random
import numpy as np
from transformers import get_linear_schedule_with_warmup
import os

BioBERT is a variation of the bert model from Korea University and Clova AI. Researchers added to the corpora of the original BERT with PubMed and PMC. PubMed is a database of biomedical citations and abstractions, whereas PMC is an electronic archive of full-text journal articles. Their contributions were a biomedical language representation model that could manage tasks such as relation extraction and drug discovery to name a few. By having a pre-trained model that encompasses both general and biomedical domain corpora, developers and practitioners could now encapsulate biomedical terms that would have been incredibly difficult for a general language model to comprehend.

In [None]:
## load pretrained biobert tokenizer
tokenizer = BertTokenizer.from_pretrained('biobert_v1.1_pubmed', do_lower_case=True)

In [None]:
## load pretrained biobert model
model = BertForSequenceClassification.from_pretrained("biobert_v1.1_pubmed", num_labels=26)

Some weights of the model checkpoint at biobert_v1.1_pubmed were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initia

In [None]:
MAX_LEN = 200
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 4

train = pd.read_csv('/content/gdrive/My Drive/lily/train.csv')
test = pd.read_csv('/content/gdrive/My Drive/lily/test.csv')

output_map = {'Animal Diseases': 0,
 'Bacterial Infections and Mycoses': 3,
 'Cardiovascular Diseases': 1,
 'Chemically-Induced Disorders': 2,
 'Congenital Hereditary and Neonatal Diseases and Abnormalities': 4,
 'Digestive System Diseases': 5,
 'Disorders of Environmental Origin': 22,
 'Endocrine System Diseases': 6,
 'Eye Diseases': 7,
 'Female Urogenital Diseases and Pregnancy Complications': 8,
 'Hemic and Lymphatic Diseases': 9,
 'Immune System Diseases': 10,
 'Male Urogenital Diseases': 11,
 'Musculoskeletal Diseases': 24,
 'Neoplasms': 12,
 'Nervous System Diseases': 13,
 'Nutritional and Metabolic Diseases': 14,
 'Occupational Diseases': 15,
 'Otorhinolaryngologic Diseases': 16,
 'Parasitic Diseases': 17,
 'Pathological Conditions and Signs and Symptoms': 18,
 'Respiratory Tract Diseases': 19,
 'Skin and Connective Tissue Diseases': 25,
 'Stomatognathic Diseases': 20,
 'Virus Diseases': 21,
 'Wounds and Injuries': 23}

train['categories'] = train['categories'].map(output_map)
test['categories'] = test['categories'].map(output_map)

**Preparing the Dataset and Dataloader**



We will start with defining few key variables that will be used later during the training/fine tuning stage. Followed by creation of CustomDataset class - This defines how the text is pre-processed before sending it to the neural network. We will also define the Dataloader that will feed the data in batches to the neural network for suitable training and processing. Dataset and Dataloader are constructs of the PyTorch library for defining and controlling the data pre-processing and its passage to neural network

**Dataloader**


* Dataloader is used to for creating training and validation dataloader that load data to the neural network in a defined manner. This is needed because all the data from the dataset cannot be loaded to the memory at once, hence the amount of dataloaded to the memory and then passed to the neural network needs to be controlled.
* This control is achieved using the parameters such as batch_size and max_len.
* Training and Validation dataloaders are used in the training and validation part of the flow respectively

In [None]:
class BioBertDataModel(Dataset):
    def __init__(self,type,tokenizer, max_len):
        super(BertDataModel, self).__init__()
        self.tokenizer = tokenizer
        self.max_len = max_len
        if type == "train":
            self.len = len(train)
            self.abstract = list(train['abstract'])
            self.category = list(train['categories'])
        else:
            self.len = len(test)
            self.abstract = list(test['abstract'])
            self.category = list(test['categories'])
    def __getitem__(self, index):
        input = self.abstract[index]
        output = self.category[index]
        inputs = self.tokenizer.encode_plus(
            input,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(output, dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [None]:

training_set = BioBertDataModel('train', tokenizer, MAX_LEN)
testing_set = BioBertDataModel('test', tokenizer, MAX_LEN)


train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

epochs = 4
total_steps = len(training_loader) * epochs
# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

Below is our training loop. There's a lot going on, but fundamentally for each pass in our loop we have a trianing phase and a validation phase. 


**Training:**
- Unpack our data inputs and labels
- Load data onto the GPU for acceleration
- Clear out the gradients calculated in the previous pass. 
    - In pytorch the gradients accumulate by default (useful for things like RNNs) unless you explicitly clear them out.
- Forward pass (feed input data through the network)
- Backward pass (backpropagation)
- Tell the network to update parameters with optimizer.step()
- Track variables for monitoring progress

**Evalution:**
- Unpack our data inputs and labels
- Load data onto the GPU for acceleration
- Forward pass (feed input data through the network)
- Compute loss on our validation data and track variables for monitoring progress

Pytorch hides all of the detailed calculations from us, but we've commented the code to point out which of the above steps are happening on each line. 



In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
### store model weights in output_dir
output_dir = '/content/gdrive/My Drive/lily/biobert/state_dict_model.pt'

In [None]:
total_t0 = time.time()
training_stats = []
for epoch_i in range(0, 50):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    t0 = time.time()
    total_train_loss = 0
     # Put the model into training mode.
    model.train()
     # For each batch of training data...
    for i,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)
          # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        model.zero_grad()
         # Perform a forward pass (evaluate the model on this training batch).
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        # It returns different numbers of parameters depending on what arguments
        # arge given and what flags are set. For our useage here, it returns
        # the loss (because we provided labels) and the "logits"--the model
        # outputs prior to activation.
        loss, logits = model(ids, 
                             token_type_ids=None, 
                             attention_mask=mask, 
                             labels=targets)
                             labels=targets)  
        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.  
        total_train_loss += loss.item()
        # Perform a backward pass to calculate the gradients.
        loss.backward()
         # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
         # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()
        optimizer.step()
          # Update the learning rate.
        scheduler.step()
    avg_train_loss = total_train_loss / len(training_loader)
    training_time = format_time(time.time() - t0)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
    print("")
    print("Running Validation...")

    t0 = time.time()
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    for data in testing_loader:
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():  
            loss, logits = model(ids, 
                             token_type_ids=None, 
                             attention_mask=mask, 
                             labels=targets)  
        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = targets.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)

    avg_val_accuracy = total_eval_accuracy / len(testing_loader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(testing_loader)
    
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )
      ## save the after after each epoch
    torch.save(model.state_dict(), output_dir)
    torch.save(optimizer.state_dict(), os.path.join('/content/gdrive/My Drive/lily/biobert', 'optimizer.pt'))
    torch.save(scheduler.state_dict(), os.path.join('/content/gdrive/My Drive/lily/biobert', 'scheduler.pt'))
print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))    


Training...





  Average training loss: 0.93
  Training epcoh took: 0:38:42

Running Validation...
  Accuracy: 0.82
  Validation Loss: 0.81
  Validation took: 0:03:04





Training...

  Average training loss: 0.63
  Training epcoh took: 0:39:04

Running Validation...
  Accuracy: 0.83
  Validation Loss: 0.83
  Validation took: 0:03:09

Training...

  Average training loss: 0.43
  Training epcoh took: 0:38:57

Running Validation...
  Accuracy: 0.83
  Validation Loss: 0.95
  Validation took: 0:03:04

Training...

  Average training loss: 0.28
  Training epcoh took: 0:38:30

Running Validation...
  Accuracy: 0.83
  Validation Loss: 1.01
  Validation took: 0:03:04

Training...

  Average training loss: 0.21
  Training epcoh took: 0:38:28

Running Validation...
  Accuracy: 0.83
  Validation Loss: 1.01
  Validation took: 0:03:04

Training...

  Average training loss: 0.21
  Training epcoh took: 0:38:26

Running Validation...
  Accuracy: 0.83
  Validation Loss: 1.01
  Validation took: 0:03:04

Training...

  Average training loss: 0.21
  Training epcoh took: 0:38:16

Running Validation...
  Accuracy: 0.83
  Validation Loss: 1.00
  Validation took: 0:03:04

Tra