In [None]:
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)

Mounted at /content/gdrive


Importing Python Libraries and preparing the environment
At this step we will be importing the libraries and modules needed to run our script. Libraries are:

* Pandas
* Pytorch
* Pytorch Utils for Dataset and Dataloader
* Transformers
* DistilledBERT Model and Tokenizer


In [None]:
!pip install transformers



In [None]:
# Importing the libraries needed
import pandas as pd
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer,AdamW
import os

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




In [None]:
train = pd.read_csv('/content/gdrive/My Drive/lily/train.csv')
test = pd.read_csv('/content/gdrive/My Drive/lily/test.csv')

In [None]:
train.sample(10)

Unnamed: 0,abstract,categories
12239,Dural arteriovenous fistula (DAVF) of the ante...,4
16039,Peptide hormone somatostatin and its receptors...,5
38924,Orthoreoviruses have been associated with dise...,21
32296,Periostin is a 90-kDa member of the fasciclin-...,16
31628,Vitamin D deficiency has been implicated in th...,14
40857,Free radicals produced during cancer radiother...,23
6881,Given that prolonged exposure to extreme clima...,1
16464,Wireless capsule endoscopy (WCE) can directly ...,5
36650,Cell-based therapies to augment endothelial ce...,18
37864,Evaluation of pediatric obstructive sleep apne...,19


In [None]:
train['categories'].unique()

array(['Animal Diseases', 'Cardiovascular Diseases',
       'Chemically-Induced Disorders', 'Bacterial Infections and Mycoses',
       'Congenital Hereditary and Neonatal Diseases and Abnormalities',
       'Digestive System Diseases', 'Endocrine System Diseases',
       'Eye Diseases',
       'Female Urogenital Diseases and Pregnancy Complications',
       'Hemic and Lymphatic Diseases', 'Immune System Diseases',
       'Male Urogenital Diseases', 'Neoplasms', 'Nervous System Diseases',
       'Nutritional and Metabolic Diseases', 'Occupational Diseases',
       'Otorhinolaryngologic Diseases', 'Parasitic Diseases',
       'Pathological Conditions and Signs and Symptoms',
       'Respiratory Tract Diseases', 'Stomatognathic Diseases',
       'Virus Diseases', 'Disorders of Environmental Origin',
       'Wounds and Injuries', 'Musculoskeletal Diseases',
       'Skin and Connective Tissue Diseases'], dtype=object)

In [None]:
output_map = {cat:i for i,cat in enumerate(train['categories'].unique())}

In [None]:
output_map = {'Animal Diseases': 0,
 'Bacterial Infections and Mycoses': 3,
 'Cardiovascular Diseases': 1,
 'Chemically-Induced Disorders': 2,
 'Congenital Hereditary and Neonatal Diseases and Abnormalities': 4,
 'Digestive System Diseases': 5,
 'Disorders of Environmental Origin': 22,
 'Endocrine System Diseases': 6,
 'Eye Diseases': 7,
 'Female Urogenital Diseases and Pregnancy Complications': 8,
 'Hemic and Lymphatic Diseases': 9,
 'Immune System Diseases': 10,
 'Male Urogenital Diseases': 11,
 'Musculoskeletal Diseases': 24,
 'Neoplasms': 12,
 'Nervous System Diseases': 13,
 'Nutritional and Metabolic Diseases': 14,
 'Occupational Diseases': 15,
 'Otorhinolaryngologic Diseases': 16,
 'Parasitic Diseases': 17,
 'Pathological Conditions and Signs and Symptoms': 18,
 'Respiratory Tract Diseases': 19,
 'Skin and Connective Tissue Diseases': 25,
 'Stomatognathic Diseases': 20,
 'Virus Diseases': 21,
 'Wounds and Injuries': 23}

In [None]:
train['categories'] = train['categories'].map(output_map)
test['categories'] = test['categories'].map(output_map)

**Preparing the Dataset and Dataloader**



We will start with defining few key variables that will be used later during the training/fine tuning stage. Followed by creation of CustomDataset class - This defines how the text is pre-processed before sending it to the neural network. We will also define the Dataloader that will feed the data in batches to the neural network for suitable training and processing. Dataset and Dataloader are constructs of the PyTorch library for defining and controlling the data pre-processing and its passage to neural network

**Dataloader**


* Dataloader is used to for creating training and validation dataloader that load data to the neural network in a defined manner. This is needed because all the data from the dataset cannot be loaded to the memory at once, hence the amount of dataloaded to the memory and then passed to the neural network needs to be controlled.
* This control is achieved using the parameters such as batch_size and max_len.
* Training and Validation dataloaders are used in the training and validation part of the flow respectively

In [None]:
class DistillationBertTrain(Dataset):
    def __init__(self,type,tokenizer, max_len):
        self.tokenizer = tokenizer
        self.max_len = max_len
        if type == "train":
            self.len = len(train)
            self.abstract = list(train['abstract'])
            self.category = list(train['categories'])
        else:
            self.len = len(test)
            self.abstract = list(test['abstract'])
            self.category = list(test['categories'])
    def __getitem__(self, index):
        input = self.abstract[index]
        output = self.category[index]
        inputs = self.tokenizer.encode_plus(
            input,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(output, dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [None]:
training_set = DistillationBertTrain('train', tokenizer, MAX_LEN)
testing_set = DistillationBertTrain('test', tokenizer, MAX_LEN)

In [None]:

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

**Creating the Neural Network for Fine Tuning**


**Neural Network**
* We will be creating a neural network with the DistillBERTClass.
* This network will have the DistilBERT Language model followed by a dropout and finally a Linear layer to obtain the final outputs.
* The data will be fed to the DistilBERT Language model as defined in the dataset.
* Final layer outputs is what will be compared to the rating to determine the accuracy of models prediction.
* We will initiate an instance of the network called model. This instance will be used for training and then to save the final trained model for future inference.

**Loss Function and Optimizer**


* Loss Function and Optimizer and defined in the next cell.
* The Loss Function is used the calculate the difference in the output created by the model and the actual output.
* Optimizer is used to update the weights of the neural network to improve its performance.

In [None]:
class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 26)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [None]:
model = DistillBERTClass()
model.load_state_dict(torch.load(output_dir))
model.to(device)

In [None]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )
from transformers import get_linear_schedule_with_warmup

# Number of training epochs. The BERT authors recommend between 2 and 4. 
# We chose to run for 4, but we'll see later that this may be over-fitting the
# training data.
epochs = 10

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(training_loader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [None]:
### save trained model to output_dir
output_dir = '/content/gdrive/My Drive/lily/state_dict_model.pt'
## load trained optimizer abd schedular to continue training from previous checkpoints
optimizer.load_state_dict(torch.load(os.path.join('/content/gdrive/My Drive/lily', 'optimizer.pt')))
scheduler.load_state_dict(torch.load(os.path.join('/content/gdrive/My Drive/lily', 'scheduler.pt')))
loss_function = torch.nn.CrossEntropyLoss()

def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [None]:
def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for i,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accu(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if i%500==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per {i} steps: {loss_step}")
            print(f"Training Accuracy per {i} steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # # When using GPU
        optimizer.step()
        scheduler.step()
    torch.save(model.state_dict(), output_dir)
    torch.save(optimizer.state_dict(), os.path.join('/content/gdrive/My Drive/lily', 'optimizer.pt'))
    torch.save(scheduler.state_dict(), os.path.join('/content/gdrive/My Drive/lily', 'schedular.pt'))
    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss {epoch}: {epoch_loss}")
    print(f"Training Accuracy {epoch}: {epoch_accu}")

    return

In [None]:
for epoch in range(epochs):
    train(epoch)

**Fine Tuning the Model**  



After all the effort of loading and preparing the data and datasets, creating the model and defining its loss and optimizer. This is probably the easier steps in the process.

Here we define a training function that trains the model on the training dataset created above, specified number of times (EPOCH), An epoch defines how many times the complete data will be passed through the network.

Following events happen in this function to fine tune the neural network:

* The dataloader passes data to the model based on the batch size.
* Subsequent output from the model and the actual category are compared to calculate the loss.
* Loss value is used to optimize the weights of the neurons in the network.
* After every 500 steps the loss value is printed in the console.