In [None]:
from transformers import BertTokenizer,BertModel
import torch
import pandas as pd
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn as nn
import matplotlib.pyplot as plt
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split

#bert = BertModel.from_pretrained('dmis-lab/biobert-large-cased-v1.1')
#tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-large-cased-v1.1')

bert = BertModel.from_pretrained('bert-base-uncased')
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
embedding_dim = bert.config.to_dict()['hidden_size']

## YOU MAY CHANGE THESE HYPERPARAMETERS
LABEL_NUM = 5
BATCH_SIZE = 16
LEARNING_RATE = 1e-5
DROPOUT_RATIO = 0 
MAX_EPOCH = 5
TEST_TRAIN_RATIO = 0.3
MAX_LEN = 256


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
## if you want to use TPU on Colab, run this code

!pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.8.1-cp37-cp37m-linux_x86_64.whl
import torch_xla
import torch_xla.core.xla_model as xm

device = xm.xla_device()

In [None]:
text_data = pd.read_csv('train.dat', sep = '\t', header=None)
df_train, df_test = train_test_split(text_data, test_size=TEST_TRAIN_RATIO)
df_train.reset_index(drop=True, inplace = True)
df_test.reset_index(drop=True, inplace = True)

## Prepare Dataset for BERT

### TODO : Preprocess input data

#### example : when maximum length is 8

- original input sentence - "I really love you"
- tokenizing (use ``bert_tokenizer.tokenize``) -  ['i', 'really', 'love', 'you']
- Add special token - ['[CLS]' 'i', 'really', 'love', 'you', '[SEP]'] (length = 6)
- Add padding tokens to fit maximum length - ['[CLS]' 'i', 'really', 'love', 'you', '[SEP]', '[PAD]','[PAD]']
- Convert tokens to id (use ``bert_tokenizer.convert_tokens_to_ids``)
- make attention mask to tell which token is a padding token - [1,1,1,1,1,1,0,0]

You may choose other way (even simpler) to preprocess the input text. see https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.PreTrainedTokenizer or https://huggingface.co/transformers/model_doc/bert.html#berttokenizer


In [None]:
class ClinicalDataset(Dataset):

    def __init__(self, dataframe, maxlen, tokenizer):

        self.df = dataframe.rename(columns={0: "label", 1: "text"})
        #Initialize the BERT tokenizer
        self.tokenizer = tokenizer
        
        self.df['label'] = self.df['label'].apply(lambda x : x-1)

        self.maxlen = maxlen

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):

        #Selecting the sentence and label at the specified index in the data frame
        sentence = self.df.loc[index, 'text']
        label = self.df.loc[index, 'label']

        ##TODO 

        return sequence, attention_mask, label

In [None]:
#Creating instances of training and validation set
train_set = ClinicalDataset(df_train, maxlen = MAX_LEN,tokenizer = bert_tokenizer)
val_set = ClinicalDataset(df_test, maxlen = MAX_LEN,tokenizer = bert_tokenizer)

#Creating intsances of training and validation dataloaders
train_loader = DataLoader(train_set, batch_size = BATCH_SIZE, num_workers = 2)
val_loader = DataLoader(val_set, batch_size = BATCH_SIZE, num_workers = 2)

## Define Neural Network

### TODO : Define layers and data flow 

``nn.Dropout`` could be used to prevent overffiting. (parameter : dropout ratio)

from ``bert.config.to_dict()['hidden_size']``, we can obtain size of embedding vector used in pretrained BERT model.

Here is information about pytorch functions that used for layers (e.g. ``nn.Linear``):
https://pytorch.org/docs/stable/nn.html

Here is more information about BERT model implementation on PyTorch : https://huggingface.co/transformers/model_doc/bert.html#bertmodel




In [None]:
class Classifier(nn.Module):

    def __init__(self,bert,output_len,dropout):
        super(Classifier, self).__init__()
        ##TODO
        
        ## Define Layers
        ## 

    def forward(self, sequence, attention_masks):

        ##TODO
        
        ##Define data flow in neural network


In [None]:
criterion = nn.CrossEntropyLoss()


#device = torch.device('cpu')
net = Classifier(bert,LABEL_NUM,DROPOUT_RATIO)
net.to(device)
optimizer = optim.Adam(net.parameters(), lr = LEARNING_RATE)

## Train the model

In [None]:
def get_accuracy(output, labels):

    _, pred = torch.max(output.data, axis=1)
    ans = (pred == labels.squeeze()).sum()
    
    return ans

def evaluate(net, criterion, dataloader):
    net.eval()

    mean_acc, mean_loss = 0, 0
    count = 0
    ans = 0
    total_num = 0
    

    with torch.no_grad():
        for seq, attn_masks, labels in dataloader:
            seq, attn_masks, labels = seq.to(device), attn_masks.to(device), labels.to(device)
            output = net(seq, attn_masks)
            mean_loss += criterion(output.squeeze(-1), labels.long()).item()
            
            ans += get_accuracy(output, labels)
            total_num += labels.size(0)
            count += 1


    return float(ans) / float(total_num), mean_loss / count



In [None]:
def train(net, criterion, optimizer, dataloader):
    
    total_loss = 0
    count = 0
    ans = 0
    total_num = 0
    net.train()
        
    for i, (sequence, attention_mask, labels) in enumerate(dataloader):
        
        
        optimizer.zero_grad()  
        
        sequence, attention_mask, labels = sequence.to(device), attention_mask.to(device), labels.to(device)
        
        output = net(sequence, attention_mask)
        
        loss = criterion(output.squeeze(-1), labels.long())
        
        ##loss.backward() calculate gradients of each parameters
        loss.backward()
        
        ##optimizer.step() updates learnable parameters in the NN using calculated gradients
        optimizer.step()
        
        total_loss += criterion(output.squeeze(-1), labels.long()).item()
        ans += get_accuracy(output, labels)
        total_num += labels.size(0)
        count += 1
        acc = float(ans)/ float(total_num)
        mean_loss = float(total_loss/count)  

        if (i + 1) % 100 == 0:
            print("Iteration {} complete. Loss : {} Accuracy : {}".format(i+1, mean_loss, acc))
     
    return acc, mean_loss
    
  

In [None]:
train_acc_list = []
train_loss_list = []

test_acc_list = []
test_loss_list = []
for epoch in range(MAX_EPOCH):
    train_acc, train_loss = train(net,criterion,optimizer,train_loader)
    test_acc, test_loss = evaluate(net,criterion,val_loader)
    print("Epoch {} complete! Validation Loss : {} Validation Accuracy : {}".format(epoch+1,test_loss,test_acc))
    
    train_acc_list.append(train_acc)
    train_loss_list.append(train_loss)
    
    test_acc_list.append(test_acc)
    test_loss_list.append(test_loss)

In [None]:
x = np.arange(1,MAX_EPOCH+1,1)

plt.plot(x,train_acc_list)
plt.plot(x,test_acc_list)