<a href="https://colab.research.google.com/github/akbism/COVID-QA/blob/main/QuestionClassification/1_BERT_Question_Classifier_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This script is for question type model training and inference.

#Setting up the google drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd gdrive/My\ Drive/Colab\ Notebooks/LJMU/covidqa/biobert-pytorch/question-classification

Mounted at /content/gdrive
/content/gdrive/My Drive/Colab Notebooks/LJMU/covidqa/biobert-pytorch/question-classification


In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.9.1-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 4.9 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 44.7 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 53.5 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 60.5 MB/s 
[?25hCollecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninsta

In [None]:
import torch
import pandas as pd
from tqdm.notebook import tqdm
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from transformers import BertForSequenceClassification
from sklearn.metrics import f1_score
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')




#Defining evaluation functions

In [None]:
### Evaluation utility functions
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

# Data pre-processing

In [None]:
### Training / Test Data preparation
f_train = open('datasets/UIUCQuestionClassification/traininig_dataset.txt', 'r+')
f_test = open('datasets/UIUCQuestionClassification/validation_dataset.txt', 'r+')
train = pd.DataFrame(f_train.readlines(), columns = ['Question'])
test = pd.DataFrame(f_test.readlines(), columns = ['Question'])

train['QType'] = train.Question.apply(lambda x: x.split(' ', 1)[0])
train['Question'] = train.Question.apply(lambda x: x.split(' ', 1)[1])
train['QType-Coarse'] = train.QType.apply(lambda x: x.split(':')[0])
train['QType-Fine'] = train.QType.apply(lambda x: x.split(':')[1])
test['QType'] = test.Question.apply(lambda x: x.split(' ', 1)[0])
test['Question'] = test.Question.apply(lambda x: x.split(' ', 1)[1])
test['QType-Coarse'] = test.QType.apply(lambda x: x.split(':')[0])
test['QType-Fine'] = test.QType.apply(lambda x: x.split(':')[1])

possible_labels = train['QType-Coarse'].unique()
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
train['label'] = train['QType-Coarse'].replace(label_dict)
test['label'] = test['QType-Coarse'].replace(label_dict)
print(label_dict)
print(train['QType-Coarse'].value_counts())

{'DESC': 0, 'ENTY': 1, 'ABBR': 2, 'HUM': 3, 'NUM': 4, 'LOC': 5}
ENTY    1250
HUM     1223
DESC    1162
NUM      896
LOC      835
ABBR      86
Name: QType-Coarse, dtype: int64


In [None]:
train.shape

(5452, 5)

In [None]:
test.shape

(500, 5)

# Train-Test split

In [None]:
### Training Validation Data split
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train.index.values, 
                                                  train.label.values, 
                                                  test_size=0.05, 
                                                  random_state=42, 
                                                  stratify=train.label.values)

train['data_type'] = ['not_set']*train.shape[0]

train.loc[X_train, 'data_type'] = 'train'
train.loc[X_val, 'data_type'] = 'val'

train.groupby(['QType-Coarse', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Question,QType,QType-Fine
QType-Coarse,label,data_type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ABBR,2,train,82,82,82
ABBR,2,val,4,4,4
DESC,0,train,1104,1104,1104
DESC,0,val,58,58,58
ENTY,1,train,1187,1187,1187
ENTY,1,val,63,63,63
HUM,3,train,1162,1162,1162
HUM,3,val,61,61,61
LOC,5,train,793,793,793
LOC,5,val,42,42,42


# Data Preparation

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)
                                          
encoded_data_train = tokenizer.batch_encode_plus(
    train[train.data_type=='train'].Question.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    train[train.data_type=='val'].Question.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

encoded_data_test = tokenizer.batch_encode_plus(
    test.Question.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(train[train.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(train[train.data_type=='val'].label.values)

input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(test.label.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)
dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


# Model Training

In [None]:
epochs = 5
batch_size = 32
# batch_size = 3
# Warmup is a period of time where hte learning rate 
# is small and gradually increases--usually helps training.
# WARMUP_PROPORTION = 0.1
# Compute # train and warmup steps from batch size
# num_train_steps = int(len(train_features) / batch_size * epochs)
# num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)


### Load pre-trained model for fine-tuning
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)
model.to(device)
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  # lr=1e-5, 
                  lr=2e-5, 
                  eps=1e-8)
                  
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

# Model Evaluation

In [None]:
### Evaluation on the dataset with labels
import random
import numpy as np
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

def evaluate(model, dataloader_val):
    model.eval()   
    loss_val_total = 0
    predictions, true_vals = [], []
   
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2]}

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
    # if not os.path.exists(f'model/epoch{epoch}/'):
    #   os.makedirs(f'model/epoch{epoch}/')    
    # torch.save(model.state_dict(), f'model/finetuned_BERT_epoch_{epoch}.model')
    model.save_pretrained(f'model/epoch{epoch}/')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(model, dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=162.0, style=ProgressStyle(description_widt…


Epoch 1
Training loss: 0.8251463004652365
Validation loss: 0.39755426347255707
F1 Score (Weighted): 0.8797620870047512


HBox(children=(FloatProgress(value=0.0, description='Epoch 2', max=162.0, style=ProgressStyle(description_widt…


Epoch 2
Training loss: 0.19230041793191138
Validation loss: 0.26939594083362156
F1 Score (Weighted): 0.9374955201362895


HBox(children=(FloatProgress(value=0.0, description='Epoch 3', max=162.0, style=ProgressStyle(description_widt…


Epoch 3
Training loss: 0.08897053601542189
Validation loss: 0.3313591645823585
F1 Score (Weighted): 0.9230753267305372


HBox(children=(FloatProgress(value=0.0, description='Epoch 4', max=162.0, style=ProgressStyle(description_widt…


Epoch 4
Training loss: 0.053462315140935926
Validation loss: 0.37889328930113053
F1 Score (Weighted): 0.9266726507901899


HBox(children=(FloatProgress(value=0.0, description='Epoch 5', max=162.0, style=ProgressStyle(description_widt…


Epoch 5
Training loss: 0.03869186655940189
Validation loss: 0.38009888927141827
F1 Score (Weighted): 0.9266726507901899



# Model inference

In [None]:
### Load the fine-tuned model
model = BertForSequenceClassification.from_pretrained("model/")
model.to(device)

print(device)

cuda


In [None]:
label_dict

{'ABBR': 2, 'DESC': 0, 'ENTY': 1, 'HUM': 3, 'LOC': 5, 'NUM': 4}

### Prediction function

In [None]:
#### Prediction Function
def predict(test_question_series, model= model, batch_size = 3, label_dict=label_dict):
    label_dict=  {v: k for k, v in label_dict.items()}
    encoded_data_test = tokenizer.batch_encode_plus(
      test_question_series.values, 
      add_special_tokens=True, 
      return_attention_mask=True, 
      pad_to_max_length=True, 
      max_length=256, 
      return_tensors='pt'
    )
    encoded_data_test=encoded_data_test.to(device)
    input_ids_test = encoded_data_test['input_ids']
    attention_masks_test = encoded_data_test['attention_mask']
    dataset_test = TensorDataset(input_ids_test, attention_masks_test)
    dataloader_test = DataLoader(dataset_test, 
                                  sampler=SequentialSampler(dataset_test), 
                                  batch_size=batch_size)
    model.eval()
    predictions = []   
    for batch in dataloader_test:      
        batch = tuple(b.to(device) for b in batch)        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1]
                 }
        with torch.no_grad():        
            outputs = model(**inputs)           
        logits = outputs['logits']
        # import pdb
        # pdb.set_trace()
        logits = logits.detach().cpu().numpy()
        predictions.append(logits)
    predictions = np.concatenate(predictions, axis=0)
    preds_flat = np.argmax(predictions, axis=1).flatten()
    preds_final = np.vectorize(label_dict.get)(preds_flat)
    return preds_final

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import numpy as np

In [None]:
temp=predict(test.Question)
pd.crosstab(temp, test['QType-Coarse'])