# BERT Model: Manual Fine-tuning

#### Imports:

In [1]:
import json
import pickle
import numpy as np

from tqdm import tqdm
from tqdm.autonotebook import tqdm
from collections import Counter

from datasets import load_dataset

import torch
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

import torch.nn as nn
from torch.nn import CrossEntropyLoss
from torch.nn.functional import softmax

from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

  from tqdm.autonotebook import tqdm


#### Data Loading and Preperation:

In [2]:
# Load dataset from json file:
data_file_path ='../data/biloc_tagged_sequences.json'
datasets = load_dataset('json', data_files=data_file_path, field='data')

# Paramters for dataset train-test-split function: 
# Sets train-test split and seed of data shuffle
test_size=0.15
random_seed=42

# Split dataset into train and test sets:
datasets = datasets['train'].train_test_split(test_size=test_size, seed=random_seed)
print("Dataset Structure:")
print(datasets)

Dataset Structure:
DatasetDict({
    train: Dataset({
        features: ['id', 'ner_tags', 'split_tokens'],
        num_rows: 8646
    })
    test: Dataset({
        features: ['id', 'ner_tags', 'split_tokens'],
        num_rows: 1526
    })
})


#### Tokenize Data for BERT Model:

In [3]:
# Load in BERT tokenizer bert-base-cased:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [4]:
# Deals with special tokens and ensures correct label alignment:
# Helps with tokenization due to dataset format
def tokenize_and_align_labels(tokenizer, examples):
    
    tokenized_inputs = tokenizer(examples["split_tokens"], truncation=True, padding="max_length", 
                                 is_split_into_words=True, return_tensors="pt")
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = [-100 if word_id is None else label[word_id] for word_id in word_ids]
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Converts batch input to tensor
def convert_to_tensors(batch):
    batch_tensors = {key: tensor(value) for key, value in batch.items()}

In [5]:
# Tokenize Dataset
tokenized_datasets = datasets.map(lambda examples: tokenize_and_align_labels(tokenizer, examples), batched=True)

#### Tokenized Dataset Formatting for Model:

In [6]:
# Format dataset for use with Pytorch:
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'labels'])

In [7]:
# Create Pytorch DataLoader Objects for Train and Test Sets:
train_dataset = tokenized_datasets["train"]
test_dataset = tokenized_datasets["test"]

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2)

#### Load in pre-trained BERT Model:

In [8]:
# Model Parameters:
model_name = "bert-base-cased"  
num_labels = 165

# Loads in default model from HuggingFace:
bert_model = AutoModel.from_pretrained(model_name, num_labels=num_labels)

#### Model, Fine-tuner, and Optimizer:

In [9]:
# Inherits from pytorch.nn.module to add custom fine-tuning to model:
class CustomNERModel(nn.Module):
    def __init__(self, bert_model, num_labels):
        super(CustomNERModel, self).__init__()
        self.bert = bert_model  # The BERT model
        self.classifier = nn.Linear(bert_model.config.hidden_size, num_labels)  # Classifier

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        sequence_output = outputs.last_hidden_state
        logits = self.classifier(sequence_output)
        return logits


In [10]:
# Initialization of custom fine-tuned BERT model
model = CustomNERModel(bert_model, num_labels)
train_mode = False

In [11]:
# Intialization of Optimizer and Loss Function:
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_function = nn.CrossEntropyLoss()

#### Model Training:

In [12]:
# Checkpoint Function:
def checkpoint(model, filename):
    torch.save(model.state_dict(), filename)
    
def resume(model, filename):
    model.load_state_dict(torch.load(filename))

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else torch.device("cpu"))
model.to(device)
print(device)

if train_mode:
    num_epochs = 50
    save_path = './checkpoints/model'

    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
        for batch in progress_bar:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = batch['labels'].to(device)
            optimizer.zero_grad()
            outputs = model(**inputs)
            logits = outputs
            loss = loss_function(logits.view(-1, num_labels), labels.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            progress_bar.set_postfix(loss=f"{loss.item():.4f}")

        # Generates Model Checkpoint Every 10 Epochs
        if epoch != 0 and epoch % 10 == 0:
            checkpoint(model, save_path + str(epoch // 10))

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.4f}")
else:
    model_path = './checkpoints/model'
    resume(model, model_path + "1")

cuda


#### Model Prediction:

In [14]:
true_labels_list = []
pred_labels_list = []

with torch.no_grad(): 
    for batch in test_loader:
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)
        outputs = model(**inputs)
        logits = outputs
        predictions = torch.argmax(logits, dim=-1)
        predictions = predictions.detach().cpu().numpy()
        labels = labels.detach().cpu().numpy()
        
        true_labels_list.append(labels)
        pred_labels_list.append(predictions)

true_labels_flat = np.concatenate(true_labels_list, axis=None)
pred_labels_flat = np.concatenate(pred_labels_list, axis=None)


#### Prediction Analysis: 

In [15]:
mask = true_labels_flat != -100 
true_labels_filtered = true_labels_flat[mask]
pred_labels_filtered = pred_labels_flat[mask]

precision, recall, f1, _ = precision_recall_fscore_support(true_labels_filtered, pred_labels_filtered, average='weighted')
accuracy = accuracy_score(true_labels_filtered, pred_labels_filtered)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Accuracy: 0.9269
Precision: 0.9236
Recall: 0.9269
F1 Score: 0.9234


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Model Parameters:

In [16]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
non_trainable_params = total_params - trainable_params

print(f"Total Parameters: {total_params}")
print(f"Trainable Parameters: {trainable_params}")
print(f"Non-trainable Parameters: {non_trainable_params}")


Total Parameters: 108437157
Trainable Parameters: 108437157
Non-trainable Parameters: 0


#### Save True and Predicted Labels for Analysis

In [17]:
with open("../data/true_labels.ob", 'wb') as fp:
    pickle.dump(true_labels_filtered, fp)
    
with open("../data/pre_labels.ob", 'wb') as fp:
    pickle.dump(pred_labels_filtered, fp)