# LoRA Fine-Tuning BERT from Scratch

This notebook performs both **standard fine-tuning** and **LoRA (Low-Rank Approximation) fine-tuning** on **BERT models** using custom implementations **coded from scratch**. 

The fine-tuning **objective** is to optimize the BERT models on the specific task of **predicting** whether a **question submitted** to **StackOverflow** will be **closed**.

BERT variants: 
* **BERT-base**: BERT base uncased
* **BERT Overflow**: BERT-base pretrained on 152 million sentences from Stack Overflow
* **BERT Tabular**: BERT-base modified to accept tabular features
* **LoRA BERT**:
    * **rank 1**
    * **rank 8**
    * **last four layers with rank 8**

Dataset: https://www.kaggle.com/competitions/predict-closed-questions-on-stack-overflow/data

### Import Libraries

In [1]:
# add python path to include src directory
import sys
sys.path.insert(0, '../src')

# standard library imports
from dataclasses import dataclass
from pathlib import Path
from typing import Tuple
import math

# related third party imports
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torch.utils.data import DataLoader
from transformers import BertTokenizer
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import tqdm

# local library specific imports
from bert_from_scratch import BertForSequenceClassification as MyBertForSequenceClassification
from lora_from_scratch import (
    LinearLoRA,
    create_lora,
    add_lora_layers,
    freeze_model,
    unfreeze_model,
    create_linear,
    merge_lora_layers,
)

## STEP 1: Data Preprocessing

In [23]:
def collate_fn(batch):
    """ Instructs how the DataLoader should process the data into a batch"""
    
    text = [item['text'] for item in batch]
    tabular = torch.stack([torch.tensor(item['tabular']) for item in batch])
    labels = torch.stack([torch.tensor(item['label']) for item in batch])

    return {'text': text, 'tabular': tabular, 'label': labels}


df = pd.read_csv("../data/train-sample.csv")

# dict mapping strings to integers
string_to_int = {
    'open': 0,
    'not a real question': 1,
    'off topic': 1,
    'not constructive': 1,
    'too localized': 1
}

# add new features to dataframe
df['OpenStatusInt'] = df['OpenStatus'].map(string_to_int)  # convert class strings to integers
df['BodyLength'] = df['BodyMarkdown'].apply(lambda x: len(x.split(" ")))  # number of words in body text
df['TitleLength'] = df['Title'].apply(lambda x: len(x.split(" ")))  # number of words in title text
df['TitleConcatWithBody'] = df.apply(lambda x: x.Title +  " " + x.BodyMarkdown, axis=1)  # combine title and body text
df['NumberOfTags'] = df.apply(
    lambda x: len([x[col] for col in ['Tag1', 'Tag2', 'Tag3', 'Tag4', 'Tag5'] if not pd.isna(x[col])]), 
    axis=1,
)  # number of tags
df['PostCreationDate'] = pd.to_datetime(df['PostCreationDate'])  # convert string to Timedelta object
df['OwnerCreationDate'] = pd.to_datetime(df['OwnerCreationDate'], format='mixed')  # convert string to Timedelta object
df['DayDifference'] = (df['PostCreationDate'] - df['OwnerCreationDate']).dt.days  # days between account creation and post creation 

# list of col names with tabular data 
tabular_feature_list = [
    'ReputationAtPostCreation',  
    'BodyLength', 
    'TitleLength', 
    'NumberOfTags',
    'DayDifference',
]

# place the desired data from the dataframe into a dictionary
data_dict = {
    'text': df.TitleConcatWithBody.tolist(),
    'tabular': df[tabular_feature_list].values,
    'label': df.OpenStatusInt.tolist(),
}

# load data into hugging face dataset object
dataset_stackoverflow = Dataset.from_dict(data_dict)

# define the indices at which to split the dataset into train/validation/test
n_samples = len(dataset_stackoverflow)
split_idx1 = int(n_samples * 0.8)
split_idx2 = int(n_samples * 0.9)

# shuffle the dataset
shuffled_dataset = dataset_stackoverflow.shuffle(seed=42)

# split dataset training/validation/test
train_dataset = shuffled_dataset.select(range(split_idx1))
val_dataset = shuffled_dataset.select(range(split_idx1, split_idx2))
test_dataset = shuffled_dataset.select(range(split_idx2, n_samples))

# calculate mean and std of each tabular feature
mean_train = torch.mean(torch.tensor(train_dataset['tabular'], dtype=torch.float32), dim=0)
std_train = torch.std(torch.tensor(train_dataset['tabular'], dtype=torch.float32), dim=0)

# define a function to apply standard scaling to the tabular data
def standard_scale(example):
    example['tabular'] = (torch.tensor(example['tabular']) - mean_train) / std_train
    return example

# apply the standard scaling function to the tabular features
train_dataset = train_dataset.map(standard_scale)
val_dataset = val_dataset.map(standard_scale)
test_dataset = test_dataset.map(standard_scale)

# load the datasets into a dataloader
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)

Map:   0%|          | 0/112217 [00:00<?, ? examples/s]

Map:   0%|          | 0/14027 [00:00<?, ? examples/s]

Map:   0%|          | 0/14028 [00:00<?, ? examples/s]

#### Trainer Class

In [5]:
class BertTrainer:
    """ A training and evaluation loop for PyTorch models with a BERT like architecture. """
    
    
    def __init__(
        self, 
        model,
        tokenizer,
        train_dataloader,
        eval_dataloader=None,
        epochs=1,
        lr=5e-04,
        output_dir='./',
        output_filename='model_state_dict.pt',
        save=False,
        tabular=False,
    ):
        """
        Args:
            model: torch.nn.Module: = A PyTorch model with a BERT like architecture,
            tokenizer: = A BERT tokenizer for tokenizing text input,
            train_dataloader: torch.utils.data.DataLoader = 
                A dataloader containing the training data with "text" and "label" keys (optionally a "tabular" key),
            eval_dataloader: torch.utils.data.DataLoader = 
                A dataloader containing the evaluation data with "text" and "label" keys (optionally a "tabular" key),
            epochs: int = An integer representing the number epochs to train,
            lr: float = A float representing the learning rate for the optimizer,
            output_dir: str = A string representing the directory path to save the model,
            output_filename: string = A string representing the name of the file to save in the output directory,
            save: bool = A boolean representing whether or not to save the model,
            tabular: bool = A boolean representing whether or not the BERT model is modified to accept tabular data,
        """
        
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model = model.to(self.device)
        self.tokenizer = tokenizer
        self.train_dataloader = train_dataloader
        self.eval_dataloader = eval_dataloader
        self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=lr)
        self.loss_fn = nn.CrossEntropyLoss()
        self.output_dir = output_dir
        self.output_filename = output_filename
        self.save = save
        self.eval_loss = float('inf')  # tracks the lowest loss so as to only save the best model  
        self.epochs = epochs
        self.epoch_best_model = 0  # tracks which epoch the lowest loss is in so as to only save the best model
        self.tabular = tabular
    
    def train(self, evaluate=False):
        """ Calls the batch iterator to train and optionally evaluate the model."""
        for epoch in range(self.epochs):
            self.iteration(epoch, self.train_dataloader)
            if evaluate and self.eval_dataloader is not None:
                self.iteration(epoch, self.eval_dataloader, train=False)

    def evaluate(self):
        """ Calls the batch iterator to evaluate the model."""
        epoch=0
        self.iteration(epoch, self.eval_dataloader, train=False)
    
    def iteration(self, epoch, data_loader, train=True):
        """ Iterates through one epoch of training or evaluation"""
        
        # initialize variables
        loss_accumulated = 0.
        correct_accumulated = 0
        samples_accumulated = 0
        preds_all = []
        labels_all = []
        
        self.model.train() if train else self.model.eval()
        
        # progress bar
        mode = "train" if train else "eval"
        batch_iter = tqdm.tqdm(
            enumerate(data_loader),
            desc=f"EP ({mode}) {epoch}",
            total=len(data_loader),
            bar_format="{l_bar}{r_bar}"
        )
        
        # iterate through batches of the dataset
        for i, batch in batch_iter:

            # tokenize data
            batch_t = self.tokenizer(
                batch['text'],
                padding='max_length', 
                max_length=512, 
                truncation=True,
                return_tensors='pt', 
            )
            batch_t = {key: value.to(self.device) for key, value in batch_t.items()}
            batch_t["input_labels"] = batch["label"].to(self.device)
            batch_t["tabular_vectors"] = batch["tabular"].to(self.device)

            # forward pass - include tabular data if it is a tabular model
            if self.tabular:
                logits = self.model(
                    input_ids=batch_t["input_ids"], 
                    token_type_ids=batch_t["token_type_ids"], 
                    attention_mask=batch_t["attention_mask"],
                    tabular_vectors=batch_t["tabular_vectors"],
                )   
            
            else:
                logits = self.model(
                    input_ids=batch_t["input_ids"], 
                    token_type_ids=batch_t["token_type_ids"], 
                    attention_mask=batch_t["attention_mask"],
                )

            # calculate loss
            loss = self.loss_fn(logits, batch_t["input_labels"])
    
            # compute gradient and and update weights
            if train:
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
            
            # calculate the number of correct predictions
            preds = logits.argmax(dim=-1)
            correct = preds.eq(batch_t["input_labels"]).sum().item()
            
            # accumulate batch metrics and outputs
            loss_accumulated += loss.item()
            correct_accumulated += correct
            samples_accumulated += len(batch_t["input_labels"])
            preds_all.append(preds.detach())
            labels_all.append(batch_t['input_labels'].detach())
        
        # concatenate all batch tensors into one tensor and move to cpu for compatibility with sklearn metrics
        preds_all = torch.cat(preds_all, dim=0).cpu()
        labels_all = torch.cat(labels_all, dim=0).cpu()
        
        # metrics
        accuracy = accuracy_score(labels_all, preds_all)
        precision = precision_score(labels_all, preds_all, average='macro')
        recall = recall_score(labels_all, preds_all, average='macro')
        f1 = f1_score(labels_all, preds_all, average='macro')
        avg_loss_epoch = loss_accumulated / len(data_loader)
        
        # print metrics to console
        print(
            f"samples={samples_accumulated}, \
            correct={correct_accumulated}, \
            acc={round(accuracy, 4)}, \
            recall={round(recall, 4)}, \
            prec={round(precision,4)}, \
            f1={round(f1, 4)}, \
            loss={round(avg_loss_epoch, 4)}"
        )    
        
        # save the model if the evaluation loss is lower than the previous best epoch 
        if self.save and not train and avg_loss_epoch < self.eval_loss:
            
            # create directory and filepaths
            dir_path = Path(self.output_dir)
            dir_path.mkdir(parents=True, exist_ok=True)
            file_path = dir_path / f"{self.output_filename}_epoch_{epoch}.pt"
            
            # delete previous best model from hard drive
            if epoch > 0:
                file_path_best_model = dir_path / f"{self.output_filename}_epoch_{self.epoch_best_model}.pt"
                !rm -f $file_path_best_model
            
            # save model
            torch.save({
                'model_state_dict': self.model.state_dict(),
                'optimizer_state_dict': self.optimizer.state_dict()
            }, file_path)
            
            # update the new best loss and epoch
            self.eval_loss = avg_loss_epoch
            self.epoch_best_model = epoch

## STEP 2: Fine-Tuning

### BERT-base

#### Instantiate the pretrained model.

In [4]:
# load tokenizer and pretrained model
tokenizer_base = BertTokenizer.from_pretrained('bert-base-uncased')
bert_base = MyBertForSequenceClassification.from_pretrained(
    model_type='bert-base-uncased',
    config_args={"vocab_size": 30522, "n_classes": 2}  # these are default configs but just added for explicity
)

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Loading weights from pretrained model: bert-base-uncased


Downloading pytorch_model.bin:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

#### Fine-Tune the model. 

The trainer saves the model from the epoch with the lowest validation loss.

In [6]:
#bert base
trainer_bert_base = BertTrainer(
    bert_base,
    tokenizer_base,
    lr=5e-06,
    epochs=5,
    train_dataloader=train_dataloader,
    eval_dataloader=val_dataloader,
    output_dir='../models/bert_base_fine_tuned',
    output_filename='bert_base',
    save=True,
)

trainer_bert_base.train(evaluate=True)

EP (train) 0: 100%|| 3507/3507 [48:24<00:00,  1.21it/s]


samples=112217,             correct=85975,             acc=0.7661,             recall=0.7662,             prec=0.7663,             f1=0.7661,             loss=0.4903


EP (eval) 0: 100%|| 439/439 [02:25<00:00,  3.01it/s]


samples=14027,             correct=11013,             acc=0.7851,             recall=0.7853,             prec=0.7856,             f1=0.7851,             loss=0.4643


EP (train) 1: 100%|| 3507/3507 [48:27<00:00,  1.21it/s]


samples=112217,             correct=89764,             acc=0.7999,             recall=0.7999,             prec=0.7999,             f1=0.7999,             loss=0.4392


EP (eval) 1: 100%|| 439/439 [02:25<00:00,  3.01it/s]


samples=14027,             correct=11034,             acc=0.7866,             recall=0.7869,             prec=0.7877,             f1=0.7865,             loss=0.462


EP (train) 2: 100%|| 3507/3507 [48:29<00:00,  1.21it/s]


samples=112217,             correct=92032,             acc=0.8201,             recall=0.8201,             prec=0.8201,             f1=0.8201,             loss=0.4056


EP (eval) 2: 100%|| 439/439 [02:25<00:00,  3.01it/s]


samples=14027,             correct=11081,             acc=0.79,             recall=0.7901,             prec=0.7902,             f1=0.79,             loss=0.4624


EP (train) 3: 100%|| 3507/3507 [48:31<00:00,  1.20it/s]


samples=112217,             correct=94453,             acc=0.8417,             recall=0.8417,             prec=0.8417,             f1=0.8417,             loss=0.3671


EP (eval) 3: 100%|| 439/439 [02:26<00:00,  3.00it/s]


samples=14027,             correct=10886,             acc=0.7761,             recall=0.7767,             prec=0.7816,             f1=0.7752,             loss=0.4853


EP (train) 4: 100%|| 3507/3507 [48:34<00:00,  1.20it/s]


samples=112217,             correct=97186,             acc=0.8661,             recall=0.8661,             prec=0.8661,             f1=0.8661,             loss=0.3244


EP (eval) 4: 100%|| 439/439 [02:25<00:00,  3.01it/s]

samples=14027,             correct=10998,             acc=0.7841,             recall=0.7842,             prec=0.7842,             f1=0.7841,             loss=0.5039





#### Evaluate the model from the best epoch on the test datset.


You have to replace state dictionary path on line 7 in the below cell with the .pt file from the /models directory.

In [18]:
# copy weights from the saved fine-tuned model
state_dict = torch.load('../models/bert_base_fine_tuned/bert_base_epoch_1.pt')  # replace with .pt file from models dir
bert_base.load_state_dict(state_dict["model_state_dict"])

# trainer
trainer_bert_base = BertTrainer(
    bert_base,
    tokenizer_base,
    lr=5e-06,
    epochs=5,
    train_dataloader=train_dataloader,
    eval_dataloader=test_dataloader,
    output_dir='../models/bert_base_fine_tuned',
    output_filename='bert_base',
    save=False,
)

# evaluate on test set
trainer_bert_base.evaluate()

EP (eval) 0: 100%|| 439/439 [02:25<00:00,  3.01it/s]

samples=14028,             correct=11022,             acc=0.7857,             recall=0.7855,             prec=0.7862,             f1=0.7855,             loss=0.4594





### BERTOverflow

In [4]:
tokenizer_overflow = BertTokenizer.from_pretrained('lanwuwei/BERTOverflow_stackoverflow_github')
bert_overflow = MyBertForSequenceClassification.from_pretrained(
    model_type='lanwuwei/BERTOverflow_stackoverflow_github', 
    config_args={"vocab_size": 82000, "n_classes": 2, "pad_token_id": 0}
)

Downloading vocab.txt:   0%|          | 0.00/644k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/3.16k [00:00<?, ?B/s]

Loading weights from pretrained model: lanwuwei/BERTOverflow_stackoverflow_github


Downloading pytorch_model.bin:   0%|          | 0.00/568M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at lanwuwei/BERTOverflow_stackoverflow_github and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# bert overflow
trainer_bert_overflow = BertTrainer(
    bert_overflow,
    tokenizer_overflow,
    lr=5e-06,
    epochs=5,
    train_dataloader=train_dataloader,
    eval_dataloader=val_dataloader,
    output_dir='../models/bert_overflow_fine_tuned',
    output_filename='bert_overflow',
    save=True
)

trainer_bert_overflow.train(evaluate=True)

EP (train) 0: 100%|| 3507/3507 [47:24<00:00,  1.23it/s]


samples=112217,             correct=78125,             acc=0.6962,             recall=0.6962,             prec=0.6967,             f1=0.696,             loss=0.5828


EP (eval) 0: 100%|| 439/439 [02:17<00:00,  3.18it/s]


samples=14027,             correct=10303,             acc=0.7345,             recall=0.7347,             prec=0.7352,             f1=0.7344,             loss=0.5331


EP (train) 1: 100%|| 3507/3507 [47:25<00:00,  1.23it/s]


samples=112217,             correct=84117,             acc=0.7496,             recall=0.7496,             prec=0.7496,             f1=0.7496,             loss=0.5139


EP (eval) 1: 100%|| 439/439 [02:17<00:00,  3.18it/s]


samples=14027,             correct=10500,             acc=0.7486,             recall=0.7487,             prec=0.749,             f1=0.7485,             loss=0.5107


EP (train) 2: 100%|| 3507/3507 [47:27<00:00,  1.23it/s]


samples=112217,             correct=86506,             acc=0.7709,             recall=0.7709,             prec=0.7709,             f1=0.7709,             loss=0.4854


EP (eval) 2: 100%|| 439/439 [02:17<00:00,  3.18it/s]


samples=14027,             correct=10605,             acc=0.756,             recall=0.7562,             prec=0.7564,             f1=0.756,             loss=0.5035


EP (train) 3: 100%|| 3507/3507 [47:28<00:00,  1.23it/s]


samples=112217,             correct=88250,             acc=0.7864,             recall=0.7864,             prec=0.7864,             f1=0.7864,             loss=0.46


EP (eval) 3: 100%|| 439/439 [02:18<00:00,  3.17it/s]


samples=14027,             correct=10627,             acc=0.7576,             recall=0.7574,             prec=0.7579,             f1=0.7574,             loss=0.5025


EP (train) 4: 100%|| 3507/3507 [47:27<00:00,  1.23it/s]


samples=112217,             correct=89663,             acc=0.799,             recall=0.799,             prec=0.799,             f1=0.799,             loss=0.439


EP (eval) 4: 100%|| 439/439 [02:18<00:00,  3.18it/s]

samples=14027,             correct=10459,             acc=0.7456,             recall=0.7464,             prec=0.7539,             f1=0.7439,             loss=0.5247





In [16]:
state_dict = torch.load('../models/bert_overflow_fine_tuned/bert_overflow_epoch_3.pt')
bert_overflow.load_state_dict(state_dict["model_state_dict"])

trainer_bert_overflow = BertTrainer(
    bert_overflow,
    tokenizer_overflow,
    lr=5e-06,
    epochs=5,
    train_dataloader=train_dataloader,
    eval_dataloader=test_dataloader,
    output_dir='../models/bert_overflow_fine_tuned',
    output_filename='bert_overflow',
    save=False,
)

trainer_bert_overflow.evaluate()

EP (eval) 0: 100%|| 439/439 [02:18<00:00,  3.17it/s]

samples=14028,             correct=10685,             acc=0.7617,             recall=0.7619,             prec=0.7624,             f1=0.7616,             loss=0.5061





### BERT Tabular

In [4]:
class TabularBertForSequenceClassification(torch.nn.Module):
    """
    A BERT model for sequence classification that concatenates a vector of tabular features
    to the pooler output (pooled last hidden state) before pushing to the classification head.
    
    """

    def __init__(self, bert_model, tabular_size):
        """
        Args:
            bert_model: torch.nn.Module: = A PyTorch model with a BERT like architecture that returns the pooled embedding
            tabular_size: int = Integer representing the size of the tabular vector to concatenate   
        """
        super().__init__()
        self.bert_model = bert_model
        self.emb_size = self.bert_model.config.emb_size
        self.n_classes = self.bert_model.config.n_classes
        self.classifier_replacement = torch.nn.Linear(self.emb_size + tabular_size, self.n_classes)


    def forward(self, input_ids, token_type_ids, tabular_vectors, attention_mask=None):
        pooler_out, _ = self.bert_model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
        pooler_out_cat_tabular = torch.cat((pooler_out, tabular_vectors), dim=-1)
        out = self.classifier_replacement(pooler_out_cat_tabular)

        return out

In [5]:
tokenizer_base = BertTokenizer.from_pretrained('bert-base-uncased')
bert_base = MyBertForSequenceClassification.from_pretrained(
    model_type='bert-base-uncased',
    config_args={"vocab_size": 30522, "n_classes": 2, "return_pooler_output": True}
)

tabular_size = len(train_dataset['tabular'][0])
bert_base_tabular = TabularBertForSequenceClassification(bert_base, tabular_size)

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Loading weights from pretrained model: bert-base-uncased


Downloading pytorch_model.bin:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [7]:
#bert base tabular
trainer_bert_base_tabular = BertTrainer(
    bert_base_tabular,
    tokenizer_base,
    lr=5e-06,
    epochs=5,
    train_dataloader=train_dataloader,
    eval_dataloader=val_dataloader,
    output_dir='../models/bert_base_fine_tuned_tabular',
    output_filename='bert_base_tabular',
    save=True,
    tabular=True,
)

trainer_bert_base_tabular.train(evaluate=True)

EP (train) 0: 100%|| 3507/3507 [48:13<00:00,  1.21it/s]


samples=112217,             correct=86075,             acc=0.767,             recall=0.767,             prec=0.7671,             f1=0.767,             loss=0.4894


EP (eval) 0: 100%|| 439/439 [02:24<00:00,  3.03it/s]


samples=14027,             correct=10972,             acc=0.7822,             recall=0.7821,             prec=0.7823,             f1=0.7821,             loss=0.4636


EP (train) 1: 100%|| 3507/3507 [48:13<00:00,  1.21it/s]


samples=112217,             correct=89652,             acc=0.7989,             recall=0.7989,             prec=0.799,             f1=0.7989,             loss=0.4401


EP (eval) 1: 100%|| 439/439 [02:24<00:00,  3.03it/s]


samples=14027,             correct=11103,             acc=0.7915,             recall=0.7916,             prec=0.7915,             f1=0.7915,             loss=0.4561


EP (train) 2: 100%|| 3507/3507 [48:12<00:00,  1.21it/s]


samples=112217,             correct=91891,             acc=0.8189,             recall=0.8189,             prec=0.8189,             f1=0.8189,             loss=0.4073


EP (eval) 2: 100%|| 439/439 [02:24<00:00,  3.03it/s]


samples=14027,             correct=11075,             acc=0.7895,             recall=0.7893,             prec=0.7904,             f1=0.7893,             loss=0.4566


EP (train) 3: 100%|| 3507/3507 [48:13<00:00,  1.21it/s]


samples=112217,             correct=94457,             acc=0.8417,             recall=0.8417,             prec=0.8417,             f1=0.8417,             loss=0.3682


EP (eval) 3: 100%|| 439/439 [02:24<00:00,  3.03it/s]


samples=14027,             correct=11014,             acc=0.7852,             recall=0.7848,             prec=0.7867,             f1=0.7848,             loss=0.4814


EP (train) 4: 100%|| 3507/3507 [48:10<00:00,  1.21it/s]


samples=112217,             correct=97005,             acc=0.8644,             recall=0.8644,             prec=0.8644,             f1=0.8644,             loss=0.3245


EP (eval) 4: 100%|| 439/439 [02:25<00:00,  3.03it/s]

samples=14027,             correct=10966,             acc=0.7818,             recall=0.7815,             prec=0.7829,             f1=0.7814,             loss=0.5054





In [8]:
state_dict = torch.load('../models/bert_base_fine_tuned_tabular/bert_base_tabular_epoch_1.pt')
bert_base_tabular.load_state_dict(state_dict["model_state_dict"])

trainer_bert_base_tabular = BertTrainer(
    bert_base_tabular,
    tokenizer_base,
    lr=5e-06,
    epochs=5,
    train_dataloader=train_dataloader,
    eval_dataloader=test_dataloader,
    output_dir='../models/bert_base_fine_tuned_tabular',
    output_filename='bert_base_tabular',
    save=False,
    tabular=True,
)

trainer_bert_base_tabular.evaluate()

EP (eval) 0: 100%|| 1754/1754 [02:33<00:00, 11.46it/s]

samples=14028,             correct=11064,             acc=0.7887,             recall=0.7887,             prec=0.7887,             f1=0.7887,             loss=0.4558





### LoRA BERT-base
#### rank = 8

In [11]:
tokenizer_base = BertTokenizer.from_pretrained('bert-base-uncased')
bert_base = MyBertForSequenceClassification.from_pretrained(
    model_type='bert-base-uncased',
    config_args={"vocab_size": 30522, "n_classes": 2}  # these are default configs but just added for explicity
)

add_lora_layers(bert_base, r=8, lora_alpha=16)  # inject the LoRA layers into the model
freeze_model(bert_base)  # freeze the non-LoRA parameters

Loading weights from pretrained model: bert-base-uncased


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Count how many of the parameters in the model are trainable.

In [16]:
n_params = 0
n_trainable_params = 0

# count the number of trainable parameters
for n, p in bert_base.named_parameters():
    n_params += p.numel()
    if p.requires_grad:
        n_trainable_params += p.numel()

print(f"Total parameters: {n_params}")
print(f"Trainable parameters: {n_trainable_params}")
print(f"Percentage trainable: {round(n_trainable_params / n_params * 100, 2)}%")

Total parameters: 109778690
Trainable parameters: 296450
Percentage trainable: 0.27%


In [7]:
#bert base lora all r = 8
trainer_bert_base_lora = BertTrainer(
    bert_base,
    tokenizer_base,
    lr=5e-04,
    epochs=5,
    train_dataloader=train_dataloader,
    eval_dataloader=val_dataloader,
    output_dir='../models/bert_base_fine_tuned_lora_r8',
    output_filename='bert_base_lora_r8',
    save=True,
)

trainer_bert_base_lora.train(evaluate=True)

EP (train) 0: 100%|| 3507/3507 [38:32<00:00,  1.52it/s]


samples=112217,             correct=86105,             acc=0.7673,             recall=0.7673,             prec=0.7674,             f1=0.7673,             loss=0.4879


EP (eval) 0: 100%|| 439/439 [02:28<00:00,  2.95it/s]


samples=14027,             correct=10730,             acc=0.765,             recall=0.764,             prec=0.776,             f1=0.7621,             loss=0.4885


EP (train) 1: 100%|| 3507/3507 [38:33<00:00,  1.52it/s]


samples=112217,             correct=89196,             acc=0.7949,             recall=0.7949,             prec=0.7949,             f1=0.7949,             loss=0.4482


EP (eval) 1: 100%|| 439/439 [02:29<00:00,  2.94it/s]


samples=14027,             correct=10908,             acc=0.7776,             recall=0.7777,             prec=0.7777,             f1=0.7776,             loss=0.468


EP (train) 2: 100%|| 3507/3507 [38:33<00:00,  1.52it/s]


samples=112217,             correct=90999,             acc=0.8109,             recall=0.8109,             prec=0.8109,             f1=0.8109,             loss=0.4214


EP (eval) 2: 100%|| 439/439 [02:28<00:00,  2.95it/s]


samples=14027,             correct=10986,             acc=0.7832,             recall=0.7834,             prec=0.7839,             f1=0.7831,             loss=0.4723


EP (train) 3: 100%|| 3507/3507 [38:35<00:00,  1.51it/s]


samples=112217,             correct=92469,             acc=0.824,             recall=0.824,             prec=0.824,             f1=0.824,             loss=0.3965


EP (eval) 3: 100%|| 439/439 [02:29<00:00,  2.94it/s]


samples=14027,             correct=10937,             acc=0.7797,             recall=0.7799,             prec=0.7802,             f1=0.7797,             loss=0.4713


EP (train) 4: 100%|| 3507/3507 [38:34<00:00,  1.52it/s]


samples=112217,             correct=94023,             acc=0.8379,             recall=0.8379,             prec=0.8379,             f1=0.8379,             loss=0.3718


EP (eval) 4: 100%|| 439/439 [02:28<00:00,  2.95it/s]

samples=14027,             correct=10862,             acc=0.7744,             recall=0.7745,             prec=0.7747,             f1=0.7743,             loss=0.5011





In [17]:
state_dict = torch.load("../models/bert_base_fine_tuned_lora_r8/bert_base_lora_r8_epoch_1.pt")
bert_base.load_state_dict(state_dict["model_state_dict"])

# merge weights
merge_lora_layers(bert_base) 
unfreeze_model(bert_base)

# create directory and filepaths
dir_path = Path("../models/bert_base_fine_tuned_lora_r8/merged")
dir_path.mkdir(parents=True, exist_ok=True)
file_path = dir_path / "bert_base_lora_r8_epoch_1_merged.pt"

# save model
torch.save({
    "model_state_dict": bert_base.state_dict(),
}, file_path)

In [9]:
trainer_bert_base_lora_r8 = BertTrainer(
    bert_base,
    tokenizer_base,
    lr=5e-06,
    epochs=5,
    train_dataloader=train_dataloader,
    eval_dataloader=test_dataloader,
    output_dir='../models/bert_base_fine_tuned_lora_r8',
    output_filename='bert_base_lora_r8',
    save=False,
)

trainer_bert_base_lora_r8.evaluate()

EP (eval) 0: 100%|| 439/439 [02:22<00:00,  3.07it/s]

samples=14028,             correct=10985,             acc=0.7831,             recall=0.7831,             prec=0.7831,             f1=0.7831,             loss=0.4643





### LoRA BERT-base
#### rank = 1

In [18]:
tokenizer_base = BertTokenizer.from_pretrained('bert-base-uncased')
bert_base = MyBertForSequenceClassification.from_pretrained(
    model_type='bert-base-uncased',
    config_args={"vocab_size": 30522, "n_classes": 2}  # these are default configs but just added for explicity
)

add_lora_layers(bert_base, r=1, lora_alpha=2)
freeze_model(bert_base)

Loading weights from pretrained model: bert-base-uncased


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [19]:
n_params = 0
n_trainable_params = 0

# count the number of trainable parameters
for n, p in bert_base.named_parameters():
    n_params += p.numel()
    if p.requires_grad:
        n_trainable_params += p.numel()

print(f"Total parameters: {n_params}")
print(f"Trainable parameters: {n_trainable_params}")
print(f"Percentage trainable: {round(n_trainable_params / n_params * 100, 2)}%")

Total parameters: 109520642
Trainable parameters: 38402
Percentage trainable: 0.04%


In [6]:
#bert base lora all r = 1
trainer_bert_base_lora_r1 = BertTrainer(
    bert_base,
    tokenizer_base,
    lr=5e-04,
    epochs=5,
    train_dataloader=train_dataloader,
    eval_dataloader=val_dataloader,
    output_dir='../models/bert_base_fine_tuned_lora_r1',
    output_filename='bert_base_lora_r1',
    save=True,
)

trainer_bert_base_lora_r1.train(evaluate=True)

EP (train) 0: 100%|| 3507/3507 [38:25<00:00,  1.52it/s]


samples=112217,             correct=85181,             acc=0.7591,             recall=0.7591,             prec=0.7592,             f1=0.7591,             loss=0.4981


EP (eval) 0: 100%|| 439/439 [02:27<00:00,  2.97it/s]


samples=14027,             correct=10844,             acc=0.7731,             recall=0.7732,             prec=0.7733,             f1=0.7731,             loss=0.4921


EP (train) 1: 100%|| 3507/3507 [38:25<00:00,  1.52it/s]


samples=112217,             correct=88112,             acc=0.7852,             recall=0.7852,             prec=0.7852,             f1=0.7852,             loss=0.4616


EP (eval) 1: 100%|| 439/439 [02:28<00:00,  2.95it/s]


samples=14027,             correct=10799,             acc=0.7699,             recall=0.7706,             prec=0.7767,             f1=0.7688,             loss=0.4891


EP (train) 2: 100%|| 3507/3507 [38:24<00:00,  1.52it/s]


samples=112217,             correct=89344,             acc=0.7962,             recall=0.7962,             prec=0.7962,             f1=0.7962,             loss=0.4453


EP (eval) 2: 100%|| 439/439 [02:28<00:00,  2.96it/s]


samples=14027,             correct=10937,             acc=0.7797,             recall=0.7798,             prec=0.7798,             f1=0.7797,             loss=0.47


EP (train) 3: 100%|| 3507/3507 [38:22<00:00,  1.52it/s]


samples=112217,             correct=90204,             acc=0.8038,             recall=0.8038,             prec=0.8038,             f1=0.8038,             loss=0.431


EP (eval) 3: 100%|| 439/439 [02:28<00:00,  2.96it/s]


samples=14027,             correct=10917,             acc=0.7783,             recall=0.7781,             prec=0.7788,             f1=0.7781,             loss=0.4723


EP (train) 4: 100%|| 3507/3507 [38:27<00:00,  1.52it/s]


samples=112217,             correct=91059,             acc=0.8115,             recall=0.8115,             prec=0.8115,             f1=0.8115,             loss=0.418


EP (eval) 4: 100%|| 439/439 [02:28<00:00,  2.95it/s]

samples=14027,             correct=10891,             acc=0.7764,             recall=0.7764,             prec=0.7764,             f1=0.7764,             loss=0.4778





In [6]:
state_dict = torch.load("../models/bert_base_fine_tuned_lora_r1/bert_base_lora_r1_epoch_2.pt")
bert_base.load_state_dict(state_dict["model_state_dict"])

# merge weights
merge_lora_layers(bert_base)
unfreeze_model(bert_base)

# create directory and filepaths
dir_path = Path("../models/bert_base_fine_tuned_lora_r1/merged")
dir_path.mkdir(parents=True, exist_ok=True)
file_path = dir_path / "bert_base_lora_r1_epoch_2_merged.pt"

# save model
torch.save({
    "model_state_dict": bert_base.state_dict(),
}, file_path)

In [8]:
trainer_bert_base_lora_r1 = BertTrainer(
    bert_base,
    tokenizer_base,
    lr=5e-06,
    epochs=5,
    train_dataloader=train_dataloader,
    eval_dataloader=test_dataloader,
    output_dir='../models/bert_base_fine_tuned_lora_r1',
    output_filename='bert_base_lora_r1',
    save=False,
)

trainer_bert_base_lora_r1.evaluate()

EP (eval) 0: 100%|| 1754/1754 [02:54<00:00, 10.04it/s]

samples=14028,             correct=10954,             acc=0.7809,             recall=0.7808,             prec=0.7809,             f1=0.7808,             loss=0.4648





### LoRA BERT-base
#### Last 4 layers only rank = 8

In [20]:
tokenizer_base = BertTokenizer.from_pretrained('bert-base-uncased')
bert_base = MyBertForSequenceClassification.from_pretrained(
    model_type='bert-base-uncased',
    config_args={"vocab_size": 30522, "n_classes": 2}  # these are default configs but just added for explicity
)

add_lora_layers(bert_base, r=8, lora_alpha=16, ignore_layers=[0,1,2,3,4,5,6,7])
freeze_model(bert_base)

Loading weights from pretrained model: bert-base-uncased


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [21]:
n_params = 0
n_trainable_params = 0

# count the number of trainable parameters
for n, p in bert_base.named_parameters():
    n_params += p.numel()
    if p.requires_grad:
        n_trainable_params += p.numel()

print(f"Total parameters: {n_params}")
print(f"Trainable parameters: {n_trainable_params}")
print(f"Percentage trainable: {round(n_trainable_params / n_params * 100, 2)}%")

Total parameters: 109582082
Trainable parameters: 99842
Percentage trainable: 0.09%


In [9]:
#bert base lora rank 8 last 4 layers
trainer_bert_base_lora_r8_last_4_layers = BertTrainer(
    bert_base,
    tokenizer_base,
    lr=5e-04,
    epochs=5,
    train_dataloader=train_dataloader,
    eval_dataloader=val_dataloader,
    output_dir='../models/bert_base_fine_tuned_lora_r8_last_4_layers',
    output_filename='bert_base_lora_r8_last_4_layers',
    save=True,
)

trainer_bert_base_lora_r8_last_4_layers.train(evaluate=True)

EP (train) 0: 100%|| 3507/3507 [25:33<00:00,  2.29it/s]


samples=112217,             correct=85706,             acc=0.7638,             recall=0.7638,             prec=0.7638,             f1=0.7637,             loss=0.494


EP (eval) 0: 100%|| 439/439 [02:26<00:00,  3.00it/s]


samples=14027,             correct=10821,             acc=0.7714,             recall=0.7713,             prec=0.7717,             f1=0.7713,             loss=0.4892


EP (train) 1: 100%|| 3507/3507 [25:27<00:00,  2.30it/s]


samples=112217,             correct=88350,             acc=0.7873,             recall=0.7873,             prec=0.7873,             f1=0.7873,             loss=0.4602


EP (eval) 1: 100%|| 439/439 [02:25<00:00,  3.02it/s]


samples=14027,             correct=10911,             acc=0.7779,             recall=0.7773,             prec=0.7822,             f1=0.7767,             loss=0.4802


EP (train) 2: 100%|| 3507/3507 [25:27<00:00,  2.30it/s]


samples=112217,             correct=89349,             acc=0.7962,             recall=0.7962,             prec=0.7962,             f1=0.7962,             loss=0.445


EP (eval) 2: 100%|| 439/439 [02:25<00:00,  3.01it/s]


samples=14027,             correct=10958,             acc=0.7812,             recall=0.7813,             prec=0.7814,             f1=0.7812,             loss=0.4718


EP (train) 3: 100%|| 3507/3507 [25:29<00:00,  2.29it/s]


samples=112217,             correct=90434,             acc=0.8059,             recall=0.8059,             prec=0.8059,             f1=0.8059,             loss=0.4287


EP (eval) 3: 100%|| 439/439 [02:26<00:00,  3.01it/s]


samples=14027,             correct=10912,             acc=0.7779,             recall=0.7785,             prec=0.7819,             f1=0.7774,             loss=0.4825


EP (train) 4: 100%|| 3507/3507 [25:28<00:00,  2.29it/s]


samples=112217,             correct=91471,             acc=0.8151,             recall=0.8151,             prec=0.8151,             f1=0.8151,             loss=0.4124


EP (eval) 4: 100%|| 439/439 [02:25<00:00,  3.02it/s]


samples=14027,             correct=10928,             acc=0.7791,             recall=0.7789,             prec=0.7793,             f1=0.7789,             loss=0.4691


In [11]:
state_dict = torch.load("../models/bert_base_fine_tuned_lora_r8_last_4_layers/bert_base_lora_r8_last_4_layers_epoch_4.pt")
bert_base.load_state_dict(state_dict["model_state_dict"])

merge_lora_layers(bert_base)
unfreeze_model(bert_base)

# create directory and filepaths
dir_path = Path("../models/bert_base_fine_tuned_lora_r8_last_4_layers/merged")
dir_path.mkdir(parents=True, exist_ok=True)
file_path = dir_path / "bert_base_lora_r8_last_4_layers_epoch_4_merged.pt"

# save model
torch.save({
    "model_state_dict": bert_base.state_dict(),
}, file_path)

In [12]:
trainer_bert_base_lora_r8_last_4_layers = BertTrainer(
    bert_base,
    tokenizer_base,
    lr=5e-06,
    epochs=5,
    train_dataloader=test_dataloader,
    eval_dataloader=test_dataloader,
    output_dir='../models/bert_base_fine_tuned_lora_r8_last_4_layers',
    output_filename='bert_base_lora_r8_last_4_layers',
    save=False,
)

trainer_bert_base_lora_r8_last_4_layers.evaluate()

EP (eval) 0: 100%|| 1754/1754 [02:53<00:00, 10.10it/s]

samples=14028,             correct=10958,             acc=0.7812,             recall=0.7813,             prec=0.7817,             f1=0.7811,             loss=0.4713



