# Lightweight Fine-Tuning Project

In this cell, my choices are described:

* PEFT technique: DoRA: "By employing DoRA, we enhance both the learning capacity and training stability of LoRA while avoiding any additional inference overhead."
* Model: ModernBERT: "In addition to strong downstream performance, ModernBERT is also the most speed and memory efficient encoder and is designed for inference on common GPUs."
* Evaluation approach: Binary Classification somewhat evenly split, so we'll use F1 to compare
* Fine-tuning dataset: https://huggingface.co/datasets/Jacobvs/PoliticalTweets

## Loading and Evaluating a Foundation Model

TODO: In the cells below, load your chosen pre-trained Hugging Face model and evaluate its performance prior to fine-tuning. This step includes loading an appropriate tokenizer and dataset.

In [2]:
# Install dependencies (after running this cell restart the Session/Kernel)
!pip install transformers datasets peft accelerate optuna -U -q

In [3]:
# Imports
import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    DataCollatorWithPadding
)
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, PeftModel
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    precision_recall_fscore_support
)

2025-02-06 17:07:09.975664: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:

# Configuration
MODEL_ID = "answerdotai/ModernBERT-base"
DATASET_NAME = "Jacobvs/PoliticalTweets"
ADAPTER_PATH = "./modernbert-political-dora-best"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:

torch._dynamo.config.disable = True

In [6]:
# 1. Load and prepare dataset
dataset = load_dataset(DATASET_NAME)
split_dataset = dataset['train'].train_test_split(test_size=0.2, seed=42)

In [7]:

# 2. Initialize model components
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
base_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_ID,
    num_labels=2,
    id2label={0: "Republican", 1: "Democrat"}
).to(DEVICE)

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

# 3.1 Tokenize the dataset before evaluation
def tokenize_function(examples):
    # Remove return_tensors='pt' so HF Datasets keeps outputs as lists
    # (which the DataCollatorWithPadding will batch into tensors)
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=128
    )

tokenized_dataset = split_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["index, date, id, username, text, party"]
)

# 3.2 Evaluate base model before fine-tuning
def evaluate_model(model, dataset_split, split_name="Test"):
    print(f"\n{'='*40}")
    print(f"Evaluating {split_name} Set")
    print(f"{'='*40}")

    # Define evaluation arguments
    eval_args = TrainingArguments(
        output_dir="./pre_fine_tuning_results",
        report_to="none"  # Disable all logging integrations, including wandb
    )

    trainer = Trainer(
        args=eval_args,
        model=model,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
        processing_class=tokenizer
    )

    outputs = trainer.predict(dataset_split)
    predictions = np.argmax(outputs.predictions, axis=1)
    labels = outputs.label_ids

    print(classification_report(labels, predictions, target_names=["Republican", "Democrat"]))
    plot_confusion_matrix(labels, predictions)

def plot_confusion_matrix(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                               display_labels=["Republican", "Democrat"])
    disp.plot(cmap=plt.cm.Blues)
    plt.title("Confusion Matrix")
    plt.show()

# Pre-training evaluation on the tokenized dataset
evaluate_model(base_model, tokenized_dataset["test"], "Base Model")

## Performing Parameter-Efficient Fine-Tuning

TODO: In the cells below, create a PEFT model from your loaded model, run a training loop, and save the PEFT model weights.

In [None]:
# 4. Optimized DoRA configuration
dora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["Wqkv", "Wi", "Wo"],
    lora_dropout=0.2,
    bias="none",
    task_type="SEQ_CLS",
    modules_to_save=["classifier", "model.final_norm"],
    use_dora=True,
    init_lora_weights="gaussian",
    inference_mode=False
)

In [None]:
# 5. Apply DoRA adapters
model = get_peft_model(base_model, dora_config)
model.print_trainable_parameters()

In [None]:
# 6. Training arguments
training_args = TrainingArguments(
    output_dir=ADAPTER_PATH,
    learning_rate=5e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=8,
    weight_decay=0.1,
    logging_steps=20,
    eval_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    fp16=torch.cuda.is_available(),
    optim="adamw_torch",
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    report_to="none",
    push_to_hub=False,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    gradient_accumulation_steps=1,
    group_by_length=True,
    dataloader_num_workers=2,
    torch_compile=False
)

In [None]:
# 7. Metrics calculation
def compute_metrics(p):
    predictions = np.argmax(p.predictions, axis=1)
    labels = p.label_ids

    # Compute MACRO metrics only (or MICRO if you prefer),
    # and keep overall accuracy. This greatly simplifies your logs.
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average="macro", zero_division=0
    )
    accuracy = (predictions == labels).mean()

    return {
        "accuracy": accuracy,
        "macro_precision": precision,
        "macro_recall": recall,
        "macro_f1": f1
    }

In [None]:
# 8. Early stopping
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.001
)

In [None]:
# 9. Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
    callbacks=[early_stopping],
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    processing_class=tokenizer
)

In [None]:
# 10. Run training
train_results = trainer.train()

In [None]:
# 11. Save and load best model
final_model_path = "./modernbert-political-dora-final"
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)


## Performing Inference with a PEFT Model

TODO: In the cells below, load the saved PEFT model weights and evaluate the performance of the trained PEFT model. Be sure to compare the results to the results from prior to fine-tuning.

In [None]:

model = PeftModel.from_pretrained(base_model, final_model_path)
model = model.merge_and_unload().to(DEVICE)

In [None]:
# 12. Post-training evaluation
print("\nEvaluating Fine-tuned Model on Test Set")
evaluate_model(model, tokenized_dataset["test"], "Fine-tuned Model")

In [None]:

# 14. Error analysis
test_output = trainer.predict(tokenized_dataset["test"])
test_preds = np.argmax(test_output.predictions, axis=1)
test_labels = test_output.label_ids

misclassified = np.where(test_preds != test_labels)[0]
if len(misclassified) > 0:
    print("\nError Analysis:")
    for idx in misclassified[:3]:
        # Cast `idx` to a Python int
        original_text = split_dataset["test"][int(idx)]["sms"]
        print(f"\nError Case {idx+1}:")
        print(f"Text: {original_text[:200]}{'...' if len(original_text)>200 else ''}")
        print(f"True: {model.config.id2label[test_labels[idx]]}")
        print(f"Predicted: {model.config.id2label[test_preds[idx]]}")
else:
    print("\nPerfect classification on test set!")