<a href="https://colab.research.google.com/github/amomack123/App-Review-Responder/blob/main/bert/Mine_BERT_Based_Uncased.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ================================================================
# CELL 0 — Install dependencies, import everything, load dataset
# ================================================================

!pip install -q transformers datasets scikit-learn peft

import time
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

from datasets import load_dataset
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    TrainingArguments,
    Trainer
)
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, f1_score
from peft import LoraConfig, get_peft_model, TaskType

# ------------------------------------------------
# Device
# ------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ------------------------------------------------
# Load IMDb Dataset
# ------------------------------------------------
print("Loading IMDb dataset...")
dataset = load_dataset("imdb")

model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )

print("Tokenizing IMDb data... this may take a few minutes.")
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Rename labels & set PyTorch format
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"]
)

tokenized_train_dataset = tokenized_dataset["train"]
tokenized_eval_dataset  = tokenized_dataset["test"]

print(f"✓ IMDb train: {len(tokenized_train_dataset)} samples")
print(f"✓ IMDb test:  {len(tokenized_eval_dataset)} samples")

# Show a sample
example = tokenized_train_dataset[0]
decoded = tokenizer.decode(example["input_ids"][:60])
print("\nSample text snippet:\n", decoded)


Using device: cuda
Loading IMDb dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Tokenizing IMDb data... this may take a few minutes.


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

✓ IMDb train: 25000 samples
✓ IMDb test:  25000 samples

Sample text snippet:
 [CLS] i rented i am curious - yellow from my video store because of all the controversy that surrounded it when it was first released in 1967. i also heard that at first it was seized by u. s. customs if it ever tried to enter this country, therefore being a fan of films considered "


In [2]:
# ================================================================
# CELL 1 — Metrics & Evaluation Helpers
# ================================================================

import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from torch.utils.data import DataLoader

# ------------------------------------------------
# Helper for HuggingFace Trainer (Path A / LoRA)
# ------------------------------------------------
def compute_metrics_trainer(eval_pred):
    """
    Metrics function used by HuggingFace Trainer.
    Expects (logits, labels) and returns accuracy + weighted F1.
    """
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    acc = accuracy_score(labels, preds)
    f1  = f1_score(labels, preds, average="weighted")

    return {
        "accuracy": acc,
        "f1_score": f1,
    }

# ------------------------------------------------
# Generic evaluation helper (used for ES + Baseline)
# ------------------------------------------------
def evaluate_model(model, eval_dataset, batch_size=32):
    """
    Evaluate a model on a tokenized eval_dataset.
    Uses the model's logits -> argmax, and returns accuracy + weighted F1.
    """
    model.eval()
    model.to(device)

    eval_loader = DataLoader(eval_dataset, batch_size=batch_size)

    all_labels = []
    all_preds  = []

    with torch.no_grad():
        for batch in eval_loader:
            batch = {
                k: v.to(device)
                for k, v in batch.items()
                if k in ["input_ids", "attention_mask", "labels"]
            }

            outputs = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"]
            )
            logits = outputs.logits
            preds  = torch.argmax(logits, dim=1)

            all_labels.extend(batch["labels"].cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    f1  = f1_score(all_labels, all_preds, average="weighted")

    print(f"[evaluate_model] Accuracy: {acc:.4f} | F1: {f1:.4f}")

    return {
        "accuracy": acc,
        "f1_score": f1,
    }

In [3]:
# ================================================================
# CELL 2 — Cached Batch Provider (Speeds up ES dramatically)
# ================================================================

from torch.utils.data import DataLoader
import numpy as np

class CachedBatchProvider:
    """
    Preloads N random batches to avoid repeated DataLoader + CPU→GPU overhead.
    ES pulls batches instantly from cache for faster iterations.
    """

    def __init__(self, dataset, batch_size=32, cache_size=50, device="cpu"):
        self.dataset = dataset
        self.batch_size = batch_size
        self.cache_size = cache_size
        self.device = device
        self._refresh_cache()

    def _refresh_cache(self):
        """
        Load 'cache_size' random batches into GPU memory once.
        """
        loader = DataLoader(self.dataset, batch_size=self.batch_size, shuffle=True)
        self.cache = []

        for i, batch in enumerate(loader):
            if i >= self.cache_size:
                break

            batch_gpu = {k: v.to(self.device) for k, v in batch.items()}
            self.cache.append(batch_gpu)

        print(f"[CachedBatchProvider] Cached {len(self.cache)} batches.")

    def get_batch(self):
        """
        Return a random cached batch.
        If empty, refresh cache automatically.
        """
        if len(self.cache) == 0:
            self._refresh_cache()

        idx = np.random.randint(0, len(self.cache))
        return self.cache[idx]

In [4]:
# ================================================================
# CELL 3 — Evolution Strategies: run_es_once_return_model()
# ================================================================

def run_es_once_return_model(
    seed=42,
    num_iterations=400,
    population_size=40,
    learning_rate=1e-4,
    noise_std=0.015,
    reward_batches=6,
    cached_batches=50,
    batch_size=32,
    device=device
):
    """
    Full ES training loop with:
      ✓ cached batch provider for speed
      ✓ classifier-head-only updates
      ✓ reward averaging over multiple batches (stable training)
      ✓ standardized rewards
      ✓ returns: (trained_model, metrics_dict)
    """

    torch.manual_seed(seed)
    np.random.seed(seed)

    # ------------------------------------------------------------
    # Load fresh BERT model
    # ------------------------------------------------------------
    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels=2
    ).to(device)

    optimizer = optim.Adam(model.classifier.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    # ------------------------------------------------------------
    # Cached batch provider for fast batch pulls
    # ------------------------------------------------------------
    batch_provider = CachedBatchProvider(
        tokenized_train_dataset,
        batch_size=batch_size,
        cache_size=cached_batches,
        device=device
    )

    print("\n[ES] Starting Evolution Strategies Training...")
    start_time = time.time()

    # ------------------------------------------------------------
    # Main ES loop
    # ------------------------------------------------------------
    for iteration in range(num_iterations):

        # Save original classifier weights
        original_weights = {
            n: p.detach().clone()
            for n, p in model.classifier.named_parameters()
        }

        perturbations = []
        rewards = []

        # --------------------------------------------------------
        # Evaluate population
        # --------------------------------------------------------
        for _ in range(population_size):

            # ------------------
            # apply noise
            # ------------------
            noise_dict = {}
            with torch.no_grad():
                for name, p in model.classifier.named_parameters():
                    noise = torch.randn_like(p) * noise_std
                    noise_dict[name] = noise
                    p.add_(noise)

            # ------------------
            # compute reward = -(avg loss over reward_batches)
            # ------------------
            total_loss = 0.0
            for _ in range(reward_batches):
                batch = batch_provider.get_batch()
                logits = model(
                    input_ids=batch["input_ids"],
                    attention_mask=batch["attention_mask"]
                ).logits
                loss = criterion(logits, batch["labels"])
                total_loss += loss.item()

            avg_loss = total_loss / reward_batches
            rewards.append(-avg_loss)
            perturbations.append(noise_dict)

            # ------------------
            # reset weights
            # ------------------
            with torch.no_grad():
                for name, p in model.classifier.named_parameters():
                    p.copy_(original_weights[name])

        # --------------------------------------------------------
        # Normalize rewards
        # --------------------------------------------------------
        rewards = np.array(rewards, dtype=np.float32)
        r_mean, r_std = rewards.mean(), rewards.std()

        if r_std < 1e-8:
            rewards = np.zeros_like(rewards)
        else:
            rewards = (rewards - r_mean) / (r_std + 1e-8)

        # --------------------------------------------------------
        # Gradient estimate via ES rule
        # --------------------------------------------------------
        optimizer.zero_grad(set_to_none=True)

        with torch.no_grad():
            for reward, noise_dict in zip(rewards, perturbations):
                coef = float(reward) / (population_size * noise_std)

                for name, p in model.classifier.named_parameters():
                    if p.grad is None:
                        p.grad = torch.zeros_like(p)
                    p.grad.add_(noise_dict[name], alpha=coef)

        optimizer.step()

        # --------------------------------------------------------
        # Logging
        # --------------------------------------------------------
        if iteration % 20 == 0:
            print(f"[ES] Iter {iteration}/{num_iterations} | Reward mean = {r_mean:.4f}")

    # ------------------------------------------------------------
    # End of ES training
    # ------------------------------------------------------------
    total_time = time.time() - start_time
    print(f"\n[ES] Training complete in {total_time:.2f} seconds")

    # ------------------------------------------------------------
    # Final evaluation
    # ------------------------------------------------------------
    final_metrics = evaluate_model(model, tokenized_eval_dataset, batch_size=32)
    final_metrics["compute_time_seconds"] = total_time

    print("\n=== ES Final Metrics ===")
    print(final_metrics)

    return model, final_metrics

In [5]:
# ================================================================
# CELL 4 — Error Analysis via CLS Embeddings
# ================================================================

from sklearn.metrics.pairwise import cosine_similarity

# ------------------------------------------------------------
# Extract CLS embeddings, preds, labels
# ------------------------------------------------------------
def extract_embeddings(model, dataset, max_samples=None):
    """
    Extracts final hidden layer CLS embeddings for error analysis.
    Returns (embeddings, labels, preds).
    """
    model.eval()
    model.to(device)

    loader = DataLoader(dataset, batch_size=16)
    all_emb = []
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in loader:
            batch_gpu = {
                k: v.to(device)
                for k, v in batch.items()
                if k in ["input_ids", "attention_mask", "labels"]
            }

            outputs = model(
                input_ids=batch_gpu["input_ids"],
                attention_mask=batch_gpu["attention_mask"],
                output_hidden_states=True
            )

            # CLS embedding: final layer, token 0 = [CLS]
            cls_emb = outputs.hidden_states[-1][:, 0, :]
            all_emb.append(cls_emb.cpu().numpy())

            preds = torch.argmax(outputs.logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(batch_gpu["labels"].cpu().numpy())

    emb = np.vstack(all_emb)
    labels = np.array(all_labels)
    preds = np.array(all_preds)

    return emb, labels, preds


# ------------------------------------------------------------
# Error Analysis Function
# ------------------------------------------------------------
def error_analysis_embeddings(model, dataset, max_examples=5):
    """
    Prints misclassified samples + nearest neighbors using cosine similarity
    of CLS embeddings.
    """
    print("\n=== ERROR ANALYSIS ===")

    emb, labels, preds = extract_embeddings(model, dataset)
    errors = np.where(labels != preds)[0]

    print(f"Total errors in dataset: {len(errors)}")

    if len(errors) == 0:
        print("No misclassifications found — model perfect on provided samples.")
        return

    for idx in errors[:max_examples]:
        print("\n-------------------------------")
        print(f"❌ Misclassified index: {idx}")
        print(f"True label:     {labels[idx]}")
        print(f"Predicted label:{preds[idx]}")

        # Cosine similarity with all other embeddings
        sims = cosine_similarity([emb[idx]], emb)[0]

        # Get nearest neighbors excluding itself
        nearest = sims.argsort()[::-1][1:6]

        print("Top-5 nearest neighbors (embedding similarity):")
        for n in nearest:
            print(
                f"  → idx {n} | true={labels[n]}, pred={preds[n]} | sim={sims[n]:.4f}"
            )

    print("\n=== END ERROR ANALYSIS ===")

In [6]:
# ================================================================
# CELL 5 — Path C: Baseline BERT (No Fine-Tuning)
# ================================================================

print("\n================================================")
print("PATH C — Baseline bert-base-uncased (No Training)")
print("================================================\n")

# Load a fresh baseline model
baseline_model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2
).to(device)

print("Evaluating baseline model on IMDb test set...")
path_c_metrics = evaluate_model(baseline_model, tokenized_eval_dataset)

print("\n=== Path C Metrics ===")
print(path_c_metrics)


PATH C — Baseline bert-base-uncased (No Training)



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluating baseline model on IMDb test set...
[evaluate_model] Accuracy: 0.4984 | F1: 0.3578

=== Path C Metrics ===
{'accuracy': 0.49836, 'f1_score': 0.3578294194006793}


In [8]:
# ================================================================
# CELL 6 — Path A: LoRA Fine-Tuning on IMDb
# ================================================================

print("\n========================================")
print("PATH A — LoRA Fine-Tuning (bert-base-uncased)")
print("========================================\n")

# Load a fresh model for LoRA
base_model_for_lora = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2
)

# ------------------------------------------------------------
# Configure LoRA
# ------------------------------------------------------------
print("Configuring LoRA modules (query, value)...")

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query", "value"],  # Standard for BERT attention
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_CLS
)

lora_model = get_peft_model(base_model_for_lora, lora_config)
lora_model.to(device)

print("\n--- Trainable Parameters (LoRA) ---")
lora_model.print_trainable_parameters()
print("-----------------------------------\n")

# ------------------------------------------------------------
# TrainingArguments for HF Trainer
# ------------------------------------------------------------
training_args = TrainingArguments(
    output_dir="./results/bert_lora_imdb",
    num_train_epochs=1,                  # You can bump to 2–3 if GPU is strong
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir="./logs",
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",                     # no tensorboard logging
)

trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    compute_metrics=compute_metrics_trainer,
)

# ------------------------------------------------------------
# Train LoRA Model
# ------------------------------------------------------------
print("Starting LoRA fine-tuning...")
start_time_lora = time.time()

trainer.train()

lora_total_time = time.time() - start_time_lora

# ------------------------------------------------------------
# Evaluate LoRA Model
# ------------------------------------------------------------
print("\nEvaluating LoRA Model...")
eval_results = trainer.evaluate()

path_a_metrics = {
    "accuracy": eval_results.get("eval_accuracy", float("nan")),
    "f1_score": eval_results.get("eval_f1_score", float("nan")),
    "compute_time_seconds": lora_total_time,
}

print("\n=== Path A (LoRA) Metrics ===")
print(path_a_metrics)


PATH A — LoRA Fine-Tuning (bert-base-uncased)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Configuring LoRA modules (query, value)...

--- Trainable Parameters (LoRA) ---
trainable params: 591,362 || all params: 110,075,140 || trainable%: 0.5372
-----------------------------------

Starting LoRA fine-tuning...


Epoch,Training Loss,Validation Loss,Accuracy,F1 Score
1,0.3758,0.34065,0.85016,0.85003



Evaluating LoRA Model...



=== Path A (LoRA) Metrics ===
{'accuracy': 0.85016, 'f1_score': 0.8500300189778563, 'compute_time_seconds': 610.8634202480316}


In [9]:
# ================================================================
# CELL 7 — Path B: Evolution Strategies (Cached) on IMDb
# ================================================================

print("\n===========================================")
print("PATH B — Evolution Strategies (Cached ES)")
print("===========================================\n")

es_model_final, es_metrics = run_es_once_return_model(
    seed=123,
    num_iterations=400,      # You can increase to 400–600 for stronger results
    population_size=40,      # ES population size
    learning_rate=1e-4,
    noise_std=0.015,
    reward_batches=6,        # average reward over 6 random batches
    cached_batches=50,       # cache 50 preloaded batches for speed
    batch_size=32,
    device=device
)

print("\n=== Path B (ES) Metrics ===")
print(es_metrics)


PATH B — Evolution Strategies (Cached ES)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[CachedBatchProvider] Cached 50 batches.

[ES] Starting Evolution Strategies Training...
[ES] Iter 0/400 | Reward mean = -0.7263
[ES] Iter 20/400 | Reward mean = -0.7526
[ES] Iter 40/400 | Reward mean = -0.7815


KeyboardInterrupt: 

In [None]:
# ================================================================
# CELL 8 — Error Analysis on ES Model
# ================================================================

print("\n=================================")
print("ERROR ANALYSIS — ES Final Model")
print("=================================\n")

error_analysis_embeddings(
    es_model_final,
    tokenized_eval_dataset,
    max_examples=5  # show 5 misclassified samples
)

In [None]:
# ================================================================
# CELL 9 — Summary Comparison (Path C vs Path A vs Path B)
# ================================================================

print("\n===============================================")
print("FINAL SUMMARY — BERT-base-uncased on IMDb")
print("===============================================\n")

print(">>> Path C — Baseline (No Fine-Tuning)")
print(f"Accuracy: {path_c_metrics['accuracy']:.4f}")
print(f"F1 Score: {path_c_metrics['f1_score']:.4f}")
print("----------------------------------------\n")

print(">>> Path A — LoRA Fine-Tuning")
print(f"Accuracy: {path_a_metrics['accuracy']:.4f}")
print(f"F1 Score: {path_a_metrics['f1_score']:.4f}")
print(f"Compute Time (s): {path_a_metrics['compute_time_seconds']:.2f}")
print("----------------------------------------\n")

print(">>> Path B — Evolution Strategies (Cached)")
print(f"Accuracy: {es_metrics['accuracy']:.4f}")
print(f"F1 Score: {es_metrics['f1_score']:.4f}")
print(f"Compute Time (s): {es_metrics['compute_time_seconds']:.2f}")
print("----------------------------------------\n")

print("Comparison complete ✔")