# expt 3 : lora + prompt tuning --- to find best model for our loss

¬†Priority 1: The "PEFT Architecture Showdown" (on CrowS-Pairs)¬†Priority 1: The "PEFT Architecture Showdown" (on CrowS-Pairs)
We must find out which PEFT architecture works best with your Contrastive PLL loss. <br>

Goal: To test if LoRA or Prompt Tuning, when trained with your successful Contrastive PLL loss, can also achieve low intrinsic bias (like your 52.0% score).<br>
How:<br>
Go to your afml-updated-baseline.ipynb notebook.<br>
Keep the data (crows_train, crows_eval) and the trainer (ContrastivePLLTrainer) exactly the same.<br>
Run 1 (Control): Run the AdaptedMLM (your custom adapter). Result: You already have this: 52.0% bias.<br>
Run 2 (New): Create a new model, LoRA_MLM. This will be BertForMaskedLM with LoRA layers added (using the peft library). Train only the LoRA parameters using your ContrastivePLLTrainer.<br>
Run 3 (New): Create a new model, Prompt_MLM. This will be BertForMaskedLM with Prompt Tuning. Train only the prompt parameters using your ContrastivePLLTrainer.<br>
Success Metric: You get a table comparing the CrowS-Pairs bias % for all three. The winner is the one with the best (lowest) bias score that doesn't hurt perplexity.<br>

Your Next Experiment (The "PEFT Showdown") <br>
So, we will run the "Step 9" experiment, but using your winning loss function. <br>
Goal: Find out which PEFT architecture (Adapter vs. LoRA vs. Prompt Tuning) works best with your ContrastivePLLTrainer on the full CrowS-Pairs dataset. <br>
Dataset: Use the full crows_train (1206 pairs) and crows_eval (302 pairs) from your successful notebook. <br>
Loss Function: Use your ContrastivePLLTrainer for all three models. <br>
Models to Test: <br>
Model A (Control): AdaptedMLM (Your DebiasAdapter). You already have this result: 52.0% Bias. <br>
Model B (New): LoRA_MLM (BERT + LoRA layers). <br>
Model C (New): Prompt_MLM (BERT + Soft Prompt). <br>
The winner of this showdown (e.g., LoRA_MLM) will be our new champion. That is the model we will use to re-run the BiasBios downstream experiment.

Block 1: Setup, Imports & Data Loading

In [None]:
# --- 1. Install & Imports ---
!pip install -q -U adapters datasets
print("adapters installed")

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, BertForMaskedLM
from adapters import AutoAdapterModel, LoRAConfig, PrefixTuningConfig
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import urllib.request
import io

# Setup Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"‚úÖ Using device: {device}")

# --- 2. Load CrowS-Pairs Data ---
print("\nüì• Loading CrowS-Pairs Dataset...")
url = "https://raw.githubusercontent.com/nyu-mll/crows-pairs/master/data/crows_pairs_anonymized.csv"
with urllib.request.urlopen(url) as response:
    df = pd.read_csv(io.StringIO(response.read().decode('utf-8')))

crows_full = []
for _, row in df.iterrows():
    crows_full.append({
        'stereotype': row['sent_more'],
        'anti_stereotype': row['sent_less'],
        'bias_type': row['bias_type']
    })

# Split 80/20
split_idx = int(len(crows_full) * 0.8)
crows_train = crows_full[:split_idx]
crows_eval = crows_full[split_idx:]
print(f"‚úÖ Loaded {len(crows_train)} training and {len(crows_eval)} evaluation pairs.")

# --- 3. Load Tokenizer ---
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# --- 4. Define Contrastive PLL Trainer ---
class ContrastivePLLTrainer:
    def __init__(self, model, tokenizer, device, learning_rate=1e-4):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        # Optimized for PEFT: Update only trainable params
        self.optimizer = torch.optim.AdamW(
            [p for p in model.parameters() if p.requires_grad],
            lr=learning_rate
        )

    def pll_score_with_grad(self, text):
        tokens = self.tokenizer.encode(text, add_special_tokens=True)
        if len(tokens) <= 2: return torch.tensor(0.0, device=self.device)

        total_pll = torch.tensor(0.0, device=self.device, requires_grad=True)
        count = 0

        # Create batch of masked inputs for efficiency
        input_ids_list = []
        target_ids_list = []

        for i in range(1, len(tokens) - 1):
            masked = tokens.copy()
            target = masked[i]
            masked[i] = self.tokenizer.mask_token_id
            input_ids_list.append(masked)
            target_ids_list.append(target)

        if not input_ids_list: return torch.tensor(0.0, device=self.device)

        input_tensor = torch.tensor(input_ids_list, device=self.device)
        target_tensor = torch.tensor(target_ids_list, device=self.device)

        outputs = self.model(input_tensor)
        logits = outputs.logits

        # Gather correct log-probs
        # shape: [batch, seq_len, vocab] -> [batch, vocab] at masked positions
        # We need indices 1 to len(tokens)-1 matching the batch items
        range_indices = torch.arange(1, len(tokens) - 1, device=self.device)

        # Extract logits for the specific masked tokens
        # Since input_tensor[k] has mask at index k+1, we gather from that index
        target_logits = logits[torch.arange(len(input_ids_list)), range_indices]
        log_probs = F.log_softmax(target_logits, dim=-1)

        token_plls = log_probs.gather(1, target_tensor.unsqueeze(1)).squeeze()
        return token_plls.mean()

    def train_epoch(self, pairs):
        self.model.train()
        epoch_losses = []

        # Shuffle pairs
        np.random.shuffle(pairs)

        for pair in tqdm(pairs, desc="Training", leave=False):
            self.optimizer.zero_grad()

            pll_stereo = self.pll_score_with_grad(pair['stereotype'])
            pll_anti = self.pll_score_with_grad(pair['anti_stereotype'])

            # Contrastive Loss: Minimize squared difference
            loss = (pll_stereo - pll_anti) ** 2

            loss.backward()
            self.optimizer.step()
            epoch_losses.append(loss.item())

        return np.mean(epoch_losses)

# --- 5. Evaluation Helpers ---
def compute_pll(model, text):
    # Simple PLL for evaluation (no grad)
    inputs = tokenizer(text, return_tensors="pt").to(device)
    with torch.no_grad():
        score = 0.0
        tokens = inputs.input_ids[0]
        for i in range(1, len(tokens)-1):
            tmp = tokens.clone()
            tmp[i] = tokenizer.mask_token_id
            out = model(tmp.unsqueeze(0))
            score += F.log_softmax(out.logits[0, i], dim=-1)[tokens[i]].item()
    return score / (len(tokens)-2) if len(tokens) > 2 else 0.0

def evaluate_bias(model, eval_pairs):
    model.eval()
    stereo_wins = 0
    for p in tqdm(eval_pairs, desc="Evaluating Bias"):
        s_score = compute_pll(model, p['stereotype'])
        a_score = compute_pll(model, p['anti_stereotype'])
        if s_score > a_score: stereo_wins += 1
    return (stereo_wins / len(eval_pairs)) * 100

neutral_sents = [
    "The weather is nice today.", "I enjoy reading books.",
    "The sun rises in the east.", "Technology is changing fast.",
    "She walked to the store.", "He cooked dinner for friends."
]

def evaluate_perplexity(model):
    model.eval()
    total_nll = 0
    count = 0
    for sent in neutral_sents:
        nll = -compute_pll(model, sent)
        total_nll += nll
        count += 1
    return np.exp(total_nll / count)

Block 2: Run LoRA Experiment
This builds, trains, and evaluates the LoRA version.

In [None]:
print("\n" + "="*40)
print("ü•ä ROUND 1: LoRA Architecture")
print("="*40)

# 1. Setup Model
lora_model = AutoAdapterModel.from_pretrained("bert-base-uncased")
lora_config = LoRAConfig(r=8, alpha=16)
lora_model.add_adapter("lora_debias", config=lora_config)
lora_model.train_adapter("lora_debias")
lora_model.add_masked_lm_head("lora_debias")
lora_model.to(device)

print(f"‚úÖ Model Ready. Trainable Params: {sum(p.numel() for p in lora_model.parameters() if p.requires_grad)}")

# 2. Train
print("‚è≥ Training LoRA (5 Epochs)...")
lora_trainer = ContrastivePLLTrainer(lora_model, tokenizer, device, learning_rate=3e-4)
for ep in range(5):
    loss = lora_trainer.train_epoch(crows_train)
    print(f"   Epoch {ep+1}: Loss = {loss:.4f}")

# 3. Save & Evaluate
lora_model.save_adapter("/kaggle/working/lora_debias", "lora_debias")
lora_bias = evaluate_bias(lora_model, crows_eval)
lora_ppl = evaluate_perplexity(lora_model)

print(f"\nüìä LoRA Results:")
print(f"   Bias Score: {lora_bias:.2f}% (Target: 50%)")
print(f"   Perplexity: {lora_ppl:.2f}")

Block 3: Run Prompt Tuning Experiment
This builds, trains, and evaluates the Prompt Tuning version.

In [None]:
print("\n" + "="*40)
print("ü•ä ROUND 2: Prompt Tuning Architecture")
print("="*40)

# 1. Setup Model
prompt_model = AutoAdapterModel.from_pretrained("bert-base-uncased")
# prefix_length=20 adds 20 virtual tokens
prompt_config = PrefixTuningConfig(flat=False, prefix_length=20)
prompt_model.add_adapter("prompt_debias", config=prompt_config)
prompt_model.train_adapter("prompt_debias")
prompt_model.add_masked_lm_head("prompt_debias")
prompt_model.to(device)

print(f"‚úÖ Model Ready. Trainable Params: {sum(p.numel() for p in prompt_model.parameters() if p.requires_grad)}")

# 2. Train (Note Higher Learning Rate for Prompts)
print("‚è≥ Training Prompt Tuning (5 Epochs)...")
prompt_trainer = ContrastivePLLTrainer(prompt_model, tokenizer, device, learning_rate=1e-2)
for ep in range(5):
    loss = prompt_trainer.train_epoch(crows_train)
    print(f"   Epoch {ep+1}: Loss = {loss:.4f}")

# 3. Save & Evaluate
prompt_model.save_adapter("/kaggle/working/prompt_debias", "prompt_debias")
prompt_bias = evaluate_bias(prompt_model, crows_eval)
prompt_ppl = evaluate_perplexity(prompt_model)

print(f"\nüìä Prompt Tuning Results:")
print(f"   Bias Score: {prompt_bias:.2f}% (Target: 50%)")
print(f"   Perplexity: {prompt_ppl:.2f}")

Block 4: Final Showdown Table

In [None]:
print("\n" + "="*50)
print("üèÜ PEFT ARCHITECTURE SHOWDOWN: FINAL RESULTS")
print("="*50)
print(f"{'Architecture':<20} | {'Bias % (Target 50)':<20} | {'Perplexity':<15}")
print("-" * 60)
# Assuming your Custom Adapter score from previous run was around 52%
print(f"{'Custom Adapter':<20} | {'~52.00 (Ref)':<20} | {'~15.5 (Ref)':<15}")
print(f"{'LoRA':<20} | {lora_bias:<20.2f} | {lora_ppl:<15.2f}")
print(f"{'Prompt Tuning':<20} | {prompt_bias:<20.2f} | {prompt_ppl:<15.2f}")
print("-" * 60)

Block 5: Visualization Code

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# --- 1. Plot Training Loss Curves ---
def plot_training_comparison(lora_losses, prompt_losses):
    plt.figure(figsize=(10, 6))
    epochs = range(1, len(lora_losses) + 1)

    # Plot LoRA
    plt.plot(epochs, lora_losses, 'o-', linewidth=2, label='LoRA (Weights)', color='#1f77b4')

    # Plot Prompt Tuning
    plt.plot(epochs, prompt_losses, 's--', linewidth=2, label='Prompt Tuning (Activations)', color='#ff7f0e')

    plt.title('PEFT Training Dynamics: LoRA vs. Prompt Tuning', fontsize=14)
    plt.xlabel('Epochs', fontsize=12)
    plt.ylabel('Contrastive PLL Loss', fontsize=12)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend(fontsize=12)
    plt.tight_layout()
    plt.savefig("/kaggle/working/loss_comparison.png")
    plt.show()

# Run the plotter (assuming you have these lists from the previous steps)
if 'lora_losses' in locals() and 'prompt_losses' in locals():
    plot_training_comparison(lora_losses, prompt_losses)
else:
    print("‚ö†Ô∏è Training loss lists not found. Did you run the training blocks?")

# --- 2. Plot The "Pareto Frontier" (Bias vs. Utility) ---
def plot_tradeoff(results):
    """
    results: dict like {'Model Name': (Bias_Score, Perplexity)}
    """
    plt.figure(figsize=(10, 8))

    # Define reference lines
    plt.axvline(x=50, color='gray', linestyle='--', alpha=0.5, label='Ideal Neutrality (50%)')

    colors = ['#2ca02c', '#1f77b4', '#ff7f0e'] # Green, Blue, Orange
    markers = ['*', 'o', 's']

    for i, (name, (bias, ppl)) in enumerate(results.items()):
        plt.scatter(bias, ppl, s=200, color=colors[i], marker=markers[i], label=name, edgecolors='black')
        # Annotate
        plt.annotate(f"{name}\n({bias:.1f}%, {ppl:.1f})",
                     (bias, ppl),
                     xytext=(10, 10), textcoords='offset points',
                     fontsize=11)

    plt.title('The Fairness-Utility Tradeoff', fontsize=16)
    plt.xlabel('Stereotype Preference (Closer to 50% is better)', fontsize=12)
    plt.ylabel('Perplexity (Lower is better)', fontsize=12)
    plt.xlim(40, 80) # Zoom in on the relevant bias range
    plt.grid(True, alpha=0.3)
    plt.legend(loc='upper right')

    plt.tight_layout()
    plt.savefig("/kaggle/working/tradeoff_plot.png")
    plt.show()

# Example Data (Replace these with your actual variables!)
# Baseline numbers usually ~58% bias, ~4.5 perplexity
final_results = {
    "Baseline (No Debias)": (58.3, 15.2),
    "LoRA": (lora_bias, lora_ppl),
    "Prompt Tuning": (prompt_bias, prompt_ppl)
}

plot_tradeoff(final_results)

<div class="alert alert-block alert-info" style="font-size:14px; font-family:verdana; line-height: 1.7em;">
    üìå &nbsp; Contrastive PLL reduces intrinsic bias while preserving fluency for adapter-style PEFTs; naive LoRA/prompt tuning collapses LM quality.‚Äù
</div>