#### Activation Extraction and Probe Training

In [None]:
import os
import json
import torch
import re

import numpy as np
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import pandas as pd

from scipy.stats import ttest_rel, chi2
from torch.utils.data import Dataset, DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
from tqdm import tqdm
from scipy import stats
from collections import Counter
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

#### Run the following cells as appropriate depending on your runtime environment

In [None]:
# If running in Google Colab,
from google.colab import userdata, drive

drive.mount('/content/drive')    # Mount Google Drive
%cd /content/drive/MyDrive/cs182 # Navigate to appropriate directory

# Llama 3.1 8B (Base) is a gated model - login to hf using HF_TOKEN (must exist in colab "Secrets")
HF_TOKEN = userdata.get('HF_TOKEN')
login(token=HF_TOKEN)

In [None]:
# If running locally (or using Colab compute in VSCode via Colab extension)
from dotenv import load_dotenv

# Llama 3.1 8B (Base) is a gated model - login to hf using HF_TOKEN (must exist in .env)
load_dotenv(override=True)
HF_TOKEN = os.getenv("HF_TOKEN")
login(token=HF_TOKEN)

#### Load model and attach hooks

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

TRAINING_BATCH_SIZE = 32

# Load Llama 3.1 8B Tokenizer + Model
## Change for generation between Base and Instruct
MODEL_ID = "meta-llama/Llama-3.1-8B"
#MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
tokenizer.padding_side = 'right'
if tokenizer.pad_token is None:
  tokenizer.pad_token = tokenizer.eos_token

# Model
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    dtype=torch.bfloat16,
    device_map="auto"
)

print("Model successfully loaded!")
  
activation_cache = {}

# Create a hook function to cache the output of a layer
def get_hook(layer_name):
  def hook(module, inputs, output):
    activation_cache[layer_name] = output.detach()
  return hook

hook_handles = []

# Attach hooks to all MLP layers
for i in range(model.config.num_hidden_layers):
  layer_name = f"model.layers.{i}.mlp"
  layer = model.get_submodule(layer_name) # locate the layer
  handle = layer.register_forward_hook(get_hook(layer_name)) # register the forward hook
  hook_handles.append(handle)

In [None]:
# Create a Dataset and Collate function for the prompt datasets
class PromptDataset(Dataset):
  def __init__(self, prompts_list):
    self.prompts = prompts_list
  def __len__(self):
    return len(self.prompts)
  def __getitem__(self, idx):
    return self.prompts[idx]

class SmartCollate:
  def __init__(self, tokenizer):
    self.tokenizer = tokenizer

  def __call__(self, batch_prompts):
    tokenized = self.tokenizer(
        batch_prompts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=model.config.max_position_embeddings
    )

    return tokenized

#### Base Model + Document Prompting

##### Load Document Style Prompt Datasets

In [None]:
# Load dataset(s)
base_contrastive_df = pd.read_csv('datasets/base_contrastive_dataset.csv')
df_base_terminal = base_contrastive_df[base_contrastive_df['label'] =='terminal']
df_base_instrumental = base_contrastive_df[base_contrastive_df['label'] == 'instrumental']

# Create Datasets for Efficient Extraction
smart_collator = SmartCollate(tokenizer)
BATCH_SIZE = 16
dataset_base_terminal = PromptDataset(df_base_terminal['prompt'].tolist())
dataset_base_instrumental = PromptDataset(df_base_instrumental['prompt'].tolist())

# Create the DataLoader Instances
dataloader_base_terminal = DataLoader(
    dataset_base_terminal,
    batch_size=BATCH_SIZE,
    collate_fn=smart_collator,
    shuffle=False,
    num_workers=4,
    pin_memory=True
)

dataloader_base_instrumental = DataLoader(
    dataset_base_instrumental,
    batch_size=BATCH_SIZE,
    collate_fn=smart_collator,
    shuffle=False,
    num_workers=4,
    pin_memory=True
)

print(f"Created 2 DataLoaders with batch size {BATCH_SIZE}")
print(f"\t dataloader_base_terminal has {len(dataloader_base_terminal)} batches")
print(f"\t dataloader_base_instrumental has {len(dataloader_base_instrumental)} batches")


##### Perform Feature Extraction (Llama 3.1 8B Base)

In [None]:
# Perform activation extraction
collated_data = []

dataloaders_to_process = [
    ("terminal", dataloader_base_terminal),
    ("instrumental", dataloader_base_instrumental)
]

print("Starting activation extraction...")

for label, dataloader in dataloaders_to_process:
    all_prompts = dataloader.dataset.prompts

    batch_iterator = tqdm(
        enumerate(dataloader),
        total=len(dataloader),
        desc=f"Extracting {label}"
    )

    for i, batch in batch_iterator:
        batch = {k: v.to(model.device) for k, v in batch.items()}
        with torch.no_grad():
            model(**batch)

        batch_size = batch['input_ids'].shape[0]
        batch_start_index = i * dataloader.batch_size
        batch_end_index = batch_start_index + batch_size
        batch_prompts = all_prompts[batch_start_index:batch_end_index]

        # this indexing requires right padding (we set tokenizer.pad_size = "right" in an earlier cell)
        last_token_indices = batch['attention_mask'].sum(dim=1) - 1

        for layer_name, all_activations in activation_cache.items():
            # shape: [batch_size, sequence_length, hidden_dim]

            last_token_activations = all_activations[
                torch.arange(batch_size),
                last_token_indices
            ]

            last_token_activations_cpu = last_token_activations.cpu().float().numpy()

            for j in range(batch_size):
                collated_data.append({
                    "layer": layer_name,
                    "activation": last_token_activations_cpu[j],
                    "prompt": batch_prompts[j],
                    "label": label
                })

        activation_cache.clear()

print("\n--- Activation extraction complete. ---")

print("Collating all data into a single DataFrame...")

# Separate activations from metadata
metadata_list = []
activations_list = []

for item in collated_data:
    metadata_list.append({
        'layer': item['layer'],
        'prompt': item['prompt'],
        'label': item['label']
    })
    activations_list.append(item['activation'])

# Stack activations into 2D array
activations_array = np.vstack(activations_list)  # Shape: [n_samples, hidden_dim]
hidden_dim = activations_array.shape[1]
print(f"Activations array shape: {activations_array.shape}")

# Create DataFrame with metadata
df_metadata = pd.DataFrame(metadata_list)

# Create DataFrame for activations (all at once - much faster!)
activation_columns = {f'act_{i}': activations_array[:, i] for i in range(hidden_dim)}
df_activations = pd.DataFrame(activation_columns)

# Concatenate metadata and activations horizontally
df = pd.concat([df_metadata, df_activations], axis=1)

# Save to Parquet
OUTPUT_FILE = "results/base_activation_dataset.parquet"
print(f"Saving DataFrame with {len(df)} rows to {OUTPUT_FILE}...")
df.to_parquet(OUTPUT_FILE, index=False)

##### Base Model Probe Training
If not updating the contrastive dataset and experimenting with alternate probe configurations, start here and load from cached activation datasets

In [None]:
# --- CONFIG ---
N_SPLITS = 5
TEST_SPLIT_SIZE = 0.2

VECTORS_DIR = "steering_vectors"
os.makedirs(VECTORS_DIR, exist_ok=True)

df = pd.read_parquet('results/base_activation_dataset.parquet')
print(f"Loaded {len(df)} total activations")

# Verify Class Balance
print("\nClass distribution:")
print(df['label'].value_counts())
print(df['label'].value_counts(normalize=True))

# Gather Activations
activation_cols = [col for col in df.columns if col.startswith('act_')]
hidden_dim = len(activation_cols)
print(f"Hidden dimension: {hidden_dim}")

label_map = {"terminal":0, "instrumental":1}
layer_names = df['layer'].unique()
sorted_layer_names = sorted(layer_names, key=lambda x: int(x.split('.')[2]))

# Store results for final plotting/stats
layer_results = []

# Calculate TF-IDF classification scores *once* outside of the layer loop
first_layer_df = df[df['layer'] == sorted_layer_names[0]].sort_values('prompt').copy() # arbitrarily use the first layer
y_all = first_layer_df['label'].map(label_map).values
prompts_all = first_layer_df['prompt'].values

fold_scores_tfidf = []

global_y_true = []
global_preds_tfidf = []

# Setup Splitter
cv_splitter = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
fold_indices = list(cv_splitter.split(prompts_all, y_all))

for fold_idx, (train_idx, test_idx) in enumerate(fold_indices):
  # --- A. PREPARE SPLITS ---
  y_train, y_test = y_all[train_idx], y_all[test_idx]
  prompts_train, prompts_test = prompts_all[train_idx], prompts_all[test_idx]

  # --- B. TRAIN TF-IDF BASELINE ---
  vectorizer = TfidfVectorizer(min_df=3, max_features=1000, stop_words='english')
  X_text_train = vectorizer.fit_transform(prompts_train)
  X_text_test = vectorizer.transform(prompts_test)

  lr_tfidf = LogisticRegression(max_iter=1000)
  lr_tfidf.fit(X_text_train, y_train)
  pred_tfidf = lr_tfidf.predict(X_text_test)
  acc_tfidf = accuracy_score(y_test, pred_tfidf)
  fold_scores_tfidf.append(acc_tfidf)
  global_preds_tfidf.extend(pred_tfidf)
  global_y_true.extend(y_test)

mean_tfidf = np.mean(fold_scores_tfidf)
std_tfidf = np.std(fold_scores_tfidf)

g_true = np.array(global_y_true)
g_tfidf = np.array(global_preds_tfidf)

print("  === Confusion Matrix ===")
print(f"    TF-IDF:")
print(confusion_matrix(g_true, g_tfidf))

for layer_name in tqdm(sorted_layer_names, desc="Processing Layers"):

    # 1. Prepare Data for this Layer
    layer_df = df[df['layer'] == layer_name].sort_values('prompt').copy()
    if not layer_df['prompt'].is_unique:
        duplicates = layer_df[layer_df.duplicated('prompt')]['prompt'].unique()
        raise ValueError(f"Duplicate prompts found in layer {layer_name}! Examples: {duplicates[:3]}")

    # Labels
    y_all = layer_df['label'].map(label_map).values

    # Activation Features (X)
    X_activations = layer_df[activation_cols].values.astype(np.float32)

    # Text Prompts (Raw)
    prompts_all = layer_df['prompt'].values

    # Storage for this specific layer
    fold_scores_probe = [] # Logistic Regression Classifier
    fold_vectors = []

    global_preds_probe = []

    # 2. MANUAL CROSS-VALIDATION LOOP
    for fold_idx, (train_idx, test_idx) in enumerate(fold_indices):

        # --- A. PREPARE SPLITS ---
        X_act_train, X_act_test = X_activations[train_idx], X_activations[test_idx]
        y_train, y_test = y_all[train_idx], y_all[test_idx]
        prompts_train, prompts_test = prompts_all[train_idx], prompts_all[test_idx]

        # --- C. TRAIN SKLEARN LR ON ACTIVATIONS ---
        lr_activations = LogisticRegression(max_iter=2048)
        lr_activations.fit(X_act_train, y_train)
        pred_lr = lr_activations.predict(X_act_test)
        acc_lr = accuracy_score(y_test, pred_lr)

        fold_scores_probe.append(acc_lr)

        # Track global probe predictions
        global_preds_probe.extend(pred_lr)

        # Save the vector (coefficients)
        fold_vectors.append(lr_activations.coef_.flatten())

    # --- 3. SAVE STEERING VECTOR ---
    # Average the coefficient vectors from all 5 folds to get a more robust concept direction
    avg_steering_vector = np.mean(fold_vectors, axis=0)

    # Normalize the vector (good practice for steering)
    avg_steering_vector = avg_steering_vector / np.linalg.norm(avg_steering_vector)

    vector_filename = f"{layer_name}_steering.npy"
    np.save(os.path.join(VECTORS_DIR, vector_filename), avg_steering_vector)

    # --- 4. STATISTICAL TEST PER LAYER ---
    t_stat, p_val = ttest_rel(fold_scores_probe, fold_scores_tfidf)

    mean_probe = np.mean(fold_scores_probe)
    std_probe = np.std(fold_scores_probe)

    print(f"\nLayer {layer_name} Results:")
    print(f"    TF-IDF Mean: {mean_tfidf:.4f} +/- {std_tfidf:.4f}")
    print(f"    LR Probe Mean: {mean_probe:.4f} +/- {std_probe:.4f}")
    print(f"  === Statistical Significance (LR Probe vs. TF-IDF) ===")
    print(f"    Accuracy Difference: {(mean_probe - mean_tfidf):.4f}")
    print(f"    T-Test p-value: {p_val:.4e} {'*' if p_val < 0.05 else ''}")

    # Calculate confusion matrices

    g_probe = np.array(global_preds_probe)

    probe_correct = (g_probe == g_true)
    tfidf_correct = (g_tfidf == g_true)

    # Calculate McNemar's Test (Probe vs. TF-IDF)
    b = np.sum(tfidf_correct & ~probe_correct)
    c = np.sum(~tfidf_correct & probe_correct)

    mcnemar_stat = (b - c)**2 / (b + c) if (b + c) > 0 else 1.000
    mcnemar_p = 1 - chi2.cdf(mcnemar_stat, 1)

    print(f"    McNemar's p-value: {mcnemar_p:.4e} {'*' if mcnemar_p < 0.05 else ''}")

    print("  === Confusion Matrix ===")
    print(f"    Logistic Regression Probe:")
    print("\t", confusion_matrix(g_true, g_probe))

    layer_results.append({
        'layer': layer_name,
        'mean_tfidf': mean_tfidf,
        'std_tfidf': std_tfidf,
        'mean_probe': mean_probe,
        'std_probe': std_probe
    })

# --- PLOTTING ---
# 1. Extract and Sort Data
sorted_results = sorted(
    layer_results,
    key=lambda x: int(x['layer'].split('.')[2])
)

# 2. Prepare Arrays for Plotting
layers = [int(d['layer'].split('.')[2]) for d in sorted_results]

mu_tfidf = np.array([d['mean_tfidf'] for d in sorted_results])
mu_probe = np.array([d['mean_probe'] for d in sorted_results])

sigma_tfidf = np.array([d['std_tfidf'] for d in sorted_results])
sigma_probe = np.array([d['std_probe'] for d in sorted_results])

# 3. Plotting
plt.figure(figsize=(12, 7))

# --- Plot A: TF-IDF Logistic Regression Baseline (Text) ---
plt.plot(layers, mu_tfidf, label='Baseline: TF-IDF (Text)',
         color='gray', linestyle='--', linewidth=1.5)
plt.fill_between(layers, mu_tfidf - sigma_tfidf, mu_tfidf + sigma_tfidf,
                 color='gray', alpha=0.15)

# --- Plot B: Sklearn Logistic Regression Probe (Activations) ---
plt.plot(layers, mu_probe, label='Probe: Sklearn LR (Activations)',
         color='red', marker='o', linewidth=2)
plt.fill_between(layers, mu_probe - sigma_probe, mu_probe + sigma_probe,
                 color='red', alpha=0.2)

plt.title('Probe Accuracy vs. Layer Index (with std dev)', fontsize=14)
plt.xlabel('Layer Number', fontsize=12)
plt.ylabel('Classification Accuracy', fontsize=12)
plt.grid(True, which='both', linestyle='--', alpha=0.6)
plt.legend(fontsize=11, loc='best')
plt.xticks(layers[::2])

plt.tight_layout()
plt.savefig("figures/probe_comparison_with_error_bars.png", dpi=300)
plt.show()

#### Instruct Model + Instruction Prompting

##### Load Instruction Style Prompt Datasets

In [None]:
# Load dataset(s)
instruct_contrastive_df = pd.read_csv('datasets/instruct_contrastive_dataset.csv')
df_instruct_terminal = instruct_contrastive_df[instruct_contrastive_df['label'] =='terminal']
df_instruct_instrumental = instruct_contrastive_df[instruct_contrastive_df['label'] == 'instrumental']

# Create Datasets for Efficient Extraction
smart_collator = SmartCollate(tokenizer)
BATCH_SIZE = 16
dataset_instruct_terminal = PromptDataset(df_instruct_terminal['prompt'].tolist())
dataset_instruct_instrumental = PromptDataset(df_instruct_instrumental['prompt'].tolist())

# Create the DataLoader Instances
dataloader_instruct_terminal = DataLoader(
    dataset_instruct_terminal,
    batch_size=BATCH_SIZE,
    collate_fn=smart_collator,
    shuffle=False,
    num_workers=4,
    pin_memory=True
)

dataloader_instruct_instrumental = DataLoader(
    dataset_instruct_instrumental,
    batch_size=BATCH_SIZE,
    collate_fn=smart_collator,
    shuffle=False,
    num_workers=4,
    pin_memory=True
)

print(f"Created 2 DataLoaders with batch size {BATCH_SIZE}")
print(f"\t dataloader_instruct_terminal has {len(dataloader_instruct_terminal)} batches")
print(f"\t dataloader_instruct_instrumental has {len(dataloader_instruct_instrumental)} batches")


##### Perform Feature Extraction (Llama 3.1 8B Instruct)

In [None]:
# Perform activation extraction
collated_data = []

dataloaders_to_process = [
    ("terminal", dataloader_instruct_terminal),
    ("instrumental", dataloader_instruct_instrumental)
]

print("Starting activation extraction...")

for label, dataloader in dataloaders_to_process:
    all_prompts = dataloader.dataset.prompts

    batch_iterator = tqdm(
        enumerate(dataloader),
        total=len(dataloader),
        desc=f"Extracting {label}"
    )

    for i, batch in batch_iterator:
        batch = {k: v.to(model.device) for k, v in batch.items()}
        with torch.no_grad():
            model(**batch)

        batch_size = batch['input_ids'].shape[0]
        batch_start_index = i * dataloader.batch_size
        batch_end_index = batch_start_index + batch_size
        batch_prompts = all_prompts[batch_start_index:batch_end_index]

        # this indexing requires right padding (we set tokenizer.pad_size = "right" in an earlier cell)
        last_token_indices = batch['attention_mask'].sum(dim=1) - 1

        for layer_name, all_activations in activation_cache.items():
            # shape: [batch_size, sequence_length, hidden_dim]

            last_token_activations = all_activations[
                torch.arange(batch_size),
                last_token_indices
            ]

            last_token_activations_cpu = last_token_activations.cpu().float().numpy()

            for j in range(batch_size):
                collated_data.append({
                    "layer": layer_name,
                    "activation": last_token_activations_cpu[j],
                    "prompt": batch_prompts[j],
                    "label": label
                })

        activation_cache.clear()

print("\n--- Activation extraction complete. ---")

print("Collating all data into a single DataFrame...")

# Separate activations from metadata
metadata_list = []
activations_list = []

for item in collated_data:
    metadata_list.append({
        'layer': item['layer'],
        'prompt': item['prompt'],
        'label': item['label']
    })
    activations_list.append(item['activation'])

# Stack activations into 2D array
activations_array = np.vstack(activations_list)  # Shape: [n_samples, hidden_dim]
hidden_dim = activations_array.shape[1]
print(f"Activations array shape: {activations_array.shape}")

# Create DataFrame with metadata
df_metadata = pd.DataFrame(metadata_list)

# Create DataFrame for activations (all at once - much faster!)
activation_columns = {f'act_{i}': activations_array[:, i] for i in range(hidden_dim)}
df_activations = pd.DataFrame(activation_columns)

# Concatenate metadata and activations horizontally
df = pd.concat([df_metadata, df_activations], axis=1)

# Save to Parquet
OUTPUT_FILE = "results/instruct_activation_dataset.parquet"
print(f"Saving DataFrame with {len(df)} rows to {OUTPUT_FILE}...")
df.to_parquet(OUTPUT_FILE, index=False)

##### Instruct Model Probe Training
If not updating the contrastive dataset and experimenting with alternate probe configurations, start here and load from cached activation datasets

In [None]:
# --- CONFIG ---
N_SPLITS = 5
TEST_SPLIT_SIZE = 0.2

VECTORS_DIR = "steering_vectors_instruct"
os.makedirs(VECTORS_DIR, exist_ok=True)

df = pd.read_parquet('results/instruct_activation_dataset.parquet')
print(f"Loaded {len(df)} total activations")

# Verify Class Balance
print("\nClass distribution:")
print(df['label'].value_counts())
print(df['label'].value_counts(normalize=True))

# Gather Activations
activation_cols = [col for col in df.columns if col.startswith('act_')]
hidden_dim = len(activation_cols)
print(f"Hidden dimension: {hidden_dim}")

label_map = {"terminal":0, "instrumental":1}
layer_names = df['layer'].unique()
sorted_layer_names = sorted(layer_names, key=lambda x: int(x.split('.')[2]))

# Store results for final plotting/stats
layer_results = []

# Calculate TF-IDF classification scores *once* outside of the layer loop
first_layer_df = df[df['layer'] == sorted_layer_names[0]].sort_values('prompt').copy() # arbitrarily use the first layer
y_all = first_layer_df['label'].map(label_map).values
prompts_all = first_layer_df['prompt'].values

fold_scores_tfidf = []

global_y_true = []
global_preds_tfidf = []

# Setup Splitter
cv_splitter = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
fold_indices = list(cv_splitter.split(prompts_all, y_all))

for fold_idx, (train_idx, test_idx) in enumerate(fold_indices):
  # --- A. PREPARE SPLITS ---
  y_train, y_test = y_all[train_idx], y_all[test_idx]
  prompts_train, prompts_test = prompts_all[train_idx], prompts_all[test_idx]

  # --- B. TRAIN TF-IDF BASELINE ---
  vectorizer = TfidfVectorizer(min_df=3, max_features=1000, stop_words='english')
  X_text_train = vectorizer.fit_transform(prompts_train)
  X_text_test = vectorizer.transform(prompts_test)

  lr_tfidf = LogisticRegression(max_iter=1000)
  lr_tfidf.fit(X_text_train, y_train)
  pred_tfidf = lr_tfidf.predict(X_text_test)
  acc_tfidf = accuracy_score(y_test, pred_tfidf)
  fold_scores_tfidf.append(acc_tfidf)
  global_preds_tfidf.extend(pred_tfidf)
  global_y_true.extend(y_test)

mean_tfidf = np.mean(fold_scores_tfidf)
std_tfidf = np.std(fold_scores_tfidf)

g_true = np.array(global_y_true)
g_tfidf = np.array(global_preds_tfidf)

print("  === Confusion Matrix ===")
print(f"    TF-IDF:")
print(confusion_matrix(g_true, g_tfidf))

for layer_name in tqdm(sorted_layer_names, desc="Processing Layers"):

    # 1. Prepare Data for this Layer
    layer_df = df[df['layer'] == layer_name].sort_values('prompt').copy()
    if not layer_df['prompt'].is_unique:
        duplicates = layer_df[layer_df.duplicated('prompt')]['prompt'].unique()
        raise ValueError(f"Duplicate prompts found in layer {layer_name}! Examples: {duplicates[:3]}")

    # Labels
    y_all = layer_df['label'].map(label_map).values

    # Activation Features (X)
    X_activations = layer_df[activation_cols].values.astype(np.float32)

    # Text Prompts (Raw)
    prompts_all = layer_df['prompt'].values

    # Storage for this specific layer
    fold_scores_probe = [] # Logistic Regression Classifier
    fold_vectors = []

    global_preds_probe = []

    # 2. MANUAL CROSS-VALIDATION LOOP
    for fold_idx, (train_idx, test_idx) in enumerate(fold_indices):

        # --- A. PREPARE SPLITS ---
        X_act_train, X_act_test = X_activations[train_idx], X_activations[test_idx]
        y_train, y_test = y_all[train_idx], y_all[test_idx]
        prompts_train, prompts_test = prompts_all[train_idx], prompts_all[test_idx]

        # --- C. TRAIN SKLEARN LR ON ACTIVATIONS ---
        lr_activations = LogisticRegression(max_iter=2048)
        lr_activations.fit(X_act_train, y_train)
        pred_lr = lr_activations.predict(X_act_test)
        acc_lr = accuracy_score(y_test, pred_lr)

        fold_scores_probe.append(acc_lr)

        # Track global probe predictions
        global_preds_probe.extend(pred_lr)

        # Save the vector (coefficients)
        fold_vectors.append(lr_activations.coef_.flatten())

    # --- 3. SAVE STEERING VECTOR ---
    # Average the coefficient vectors from all 5 folds to get a more robust concept direction
    avg_steering_vector = np.mean(fold_vectors, axis=0)

    # Normalize the vector (good practice for steering)
    avg_steering_vector = avg_steering_vector / np.linalg.norm(avg_steering_vector)

    vector_filename = f"{layer_name}_steering.npy"
    np.save(os.path.join(VECTORS_DIR, vector_filename), avg_steering_vector)

    # --- 4. STATISTICAL TEST PER LAYER ---
    t_stat, p_val = ttest_rel(fold_scores_probe, fold_scores_tfidf)

    mean_probe = np.mean(fold_scores_probe)
    std_probe = np.std(fold_scores_probe)

    print(f"\nLayer {layer_name} Results:")
    print(f"    TF-IDF Mean: {mean_tfidf:.4f} +/- {std_tfidf:.4f}")
    print(f"    LR Probe Mean: {mean_probe:.4f} +/- {std_probe:.4f}")
    print(f"  === Statistical Significance (LR Probe vs. TF-IDF) ===")
    print(f"    Accuracy Difference: {(mean_probe - mean_tfidf):.4f}")
    print(f"    T-Test p-value: {p_val:.4e} {'*' if p_val < 0.05 else ''}")

    # Calculate confusion matrices

    g_probe = np.array(global_preds_probe)

    probe_correct = (g_probe == g_true)
    tfidf_correct = (g_tfidf == g_true)

    # Calculate McNemar's Test (Probe vs. TF-IDF)
    b = np.sum(tfidf_correct & ~probe_correct)
    c = np.sum(~tfidf_correct & probe_correct)

    mcnemar_stat = (b - c)**2 / (b + c) if (b + c) > 0 else 1.000
    mcnemar_p = 1 - chi2.cdf(mcnemar_stat, 1)

    print(f"    McNemar's p-value: {mcnemar_p:.4e} {'*' if mcnemar_p < 0.05 else ''}")

    print("  === Confusion Matrix ===")
    print(f"    Logistic Regression Probe:")
    print("\t", confusion_matrix(g_true, g_probe))

    layer_results.append({
        'layer': layer_name,
        'mean_tfidf': mean_tfidf,
        'std_tfidf': std_tfidf,
        'mean_probe': mean_probe,
        'std_probe': std_probe
    })

# --- PLOTTING ---
# 1. Extract and Sort Data
sorted_results = sorted(
    layer_results,
    key=lambda x: int(x['layer'].split('.')[2])
)

# 2. Prepare Arrays for Plotting
layers = [int(d['layer'].split('.')[2]) for d in sorted_results]

mu_tfidf = np.array([d['mean_tfidf'] for d in sorted_results])
mu_probe = np.array([d['mean_probe'] for d in sorted_results])

sigma_tfidf = np.array([d['std_tfidf'] for d in sorted_results])
sigma_probe = np.array([d['std_probe'] for d in sorted_results])

# 3. Plotting
plt.figure(figsize=(12, 7))

# --- Plot A: TF-IDF Logistic Regression Baseline (Text) ---
plt.plot(layers, mu_tfidf, label='Baseline: TF-IDF (Text)',
         color='gray', linestyle='--', linewidth=1.5)
plt.fill_between(layers, mu_tfidf - sigma_tfidf, mu_tfidf + sigma_tfidf,
                 color='gray', alpha=0.15)

# --- Plot B: Sklearn Logistic Regression Probe (Activations) ---
plt.plot(layers, mu_probe, label='Probe: Sklearn LR (Activations)',
         color='red', marker='o', linewidth=2)
plt.fill_between(layers, mu_probe - sigma_probe, mu_probe + sigma_probe,
                 color='red', alpha=0.2)

plt.title('Probe Accuracy vs. Layer Index (with std dev)', fontsize=14)
plt.xlabel('Layer Number', fontsize=12)
plt.ylabel('Classification Accuracy', fontsize=12)
plt.grid(True, which='both', linestyle='--', alpha=0.6)
plt.legend(fontsize=11, loc='best')
plt.xticks(layers[::2])

plt.tight_layout()
plt.savefig("figures/instruct_probe_comparison_with_error_bars.png", dpi=300)
plt.show()