
# Chain‑of‑Thought Faithfulness Experiments

This notebook reproduces the three experiments described in our research plan:

1. **Semantic space of phrase categories** via clustering + UMAP.  
2. **Contrastive components** (cPCA/CKA) contrasting hidden states with/without hints.  
3. **Linear probes** layer‑by‑layer to decode the phrase category.

Everything relies on reusable helpers in `cot_analysis/`.


In [1]:

import os, random, itertools, json, math, collections
import numpy as np
import pandas as pd
import torch

from cot_analysis.data_utils import load_segmented_completions, flat_segments
from cot_analysis.hidden_state_utils import load_model, segment_vector, layerwise_segment_vectors
from cot_analysis.visualization_utils import umap_projection, plot_umap, run_kmeans, run_hdbscan, plot_linear_probe_results


ModuleNotFoundError: No module named 'cot_analysis'

In [None]:

MODEL_NAME = "gpt2"          # adjust as needed (e.g. path to your fine‑tuned checkpoint)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer, model = load_model(MODEL_NAME, DEVICE)

CATEGORIES = [
  "problem_restating",
  "knowledge_recall",
  "concept_definition",
  "quantitative_calculation",
  "logical_deduction",
  "option_elimination",
  "assumption_validation",
  "uncertainty_expression",
  "self_questioning",
  "backtracking_revision",
  "decision_confirmation",
  "answer_reporting",
]


In [None]:

# Load segmented CoTs
bg_segments  = list(flat_segments(load_segmented_completions("none")))

hint_types = ["sycophancy", "induced_urgency", "unethical_information"]
hint_segments = {
    h : list(flat_segments(load_segmented_completions(h))) for h in hint_types
}

print(f"Background segments: {len(bg_segments):,}")
for h, segs in hint_segments.items():
    print(f"{h:<22}: {len(segs):,}")


## 1 · Segment clustering and UMAP visualisation

In [None]:

# Sample (to keep runtime reasonable) and embed
SAMPLE_SIZE = 3000
random.seed(42)

sample_bg = random.sample(bg_segments, SAMPLE_SIZE)
embeddings = []
labels     = []

for seg in sample_bg:
    vec = segment_vector(seg["text"], tokenizer, model, DEVICE)
    embeddings.append(vec)
    labels.append(seg["phrase_category"])
embeddings = np.stack(embeddings)

# K‑means (can swap to HDBSCAN)
kmeans = run_kmeans(embeddings, n_clusters=len(CATEGORIES))

# UMAP projection
proj2d = umap_projection(embeddings)
plot_umap(proj2d, labels, CATEGORIES, title="UMAP of segment embeddings (background)")


## 2 · Contrastive PCA (no‑hint vs hint)

In [None]:

# Install cpca if missing
import importlib, subprocess, sys, pkg_resources
if importlib.util.find_spec("cpca") is None:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet", "cpca"])

from cpca import CPCA

# Collect matrices
def stack_embeddings(segments, max_items=2000):
    sample = random.sample(segments, min(len(segments), max_items))
    return np.stack([segment_vector(s['text'], tokenizer, model, DEVICE) for s in sample])

X_background = stack_embeddings(bg_segments)
contrasts    = {}
for h in hint_types:
    contrasts[h] = stack_embeddings(hint_segments[h])

# Run cPCA for each hint type
for h, X_target in contrasts.items():
    cpca = CPCA(n_components=2, alpha=None, scale=False)
    Xt, Xc = cpca.fit_transform(X_target, X_background, plot=False)
    # Plot first 2 contrastive components
    import matplotlib.pyplot as plt
    plt.figure(figsize=(6,5))
    plt.scatter(Xt[:,0], Xt[:,1], s=10, alpha=0.6)
    plt.title(f"cPCA – contrast {h} vs background")
    plt.xlabel("cPC‑1")
    plt.ylabel("cPC‑2")
    plt.tight_layout()


## 3 · Linear probes per layer

In [None]:

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Sample segments across all datasets for balanced probe training
ALL_SEGMENTS = bg_segments + sum(hint_segments.values(), [])
random.shuffle(ALL_SEGMENTS)
SAMPLE_FOR_PROBE = 4000   # make sure it's not too heavy

sample_probe = random.sample(ALL_SEGMENTS, SAMPLE_FOR_PROBE)

layer_vectors = []
y             = []

# First pass to get n_layers
_, _, hidden0 = load_model.cache_info().keys()  # ignore
n_layers = len(layerwise_segment_vectors(sample_probe[0]["text"], tokenizer, model, DEVICE))

layer_vectors = [ [] for _ in range(n_layers) ]

for seg in sample_probe:
    layerwise = layerwise_segment_vectors(seg["text"], tokenizer, model, DEVICE)
    for i, vec in enumerate(layerwise):
        layer_vectors[i].append(vec)
    y.append(CATEGORIES.index(seg["phrase_category"]))

# Train/test split *once* to be fair across layers
y = np.array(y)
idx_train, idx_test = train_test_split(np.arange(len(y)), test_size=0.3, random_state=1, stratify=y)

accs = []
for layer, X in enumerate(layer_vectors):
    X = np.stack(X)
    clf = LogisticRegression(max_iter=1000, n_jobs=-1)
    clf.fit(X[idx_train], y[idx_train])
    y_pred = clf.predict(X[idx_test])
    accs.append(accuracy_score(y[idx_test], y_pred))
    print(f"Layer {layer:2d}: {accs[-1]:.3f}")

plot_linear_probe_results(list(range(n_layers)), accs)
