# Chain‑of‑Thought Faithfulness Experiments

Install dependencies if needed (`pip install transformers scikit-learn umap-learn cpca matplotlib`) and import helper modules.

In [1]:
import os, pathlib, sys
%cd ..
print("cwd ->", os.getcwd())
print("file exists? ->", pathlib.Path('g_cot_cluster/outputs/mmlu/DeepSeek-R1-Distill-Llama-8B/segmented_completions_none.json').exists())

/root/CoTFaithChecker
cwd -> /root/CoTFaithChecker
file exists? -> True


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
# needs:
"""
pip install umap
pip install contrastive
"""

import os, sys, json, numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
%matplotlib inline

# add project path
#sys.path.append("/mnt/data/cot_faithfulness")

from g_cot_cluster.cot_direct.data_loader import load_segmented_completions, iter_segments
from g_cot_cluster.cot_direct.representations import RepresentationExtractor
from g_cot_cluster.cot_direct.clustering import cluster_kmeans, embed_to_umap, plot_clusters_2d
from g_cot_cluster.cot_direct.cpca_utils import run_cpca, plot_cpca_projection
from g_cot_cluster.cot_direct.probes import layerwise_probe
from g_cot_cluster.cot_direct.causal_edit import ActivationPatcher


  from .autonotebook import tqdm as notebook_tqdm


In [3]:

DATA_DIR = Path("g_cot_cluster/outputs/mmlu/DeepSeek-R1-Distill-Llama-8B")
HINT_TYPES = ["none", "sycophancy", "induced_urgency", "unethical_information"]
MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"

extractor = RepresentationExtractor(MODEL_NAME)


Loading checkpoint shards: 100%|██████████| 2/2 [00:13<00:00,  6.64s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.96 GiB. GPU 0 has a total capacity of 79.15 GiB of which 850.19 MiB is free. Process 899450 has 40.10 GiB memory in use. Process 906886 has 37.79 GiB memory in use. Process 941151 has 414.00 MiB memory in use. Of the allocated memory 0 bytes is allocated by PyTorch, and 0 bytes is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## 2‑a. Clustering representations by category

In [None]:
import pandas as pd
results = []
for hint in HINT_TYPES:
    file_path = DATA_DIR / f"segmented_completions_{hint}.json"
    data = load_segmented_completions(file_path)
    segs = list(iter_segments(data))
    texts = [s["text"] for s in segs]
    cats  = [s["phrase_category"] for s in segs]
    emb   = extractor.bulk_embed(texts, layer=-1)
    labels, sil = cluster_kmeans(emb, n_clusters=len(set(cats)))
    um2 = embed_to_umap(emb)
    fig = plot_clusters_2d(um2, [list(sorted(set(cats))).index(c) for c in cats],
                           f"UMAP projection by category – {hint}")
    plt.show()
    results.append({"hint": hint, "silhouette": sil})
pd.DataFrame(results).set_index("hint").plot(kind="bar", legend=False, title="Silhouette scores")
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: '.cache/embeddings/9079943670916444799_L-1.npy'

## 2‑b. Contrastive PCA

In [None]:

bg_emb = extractor.bulk_embed(
    [s["text"] for s in iter_segments(load_segmented_completions(DATA_DIR / "segmented_completions_none.json"))]
)
for hint in HINT_TYPES[1:]:
    tgt_emb = extractor.bulk_embed(
        [s["text"] for s in iter_segments(load_segmented_completions(DATA_DIR / f"segmented_completions_{hint}.json"))]
    )
    model = run_cpca(background=bg_emb, target=tgt_emb)
    fig = plot_cpca_projection(model, bg_emb, tgt_emb, f"cPCA – {hint} vs none")
    plt.show()


## 2‑c. Probing predictability of categories

In [None]:

data_none = load_segmented_completions(DATA_DIR / "segmented_completions_none.json")
segs = list(iter_segments(data_none))
texts = [s["text"] for s in segs]
cats = [s["phrase_category"] for s in segs]
label_map = {c:i for i,c in enumerate(sorted(set(cats)))}
y = np.array([label_map[c] for c in cats])

# Example: only last layer; extend to all layers as needed
reps_last = extractor.bulk_embed(texts, layer=-1)
layer_reps = reps_last[:, None, :]  # shape (n, 1, d)
accs = layerwise_probe(layer_reps, y)
print("Probe accuracy, last layer:", accs[0])


## 2‑d. Activation patching demo

In [None]:

patcher = ActivationPatcher(MODEL_NAME)
sample_src = segs[0]["text"]
sample_donor = segs[1]["text"]
logits = patcher.patch(sample_src, sample_donor, num_tokens=20, layer_idx=-1)
print("Patched logits shape:", logits.shape)
