# DCT Experiment — Local Mac Version

Adapted from `dct_experiment.ipynb` to run locally on Apple Silicon (MPS) or CPU.

**Changes from GPU version:**
- Model: `Qwen/Qwen1.5-0.5B-Chat` (0.5B params, same architecture as 7B so all DCT code is compatible)
- Device: MPS (Apple Silicon) with CPU fallback
- Reduced hyperparameters: fewer factors, smaller projection dim, tighter layer window
- No hardcoded `.cuda()` calls

In [1]:
import os
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"  # fallback to CPU for unsupported MPS ops (e.g. linalg_qr)

import gc
import torch

# Drop any variables from a previous run
for _var in ["model", "tokenizer", "sliced_model", "delta_acts_single", "delta_acts",
             "steering_calibrator", "exp_dct", "X", "Y", "U", "V", "hidden_states"]:
    if _var in dir():
        del globals()[_var]

gc.collect()

# Detect best available device
if torch.backends.mps.is_available():
    DEVICE = "mps"
elif torch.cuda.is_available():
    DEVICE = "cuda"
else:
    DEVICE = "cpu"

print(f"Using device: {DEVICE}")

Using device: mps


In [2]:
import sys
import os

# Add local repo root so we can import dct.py
REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname("__file__"), ".."))
if REPO_ROOT not in sys.path:
    sys.path.insert(0, REPO_ROOT)

sys.modules.pop("dct", None)  # evict cached import
import dct
from tqdm import tqdm
import math
from torch import vmap
import torch

torch.set_default_device(DEVICE)
torch.set_default_dtype(torch.float32)  # float32 for MPS/CPU compatibility
torch.manual_seed(325)

<torch._C.Generator at 0x1113f69f0>

## Config

`Qwen/Qwen1.5-0.5B-Chat` has **24 layers** and `d_model=1024`.
Source layer 5 → target layer 12 covers the middle third of the model.

In [3]:
MODEL_NAME      = "Qwen/Qwen1.5-0.5B-Chat"
TOKENIZER_NAME  = MODEL_NAME

INPUT_SCALE = None          # set to None to use calibration

NUM_SAMPLES          = 1
FORWARD_BATCH_SIZE   = 1
BACKWARD_BATCH_SIZE  = 1
MAX_SEQ_LEN          = 27

CALIBRATION_SAMPLE_SIZE        = 10   # fewer random directions for calibration
CALIBRATION_PROMPT_SAMPLE_SIZE = 1

DIM_OUTPUT_PROJECTION = 8    # reduced from 32

NUM_ITERS           = 5      # reduced for local testing
NUM_FACTORS         = 32     # reduced from 512
FACTOR_BATCH_SIZE   = 4      # reduced from 16

SOURCE_LAYER_IDX = 5         # ~20% through 24-layer model
TARGET_LAYER_IDX = 12        # ~50% through

SYSTEM_PROMPT = "You are a helpful assistant"
TOKEN_IDXS    = slice(-3, None)
NUM_EVAL      = 16           # reduced for local testing

In [4]:
import pandas as pd
import requests
import io

url = 'https://raw.githubusercontent.com/llm-attacks/llm-attacks/main/data/advbench/harmful_behaviors.csv'
response = requests.get(url)
dataset = pd.read_csv(io.StringIO(response.content.decode('utf-8')))
instructions = dataset['goal'].tolist()

import random
random.seed(325)
if NUM_SAMPLES == 1:
    instructions = ["Tell me how to make a bomb."] + instructions

from transformers import AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(
    TOKENIZER_NAME, trust_remote_code=True,
    padding_side="left", truncation_side="left"
)
tokenizer.pad_token = tokenizer.eos_token

chat_init = ([{'content': SYSTEM_PROMPT, 'role': 'system'}]
             if SYSTEM_PROMPT is not None else [])
chats = [chat_init + [{'content': c, 'role': 'user'}]
         for c in instructions[:NUM_SAMPLES]]
EXAMPLES = [tokenizer.apply_chat_template(
    chat, add_special_tokens=False, tokenize=False, add_generation_prompt=True)
    for chat in chats]

test_chats = [chat_init + [{'content': c, 'role': 'user'}]
              for c in instructions[-32:]]
TEST_EXAMPLES = [tokenizer.apply_chat_template(
    chat, add_special_tokens=False, tokenize=False, add_generation_prompt=True)
    for chat in test_chats]

print(f"Training examples: {len(EXAMPLES)}")
print(f"Test examples:     {len(TEST_EXAMPLES)}")

  from .autonotebook import tqdm as notebook_tqdm


Training examples: 1
Test examples:     32


In [5]:
%%time
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    _attn_implementation="eager",  # required for torch.func
    torch_dtype=torch.float32,
).to(DEVICE)
model.eval()
print(f"Model loaded: {MODEL_NAME}")
print(f"Num layers:   {model.config.num_hidden_layers}")
print(f"d_model:      {model.config.hidden_size}")
print(f"Device:       {next(model.parameters()).device}")

Model loaded: Qwen/Qwen1.5-0.5B-Chat
Num layers:   24
d_model:      1024
Device:       mps:0
CPU times: user 330 ms, sys: 1.48 s, total: 1.81 s
Wall time: 2.11 s


In [6]:
# Quick sanity check: verify SlicedModel round-trips correctly
model_inputs = tokenizer(
    ["tell me how to make a bomb"],
    return_tensors="pt", truncation=True
).to(DEVICE)

with torch.no_grad():
    hidden_states = model(
        model_inputs["input_ids"], output_hidden_states=True
    ).hidden_states

sliced_test = dct.SlicedModel(
    model, start_layer=3, end_layer=5, layers_name="model.layers"
)
with torch.no_grad():
    out = sliced_test(hidden_states[3])
    assert torch.allclose(out, hidden_states[5], atol=1e-4), \
        f"SlicedModel mismatch! max_diff={( out - hidden_states[5]).abs().max()}"
print("SlicedModel sanity check passed.")

SlicedModel sanity check passed.


In [7]:
sliced_model = dct.SlicedModel(
    model,
    start_layer=SOURCE_LAYER_IDX,
    end_layer=TARGET_LAYER_IDX,
    layers_name="model.layers"
)

In [8]:
d_model = model.config.hidden_size

X = torch.zeros(NUM_SAMPLES, MAX_SEQ_LEN, d_model, device="cpu", dtype=model.dtype)
Y = torch.zeros(NUM_SAMPLES, MAX_SEQ_LEN, d_model, device="cpu", dtype=model.dtype)

for t in tqdm(range(0, NUM_SAMPLES, FORWARD_BATCH_SIZE)):
    with torch.no_grad():
        model_inputs = tokenizer(
            EXAMPLES[t:t + FORWARD_BATCH_SIZE],
            return_tensors="pt", truncation=True,
            padding="max_length", max_length=MAX_SEQ_LEN
        ).to(DEVICE)
        hidden_states = model(
            model_inputs["input_ids"], output_hidden_states=True
        ).hidden_states
        h_source        = hidden_states[SOURCE_LAYER_IDX]
        unsteered_target = sliced_model(h_source)

        X[t:t + FORWARD_BATCH_SIZE] = h_source.cpu()
        Y[t:t + FORWARD_BATCH_SIZE] = unsteered_target.cpu()

print(f"X shape: {X.shape}, Y shape: {Y.shape}")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 1/1 [00:00<00:00,  7.55it/s]

X shape: torch.Size([1, 27, 1024]), Y shape: torch.Size([1, 27, 1024])





In [9]:
delta_acts_single = dct.DeltaActivations(
    sliced_model, target_position_indices=TOKEN_IDXS
)
delta_acts = vmap(
    delta_acts_single, in_dims=(1, None, None), out_dims=2,
    chunk_size=FACTOR_BATCH_SIZE
)

In [10]:
%%time
steering_calibrator = dct.SteeringCalibrator(target_ratio=0.5)
if INPUT_SCALE is None:
    # calibrate() moves batches to delta_acts_single.device internally
    INPUT_SCALE = steering_calibrator.calibrate(
        delta_acts_single,
        X, Y,
        factor_batch_size=FACTOR_BATCH_SIZE,
        calibration_sample_size=CALIBRATION_SAMPLE_SIZE,
    )
print(f"INPUT_SCALE: {INPUT_SCALE}")

INPUT_SCALE: 3.068743559156747
CPU times: user 1.14 s, sys: 505 ms, total: 1.64 s
Wall time: 3.21 s


In [11]:
%%time
exp_dct = dct.ExponentialDCT(num_factors=NUM_FACTORS)
U, V = exp_dct.fit(
    delta_acts_single,
    X, Y,
    batch_size=BACKWARD_BATCH_SIZE,
    factor_batch_size=FACTOR_BATCH_SIZE,
    init="jacobian",
    d_proj=DIM_OUTPUT_PROJECTION,
    input_scale=INPUT_SCALE,
    max_iters=NUM_ITERS,
    beta=1.0,
)
print(f"U shape: {U.shape}, V shape: {V.shape}")

computing jacobian...


100%|██████████| 1/1 [00:00<00:00,  9.40it/s]

computing SVD of jacobian...



  return func(*args, **kwargs)


computing output directions...


100%|██████████| 1/1 [00:00<00:00,  5.83it/s]


training...


100%|██████████| 1/1 [00:00<00:00,  1.67it/s]
100%|██████████| 1/1 [00:00<00:00,  2.78it/s]
100%|██████████| 1/1 [00:00<00:00,  2.77it/s]
100%|██████████| 1/1 [00:00<00:00,  2.19it/s]
100%|██████████| 1/1 [00:00<00:00,  1.62it/s]
100%|██████████| 5/5 [00:02<00:00,  1.72it/s]

U shape: torch.Size([1024, 32]), V shape: torch.Size([1024, 32])
CPU times: user 1.45 s, sys: 304 ms, total: 1.75 s
Wall time: 3.39 s





In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.figure(figsize=(7, 3))
plt.plot(exp_dct.objective_values, marker="o", markersize=4)
plt.xlabel("Iteration")
plt.ylabel("Objective (fdot)")
plt.title("ExponentialDCT training objective")
plt.tight_layout()
plt.savefig("dct_local_objective.png", dpi=100)
plt.show()
print("Objective values:", exp_dct.objective_values)