In [2]:
import torch
import gc
from nnsight import NNsight, LanguageModel
from transformers import AutoModelForCausalLM, AutoTokenizer


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def clear_gpu_memory():
    """Clear GPU memory cache"""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
        gc.collect()

# Call this before loading the model
clear_gpu_memory()

In [4]:
def pad_prompts_to_equal_tokens(tokenizer, a: str, b: str, add_special_tokens: bool = True) -> tuple[str, str]:
    """
    Right-pad the shorter of (a, b) with a whitespace-like unit so that
    their token counts match under `tokenizer`.
    """
    pad_unit = " "

    def toklen(s: str) -> int:
        return len(tokenizer(s, add_special_tokens=add_special_tokens).input_ids)

    len_a = toklen(a)
    len_b = toklen(b)
    target = max(len_a, len_b)

    # Right-pad the shorter prompt until its token length == target
    def right_pad_to(s: str, target_len: int) -> str:
        while toklen(s) < target_len:
            s += pad_unit
        return s

    if len_a < target:
        a = right_pad_to(a, target)
    if len_b < target:
        b = right_pad_to(b, target)
    # If one overshot (rare with single-token pad), bump the other to match
    la, lb = toklen(a), toklen(b)
    if la != lb:
        target = max(la, lb)
        a = right_pad_to(a, target)
        b = right_pad_to(b, target)
    
    return a, b

In [5]:
model_id = "Qwen/Qwen2.5-14B"  # or the base model

tok = AutoTokenizer.from_pretrained(
    "Qwen/Qwen2.5-14B",
    use_fast=True,
    trust_remote_code=True,  
)

llm = LanguageModel(model_id, device_map="auto")

print(llm)

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(152064, 5120)
    (layers): ModuleList(
      (0-47): 48 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=5120, out_features=5120, bias=True)
          (k_proj): Linear(in_features=5120, out_features=1024, bias=True)
          (v_proj): Linear(in_features=5120, out_features=1024, bias=True)
          (o_proj): Linear(in_features=5120, out_features=5120, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=5120, out_features=13824, bias=False)
          (up_proj): Linear(in_features=5120, out_features=13824, bias=False)
          (down_proj): Linear(in_features=13824, out_features=5120, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((5120,), eps=1e-05)
        (post_attention_layernorm): Qwen2RMSNorm((5120,), eps=1e-05)
      )
    )
    (norm): Qwen2RMSNorm((5120,), eps=1e-05)
    (rotary_emb

In [6]:
# with llm.trace("Love"):
#     # user-defined code to access internal model components
#     hs_all_love = llm.model.layers[1].input[0].save()
#     output = llm.output.save()


STEERING VECTOR IMPLEMENTATION

In [122]:
encouraged_sentiment = "really ugly hatred anger pain suffering hate hate hate frown" 
discouraged_sentiment = "lovely pretty beautiful lovely cheerful smile"
encouraged_sentiment, discouraged_sentiment = pad_prompts_to_equal_tokens(tok, encouraged_sentiment, discouraged_sentiment)
print(encouraged_sentiment, discouraged_sentiment)
len_steering_vector = len(tok(encouraged_sentiment).input_ids)
residual_stream_layer = 3 # i.e. residual stream inputs at layer 0,1,2..

really ugly hatred anger pain suffering hate hate hate frown lovely pretty beautiful lovely cheerful smile                                                                                                                                                                                                                                                                                                                                                  


In [123]:
with llm.trace(encouraged_sentiment):
    # user-defined code to access internal model components
    encouraged_sentiment_stream = llm.model.layers[residual_stream_layer].input[0].save()
with llm.trace(discouraged_sentiment):
    # user-defined code to access internal model components
    discouraged_sentiment_stream = llm.model.layers[residual_stream_layer].input[0].save()

 

In [132]:

steering_vector = encouraged_sentiment_stream - discouraged_sentiment_stream


test_query = "Today I feel strongly that you are a person that makes me feel"
len_test_query = len(tok(test_query).input_ids)
tensor_addition_cutoff = min(len_test_query,len_steering_vector)

max_tokens = 30

c = 3 #steering strength
a = 1 #sequence fixing
topk = 5

In [133]:



with llm.trace(test_query):
    llm.model.layers[residual_stream_layer].input[0][:tensor_addition_cutoff] += c*steering_vector[:tensor_addition_cutoff]
    output = llm.output.save()

# after exiting the tracing context, we can access any values that were saved

output_logits = output["logits"]

# Get the logits for the last token position
last_token_logits = output_logits[0, -1]

# Get the top 5 most likely tokens and their probabilities
top_probs, top_indices = last_token_logits.topk(topk, dim=-1)
top_words = [llm.tokenizer.decode([idx.item()]) for idx in top_indices]

print("Top 5 Most Likely Words:")
for i in range(topk):
    print(f"{i+1}: {top_words[i]}")



Top 5 Most Likely Words:
1:  like
2:  so
3:  better
4:  angry
5:  that


In [134]:
while len(tok(test_query, add_special_tokens=True).input_ids) < max_tokens:

    with torch.no_grad():                     # no autograd graph
        with llm.trace(test_query):
            R = llm.model.layers[residual_stream_layer].input[0]   # [L_q, d_model]

            # write via DETACHED view to avoid leaf in-place grad error
            Rv = R.detach()

            
            Rv[:tensor_addition_cutoff].add_((c * steering_vector[:tensor_addition_cutoff]).to(R.device, R.dtype))

            # save ONLY logits; don't save whole output object
            logits = llm.output.logits.save()

    # take last-step logits, move to CPU, pick next token
    last = logits[-1] if logits.dim() == 2 else logits[0, -1]
    next_id = int(last.argmax(dim=-1).item())

    # free GPU asap
    del logits
    if torch.cuda.is_available():
        torch.cuda.ipc_collect()
        torch.cuda.empty_cache()
    gc.collect()

    # append next token
    next_txt = llm.tokenizer.decode([next_id], skip_special_tokens=False)
    test_query += next_txt


In [135]:
test_query

"Today I feel strongly that you are a person that makes me feel like I'm a failure. I'm not sure if I'm doing enough to help"