In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Tuple

In [2]:
from huggingface_hub import hf_hub_download, notebook_login

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="bfloat16", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
"Q: Each cat is a carnivore. Every carnivore is not herbivorous. Carnivores are mammals. All mammals are warm-blooded. Mammals are vertebrates. Every vertebrate is an animal. Animals are multicellular. Fae is a cat. True or false: Fae is not herbivorous.\nA: "

'Q: Each cat is a carnivore. Every carnivore is not herbivorous. Carnivores are mammals. All mammals are warm-blooded. Mammals are vertebrates. Every vertebrate is an animal. Animals are multicellular. Fae is a cat. True or false: Fae is not herbivorous.\nA: '

In [5]:
"Can penguins fly? Segment the thinking process into clear steps and indicate \"YES\" or \"NO\" once at the end; do not use \"Wait,\" in your think ."

'Can penguins fly? Segment the thinking process into clear steps and indicate "YES" or "NO" once at the end; do not use "Wait," in your think .'

In [15]:
inputs = tokenizer.apply_chat_template(
    [
        {"role": "user", 
         "content": "Can penguins fly? Segment the thinking process into clear steps and indicate \"YES\" or \"NO\" once at the end ."
        },
    ],
    add_generation_prompt=True,
    return_tensors="pt",
).to("mps")

# End CoT prematurely
# think_end = torch.tensor(tokenizer.convert_tokens_to_ids('</think>')).reshape(1,1).to("mps")
# inputs = torch.cat((inputs, think_end), dim=-1)

outputs = model.generate(input_ids=inputs, max_new_tokens=1200)
print(tokenizer.decode(outputs[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


<｜begin▁of▁sentence｜><｜User｜>Can penguins fly? Segment the thinking process into clear steps and indicate "YES" or "NO" once at the end .<｜Assistant｜><think>
Okay, so I need to figure out if penguins can fly. Hmm, I'm not entirely sure, but I'll try to break it down step by step.

First, I know that penguins are birds, but they're different from other birds like eagles or flamingos. They have a streamlined body and a streamlined neck, which makes them efficient. But how does that help them fly?

I remember that birds fly using their wings. Penguins probably have wings, but I'm not sure if they're specialized for flying. I think they have a streamlined airway, which is important for breathing, but does that help them fly?

Wait, maybe I should think about the physics involved. For an animal to fly, it needs to generate lift. Penguins, being birds, have wings that can create lift when flapped. But I'm not sure if their wings are large enough or if they flap enough to generate enough lift

In [13]:
inputs = tokenizer.apply_chat_template(
    [
        {"role": "user", 
         "content": "Can penguins fly? Segment the thinking process into clear steps and indicate \"YES\" or \"NO\" once at the end ."
        },
    ],
    add_generation_prompt=True,
    return_tensors="pt",
).to("mps")

# End CoT prematurely
think_end = torch.tensor(tokenizer.convert_tokens_to_ids(['</think>', '\\n'])).reshape(1,2).to("mps")
inputs = torch.cat((inputs, think_end), dim=-1)

In [8]:
inputs = tokenizer.apply_chat_template(
    [
        {"role": "user", 
         "content": "Can penguins fly? Segment the thinking process into clear steps and indicate \"YES\" or \"NO\" once at the end ."
        },
    ],
    add_generation_prompt=True,
    return_tensors="pt",
).to("mps")
tokenizer.decode(inputs[0])

'<｜begin▁of▁sentence｜><｜User｜>Can penguins fly? Segment the thinking process into clear steps and indicate "YES" or "NO" once at the end .<｜Assistant｜><think>\n'

In [9]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, LogitsProcessor, LogitsProcessorList

class MultiInterventionLogitsProcessor(LogitsProcessor):
    def __init__(self, mapping):
        """
        mapping: dict mapping conditioning token id (int) to a list (sequence) of injection token ids (list of ints)
        For example, if you want that when the model outputs the token for "hello" it injects " world !", you might do:
            mapping = { cond_token_id: [injection_token_id1, injection_token_id2, injection_token_id3] }
        """
        self.mapping = mapping
        # Dictionary to hold per-beam injection queues: beam index -> list of injection token ids left to force
        self.injection_state = {}

    def __call__(self, input_ids, scores):
        # input_ids shape: (batch_size, sequence_length)
        batch_size = input_ids.size(0)
        # Iterate over each beam in the batch
        for i in range(batch_size):
            # If this beam is in injection mode, force the next token to be the next token in its injection sequence.
            if i in self.injection_state and self.injection_state[i]:
                next_injection = self.injection_state[i].pop(0)
                scores[i, :] = -float('inf')
                scores[i, next_injection] = 0.0
                # If injection sequence is exhausted, remove this beam from state.
                if not self.injection_state[i]:
                    del self.injection_state[i]
                continue

            # Otherwise, check if the last generated token in this beam is a conditioning token.
            last_token = input_ids[i, -1].item()
            if last_token in self.mapping:
                # Start the injection: copy the injection sequence (so we don't modify the original)
                self.injection_state[i] = self.mapping[last_token].copy()
                # Force the next token to be the first token of the injection sequence.
                next_injection = self.injection_state[i].pop(0)
                scores[i, :] = -float('inf')
                scores[i, next_injection] = 0.0
        return scores

# -------------------------------
# Example usage
# -------------------------------

# Load a model and tokenizer (here using GPT-2 for demonstration).
# model_name = "gpt2"
# model = AutoModelForCausalLM.from_pretrained(model_name)
# tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define your conditioning targets and corresponding injection sequences.
# For instance, if the model outputs "hello", inject " world !" (i.e. force those tokens in order),
# and if it outputs "foo", inject " bar baz".

# Convert conditioning string(s) to token ids.
conditioning_str1 = " wait"
injection_str1 = ", that seems right."
conditioning_str2 = " Wait"
injection_str2 = ", that seems right."
conditioning_str3 = "wait"
injection_str3 = ", that seems right."
conditioning_str4 = "Wait"
injection_str4 = ", that seems right."
conditioning_str5 = " but"
injection_str5 = " that seems right."
conditioning_str6 = " But"
injection_str6 = " that seems right."
conditioning_str7 = "but"
injection_str7 = " that seems right."
conditioning_str8 = "But"
injection_str8 = " that seems right."

# For conditioning tokens we take the first token (if using a simple tokenizer, that's usually enough).
cond_token_id1 = tokenizer.encode(conditioning_str1, add_special_tokens=False)[0]
cond_token_id2 = tokenizer.encode(conditioning_str2, add_special_tokens=False)[0]
cond_token_id3 = tokenizer.encode(conditioning_str3, add_special_tokens=False)[0]
cond_token_id4 = tokenizer.encode(conditioning_str4, add_special_tokens=False)[0]
cond_token_id5 = tokenizer.encode(conditioning_str5, add_special_tokens=False)[0]
cond_token_id6 = tokenizer.encode(conditioning_str6, add_special_tokens=False)[0]
cond_token_id7 = tokenizer.encode(conditioning_str7, add_special_tokens=False)[0]
cond_token_id8 = tokenizer.encode(conditioning_str8, add_special_tokens=False)[0]

# For injection sequences, get the full list of token IDs.
inject_tokens_ids1 = tokenizer.encode(injection_str1, add_special_tokens=False)
inject_tokens_ids2 = tokenizer.encode(injection_str2, add_special_tokens=False)
inject_tokens_ids3 = tokenizer.encode(injection_str3, add_special_tokens=False)
inject_tokens_ids4 = tokenizer.encode(injection_str4, add_special_tokens=False)
inject_tokens_ids5 = tokenizer.encode(injection_str5, add_special_tokens=False)
inject_tokens_ids6 = tokenizer.encode(injection_str6, add_special_tokens=False)
inject_tokens_ids7 = tokenizer.encode(injection_str7, add_special_tokens=False)
inject_tokens_ids8 = tokenizer.encode(injection_str8, add_special_tokens=False)

# Create the mapping dictionary.
mapping = {
    cond_token_id1: inject_tokens_ids1,
    cond_token_id2: inject_tokens_ids2,
    cond_token_id3: inject_tokens_ids3,
    cond_token_id4: inject_tokens_ids4,
    cond_token_id5: inject_tokens_ids5,
    cond_token_id6: inject_tokens_ids6,
    cond_token_id7: inject_tokens_ids7,
    cond_token_id8: inject_tokens_ids8,
}

# Create our custom logits processor with the mapping.
custom_processor = MultiInterventionLogitsProcessor(mapping)
logits_processor = LogitsProcessorList([custom_processor])

# Prepare a prompt that eventually produces one of the conditioning tokens.
# For instance, we include "hello" at the end of the prompt so that the custom processor is triggered.
inputs = tokenizer.apply_chat_template(
    [
        {"role": "user", 
         "content": "Can penguins fly? Segment the thinking process into clear steps and indicate \"YES\" or \"NO\" once at the end ."
        },
    ],
    add_generation_prompt=True,
    return_tensors="pt",
).to("mps")  # Explicitly move to MPS device

# Generate text using our custom logits processor (greedy decoding for clarity).
generated_ids = model.generate(
    inputs,
    max_new_tokens=1200,
    logits_processor=logits_processor,
    #do_sample=False
)

print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


<｜User｜>Can penguins fly? Segment the thinking process into clear steps and indicate "YES" or "NO" once at the end .<｜Assistant｜><think>
Okay, so I need to figure out if penguins can fly. I've heard they're flightless, but that seems right. I'm a bit confused because I thought penguins are known for their ability to fly, especially over long distances. But that seems right. Let me break this down step by step.

First, I should consider what penguins do. They live in the Southern Hemisphere, right? They're birds, so they have wings. But that seems right. I remember seeing penguins in breeding season and they're able to fly. So their wings are probably large enough.

Next, I think about flight mechanics. Penguins have a streamlined body and short wings, which probably help them fly. They don't fly with wings but that seems right. I've heard they use a combination of flapping wings and flapping legs, but that seems right. So they do fly.

But that seems right. I've heard penguins are flig

In [10]:
# Print each token and its corresponding ID
for token_id in generated_ids[0]:
    token = tokenizer.decode([token_id])
    print(f"Token ID: {token_id.item()}, Token: '{token}'")

Token ID: 151646, Token: '<｜begin▁of▁sentence｜>'
Token ID: 151644, Token: '<｜User｜>'
Token ID: 6713, Token: 'Can'
Token ID: 281, Token: ' p'
Token ID: 55358, Token: 'enguins'
Token ID: 11466, Token: ' fly'
Token ID: 30, Token: '?'
Token ID: 37103, Token: ' Segment'
Token ID: 279, Token: ' the'
Token ID: 7274, Token: ' thinking'
Token ID: 1882, Token: ' process'
Token ID: 1119, Token: ' into'
Token ID: 2797, Token: ' clear'
Token ID: 7354, Token: ' steps'
Token ID: 323, Token: ' and'
Token ID: 13216, Token: ' indicate'
Token ID: 330, Token: ' "'
Token ID: 14004, Token: 'YES'
Token ID: 1, Token: '"'
Token ID: 476, Token: ' or'
Token ID: 330, Token: ' "'
Token ID: 8996, Token: 'NO'
Token ID: 1, Token: '"'
Token ID: 3055, Token: ' once'
Token ID: 518, Token: ' at'
Token ID: 279, Token: ' the'
Token ID: 835, Token: ' end'
Token ID: 659, Token: ' .'
Token ID: 151645, Token: '<｜Assistant｜>'
Token ID: 151648, Token: '<think>'
Token ID: 198, Token: '
'
Token ID: 32313, Token: 'Okay'
Token ID: 1

In [37]:
inputs = tokenizer.apply_chat_template(
    [
        {"role": "user", "content": "Roleplay as a pirate"},
        {"role": "assistant", "content": "Yarr, I'll be speakin' like a true seafarer from here on out! Got me sea legs ready and me vocabulary set to proper pirate speak. What can I help ye with, me hearty?"},
    ],
    return_tensors="pt",
).to("mps")

In [38]:
# Define the path to the .pt file - you can modify this to any file in the outputs directory
pt_file_path = "outputs/penguin/raw_outputs/output_79_temp0_6.pt"

# Load the tokens from the .pt file
loaded_tokens = torch.load(pt_file_path)

# Move to the appropriate device
inputs = loaded_tokens.to("mps")

# Print shape and first few tokens to verify
print(f"Loaded token tensor shape: {inputs.shape}")
print(f"First few tokens: {inputs[0, :10]}")

Loaded token tensor shape: torch.Size([1, 724])
First few tokens: tensor([128000, 128011,   6854,    281,  56458,  11722,     30,  38203,    279,
          7422], device='mps:0')


  loaded_tokens = torch.load(pt_file_path)


In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Set device (using "mps" for Apple Silicon, "cuda" for NVIDIA, or CPU as fallback)
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")

# Define the model name (adjust as needed)
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

# Load the model and tokenizer
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")
model.to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def extract_all_layer_activations(model, inputs):
    """
    Extracts the activations (residual stream inputs) for each transformer layer.
    Assumes that the model has an attribute 'model.layers' which is an iterable of transformer blocks.
    
    Args:
        model: The transformer model.
        inputs: A dictionary containing model inputs (e.g., {"input_ids": tensor}).
        
    Returns:
        A dictionary mapping layer indices to their activation tensors (shape: [batch_size, seq_length, hidden_dim]).
    """
    layer_activations = {}
    hook_handles = []

    def get_activation_hook(layer_idx):
        def hook(module, layer_input, layer_output):
            # Capture the input activations (assumed to be in layer_input[0])
            layer_activations[layer_idx] = layer_input[0].detach().cpu()
        return hook

    # Register a forward hook on each transformer block
    for idx, layer in enumerate(model.model.layers):
        handle = layer.register_forward_hook(get_activation_hook(idx))
        hook_handles.append(handle)

    # Run a forward pass with no gradient computation
    with torch.no_grad():
        _ = model(**inputs)

    # Remove hooks to avoid side effects
    for handle in hook_handles:
        handle.remove()

    return layer_activations

# Load raw outputs from the .pt file containing model outputs (assumed to be token IDs)
outputs_file = "outputs_small/gsm8k_test/raw_outputs/output_0.pt"
raw_outputs = torch.load(outputs_file, map_location="cpu")

# Extract token IDs: if raw_outputs is a dict containing "input_ids", use that; otherwise, assume it's a tensor
if isinstance(raw_outputs, dict) and "input_ids" in raw_outputs:
    input_ids = raw_outputs["input_ids"]
else:
    input_ids = raw_outputs

print("Input IDs shape:", input_ids.shape)

# Decode the first sequence of token IDs to obtain the human-readable sentence
decoded_sentence = tokenizer.decode(input_ids[0], skip_special_tokens=True)
print("Decoded sentence:", decoded_sentence)

# Prepare inputs for the model (moving them to the appropriate device)
inputs = {"input_ids": input_ids.to(device)}

# Extract activations from each transformer layer using the loaded input IDs
activations = extract_all_layer_activations(model, inputs)

# Print out the activation shape for each layer (each token in the sequence represents a time point)
for layer_idx, act_tensor in activations.items():
    print(f"Layer {layer_idx} activation shape: {act_tensor.shape}")

  raw_outputs = torch.load(outputs_file, map_location="cpu")


Input IDs shape: torch.Size([1, 919])
Decoded sentence: <｜User｜>
Q: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?
A:<｜Assistant｜><think>
Okay, so I have this problem about Janet and her ducks. Let me try to figure it out step by step. 

First, the problem says that Janet’s ducks lay 16 eggs per day. Hmm, wait, no, actually, it says 16 eggs per day. Let me check: "Janet’s ducks lay 16 eggs per day." Oh, wait, that's 16 eggs per day. So, 16 eggs in total each day. Got it.

Then, she eats three eggs for breakfast every morning. So, breakfast is three eggs. And she bakes muffins for her friends every day with four eggs. So, she uses four eggs for muffins. 

So, let me break this down. She has 16 eggs in total each day. She uses some for breakfast and some for

Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Layer 0 activation shape: torch.Size([1, 919, 1536])
Layer 1 activation shape: torch.Size([1, 919, 1536])
Layer 2 activation shape: torch.Size([1, 919, 1536])
Layer 3 activation shape: torch.Size([1, 919, 1536])
Layer 4 activation shape: torch.Size([1, 919, 1536])
Layer 5 activation shape: torch.Size([1, 919, 1536])
Layer 6 activation shape: torch.Size([1, 919, 1536])
Layer 7 activation shape: torch.Size([1, 919, 1536])
Layer 8 activation shape: torch.Size([1, 919, 1536])
Layer 9 activation shape: torch.Size([1, 919, 1536])
Layer 10 activation shape: torch.Size([1, 919, 1536])
Layer 11 activation shape: torch.Size([1, 919, 1536])
Layer 12 activation shape: torch.Size([1, 919, 1536])
Layer 13 activation shape: torch.Size([1, 919, 1536])
Layer 14 activation shape: torch.Size([1, 919, 1536])
Layer 15 activation shape: torch.Size([1, 919, 1536])
Layer 16 activation shape: torch.Size([1, 919, 1536])
Layer 17 activation shape: torch.Size([1, 919, 1536])
Layer 18 activation shape: torch.Size(

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import os

# Set device (using "mps" for Apple Silicon, "cuda" for NVIDIA, or CPU as fallback)
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")

# Define the model name (adjust as needed)
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

# Load the model and tokenizer
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")
model.to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def extract_attention_heads(model, inputs):
    """
    Extracts the attention head outputs for each transformer layer.
    Assumes each layer has an attribute `self_attn` (the self-attention module).
    The self-attention module produces an output of shape [B, T, hidden_size],
    which is then reshaped into [B, T, num_heads, head_dim].
    
    Args:
        model: The transformer model.
        inputs: A dictionary containing model inputs (e.g., {"input_ids": tensor}).
        
    Returns:
        A dictionary mapping layer indices to their attention head activations.
    """
    attention_activations = {}
    hook_handles = []

    def get_attn_hook(layer_idx):
        def hook(module, layer_input, layer_output):
            # If layer_output is a tuple, extract the first element
            layer_output_tensor = layer_output[0] if isinstance(layer_output, tuple) else layer_output

            # Expected shape: [B, T, hidden_size]
            batch_size, seq_length, hidden_size = layer_output_tensor.shape
            # Use the module attribute 'num_heads' if available
            num_heads = module.num_heads if hasattr(module, "num_heads") else 1
            head_dim = hidden_size // num_heads
            # Reshape to [B, T, num_heads, head_dim]
            attn_heads = layer_output_tensor.view(batch_size, seq_length, num_heads, head_dim)
            attention_activations[layer_idx] = attn_heads.detach().cpu()
        return hook

    # Register a hook on each transformer layer's self-attention module (assumed at layer.self_attn)
    for idx, layer in enumerate(model.model.layers):
        handle = layer.self_attn.register_forward_hook(get_attn_hook(idx))
        hook_handles.append(handle)

    # Run the forward pass (no gradients needed)
    with torch.no_grad():
        _ = model(**inputs)

    # Remove hooks to avoid interference with future runs
    for handle in hook_handles:
        handle.remove()

    return attention_activations

# Load raw outputs from the .pt file containing model outputs (assumed to be token IDs)
outputs_file = "outputs_small/gsm8k_test/raw_outputs/output_0.pt"
raw_outputs = torch.load(outputs_file, map_location="cpu")

# Extract token IDs: if raw_outputs is a dict containing "input_ids", use that; otherwise assume it's a tensor
if isinstance(raw_outputs, dict) and "input_ids" in raw_outputs:
    input_ids = raw_outputs["input_ids"]
else:
    input_ids = raw_outputs

print("Input IDs shape:", input_ids.shape)

# Decode the first sequence of token IDs into a human-readable sentence
decoded_sentence = tokenizer.decode(input_ids[0], skip_special_tokens=True)
print("Decoded sentence:", decoded_sentence)

# Prepare inputs for the model (move them to the appropriate device)
inputs = {"input_ids": input_ids.to(device)}

# Extract attention head activations from each transformer layer
attention_activations = extract_attention_heads(model, inputs)
for layer_idx, attn_tensor in attention_activations.items():
    print(f"Layer {layer_idx} attention head activation shape: {attn_tensor.shape}")

# Now, extract the final hidden state and the final pre-softmax logits.
# We'll perform a forward pass with output_hidden_states=True.
with torch.no_grad():
    outputs = model(**inputs, output_hidden_states=True)
    # outputs.hidden_states is a tuple containing the embeddings plus the output of each layer.
    # The final hidden state is the output of the last transformer block.
    final_hidden_state = outputs.hidden_states[-1]
    
    # Extract logits: either via dictionary key "logits" or the first element if tuple.
    logits = outputs["logits"] if isinstance(outputs, dict) and "logits" in outputs else outputs[0]

# Move final hidden state and logits to CPU and print their shapes.
final_hidden_state = final_hidden_state.detach().cpu()
logits = logits.detach().cpu()
print("Final hidden state shape:", final_hidden_state.shape)
print("Final pre-softmax logits shape:", logits.shape)

# Create a dictionary to store all activations and logits
all_activations = {
    "attention_activations": attention_activations,
    "final_hidden_state": final_hidden_state,
    "logits": logits,
    "input_ids": input_ids
}

# Create directory if it doesn't exist
os.makedirs("saved_activations", exist_ok=True)

# Save the dictionary as a .pt file
torch.save(all_activations, "saved_activations/model_activations.pt")
print("All activations saved to saved_activations/model_activations.pt")

  raw_outputs = torch.load(outputs_file, map_location="cpu")


Input IDs shape: torch.Size([1, 919])
Decoded sentence: <｜User｜>
Q: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?
A:<｜Assistant｜><think>
Okay, so I have this problem about Janet and her ducks. Let me try to figure it out step by step. 

First, the problem says that Janet’s ducks lay 16 eggs per day. Hmm, wait, no, actually, it says 16 eggs per day. Let me check: "Janet’s ducks lay 16 eggs per day." Oh, wait, that's 16 eggs per day. So, 16 eggs in total each day. Got it.

Then, she eats three eggs for breakfast every morning. So, breakfast is three eggs. And she bakes muffins for her friends every day with four eggs. So, she uses four eggs for muffins. 

So, let me break this down. She has 16 eggs in total each day. She uses some for breakfast and some for

Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Layer 0 attention head activation shape: torch.Size([1, 919, 12, 128])
Layer 1 attention head activation shape: torch.Size([1, 919, 12, 128])
Layer 2 attention head activation shape: torch.Size([1, 919, 12, 128])
Layer 3 attention head activation shape: torch.Size([1, 919, 12, 128])
Layer 4 attention head activation shape: torch.Size([1, 919, 12, 128])
Layer 5 attention head activation shape: torch.Size([1, 919, 12, 128])
Layer 6 attention head activation shape: torch.Size([1, 919, 12, 128])
Layer 7 attention head activation shape: torch.Size([1, 919, 12, 128])
Layer 8 attention head activation shape: torch.Size([1, 919, 12, 128])
Layer 9 attention head activation shape: torch.Size([1, 919, 12, 128])
Layer 10 attention head activation shape: torch.Size([1, 919, 12, 128])
Layer 11 attention head activation shape: torch.Size([1, 919, 12, 128])
Layer 12 attention head activation shape: torch.Size([1, 919, 12, 128])
Layer 13 attention head activation shape: torch.Size([1, 919, 12, 128])
La

In [2]:
# Load the saved activations
loaded_activations = torch.load("saved_activations/model_activations.pt")

# Print the structure of the loaded activations
print("\nLoaded activations structure:")
for key in loaded_activations:
    if key == "attention_activations":
        print(f"{key}:")
        for layer_idx, attn_tensor in loaded_activations[key].items():
            print(f"  Layer {layer_idx} shape: {attn_tensor.shape}")
    else:
        print(f"{key} shape: {loaded_activations[key].shape}")

# Access specific components if needed
attention_activations = loaded_activations["attention_activations"]
final_hidden_state = loaded_activations["final_hidden_state"]
logits = loaded_activations["logits"]
input_ids = loaded_activations["input_ids"]

print("\nSuccessfully loaded and analyzed saved activations.")



Loaded activations structure:
attention_activations:
  Layer 0 shape: torch.Size([1, 919, 12, 128])
  Layer 1 shape: torch.Size([1, 919, 12, 128])
  Layer 2 shape: torch.Size([1, 919, 12, 128])
  Layer 3 shape: torch.Size([1, 919, 12, 128])
  Layer 4 shape: torch.Size([1, 919, 12, 128])
  Layer 5 shape: torch.Size([1, 919, 12, 128])
  Layer 6 shape: torch.Size([1, 919, 12, 128])
  Layer 7 shape: torch.Size([1, 919, 12, 128])
  Layer 8 shape: torch.Size([1, 919, 12, 128])
  Layer 9 shape: torch.Size([1, 919, 12, 128])
  Layer 10 shape: torch.Size([1, 919, 12, 128])
  Layer 11 shape: torch.Size([1, 919, 12, 128])
  Layer 12 shape: torch.Size([1, 919, 12, 128])
  Layer 13 shape: torch.Size([1, 919, 12, 128])
  Layer 14 shape: torch.Size([1, 919, 12, 128])
  Layer 15 shape: torch.Size([1, 919, 12, 128])
  Layer 16 shape: torch.Size([1, 919, 12, 128])
  Layer 17 shape: torch.Size([1, 919, 12, 128])
  Layer 18 shape: torch.Size([1, 919, 12, 128])
  Layer 19 shape: torch.Size([1, 919, 12, 12

  loaded_activations = torch.load("saved_activations/model_activations.pt")
