In [1]:
import torch
from transformers import AutoModel, AutoTokenizer

In [2]:
# Load model and tokenizer
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [3]:
model = AutoModel.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
model.embed_tokens.weight.data

tensor([[ 5.6152e-03, -2.1606e-02, -8.9111e-03,  ..., -6.9580e-03,
          3.6621e-02, -1.3245e-02],
        [ 4.4922e-02,  4.7363e-02,  1.6602e-02,  ...,  7.2021e-03,
          2.9564e-04, -1.6235e-02],
        [-1.6724e-02, -6.4392e-03,  1.4400e-04,  ...,  1.0824e-04,
         -2.1973e-02,  2.3804e-03],
        ...,
        [-1.1755e-37,  1.1755e-37,  1.1755e-37,  ..., -1.1755e-37,
          1.1755e-37,  1.1755e-37],
        [ 1.1755e-37, -1.1755e-37,  1.1755e-37,  ..., -1.1755e-37,
          1.1755e-37, -1.1755e-37],
        [ 1.1755e-37, -1.1755e-37, -1.1755e-37,  ...,  1.1755e-37,
          1.1755e-37, -1.1755e-37]])

In [None]:
model = model.to("cuda")

In [3]:
text = "Hello world"
inputs = tokenizer(text, return_tensors="pt")
inputs = inputs["input_ids"].to("cuda")

In [None]:
inputs

tensor([[151646,   9707,   1879]], device='cuda:0')

In [None]:
for i in range(28):
    print(model.layers[i])

: 

In [None]:

# Dictionary to store activations
activations = {}

def get_activation(name):
    def hook(model, input, output):
        #print(name, output, type(output))
        if("layer" in name):
            activations[name] = output[0].detach()  # For transformer layers, output is a tuple (last_hidden_state, past_key_values)
        else:
            activations[name] = output.detach()
    return hook

# Register hooks for all transformer layers
# DeepSeek-R1-Distill-Qwen-7B has 32 layers (layers 0-31)
for i in range(28):
    model.layers[i].register_forward_hook(get_activation(f'layer_{i}'))

"""
# You can also register hooks for specific components within layers
# For example, attention and MLP outputs:
for i in range(28):
    model.layers[i].self_attn.register_forward_hook(get_activation(f'layer_{i}_attention'))
    model.layers[i].mlp.register_forward_hook(get_activation(f'layer_{i}_mlp'))
"""
# Register hook for embeddings
model.embed_tokens.register_forward_hook(get_activation('embeddings'))

# Forward pass
outputs = model(input_ids=inputs)

# Access activations
print(f"Embeddings shape: {activations['embeddings'].shape}")
print(f"Layer 0 shape: {activations['layer_0'][0].shape}")  # Note: output is tuple, take first element
print(f"Layer 15 shape: {activations['layer_15'][0].shape}")
print(f"Layer 27 shape: {activations['layer_27'][0].shape}")

# Print all available activation keys
print("\nAll captured activations:")
for key in activations.keys():
    if isinstance(activations[key], tuple):
        print(f"{key}: {activations[key][0].shape}")
    else:
        print(f"{key}: {activations[key].shape}")

Embeddings shape: torch.Size([1, 3, 3584])
Layer 0 shape: torch.Size([3, 3584])
Layer 15 shape: torch.Size([3, 3584])
Layer 27 shape: torch.Size([3, 3584])

All captured activations:
embeddings: torch.Size([1, 3, 3584])
layer_0: torch.Size([1, 3, 3584])
layer_1: torch.Size([1, 3, 3584])
layer_2: torch.Size([1, 3, 3584])
layer_3: torch.Size([1, 3, 3584])
layer_4: torch.Size([1, 3, 3584])
layer_5: torch.Size([1, 3, 3584])
layer_6: torch.Size([1, 3, 3584])
layer_7: torch.Size([1, 3, 3584])
layer_8: torch.Size([1, 3, 3584])
layer_9: torch.Size([1, 3, 3584])
layer_10: torch.Size([1, 3, 3584])
layer_11: torch.Size([1, 3, 3584])
layer_12: torch.Size([1, 3, 3584])
layer_13: torch.Size([1, 3, 3584])
layer_14: torch.Size([1, 3, 3584])
layer_15: torch.Size([1, 3, 3584])
layer_16: torch.Size([1, 3, 3584])
layer_17: torch.Size([1, 3, 3584])
layer_18: torch.Size([1, 3, 3584])
layer_19: torch.Size([1, 3, 3584])
layer_20: torch.Size([1, 3, 3584])
layer_21: torch.Size([1, 3, 3584])
layer_22: torch.Size

In [None]:
print(activations.keys())

dict_keys(['embeddings', 'layer_0', 'layer_1', 'layer_2', 'layer_3', 'layer_4', 'layer_5', 'layer_6', 'layer_7', 'layer_8', 'layer_9', 'layer_10', 'layer_11', 'layer_12', 'layer_13', 'layer_14', 'layer_15', 'layer_16', 'layer_17', 'layer_18', 'layer_19', 'layer_20', 'layer_21', 'layer_22', 'layer_23', 'layer_24', 'layer_25', 'layer_26', 'layer_27'])


In [None]:
print("Embeddings:", activations['embeddings'], activations['embeddings'].shape)

Embeddings: tensor([[[-0.0006,  0.0005, -0.0049,  ..., -0.0001, -0.0004, -0.0010],
         [-0.0060, -0.0042,  0.0084,  ...,  0.0510,  0.0008, -0.0086],
         [-0.0225, -0.0293,  0.0107,  ..., -0.0100,  0.0104, -0.0571]]],
       device='cuda:0')


In [9]:
print("Layer 0:", activations['layer_0'])

Layer 0: tensor([[[-1.6078,  1.7398,  0.9056,  ...,  2.0966, -0.3973, -0.7354],
         [-0.4516,  1.4271,  0.0635,  ...,  0.9967,  0.3587, -1.0365],
         [ 0.3759,  0.4770,  0.5341,  ...,  0.9871,  0.5478,  0.1407]]],
       device='cuda:0')


In [1]:
import itertools
import sys
import time
from pathlib import Path
from typing import Optional, Tuple, Union

import torch
import torch._dynamo.config
import torch._inductor.config
from torch.nn.attention.flex_attention import BlockMask, create_block_mask

from torch.nn import functional as F

In [2]:
from model import Transformer
from tokenizer import get_tokenizer

In [3]:
checkpoint_path = Path("checkpoints/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/model.pth")
tokenizer_path = checkpoint_path.parent / "tokenizer.json"

In [4]:
tokenizer = get_tokenizer(tokenizer_path, checkpoint_path)

In [5]:
def _load_model(checkpoint_path, device, precision, use_tp):
    use_cuda = 'cuda' in device
    with torch.device('meta'):
        model = Transformer.from_name(checkpoint_path.parent.name)

    if "int8" in str(checkpoint_path):
        print("Using int8 weight-only quantization!")
        from quantize import WeightOnlyInt8QuantHandler
        simple_quantizer = WeightOnlyInt8QuantHandler(model)
        model = simple_quantizer.convert_for_runtime()

    if "int4" in str(checkpoint_path):
        print("Using int4 weight-only quantization!")
        path_comps = checkpoint_path.name.split(".")
        groupsize = int(path_comps[-2][1:])
        from quantize import WeightOnlyInt4QuantHandler
        simple_quantizer = WeightOnlyInt4QuantHandler(model, groupsize)
        model = simple_quantizer.convert_for_runtime()

    checkpoint = torch.load(str(checkpoint_path), mmap=True, weights_only=True)
    if "model" in checkpoint and "stories" in str(checkpoint_path):
        checkpoint = checkpoint["model"]
    model.load_state_dict(checkpoint, assign=True)

    if use_tp:
        from tp import apply_tp
        print("Applying tensor parallel to model ...")
        apply_tp(model)

    model = model.to(device=device, dtype=precision)
    return model.eval()

device = "cuda"
precision = torch.bfloat16
use_tp = False

model = _load_model(checkpoint_path, device, precision, use_tp)
with torch.device(device):
    model.setup_caches(max_batch_size=1, max_seq_length=212)
model.causal_mask.device

device(type='cuda', index=0)

In [6]:
model.tok_embeddings.weight.data

tensor([[ 5.6152e-03, -2.1606e-02, -8.9111e-03,  ..., -6.9580e-03,
          3.6621e-02, -1.3245e-02],
        [ 4.4922e-02,  4.7363e-02,  1.6602e-02,  ...,  7.2021e-03,
          2.9564e-04, -1.6235e-02],
        [-1.6724e-02, -6.4392e-03,  1.4400e-04,  ...,  1.0824e-04,
         -2.1973e-02,  2.3804e-03],
        ...,
        [-1.1755e-37,  1.1755e-37,  1.1755e-37,  ..., -1.1755e-37,
          1.1755e-37,  1.1755e-37],
        [ 1.1755e-37, -1.1755e-37,  1.1755e-37,  ..., -1.1755e-37,
          1.1755e-37, -1.1755e-37],
        [ 1.1755e-37, -1.1755e-37, -1.1755e-37,  ...,  1.1755e-37,
          1.1755e-37, -1.1755e-37]], device='cuda:0', dtype=torch.bfloat16)

In [7]:
def encode_tokens(tokenizer, string, bos=True, device="cuda"):
    tokens = tokenizer.encode(string)
    if bos:
        tokens = [tokenizer.bos_id()] + tokens
    return torch.tensor(tokens, dtype=torch.int, device=device)
encoded = encode_tokens(tokenizer, "Hello World", bos=True, device=device)

In [8]:
prompt = encoded.view(1, -1).repeat(1, 1)

In [9]:
print(prompt)

tensor([[151646,   9707,   4337]], device='cuda:0', dtype=torch.int32)


In [10]:
prompt_length = prompt.size(-1)

In [12]:
input_pos = torch.arange(0, prompt_length, device=device)

In [13]:
input_pos.shape

torch.Size([3])

In [16]:
activations = {}

def get_activation(name):
    def hook(model, input, output):
        if isinstance(output, tuple):
            activations[name] = output[0].detach()
        else:
            activations[name] = output.detach()
    return hook

model.tok_embeddings.register_forward_hook(get_activation('embeddings'))
for i, block in enumerate(model.layers):
    block.register_forward_hook(get_activation(f'layer_{i}_output'))

In [17]:

def multinomial_sample_one_no_sync(probs_sort): # Does multinomial sampling without a cuda synchronization
    q = torch.empty_like(probs_sort).exponential_(1)
    return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)

def logits_to_probs(logits, temperature: float = 1.0, top_k: Optional[int] = None):
    logits = logits / max(temperature, 1e-5)

    if top_k is not None:
        v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
        pivot = v.select(-1, -1).unsqueeze(-1)
        logits = torch.where(logits < pivot, -float("Inf"), logits)
    probs = torch.nn.functional.softmax(logits, dim=-1)
    return probs
def sample(logits, temperature: float = 1.0, top_k: Optional[int] = None):
    probs = logits_to_probs(logits[:, -1], temperature, top_k)
    idx_next = multinomial_sample_one_no_sync(probs)
    return idx_next, probs
def prefill(model: Transformer, x: torch.Tensor, input_pos: torch.Tensor, **sampling_kwargs) -> torch.Tensor:
    # input_pos: [B, S]
    logits = model(x, input_pos)
    return sample(logits, **sampling_kwargs)[0]
print("shapes", prompt.shape, input_pos.shape)
next_token = prefill(model, prompt, input_pos, temperature=0, top_k=1).clone()

shapes torch.Size([1, 3]) torch.Size([3])


In [18]:
next_token.shape

torch.Size([1, 1])

In [56]:
input_pos = torch.tensor([prompt_length], device=device, dtype=torch.int)

In [59]:
logits = model(next_token, input_pos)

In [41]:
next_token, next_prob = sample(logits, temperature=0, top_k=1)

In [42]:
next_token.shape

torch.Size([1, 1])

In [44]:
print(tokenizer.decode(next_token[0].cpu().numpy()))

2


In [19]:
activations

{'embeddings': tensor([[[-0.0006,  0.0005, -0.0049,  ..., -0.0001, -0.0004, -0.0010],
          [-0.0060, -0.0042,  0.0084,  ...,  0.0510,  0.0008, -0.0086],
          [ 0.0299,  0.0116,  0.0133,  ..., -0.0177, -0.0156, -0.0669]]],
        device='cuda:0', dtype=torch.bfloat16),
 'layer_0_output': tensor([[[-1.3750,  1.4297,  0.7773,  ...,  1.8672, -0.3555, -0.6875],
          [-0.8516,  1.2031,  0.6094,  ...,  1.3359,  0.3320, -0.8398],
          [-0.1143,  0.3535,  0.2148,  ...,  1.0625,  0.6680, -0.4160]]],
        device='cuda:0', dtype=torch.bfloat16),
 'layer_1_output': tensor([[[-1.1641,  1.1641,  0.3398,  ...,  1.5859,  0.0703, -0.2891],
          [-0.6992,  0.6523,  0.2461,  ...,  1.0781,  0.6641, -0.3906],
          [ 0.1206,  0.0645, -0.1699,  ...,  0.5703,  0.9258, -0.0337]]],
        device='cuda:0', dtype=torch.bfloat16),
 'layer_2_output': tensor([[[-1.4375,  1.0938, -0.6953,  ...,  2.4062, -0.7148, -0.8594],
          [-0.8047,  0.6797, -0.6016,  ...,  1.4062,  0.0156, 