# Imports and setup

In [12]:
def save_tensor(tensor, filename):
    """Save tensor values to a text file"""
    with open(filename, 'w') as f:
        # If tensor is 1D
        if len(tensor.shape) == 1:
            for value in tensor:
                f.write(f"{value.item()}\n")
        # If tensor is 2D
        elif len(tensor.shape) == 2:
            for row in tensor:
                f.write(' '.join([f"{x.item()}" for x in row]) + '\n')
        # If tensor is 3D
        elif len(tensor.shape) == 3:
            for i, matrix in enumerate(tensor):
                f.write(f"Matrix {i}:\n")
                for row in matrix:
                    f.write(' '.join([f"{x.item()}" for x in row]) + '\n')
                f.write('\n')
        # If tensor is 4D
        elif len(tensor.shape) == 4:
            for b, batch in enumerate(tensor):
                f.write(f"Batch {b}:\n")
                for h, head in enumerate(batch):
                    f.write(f"Head {h}:\n")
                    for row in head:
                        f.write(' '.join([f"{x.item()}" for x in row]) + '\n')
                    f.write('\n')

In [13]:

import torch
import math
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
from safetensors import safe_open

# setup model and tokenizer links
model_name = "arnir0/Tiny-LLM"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# setup input string
INPUT_STRING = "Hello from the"
print(f"Input string: {INPUT_STRING}")

# setup weights
tensors_path = "C:/Users/cfk30/.cache/huggingface/hub/models--arnir0--Tiny-LLM/snapshots/b784a70a5e6908c9148820a245d60a3347279868/model.safetensors"
tensors = safe_open(tensors_path, framework="pt")

# Embedding weights
W_embed = tensors.get_tensor("model.embed_tokens.weight")

# Attention weights
W_q = tensors.get_tensor("model.layers.0.self_attn.q_proj.weight")
W_k = tensors.get_tensor("model.layers.0.self_attn.k_proj.weight")
W_v = tensors.get_tensor("model.layers.0.self_attn.v_proj.weight")
W_o = tensors.get_tensor("model.layers.0.self_attn.o_proj.weight")

# Layer Norm weights
W_ln_in = tensors.get_tensor("model.layers.0.input_layernorm.weight")
W_ln_post = tensors.get_tensor("model.layers.0.post_attention_layernorm.weight")
W_ln_final = tensors.get_tensor("model.norm.weight")

# MLP weights
W_mlp_up = tensors.get_tensor("model.layers.0.mlp.up_proj.weight")
W_mlp_gate = tensors.get_tensor("model.layers.0.mlp.gate_proj.weight")
W_mlp_down = tensors.get_tensor("model.layers.0.mlp.down_proj.weight")

# Output head
W_lm_head = tensors.get_tensor("lm_head.weight")




Input string: Hello from the


# Retrieve embeddings


In [14]:
# Tokenize input string 
inputs = tokenizer(INPUT_STRING, return_tensors="pt")
batch_size = inputs.input_ids.shape[0]
seq_len = inputs.input_ids.shape[1]

# Create embeddings tensor
embeddings = torch.zeros((batch_size, seq_len, model.config.hidden_size))
for i in range(seq_len):
    embeddings[:, i, :] = W_embed[inputs.input_ids[:, i]]
# Convert embeddings to float16 to match model weights
embeddings = embeddings.to(torch.float16)


print(f"Input IDs shape: {inputs.input_ids.shape}")
print(f"Embeddings shape: {embeddings.shape}")
print(f"Model's embedding size: {model.config.hidden_size}")

Input IDs shape: torch.Size([1, 4])
Embeddings shape: torch.Size([1, 4, 192])
Model's embedding size: 192


# Input embeddings layer norm

In [15]:

# Residual connection
# RMSNorm
# RMSNorm(x) = x/RMS(x) * γ
# where:
# RMS(x) = √(1/n Σx_i²)
# γ = learned parameters (scale)

rms = torch.sqrt(embeddings.pow(2).mean(dim=-1, keepdim=True) + model.config.rms_norm_eps)
normalized = embeddings / rms
post_ln_embeddings = W_ln_in * normalized
print(f"Post layer norm embeddings shape: {post_ln_embeddings.shape}")

print(post_ln_embeddings)

Post layer norm embeddings shape: torch.Size([1, 4, 192])
tensor([[[-1.8030e-01, -2.7783e-01, -1.2781e-01,  1.1401e-01, -1.0205e-01,
          -2.7441e-01,  3.4106e-01, -9.2896e-02,  3.6060e-01,  2.7417e-01,
           1.0803e-02,  1.7957e-01,  8.5205e-02,  2.7710e-01, -2.1936e-01,
          -2.3108e-01,  2.3132e-01, -7.1106e-02,  9.6802e-02, -2.9150e-01,
          -3.5913e-01,  7.3193e-01, -1.6406e-01, -1.1505e-01, -7.8430e-02,
           2.2791e-01, -4.5996e-01,  1.1304e-01,  7.9773e-02,  9.4421e-02,
           5.2368e-02, -4.6484e-01,  1.9928e-02, -2.2327e-01,  3.3765e-01,
          -3.1543e-01, -3.6841e-01,  5.4004e-01, -8.9783e-02,  5.6543e-01,
          -6.7993e-02,  2.2278e-01,  6.0455e-02,  2.1533e-01, -9.0088e-02,
          -1.9638e-02,  8.8440e-02, -5.8740e-01,  3.6035e-01, -2.0593e-01,
          -4.3604e-01, -2.2327e-01, -1.6321e-01, -1.3831e-01,  4.8126e-02,
          -1.4319e-01,  2.8296e-01,  6.5088e-01, -2.1399e-01, -4.3396e-02,
           1.6785e-01, -3.0347e-01,  4.243

# Prepare attention inputs

In [16]:
num_Q_heads = model.config.num_attention_heads
num_KV_heads = model.config.num_key_value_heads
head_dim = model.config.hidden_size // num_Q_heads  # Size of each attention head

print(f"Number of attention heads: {num_Q_heads}")
print(f"Attention heads dimension: {head_dim}")


# Calculate Q, K, V
Q = post_ln_embeddings @ W_q.T
K = post_ln_embeddings @ W_k.T
V = post_ln_embeddings @ W_v.T


# Split Q, K, V into heads
Q = Q.view(batch_size, seq_len, num_Q_heads, Q.shape[-1] // num_Q_heads)
K = K.view(batch_size, seq_len, num_KV_heads, K.shape[-1] // num_KV_heads)
V = V.view(batch_size, seq_len, num_KV_heads, V.shape[-1] // num_KV_heads)

# Permute tensors to get heads dimension in the right position
Q = Q.transpose(1, 2)  # [1, 2, 6, 96]
K = K.transpose(1, 2)  # [1, 1, 6, 96]
V = V.transpose(1, 2)  # [1, 1, 6, 96]

# Repeat K and V for each query/key head
K = K.repeat(1, num_Q_heads, 1, 1)  # [1, 2, 6, 96]
V = V.repeat(1, num_Q_heads, 1, 1)  # [1, 2, 6, 96]

print(f"Q shape: {Q.shape}")
print(f"K shape: {K.shape}") 
print(f"V shape: {V.shape}")

Number of attention heads: 2
Attention heads dimension: 96
Q shape: torch.Size([1, 2, 4, 96])
K shape: torch.Size([1, 2, 4, 96])
V shape: torch.Size([1, 2, 4, 96])


# Apply Rotary Positional Embedding (RoPE)





In [17]:
# 1. Get dimensions from the Q tensor
seq_len = Q.shape[2]      # 6 (sequence length)
head_dim = Q.shape[-1]    # 96 (dimension per head)
theta = 10000.0           # Base for frequency calculation from model config
# 2. Create position indices: [0, 1, 2, 3, 4, 5]
position = torch.arange(seq_len).to(torch.float16)
print(f"Position indices shape: {position.shape}, values: {position}")

# 3. Create dimension indices: [0, 2, 4, ..., 94]
dim_indices = torch.arange(0, head_dim, 2).to(torch.float16)
print(f"Dimension indices shape: {dim_indices.shape}, first few values: {dim_indices[:5]}")

# 4. Calculate frequencies
# For each dimension i: freq_i = 1 / (10000^(2i/dim))
# This creates smaller frequencies for earlier dimensions and larger for later ones
freqs = 1.0 / (theta ** (dim_indices.float() / head_dim)).to(torch.float16)
print(f"Frequencies shape: {freqs.shape}, first few values: {freqs[:5]}")

# 5. Compute angles for each position-frequency pair
# This is an outer product: each position multiplied by each frequency
angles = torch.outer(position, freqs)  # Shape: [seq_len, head_dim/2]
print(f"Angles shape: {angles.shape}, sample values:\n{angles[:2, :2]}")

# 6. Calculate cos and sin of these angles
cos = torch.cos(angles).to(torch.float16)
sin = torch.sin(angles).to(torch.float16)
print(f"Cos/Sin shapes: {cos.shape}, sample cos values:\n{cos[:2, :2]}")

# 7. Reshape cos/sin for broadcasting with Q/K
# Add two dimensions for batch and heads
cos = cos.view(1, 1, cos.shape[0], cos.shape[1])
sin = sin.view(1, 1, sin.shape[0], sin.shape[1])
print(f"Broadcast-ready cos shape: {cos.shape}")

# 8. Split Q and K into even/odd dimensions
# Reshape to separate the last dimension into pairs
Q_split = Q.reshape(*Q.shape[:-1], -1, 2)
K_split = K.reshape(*K.shape[:-1], -1, 2)
print(f"Split Q shape: {Q_split.shape}")

# Separate even and odd dimensions
Q_even = Q_split[..., 0]  # Take first of each pair
Q_odd = Q_split[..., 1]   # Take second of each pair
K_even = K_split[..., 0]
K_odd = K_split[..., 1]
print(f"Q even/odd shapes: {Q_even.shape}")

# 9. Apply rotation matrix multiplication
# [cos -sin] [x_even] = [x_even*cos - x_odd*sin]
# [sin  cos] [x_odd ] = [x_even*sin + x_odd*cos]
Q_rotated_even = Q_even * cos - Q_odd * sin
Q_rotated_odd = Q_even * sin + Q_odd * cos
K_rotated_even = K_even * cos - K_odd * sin
K_rotated_odd = K_even * sin + K_odd * cos

# 10. Recombine even/odd dimensions
Q = torch.stack([Q_rotated_even, Q_rotated_odd], dim=-1).reshape(Q.shape)
K = torch.stack([K_rotated_even, K_rotated_odd], dim=-1).reshape(K.shape)
print(f"Final Q_rope shape: {Q.shape}")

Position indices shape: torch.Size([4]), values: tensor([0., 1., 2., 3.], dtype=torch.float16)
Dimension indices shape: torch.Size([48]), first few values: tensor([0., 2., 4., 6., 8.], dtype=torch.float16)
Frequencies shape: torch.Size([48]), first few values: tensor([1.0000, 0.8252, 0.6812, 0.5625, 0.4641], dtype=torch.float16)
Angles shape: torch.Size([4, 48]), sample values:
tensor([[0.0000, 0.0000],
        [1.0000, 0.8252]], dtype=torch.float16)
Cos/Sin shapes: torch.Size([4, 48]), sample cos values:
tensor([[1.0000, 1.0000],
        [0.5405, 0.6782]], dtype=torch.float16)
Broadcast-ready cos shape: torch.Size([1, 1, 4, 48])
Split Q shape: torch.Size([1, 2, 4, 48, 2])
Q even/odd shapes: torch.Size([1, 2, 4, 48])
Final Q_rope shape: torch.Size([1, 2, 4, 96])


# Calculate self attention


In [18]:
# Self attention equation:
# Attention(Q,K,V) = softmax(QK^T/√d_k)V
# where:
# Q = Query matrix
# K = Key matrix 
# V = Value matrix
# d_k = dimension of key vectors
# √d_k = scaling factor to prevent softmax from having extremely small gradients

print(f"Q shape: {Q.shape}")
print(f"K shape: {K.shape}") 
print(f"V shape: {V.shape}")

save_tensor(Q, "q_nb.txt")


# Calculate scale factor using head_dim (not hidden_size)
scale_factor = 1 / math.sqrt(head_dim)
scale_factor = torch.tensor(scale_factor).to(torch.float16)

# Calculate attention scores
# K.transpose(-2, -1) will give [1, 1, 96, 6]
scores = Q @ K.transpose(-2, -1)  # [1, 2, 6, 6]
scores = scores * scale_factor

# Create causal mask (optional, if you want to prevent attending to future tokens)
causal_mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool()
scores = scores.masked_fill(causal_mask, float('-inf')).to(torch.float16)

# Apply softmax
exp_scores = torch.exp(scores)
exp_sum = exp_scores.sum(dim=-1, keepdim=True)
Attention = exp_scores / exp_sum
print(f"Attention shape: {Attention.shape}")

# Multiply attention with V
attn_output = Attention @ V
print(f"output shape: {attn_output.shape}")

# Reshape to get heads dimension in the right position
attn_output = attn_output.transpose(1, 2)  # [1, 6, 2, 96]
print(f"output shape after transpose: {attn_output.shape}")

# Merge heads back together
attn_output = attn_output.contiguous().view(batch_size, seq_len, model.config.hidden_size) 
attn_output = attn_output @ W_o.T
print(f"Final output shape: {attn_output.shape}")

Q shape: torch.Size([1, 2, 4, 96])
K shape: torch.Size([1, 2, 4, 96])
V shape: torch.Size([1, 2, 4, 96])
Attention shape: torch.Size([1, 2, 4, 4])
output shape: torch.Size([1, 2, 4, 96])
output shape after transpose: torch.Size([1, 4, 2, 96])
Final output shape: torch.Size([1, 4, 192])


# Post-attention layer norm



In [19]:
# Residual connection
post_attn_residual = attn_output + embeddings
# RMSNorm
# RMSNorm(x) = x/RMS(x) * γ
# where:
# RMS(x) = √(1/n Σx_i²)
# γ = learned parameters (scale)

rms = torch.sqrt(post_attn_residual.pow(2).mean(dim=-1, keepdim=True) + model.config.rms_norm_eps)
normalized = post_attn_residual / rms
post_ln_attn_output = W_ln_post * normalized

print(f"Post layer norm output shape: {post_ln_attn_output.shape}")


Post layer norm output shape: torch.Size([1, 4, 192])


# Multi-layer perceptron



In [20]:
# Calculate gate and up projection
gate_proj = post_ln_attn_output @ W_mlp_gate.T
print(f"gate_proj shape: {gate_proj.shape}")
up_proj = post_ln_attn_output @ W_mlp_up.T
print(f"up_proj shape: {up_proj.shape}")

# SwiGLU activation function
silu_gated = gate_proj * torch.sigmoid(gate_proj)
print(f"silu_gated shape: {silu_gated.shape}")
output = silu_gated * up_proj
print(f"output shape: {output.shape}")

# Calculate down projection
mlp_output = output @ W_mlp_down.T
print(f"final shape: {mlp_output.shape}")



gate_proj shape: torch.Size([1, 4, 1024])
up_proj shape: torch.Size([1, 4, 1024])
silu_gated shape: torch.Size([1, 4, 1024])
output shape: torch.Size([1, 4, 1024])
final shape: torch.Size([1, 4, 192])


# Post-MLP layer norm

In [21]:
# Post-MLP RMSNorm
# RMSNorm(x) = x/RMS(x) * γ
# where:
# RMS(x) = √(1/n Σx_i²)
# γ = learned parameters (scale)

post_mlp_residual = mlp_output + post_attn_residual
rms = torch.sqrt(post_mlp_residual.pow(2).mean(dim=-1, keepdim=True) + model.config.rms_norm_eps)
normalized = post_mlp_residual / rms
post_ln_mlp_output = W_ln_final * normalized

print(f"Post layer norm output shape: {post_ln_mlp_output.shape}")

Post layer norm output shape: torch.Size([1, 4, 192])


# Retrieve next predicted word

In [22]:
# Calculate logits
logits = post_ln_mlp_output @ W_lm_head.T  # [1, 16, 32000]

# Get logits for last position
last_logits = logits[:, -1, :]  # [1, 32000]

# Apply temperature scaling
temperature = 1.0  # Start with 1.0 for debugging
last_logits = last_logits / temperature

# Get top k logits and their indices
top_k = 5
values, indices = torch.topk(last_logits, top_k)

# Apply softmax only to top k
exp_values = torch.exp(values)
sum_exp_values = torch.sum(exp_values, dim=-1, keepdim=True)
top_probs = exp_values / sum_exp_values

# Sample from top k
next_token_idx = indices[0, 0]  # Just take the highest probability for debugging
next_token = next_token_idx.item()  # Convert to integer

# Decode token to text
predicted_word = tokenizer.decode([next_token])

print(f"Input text: {INPUT_STRING}")
print(f"Next token: {predicted_word}")

# For debugging, show top 5 predictions
for i in range(top_k):
    token_id = indices[0, i].item()
    prob = top_probs[0, i].item()
    token_text = tokenizer.decode([token_id])
    print(f"Top {i+1}: '{token_text}' (prob: {prob:.3f})")

Input text: Hello from the
Next token: name
Top 1: 'name' (prob: 0.306)
Top 2: 'old' (prob: 0.279)
Top 3: 'beginning' (prob: 0.148)
Top 4: 'world' (prob: 0.135)
Top 5: 'new' (prob: 0.132)
