<a href="https://colab.research.google.com/github/amanzoni1/MoE-Burst-Upcycling/blob/main/HELLoRA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# 1. Uninstall the conflicting library
!pip uninstall -y torchvision

# 2. Force install the bleeding-edge Transformers (fixes the missing OLMoE class)
!pip install -U git+https://github.com/huggingface/transformers

# 3. Install core dependencies
!pip install -U torch accelerate peft datasets

Found existing installation: torchvision 0.24.0+cu126
Uninstalling torchvision-0.24.0+cu126:
  Successfully uninstalled torchvision-0.24.0+cu126
Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-g6ifbyat
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-g6ifbyat
  Resolved https://github.com/huggingface/transformers to commit a30413b78feed68da5c486746f745db092bfdf9a
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [1]:
import torch
from transformers import OlmoeForCausalLM, AutoTokenizer

model_name = "allenai/OLMoE-1B-7B-0924"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = OlmoeForCausalLM.from_pretrained(
    model_name,
    dtype=torch.bfloat16,
    device_map="auto",
)

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/179 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

In [2]:
print(f"Model Memory Footprint: {model.get_memory_footprint() / 1e9:.2f} GB")

Model Memory Footprint: 13.84 GB


In [8]:
print(torch.cuda.memory_allocated() / 1e9, "GB allocated")
print(torch.cuda.memory_reserved() / 1e9, "GB reserved")

13.838324736 GB allocated
15.453913088 GB reserved


In [9]:
# Quick sanity checks
print("CUDA available:", torch.cuda.is_available())
print("Device:", model.device)
print("VRAM used (GB):", torch.cuda.memory_allocated() / 1e9 if torch.cuda.is_available() else "N/A")

CUDA available: True
Device: cuda:0
VRAM used (GB): 13.838324736


In [6]:
print("\n=== üîç MODEL ARCHITECTURE DIG ===")
print(f"Model Class: {type(model).__name__}")

# We only care about the decoder layers where the MoE lives
# Usually found in model.model.layers or model.layers
layers = model.model.layers

print(f"Total Layers: {len(layers)}")

# Let's inspect Layer 0 specifically to find the Router and Experts
layer_0 = layers[0]
print("\n=== üî¨ INSIDE LAYER 0 ===")
for name, module in layer_0.named_modules():
    # Only print top-level components of the layer
    if name.count(".") == 0 and name != "":
        print(f" ‚Ä¢ {name}: {module.__class__.__name__}")

        # If this is the MoE block, dig deeper
        if "moe" in name.lower():
            print(f"    ‚îî‚îÄ‚îÄ Found MoE Block! Digging deeper...")
            for sub_name, sub_module in module.named_modules():
                if sub_name == "gate":
                    print(f"        üëâ ROUTER found: '{name}.{sub_name}' -> {sub_module}")
                if "experts" in sub_name and sub_name.count(".") == 0:
                     print(f"        üëâ EXPERTS container: '{name}.{sub_name}' -> Contains {len(sub_module)} experts")


=== üîç MODEL ARCHITECTURE DIG ===
Model Class: OlmoeForCausalLM
Total Layers: 16

=== üî¨ INSIDE LAYER 0 ===
 ‚Ä¢ self_attn: OlmoeAttention
 ‚Ä¢ mlp: OlmoeSparseMoeBlock
 ‚Ä¢ input_layernorm: OlmoeRMSNorm
 ‚Ä¢ post_attention_layernorm: OlmoeRMSNorm


In [5]:
print("\n" + "="*50)
print(f"üèóÔ∏è  DEEP DIVE: {model_name}")
print("="*50)

# Recursive function to print structure with indent
def print_structure(module, indent=0):
    for name, child in module.named_children():
        prefix = "  " * indent
        print(f"{prefix}‚îú‚îÄ‚îÄ \033[1m{name}\033[0m: {child.__class__.__name__}")

        # Dig deeper if it's a container or MoE layer
        if list(child.named_children()):
            print_structure(child, indent + 1)

# Start digging from the main model block
# We usually skip the outer 'model' wrapper to save space
print_structure(model)

print("\n" + "="*50)
print("üßê INSPECTING SPECIFIC PARAMETERS")
print("="*50)

# Let's verify the exact path to the router
found_router = False
for name, module in model.named_modules():
    if "gate" in name or "router" in name:
        print(f"‚úÖ Found potential Router: \033[92m{name}\033[0m")
        print(f"   Type: {module}")
        found_router = True

if not found_router:
    print("‚ùå Could not auto-detect router name. Check the tree above manually.")


üèóÔ∏è  DEEP DIVE: allenai/OLMoE-1B-7B-0924
‚îú‚îÄ‚îÄ [1mmodel[0m: OlmoeModel
  ‚îú‚îÄ‚îÄ [1membed_tokens[0m: Embedding
  ‚îú‚îÄ‚îÄ [1mlayers[0m: ModuleList
    ‚îú‚îÄ‚îÄ [1m0[0m: OlmoeDecoderLayer
      ‚îú‚îÄ‚îÄ [1mself_attn[0m: OlmoeAttention
        ‚îú‚îÄ‚îÄ [1mq_proj[0m: Linear
        ‚îú‚îÄ‚îÄ [1mk_proj[0m: Linear
        ‚îú‚îÄ‚îÄ [1mv_proj[0m: Linear
        ‚îú‚îÄ‚îÄ [1mo_proj[0m: Linear
        ‚îú‚îÄ‚îÄ [1mq_norm[0m: OlmoeRMSNorm
        ‚îú‚îÄ‚îÄ [1mk_norm[0m: OlmoeRMSNorm
      ‚îú‚îÄ‚îÄ [1mmlp[0m: OlmoeSparseMoeBlock
        ‚îú‚îÄ‚îÄ [1mgate[0m: OlmoeTopKRouter
        ‚îú‚îÄ‚îÄ [1mexperts[0m: OlmoeExperts
          ‚îú‚îÄ‚îÄ [1mact_fn[0m: SiLUActivation
      ‚îú‚îÄ‚îÄ [1minput_layernorm[0m: OlmoeRMSNorm
      ‚îú‚îÄ‚îÄ [1mpost_attention_layernorm[0m: OlmoeRMSNorm
    ‚îú‚îÄ‚îÄ [1m1[0m: OlmoeDecoderLayer
      ‚îú‚îÄ‚îÄ [1mself_attn[0m: OlmoeAttention
        ‚îú‚îÄ‚îÄ [1mq_proj[0m: Linear
        ‚îú‚îÄ‚îÄ [1mk_proj[0m: Lin

In [11]:
import torch
from collections import defaultdict

# 1. SETUP: Define the hook mechanism
# We will store how many times each expert is selected
# { layer_idx: { expert_idx: count } }
activation_counts = defaultdict(lambda: torch.zeros(64, device="cuda"))

def get_router_hook(layer_idx):
    def hook(module, input, output):
        # --- PROBE (DEBUGGING) ---
        # The first time this runs, we print what the output looks like
        # so we are 100% sure we are grabbing the right thing.
        if layer_idx == 0 and activation_counts[0].sum() == 0:
            print(f"\nüïµÔ∏è PROBE - Layer {layer_idx} Gate Output Type: {type(output)}")
            if isinstance(output, tuple):
                print(f"   It's a tuple! Length: {len(output)}")
                print(f"   Item 0 shape: {output[0].shape}")
            else:
                print(f"   Shape: {output.shape}")

        # --- THE LOGIC ---
        # OLMoE's router usually outputs 'router_logits' of shape [batch*seq_len, num_experts]
        # We need to calculate which ones are the Top-8 (active ones)

        router_logits = output

        # Handle if it's a tuple (logits usually come first)
        if isinstance(output, tuple):
            router_logits = output[0] # Usually [batch, seq_len, num_experts]

        # Reshape to [total_tokens, num_experts]
        if len(router_logits.shape) == 3:
            router_logits = router_logits.view(-1, router_logits.shape[-1])

        # Find Top-8 Experts (OLMoE standard)
        # We don't care about the values, just the indices
        _, selected_indices = torch.topk(router_logits, k=8, dim=-1)

        # Flatten to a simple list of "all experts selected in this batch"
        indices_flat = selected_indices.flatten()

        # Count them efficiently on GPU
        counts = torch.bincount(indices_flat, minlength=64)
        activation_counts[layer_idx] += counts

    return hook

# 2. ATTACH: Hook every layer
print("ü™ù Attaching hooks to model.layers...mlp.gate")
hooks = []
for i, layer in enumerate(model.model.layers):
    # Use the path YOU found: mlp.gate
    h = layer.mlp.gate.register_forward_hook(get_router_hook(i))
    hooks.append(h)

print(f"‚úÖ Attached {len(hooks)} hooks.")

# 3. RUN: Pass some dummy data (or your real data)
# Since we don't have your dataset yet, I'll use a generic sentence to test.
print("\nüöÄ Running Forward Pass...")
dummy_text = "The quick brown fox jumps over the lazy dog. " * 10
inputs = tokenizer(dummy_text, return_tensors="pt").to("cuda")

with torch.no_grad():
    _ = model(**inputs)

# 4. REPORT: Who are the "Hot" Experts?
print("\n" + "="*40)
print("üî• HOT EXPERTS REPORT (Top 3 per layer)")
print("="*40)

hot_experts_map = {} # We save this for the next step

for layer_idx in range(16): # 16 layers
    counts = activation_counts[layer_idx]

    # Get indices of the experts with highest counts
    top_k_indices = torch.topk(counts, k=3).indices.tolist()

    # Calculate usage % (just for your info)
    total_calls = counts.sum().item()
    top_usage = counts[top_k_indices[0]].item()
    percentage = (top_usage / total_calls) * 100 if total_calls > 0 else 0

    print(f"Layer {layer_idx:02d}: Experts {top_k_indices} (Top 1 used {percentage:.1f}%)")

    # Save for HELLoRA config later
    hot_experts_map[layer_idx] = top_k_indices

# 5. CLEANUP: Remove hooks so they don't slow down future runs
for h in hooks:
    h.remove()
print("\nüßπ Hooks removed.")

ü™ù Attaching hooks to model.layers...mlp.gate
‚úÖ Attached 16 hooks.

üöÄ Running Forward Pass...

üïµÔ∏è PROBE - Layer 0 Gate Output Type: <class 'tuple'>
   It's a tuple! Length: 3
   Item 0 shape: torch.Size([101, 64])

üî• HOT EXPERTS REPORT (Top 3 per layer)
Layer 00: Experts [6, 41, 42] (Top 1 used 7.7%)
Layer 01: Experts [5, 37, 9] (Top 1 used 8.8%)
Layer 02: Experts [5, 38, 48] (Top 1 used 7.9%)
Layer 03: Experts [61, 31, 10] (Top 1 used 9.4%)
Layer 04: Experts [43, 63, 39] (Top 1 used 9.4%)
Layer 05: Experts [0, 37, 60] (Top 1 used 8.8%)
Layer 06: Experts [57, 10, 31] (Top 1 used 11.1%)
Layer 07: Experts [4, 0, 35] (Top 1 used 10.9%)
Layer 08: Experts [27, 51, 6] (Top 1 used 10.9%)
Layer 09: Experts [51, 9, 14] (Top 1 used 10.5%)
Layer 10: Experts [22, 20, 44] (Top 1 used 11.0%)
Layer 11: Experts [47, 7, 14] (Top 1 used 11.0%)
Layer 12: Experts [32, 45, 26] (Top 1 used 8.8%)
Layer 13: Experts [59, 16, 39] (Top 1 used 10.8%)
Layer 14: Experts [4, 17, 35] (Top 1 used 11.3%)