##### 00_examine_existing_models.ipynb - Purpose
This notebook is ONLY about examining existing pretrained models ‚Äî how they‚Äôre stored, what files they contain, and how they‚Äôre structured on disk.

##### Goals
- Download small pretrained models from Hugging Face.
- Inspect the folder layout (tokenizer files, config, model weights).
- Understand what each file does (.bin, .safetensors, config.json, tokenizer.json, etc.).
- Load the model and print shapes of key components (embeddings, attention, MLP).
- Get a practical feel for ‚Äúmodel anatomy‚Äù before building my own tiny version later.

##### Scope (Important)
- **NOT** learning how they're trained.  
- **ONLY** file structure, weights, components, and practical inspection.

##### Expected Outputs
- Folder snapshots.
- Breakdown of model components.
- Parameter counts and shape summaries.

---


# Setup

---


# Download Model

In [None]:
from transformers import AutoModel, AutoTokenizer, AutoConfig
from pathlib import Path
import os
import json

# Get default cache directory
hf_home = os.environ.get('HF_HOME')
default_download_path = Path(hf_home) if hf_home else Path.home() / ".cache" / "huggingface"
print(f"Default download folder: {default_download_path.absolute()}")

# Model to download
# Toy + small model examples (uncomment ONE to use)

# --- TINY MODELS (<100MB weights) ---
# model_name = "sshleifer/tiny-gpt2"                     # ~15M params,  ~70MB  (best tiny GPT-2 for inspection)
# model_name = "roneneldan/TinyStories-1M"               # ~1M params,   ~6MB   (ultra tiny, educational)
# model_name = "roneneldan/TinyStories-10M"              # ~10M params,  ~30MB  (tiny decoder-only transformer)
# model_name = "EleutherAI/pythia-14m"                   # ~14M params,  ~55MB  (tiny pythia baseline)
# model_name = "google/flan-t5-small"                    # ~60M params,  ~240MB (encoder-decoder; small but different arch)

# --- SMALL MODELS (~100M‚Äì200M weights) ---
# model_name = "distilgpt2"                              # ~82M params,  ~320MB (distilled GPT2; great size/quality balance)
# model_name = "EleutherAI/pythia-70m"                   # ~70M params,  ~280MB (tiny GPT-NeoX style arch)
# model_name = "gpt2"                                    # ~124M params, ~500MB (baseline GPT-2 small)
# model_name = "facebook/opt-125m"                       # ~125M params, ~500MB (OPT architecture)

# --- MID-SMALL MODELS (>200M weights) ---
# model_name = "microsoft/DialoGPT-small"                # ~117M params, ~470MB (GPT2 tuned for dialogue)
# model_name = "EleutherAI/pythia-160m"                  # ~160M params, ~640MB (larger Pythia variant)
# model_name = "gpt2-medium"                             # ~355M params, ~1.4GB (too big for deep inspection but realistic)


model_name = "gpt2"

# Check if model already exists
model_cache_subdir = default_download_path / "hub" / f"models--{model_name.replace('/', '--')}"
snapshots_dir = model_cache_subdir / "snapshots"

if snapshots_dir.exists():
    snapshots = list(snapshots_dir.iterdir())
    if snapshots:
        print(f"‚úì Model already downloaded at: {snapshots[0]}")
        os.startfile(snapshots[0])
    else:
        print("‚úó Model directory exists but empty")
else:
    print(f"‚úó Model not downloaded yet - will download in next cell")


Default download folder: C:\Users\Delulu Lemon\.cache\huggingface
‚úì Model already downloaded at: C:\Users\Delulu Lemon\.cache\huggingface\hub\models--gpt2\snapshots\607a30d783dfa663caf39e06633721c8d4cfcd7e


In [2]:
# Download and load the tokenizer, config, and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

print("‚úì Model downloaded and loaded successfully!")
print(f"\nModel type: {type(model).__name__}")
print(f"Config type: {type(config).__name__}")
print(f"Tokenizer type: {type(tokenizer).__name__}")

# Open the model folder to see the downloaded files
# (Reconstruct paths in case cell 4 wasn't run - cell 4 should be run first)
try:
    # Try to use variables from cell 4
    _ = default_download_path
    _ = model_name
except NameError:
    # If not available, reconstruct them
    import os
    from pathlib import Path
    hf_home = os.environ.get('HF_HOME')
    default_download_path = Path(hf_home) if hf_home else Path.home() / ".cache" / "huggingface"
    model_name = "gpt2"

model_cache_subdir = default_download_path / "hub" / f"models--{model_name.replace('/', '--')}"
snapshots_dir = model_cache_subdir / "snapshots"
if snapshots_dir.exists():
    snapshots = list(snapshots_dir.iterdir())
    if snapshots:
        model_files_dir = snapshots[0]
        print(f"\nOpening model files folder: {model_files_dir}")
        os.startfile(model_files_dir)


‚úì Model downloaded and loaded successfully!

Model type: GPT2Model
Config type: GPT2Config
Tokenizer type: GPT2TokenizerFast

Opening model files folder: C:\Users\Delulu Lemon\.cache\huggingface\hub\models--gpt2\snapshots\607a30d783dfa663caf39e06633721c8d4cfcd7e


---


# Enumerate Models

In [13]:
# Enumerate all downloaded models
try:
    _ = default_download_path
except NameError:
    import os
    from pathlib import Path
    hf_home = os.environ.get('HF_HOME')
    default_download_path = Path(hf_home) if hf_home else Path.home() / ".cache" / "huggingface"

hub_dir = default_download_path / "hub"

if hub_dir.exists():
    # Find all model directories (they start with "models--")
    model_dirs = [d for d in hub_dir.iterdir() if d.is_dir() and d.name.startswith("models--")]
    
    if model_dirs:
        print(f"Found {len(model_dirs)} downloaded model(s):\n")
        
        for i, model_dir in enumerate(sorted(model_dirs), 1):
            # Extract model name (convert "models--gpt2" back to "gpt2")
            model_name = model_dir.name.replace("models--", "").replace("--", "/")
            
            # Get snapshots
            snapshots_dir = model_dir / "snapshots"
            snapshots = []
            if snapshots_dir.exists():
                snapshots = list(snapshots_dir.iterdir())
            
            # Calculate total size
            total_size = 0
            if snapshots:
                for snapshot in snapshots:
                    for file in snapshot.rglob("*"):
                        if file.is_file():
                            total_size += file.stat().st_size
            
            total_size_mb = total_size / (1024 * 1024)
            
            print(f"{i}. {model_name}")
            print(f"   Snapshots: {len(snapshots)}")
            print(f"   Total size: {total_size_mb:.2f} MB")
            if snapshots:
                print(f"   Latest: {snapshots[0].name[:12]}...")
            print()
    else:
        print("No models found in cache directory")
else:
    print("Cache directory not found or empty")


Found 2 downloaded model(s):

1. gpt2
   Snapshots: 1
   Total size: 525.44 MB
   Latest: 607a30d783df...

2. tiiuae/falcon-7b-instruct
   Snapshots: 1
   Total size: 13768.35 MB
   Latest: 8782b5c5d8c9...



---

# Use Models

In [10]:
# Load the language model (with language modeling head) for text generation
from transformers import AutoModelForCausalLM, AutoTokenizer

# Always load the correct model type for text generation (AutoModelForCausalLM)
# Cell 5 loads AutoModel (base model), but we need AutoModelForCausalLM for generation
try:
    _ = tokenizer
except NameError:
    model_name = "gpt2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the language model (with LM head) for text generation
model_name = "gpt2"
lm_model = AutoModelForCausalLM.from_pretrained(model_name)

# Ask a question
question = "What is capital of kansas?"

print(f"Question: {question}\n")
print("Answer:")

# Tokenize the input
inputs = tokenizer.encode(question, return_tensors="pt")

# Generate response
outputs = lm_model.generate(
    inputs,
    max_length=100,
    num_return_sequences=1,
    temperature=0.7,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id
)

# Decode and print the response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)


Question: What is capital of kansas?

Answer:
What is capital of kansas?

Capital of kansas is defined as a capital amount in the local currency. Capital of kansas is calculated based on the number of people under the age of 18 years and the amount of cash in circulation.

Where is the capital of kansas located?

Capital of kansas is located in the town or region where the property is located.

How much money has been deposited into the bank accounts of the bank?

The total


In [12]:
##### Understand which neurons got triggered

---


# Examine Model Files


In [7]:
try:
    _ = default_download_path
    _ = model_name
except NameError:
    import os
    from pathlib import Path
    hf_home = os.environ.get('HF_HOME')
    default_download_path = Path(hf_home) if hf_home else Path.home() / ".cache" / "huggingface"
    model_name = "gpt2"

# Descriptions for files and folders
descriptions = {
    # Folders
    "models--gpt2": "Model cache folder - contains all files for this model",
    "snapshots": "Snapshots folder - contains versioned model files (identified by commit hash)",
    "blobs": "Blobs folder - stores deduplicated file content (Hugging Face cache optimization)",
    "refs": "References folder - contains pointers to specific model versions (like git refs)",
    ".no_exist": "Symlink fallback folder - used when symlinks aren't supported on Windows",
    
    # Files
    "config.json": "Model configuration - architecture, hyperparameters, and model settings",
    "model.safetensors": "Model weights - the actual neural network parameters (SafeTensors format)",
    "pytorch_model.bin": "Model weights - alternative format (PyTorch binary, older format)",
    "tokenizer.json": "Tokenizer data - complete tokenizer configuration and vocabulary",
    "tokenizer_config.json": "Tokenizer config - tokenizer settings and special tokens",
    "vocab.json": "Vocabulary mapping - word/token to ID mappings",
    "merges.txt": "BPE merges - Byte Pair Encoding merge rules for tokenization",
    "special_tokens_map.json": "Special tokens - mappings for special tokens (BOS, EOS, PAD, etc.)",
    "added_tokens.json": "Added tokens - custom tokens added to the tokenizer",
    "chat_template.jinja": "Chat template - template for formatting chat conversations",
    "main": "Reference file - points to the main/default model version",
}

def get_description(name):
    """Get description for a file or folder"""
    # Check exact match first
    if name in descriptions:
        return f" <-- {descriptions[name]}"
    # Check if it's a commit hash (long hex string)
    if len(name) == 40 and all(c in '0123456789abcdef' for c in name.lower()):
        return " <-- Model snapshot (identified by commit hash)"
    return ""

def collect_tree_items(path, prefix="", is_last=True, items=None):
    """Collect all tree items with their display info"""
    if items is None:
        items = []
    
    name = path.name if path.name else str(path)
    
    if path.is_file():
        size = path.stat().st_size
        size_mb = size / (1024 * 1024)
        size_str = f"({size_mb:.2f} MB)"
        icon = "üìÑ"
    else:
        size_str = ""
        icon = "üìÅ"
    
    description = get_description(name)
    connector = "‚îî‚îÄ‚îÄ " if is_last else "‚îú‚îÄ‚îÄ "
    
    # Calculate the base display string (without description)
    base_str = f"{prefix}{connector}{icon} {name} {size_str}"
    items.append((base_str, description, path))
    
    if path.is_dir():
        children = sorted(path.iterdir(), key=lambda x: (x.is_file(), x.name))
        extension = "    " if is_last else "‚îÇ   "
        new_prefix = prefix + extension
        
        for i, child in enumerate(children):
            is_last_child = (i == len(children) - 1)
            collect_tree_items(child, new_prefix, is_last_child, items)
    
    return items

def print_tree(path, prefix="", is_last=True):
    """Print directory tree with file sizes and descriptions (aligned arrows)"""
    # First pass: collect all items
    items = collect_tree_items(path, prefix, is_last)
    
    # Find maximum width for alignment
    max_width = max(len(base_str) for base_str, _, _ in items)
    
    # Second pass: print with aligned arrows
    for base_str, description, _ in items:
        padded_base = base_str.ljust(max_width)
        print(f"{padded_base}{description}")

model_cache_path = default_download_path / "hub" / f"models--{model_name.replace('/', '--')}"
snapshots_dir = model_cache_path / "snapshots"

if snapshots_dir.exists():
    snapshots = list(snapshots_dir.iterdir())
    if snapshots:
        print(f"Model folder hierarchy:\n")
        print_tree(model_cache_path)
    else:
        print("‚úó No snapshots found")
else:
    print("‚úó Model not downloaded yet")


Model folder hierarchy:

‚îî‚îÄ‚îÄ üìÅ models--gpt2                                      <-- Model cache folder - contains all files for this model
    ‚îú‚îÄ‚îÄ üìÅ .no_exist                                     <-- Symlink fallback folder - used when symlinks aren't supported on Windows
    ‚îÇ   ‚îî‚îÄ‚îÄ üìÅ 607a30d783dfa663caf39e06633721c8d4cfcd7e  <-- Model snapshot (identified by commit hash)
    ‚îÇ       ‚îú‚îÄ‚îÄ üìÑ added_tokens.json (0.00 MB)           <-- Added tokens - custom tokens added to the tokenizer
    ‚îÇ       ‚îú‚îÄ‚îÄ üìÑ chat_template.jinja (0.00 MB)         <-- Chat template - template for formatting chat conversations
    ‚îÇ       ‚îî‚îÄ‚îÄ üìÑ special_tokens_map.json (0.00 MB)     <-- Special tokens - mappings for special tokens (BOS, EOS, PAD, etc.)
    ‚îú‚îÄ‚îÄ üìÅ blobs                                         <-- Blobs folder - stores deduplicated file content (Hugging Face cache optimization)
    ‚îú‚îÄ‚îÄ üìÅ refs                              

---


# Example Model Object

In [4]:
# Display model configuration
print("=" * 60)
print("MODEL CONFIGURATION")
print("=" * 60)
print(json.dumps(config.to_dict(), indent=2))

print("\n" + "=" * 60)
print("KEY MODEL PARAMETERS")
print("=" * 60)
if hasattr(config, 'vocab_size'):
    print(f"Vocabulary size: {config.vocab_size:,}")
if hasattr(config, 'hidden_size'):
    print(f"Hidden size: {config.hidden_size:,}")
if hasattr(config, 'num_attention_heads'):
    print(f"Number of attention heads: {config.num_attention_heads}")
if hasattr(config, 'num_hidden_layers'):
    print(f"Number of layers: {config.num_hidden_layers}")
if hasattr(config, 'max_position_embeddings'):
    print(f"Max position embeddings: {config.max_position_embeddings:,}")

# Count total parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\nTotal parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Model size (approx): {total_params * 4 / (1024**2):.2f} MB (assuming float32)")


MODEL CONFIGURATION
{
  "vocab_size": 50257,
  "n_positions": 1024,
  "n_embd": 768,
  "n_layer": 12,
  "n_head": 12,
  "n_inner": null,
  "activation_function": "gelu_new",
  "resid_pdrop": 0.1,
  "embd_pdrop": 0.1,
  "attn_pdrop": 0.1,
  "layer_norm_epsilon": 1e-05,
  "initializer_range": 0.02,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "scale_attn_weights": true,
  "use_cache": true,
  "scale_attn_by_inverse_layer_idx": false,
  "reorder_and_upcast_attn": false,
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "return_dict": true,
  "output_hidden_states": false,
  "torchscript": false,
  "dtype": null,
  "pruned_heads": {},
  "tie_word_embeddings": true,
  "chunk_size_feed_forward": 0,
  "is_encoder_decoder": false,
  "is_decoder": false,
  "cross_attention_hidden_size": null,
  "add_cross_attention": false,
  "tie_encoder_decoder": false,
  "architectures": [
    "G

---
