##### 00_examine_existing_models.ipynb - Purpose
This notebook is ONLY about examining existing pretrained models ‚Äî how they‚Äôre stored, what files they contain, and how they‚Äôre structured on disk.

##### Goals
- Download small pretrained models from Hugging Face.
- Inspect the folder layout (tokenizer files, config, model weights).
- Understand what each file does (.bin, .safetensors, config.json, tokenizer.json, etc.).
- Load the model and print shapes of key components (embeddings, attention, MLP).
- Get a practical feel for ‚Äúmodel anatomy‚Äù before building my own tiny version later.

##### Scope (Important)
- **NOT** learning how they're trained.  
- **ONLY** file structure, weights, components, and practical inspection.

##### Expected Outputs
- Folder snapshots.
- Breakdown of model components.
- Parameter counts and shape summaries.

---


# Setup

---


# Download Model

In [1]:
from transformers import AutoModel, AutoTokenizer, AutoConfig
from pathlib import Path
import os
import json

# Get default cache directory
hf_home = os.environ.get('HF_HOME')
default_download_path = Path(hf_home) if hf_home else Path.home() / ".cache" / "huggingface"
print(f"Default download folder: {default_download_path.absolute()}")

# Model to download
model_name = "gpt2"

# Check if model already exists
model_cache_subdir = default_download_path / "hub" / f"models--{model_name.replace('/', '--')}"
snapshots_dir = model_cache_subdir / "snapshots"

if snapshots_dir.exists():
    snapshots = list(snapshots_dir.iterdir())
    if snapshots:
        print(f"‚úì Model already downloaded at: {snapshots[0]}")
        os.startfile(snapshots[0])
    else:
        print("‚úó Model directory exists but empty")
else:
    print(f"‚úó Model not downloaded yet - will download in next cell")


Default download folder: C:\Users\Delulu Lemon\.cache\huggingface
‚úì Model already downloaded at: C:\Users\Delulu Lemon\.cache\huggingface\hub\models--gpt2\snapshots\607a30d783dfa663caf39e06633721c8d4cfcd7e


In [2]:
# Download and load the tokenizer, config, and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

print("‚úì Model downloaded and loaded successfully!")
print(f"\nModel type: {type(model).__name__}")
print(f"Config type: {type(config).__name__}")
print(f"Tokenizer type: {type(tokenizer).__name__}")

# Open the model folder to see the downloaded files
# (Reconstruct paths in case cell 4 wasn't run - cell 4 should be run first)
try:
    # Try to use variables from cell 4
    _ = default_download_path
    _ = model_name
except NameError:
    # If not available, reconstruct them
    import os
    from pathlib import Path
    hf_home = os.environ.get('HF_HOME')
    default_download_path = Path(hf_home) if hf_home else Path.home() / ".cache" / "huggingface"
    model_name = "gpt2"

model_cache_subdir = default_download_path / "hub" / f"models--{model_name.replace('/', '--')}"
snapshots_dir = model_cache_subdir / "snapshots"
if snapshots_dir.exists():
    snapshots = list(snapshots_dir.iterdir())
    if snapshots:
        model_files_dir = snapshots[0]
        print(f"\nOpening model files folder: {model_files_dir}")
        os.startfile(model_files_dir)


‚úì Model downloaded and loaded successfully!

Model type: GPT2Model
Config type: GPT2Config
Tokenizer type: GPT2TokenizerFast

Opening model files folder: C:\Users\Delulu Lemon\.cache\huggingface\hub\models--gpt2\snapshots\607a30d783dfa663caf39e06633721c8d4cfcd7e


---


# Use Models

---

# Enumerate Models

---


# Examine Model Files


In [5]:
try:
    _ = default_download_path
    _ = model_name
except NameError:
    import os
    from pathlib import Path
    hf_home = os.environ.get('HF_HOME')
    default_download_path = Path(hf_home) if hf_home else Path.home() / ".cache" / "huggingface"
    model_name = "gpt2"

def print_tree(path, prefix="", is_last=True):
    """Print directory tree with file sizes"""
    name = path.name if path.name else str(path)
    
    if path.is_file():
        size = path.stat().st_size
        size_mb = size / (1024 * 1024)
        size_str = f"({size_mb:.2f} MB)"
        icon = "üìÑ"
    else:
        size_str = ""
        icon = "üìÅ"
    
    connector = "‚îî‚îÄ‚îÄ " if is_last else "‚îú‚îÄ‚îÄ "
    print(f"{prefix}{connector}{icon} {name} {size_str}")
    
    if path.is_dir():
        children = sorted(path.iterdir(), key=lambda x: (x.is_file(), x.name))
        extension = "    " if is_last else "‚îÇ   "
        new_prefix = prefix + extension
        
        for i, child in enumerate(children):
            is_last_child = (i == len(children) - 1)
            print_tree(child, new_prefix, is_last_child)

model_cache_path = default_download_path / "hub" / f"models--{model_name.replace('/', '--')}"
snapshots_dir = model_cache_path / "snapshots"

if snapshots_dir.exists():
    snapshots = list(snapshots_dir.iterdir())
    if snapshots:
        print(f"Model folder hierarchy:\n")
        print_tree(model_cache_path)
    else:
        print("‚úó No snapshots found")
else:
    print("‚úó Model not downloaded yet")


Model folder hierarchy:

‚îî‚îÄ‚îÄ üìÅ models--gpt2 
    ‚îú‚îÄ‚îÄ üìÅ .no_exist 
    ‚îÇ   ‚îî‚îÄ‚îÄ üìÅ 607a30d783dfa663caf39e06633721c8d4cfcd7e 
    ‚îÇ       ‚îú‚îÄ‚îÄ üìÑ added_tokens.json (0.00 MB)
    ‚îÇ       ‚îú‚îÄ‚îÄ üìÑ chat_template.jinja (0.00 MB)
    ‚îÇ       ‚îî‚îÄ‚îÄ üìÑ special_tokens_map.json (0.00 MB)
    ‚îú‚îÄ‚îÄ üìÅ blobs 
    ‚îú‚îÄ‚îÄ üìÅ refs 
    ‚îÇ   ‚îî‚îÄ‚îÄ üìÑ main (0.00 MB)
    ‚îî‚îÄ‚îÄ üìÅ snapshots 
        ‚îî‚îÄ‚îÄ üìÅ 607a30d783dfa663caf39e06633721c8d4cfcd7e 
            ‚îú‚îÄ‚îÄ üìÑ config.json (0.00 MB)
            ‚îú‚îÄ‚îÄ üìÑ merges.txt (0.44 MB)
            ‚îú‚îÄ‚îÄ üìÑ model.safetensors (522.71 MB)
            ‚îú‚îÄ‚îÄ üìÑ tokenizer.json (1.29 MB)
            ‚îú‚îÄ‚îÄ üìÑ tokenizer_config.json (0.00 MB)
            ‚îî‚îÄ‚îÄ üìÑ vocab.json (0.99 MB)


---


# Example Model

In [4]:
# Display model configuration
print("=" * 60)
print("MODEL CONFIGURATION")
print("=" * 60)
print(json.dumps(config.to_dict(), indent=2))

print("\n" + "=" * 60)
print("KEY MODEL PARAMETERS")
print("=" * 60)
if hasattr(config, 'vocab_size'):
    print(f"Vocabulary size: {config.vocab_size:,}")
if hasattr(config, 'hidden_size'):
    print(f"Hidden size: {config.hidden_size:,}")
if hasattr(config, 'num_attention_heads'):
    print(f"Number of attention heads: {config.num_attention_heads}")
if hasattr(config, 'num_hidden_layers'):
    print(f"Number of layers: {config.num_hidden_layers}")
if hasattr(config, 'max_position_embeddings'):
    print(f"Max position embeddings: {config.max_position_embeddings:,}")

# Count total parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\nTotal parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Model size (approx): {total_params * 4 / (1024**2):.2f} MB (assuming float32)")


MODEL CONFIGURATION
{
  "vocab_size": 50257,
  "n_positions": 1024,
  "n_embd": 768,
  "n_layer": 12,
  "n_head": 12,
  "n_inner": null,
  "activation_function": "gelu_new",
  "resid_pdrop": 0.1,
  "embd_pdrop": 0.1,
  "attn_pdrop": 0.1,
  "layer_norm_epsilon": 1e-05,
  "initializer_range": 0.02,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "scale_attn_weights": true,
  "use_cache": true,
  "scale_attn_by_inverse_layer_idx": false,
  "reorder_and_upcast_attn": false,
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "return_dict": true,
  "output_hidden_states": false,
  "torchscript": false,
  "dtype": null,
  "pruned_heads": {},
  "tie_word_embeddings": true,
  "chunk_size_feed_forward": 0,
  "is_encoder_decoder": false,
  "is_decoder": false,
  "cross_attention_hidden_size": null,
  "add_cross_attention": false,
  "tie_encoder_decoder": false,
  "architectures": [
    "G

---
