# SEC 10-K HTML File Structure: An Exploratory Walkthrough

**Purpose:** Hands-on exploration of the raw EDGAR full-submission text files in `data/raw/`.
Each `.html` file is **not** a standard HTML file — it is an SGML container embedding
88–262 separate documents (HTML, XML, JSON, images, ZIP) concatenated inside a single file.

**Reference files (from research doc `2026-02-22_12-00-00_sec_html_structure_and_extraction.md`):**
- `AAPL_10K_2021.html` — 10.0 MB, 88 embedded docs
- `ADI_10K_2025.html` — 13.1 MB, 117 embedded docs
- `ALL_10K_2025.html` — 44.3 MB, 262 embedded docs

---
**Sections:**
1. Setup & Load
2. Layer 1 — SGML Container & Document Index
3. Layer 2a — SGML Header Metadata
4. Layer 2b — DEI iXBRL Tags
5. Layer 3 — Document Index Distributions (cross-file)
6. Layer 4 — High-Value Sub-Documents (MetaLinks, FilingSummary, R*.htm)
7. Layer 5 — Universal Corpus Patterns
8. Extraction Coverage: What We Get vs. What's Available

## 1. Setup & Load

In [None]:
import json
import re
import sys
from collections import Counter, defaultdict
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
from xml.etree import ElementTree as ET

# Add repo root to path so we can import project modules if needed
REPO_ROOT = Path("../").resolve()
sys.path.insert(0, str(REPO_ROOT))

RAW_DATA_DIR = REPO_ROOT / "data" / "raw"

# Reference files from research document
REF_FILES = {
    "AAPL_2021": RAW_DATA_DIR / "AAPL_10K_2021.html",
    "ADI_2025":  RAW_DATA_DIR / "ADI_10K_2025.html",
    "ALL_2025":  RAW_DATA_DIR / "ALL_10K_2025.html",
}

# Verify files exist
for name, path in REF_FILES.items():
    size_mb = path.stat().st_size / 1e6 if path.exists() else None
    status = f"{size_mb:.1f} MB" if size_mb else "MISSING"
    print(f"  {name:<12} {path.name:<25} {status}")

In [None]:
# Load the smallest reference file into memory for interactive exploration
aapl_path = REF_FILES["AAPL_2021"]
print(f"Loading {aapl_path.name} ...")
aapl_text = aapl_path.read_text(encoding="utf-8", errors="replace")
print(f"  Loaded: {len(aapl_text):,} characters  ({len(aapl_text)/1e6:.1f} MB)")

## 2. Layer 1 — SGML Container & Document Index

Every file begins with an `<SEC-HEADER>` block followed by a flat sequence of
`<DOCUMENT>` entries, each with a `<TYPE>`, `<SEQUENCE>`, `<FILENAME>`, optional
`<DESCRIPTION>`, and `<TEXT>` content block.

In [None]:
# Show the first 100 lines of the raw file to see the SGML structure
lines = aapl_text.splitlines()
print(f"Total lines: {len(lines):,}")
print("\n--- First 80 lines (SGML header + first document boundary) ---")
for i, line in enumerate(lines[:80], 1):
    print(f"{i:>4}  {line}")

In [None]:
# -----------------------------------------------------------------------
# build_doc_index: single-pass parse of SGML <DOCUMENT> boundaries
# -----------------------------------------------------------------------

_DOC_BOUNDARY = re.compile(
    r"<DOCUMENT>\s*"
    r"<TYPE>([^\n]+)\n"
    r"<SEQUENCE>([^\n]+)\n"
    r"<FILENAME>([^\n]+)\n"
    r"(?:<DESCRIPTION>([^\n]*)\n)?"  # optional
    r"<TEXT>",
    re.IGNORECASE,
)
_TEXT_END = re.compile(r"</TEXT>", re.IGNORECASE)
_INNER_WRAPPER = re.compile(r"^<(?:XBRL|XML|JSON)>\s*", re.IGNORECASE)
_INNER_WRAPPER_CLOSE = re.compile(r"\s*</(?:XBRL|XML|JSON)>$", re.IGNORECASE)


def build_doc_index(text: str) -> list[dict]:
    """Parse SGML <DOCUMENT> boundaries. Single pass; O(1) subsequent access."""
    index = []
    for m in _DOC_BOUNDARY.finditer(text):
        text_start = m.end()
        end_m = _TEXT_END.search(text, text_start)
        text_end = end_m.start() if end_m else len(text)
        index.append({
            "seq":        m.group(2).strip(),
            "type":       m.group(1).strip(),
            "filename":   m.group(3).strip(),
            "desc":       (m.group(4) or "").strip(),
            "text_start": text_start,
            "text_end":   text_end,
            "size_bytes": text_end - text_start,
        })
    return index


def get_doc(text: str, index: list[dict], filename: str) -> tuple[str | None, dict | None]:
    """Extract a named document from the pre-built index, stripping inner wrappers."""
    entry = next((d for d in index if d["filename"].lower() == filename.lower()), None)
    if not entry:
        return None, None
    raw = text[entry["text_start"]:entry["text_end"]].strip()
    raw = _INNER_WRAPPER.sub("", raw)
    raw = _INNER_WRAPPER_CLOSE.sub("", raw)
    return raw.strip(), entry


# Build the AAPL index
aapl_index = build_doc_index(aapl_text)
print(f"AAPL 2021 — {len(aapl_index)} embedded documents")

In [None]:
# Show the full document index as a DataFrame
df_index = pd.DataFrame([
    {
        "seq": d["seq"],
        "type": d["type"],
        "filename": d["filename"],
        "desc": d["desc"][:60],
        "size_KB": round(d["size_bytes"] / 1024, 1),
    }
    for d in aapl_index
])
print(f"Shape: {df_index.shape}")
df_index.head(20)

In [None]:
# Document type distribution
type_counts = Counter(d["type"] for d in aapl_index)
type_sizes = defaultdict(int)
for d in aapl_index:
    type_sizes[d["type"]] += d["size_bytes"]

df_types = pd.DataFrame([
    {"type": t, "count": c, "total_size_KB": round(type_sizes[t] / 1024, 1)}
    for t, c in type_counts.most_common()
])
print("AAPL 2021 — Document type distribution:")
df_types

In [None]:
# Show the document boundary pattern in raw text for the first two documents
# (find where "<DOCUMENT>" first appears and show 30 lines around each)
doc_starts = [m.start() for m in re.finditer(r"<DOCUMENT>", aapl_text, re.IGNORECASE)]
print(f"Number of <DOCUMENT> tags: {len(doc_starts)}")
print(f"\n--- First document boundary (chars {doc_starts[0]}–{doc_starts[0]+400}) ---")
print(aapl_text[doc_starts[0]:doc_starts[0]+400])
print(f"\n--- Second document boundary (chars {doc_starts[1]}–{doc_starts[1]+200}) ---")
print(aapl_text[doc_starts[1]:doc_starts[1]+200])

## 3. Layer 2a — SGML Header Metadata

The `<SEC-HEADER>` block (lines 1–~80) contains plain-text key-value pairs
for company identity, filing dates, and SIC classification.

In [None]:
# Show the raw SGML header
header_m = re.search(r"<SEC-HEADER>(.*?)</SEC-HEADER>", aapl_text, re.DOTALL | re.IGNORECASE)
if header_m:
    print("--- Raw SEC-HEADER block ---")
    print(header_m.group(1))

In [None]:
# SGML header field parser
_SIC_CODE = re.compile(r"\[(\d+)\]")

_HEADER_FIELDS = [
    (r"CONFORMED SUBMISSION TYPE:\s*(.+)",   "submission_type"),
    (r"PUBLIC DOCUMENT COUNT:\s*(.+)",        "document_count"),
    (r"CONFORMED PERIOD OF REPORT:\s*(.+)",   "period_of_report"),
    (r"FILED AS OF DATE:\s*(.+)",             "filed_as_of_date"),
    (r"ACCESSION NUMBER:\s*(.+)",             "accession_number"),
    (r"COMPANY CONFORMED NAME:\s*(.+)",       "company_name"),
    (r"CENTRAL INDEX KEY:\s*(.+)",            "cik"),
    (r"STANDARD INDUSTRIAL CLASSIFICATION:\s*(.+)", "sic_full"),
    (r"EIN:\s*(.+)",                          "ein"),
    (r"STATE OF INCORPORATION:\s*(.+)",       "state_of_incorporation"),
    (r"FISCAL YEAR END:\s*(.+)",              "fiscal_year_end"),
    (r"SEC FILE NUMBER:\s*(.+)",              "sec_file_number"),
    (r"FORM TYPE:\s*(.+)",                    "form_type"),
]

def parse_sgml_header(text: str) -> dict:
    hm = re.search(r"<SEC-HEADER>(.*?)</SEC-HEADER>", text, re.DOTALL | re.IGNORECASE)
    if not hm:
        return {}
    header = hm.group(1)
    result = {}
    for pattern, key in _HEADER_FIELDS:
        m = re.search(pattern, header, re.IGNORECASE)
        if m:
            result[key] = m.group(1).strip()
    if "sic_full" in result:
        sic_m = _SIC_CODE.search(result["sic_full"])
        if sic_m:
            result["sic_code"] = sic_m.group(1)
            result["sic_name"] = result["sic_full"][:sic_m.start()].strip()
    if "period_of_report" in result and len(result["period_of_report"]) == 8:
        result["fiscal_year"] = result["period_of_report"][:4]
    return result


aapl_header = parse_sgml_header(aapl_text)
pd.DataFrame(list(aapl_header.items()), columns=["field", "value"])

## 4. Layer 2b — DEI iXBRL Tags

The main 10-K document contains DEI (Document and Entity Information) iXBRL tags
in its `<ix:hidden>` block. These provide machine-readable company identity data
richer than the SGML header — including the **ticker symbol** (not in SGML header).

In [None]:
# Find the main 10-K document
main_entry = next((d for d in aapl_index if d["type"].upper() == "10-K"), None)
print(f"Main 10-K document: {main_entry['filename']}  ({main_entry['size_bytes']/1e6:.2f} MB)")
main_html, _ = get_doc(aapl_text, aapl_index, main_entry["filename"])

In [None]:
# Show a snippet of the ix:hidden block where DEI tags live
hidden_m = re.search(r"<ix:hidden>(.*?)</ix:hidden>", main_html, re.DOTALL | re.IGNORECASE)
if hidden_m:
    hidden = hidden_m.group(1)
    print(f"<ix:hidden> block: {len(hidden):,} chars")
    # Show first 2000 chars
    print("\n--- First 2000 chars of ix:hidden ---")
    print(hidden[:2000])

In [None]:
# DEI tag extractor
_DEI_TAGS = [
    "dei:EntityCentralIndexKey",
    "dei:TradingSymbol",
    "dei:EntityRegistrantName",
    "dei:DocumentFiscalYearFocus",
    "dei:DocumentFiscalPeriodFocus",
    "dei:DocumentType",
    "dei:DocumentPeriodEndDate",
    "dei:EntityIncorporationStateCountryCode",
    "dei:EntityTaxIdentificationNumber",
    "dei:EntityAddressAddressLine1",
    "dei:EntityAddressCityOrTown",
    "dei:EntityAddressStateOrProvince",
    "dei:EntityAddressPostalZipCode",
    "dei:CityAreaCode",
    "dei:LocalPhoneNumber",
    "dei:Security12bTitle",
    "dei:SecurityExchangeName",
    "dei:EntityWellKnownSeasonedIssuer",
    "dei:EntityFilerCategory",
    "dei:EntityPublicFloat",
    "dei:EntityCommonStockSharesOutstanding",
    "dei:AmendmentFlag",
    "dei:IcfrAuditorAttestationFlag",
]

_IX_FACT = re.compile(
    r'<ix:(?:non(?:Numeric|Fraction)|numeric)\b[^>]*\bname=["\']([^"\']+)["\'][^>]*>'
    r"(.*?)"
    r"</ix:(?:non(?:Numeric|Fraction)|numeric)>",
    re.DOTALL | re.IGNORECASE,
)
_HTML_TAG = re.compile(r"<[^>]+>")


def extract_dei_tags(html: str) -> dict[str, str]:
    dei_set = set(_DEI_TAGS)
    result = {}
    for m in _IX_FACT.finditer(html):
        name = m.group(1).strip()
        if name in dei_set and name not in result:
            raw = _HTML_TAG.sub("", m.group(2)).strip()
            if raw:
                result[name] = raw
    return result


aapl_dei = extract_dei_tags(main_html)
print(f"Found {len(aapl_dei)} / {len(_DEI_TAGS)} DEI tags")
pd.DataFrame([
    {"tag": t, "value": aapl_dei.get(t, "<not found>")}
    for t in _DEI_TAGS
])

In [None]:
# Compare SGML header vs DEI for the same fields
comparison = [
    {"field": "Company Name",         "SGML Header": aapl_header.get("company_name"),          "DEI iXBRL": aapl_dei.get("dei:EntityRegistrantName")},
    {"field": "CIK",                  "SGML Header": aapl_header.get("cik"),                   "DEI iXBRL": aapl_dei.get("dei:EntityCentralIndexKey")},
    {"field": "EIN",                  "SGML Header": aapl_header.get("ein"),                   "DEI iXBRL": aapl_dei.get("dei:EntityTaxIdentificationNumber")},
    {"field": "State",                "SGML Header": aapl_header.get("state_of_incorporation"), "DEI iXBRL": aapl_dei.get("dei:EntityIncorporationStateCountryCode")},
    {"field": "Ticker",               "SGML Header": "<NOT IN SGML>",                          "DEI iXBRL": aapl_dei.get("dei:TradingSymbol")},
    {"field": "Exchange",             "SGML Header": "<NOT IN SGML>",                          "DEI iXBRL": aapl_dei.get("dei:SecurityExchangeName")},
    {"field": "Fiscal Year",          "SGML Header": aapl_header.get("fiscal_year"),            "DEI iXBRL": aapl_dei.get("dei:DocumentFiscalYearFocus")},
    {"field": "Filer Category",       "SGML Header": "<NOT IN SGML>",                          "DEI iXBRL": aapl_dei.get("dei:EntityFilerCategory")},
    {"field": "Shares Outstanding",   "SGML Header": "<NOT IN SGML>",                          "DEI iXBRL": aapl_dei.get("dei:EntityCommonStockSharesOutstanding")},
]
pd.DataFrame(comparison)

## 5. Layer 3 — Document Index Distributions Across All Three Reference Files

Compare the three reference files to see how document counts and types vary.

In [None]:
# Build indexes for all three reference files
print("Building document indexes for all three reference files...")
ref_data = {}
for name, path in REF_FILES.items():
    print(f"  Loading {name} ({path.stat().st_size / 1e6:.1f} MB)...")
    text = path.read_text(encoding="utf-8", errors="replace")
    index = build_doc_index(text)
    header = parse_sgml_header(text)
    ref_data[name] = {"text": text, "index": index, "header": header}
    print(f"    {len(index)} embedded documents")

print("\nDone.")

In [None]:
# Cross-file document type distribution table
all_types = sorted(set(
    d["type"]
    for name in ref_data
    for d in ref_data[name]["index"]
))

rows = []
for doc_type in all_types:
    row = {"type": doc_type}
    for name in ref_data:
        row[name] = sum(1 for d in ref_data[name]["index"] if d["type"] == doc_type)
    rows.append(row)

df_cross = pd.DataFrame(rows).set_index("type")
df_cross["total"] = df_cross.sum(axis=1)
df_cross.sort_values("total", ascending=False)

In [None]:
# Bar chart: document counts by file and type (top 8 types)
top_types = df_cross.nlargest(8, "total").drop(columns="total")

fig, ax = plt.subplots(figsize=(12, 5))
top_types.T.plot(kind="bar", ax=ax)
ax.set_title("Embedded Document Count by Type — Three Reference Files", fontsize=13)
ax.set_xlabel("Filing")
ax.set_ylabel("Document Count")
ax.tick_params(axis="x", rotation=0)
ax.legend(title="Document Type", bbox_to_anchor=(1.02, 1), loc="upper left")
plt.tight_layout()
plt.show()

In [None]:
# Size breakdown by document type
size_rows = []
for name in ref_data:
    type_sizes = defaultdict(int)
    for d in ref_data[name]["index"]:
        type_sizes[d["type"]] += d["size_bytes"]
    for doc_type, sz in type_sizes.items():
        size_rows.append({"file": name, "type": doc_type, "size_MB": round(sz / 1e6, 2)})

df_sizes = pd.DataFrame(size_rows)

# Pivot and plot
pivot = df_sizes.pivot_table(index="type", columns="file", values="size_MB", aggfunc="sum").fillna(0)
pivot["total"] = pivot.sum(axis=1)
top_size_types = pivot.nlargest(8, "total").drop(columns="total")

fig, ax = plt.subplots(figsize=(12, 5))
top_size_types.plot(kind="bar", ax=ax)
ax.set_title("Content Size by Document Type (MB) — Three Reference Files", fontsize=13)
ax.set_xlabel("Document Type")
ax.set_ylabel("Total Size (MB)")
ax.tick_params(axis="x", rotation=30)
ax.legend(title="Filing", bbox_to_anchor=(1.02, 1), loc="upper left")
plt.tight_layout()
plt.show()

## 6. Layer 4 — High-Value Sub-Documents

Four documents contain most of the structured financial data. All are extractable
by filename using the index built in Layer 1.

### 6.1 MetaLinks.json — XBRL Element Catalogue

In [None]:
# Extract MetaLinks.json from AAPL 2021
meta_raw, meta_entry = get_doc(aapl_text, aapl_index, "MetaLinks.json")
if meta_raw:
    meta = json.loads(meta_raw)
    print(f"MetaLinks.json size: {meta_entry['size_bytes'] / 1024:.0f} KB")
    print(f"Top-level keys: {list(meta.keys())}")
    
    # Instance-level statistics
    instance_key = list(meta["instance"].keys())[0]
    inst = meta["instance"][instance_key]
    print(f"\nInstance: {instance_key}")
    stats_keys = ["axisCustom", "axisStandard", "contextCount", "elementCount",
                  "keyCustom", "keyStandard", "memberCustom", "memberStandard",
                  "segmentCount", "unitCount"]
    for k in stats_keys:
        print(f"  {k:<22} {inst.get(k, 'N/A')}")

In [None]:
# Explore the tag dictionary
tags = inst.get("tag", {})
print(f"Total XBRL elements (tags): {len(tags)}")

# Count by namespace prefix
prefix_counts = Counter(k.split("_")[0] for k in tags.keys())
print("\nNamespace prefix distribution:")
for prefix, count in prefix_counts.most_common():
    print(f"  {prefix:<20} {count}")

In [None]:
# Show a sample element entry (the canonical MetaLinks structure)
sample_key = next(k for k in tags if k.startswith("us-gaap_"))
print(f"Sample element: {sample_key}")
print(json.dumps(tags[sample_key], indent=2))

In [None]:
# xbrltype distribution
xbrl_types = Counter(v.get("xbrltype", "unknown") for v in tags.values())
print("xbrltype distribution:")
for t, c in xbrl_types.most_common():
    print(f"  {t:<30} {c}")

In [None]:
# Financial report index from MetaLinks (matches R*.htm filenames)
reports = inst.get("report", {})
print(f"Financial reports mapped: {len(reports)}")
report_rows = [
    {"key": k, "longName": v.get("longName", ""), "menuCat": v.get("menuCat", "")}
    for k, v in list(reports.items())[:20]
]
pd.DataFrame(report_rows)

### 6.2 FilingSummary.xml — Report Index

In [None]:
# Extract FilingSummary.xml
fs_raw, fs_entry = get_doc(aapl_text, aapl_index, "FilingSummary.xml")
if fs_raw:
    print(f"FilingSummary.xml size: {fs_entry['size_bytes'] / 1024:.0f} KB")
    root = ET.fromstring(fs_raw)
    
    # Summary statistics from header elements
    for tag in ["Version", "ContextCount", "ElementCount", "SegmentCount"]:
        el = root.find(tag)
        if el is not None:
            print(f"  {tag}: {el.text}")
    
    # Extract report list
    reports_el = root.find("MyReports")
    report_rows = []
    for rpt in reports_el.findall("Report"):
        report_rows.append({
            "HtmlFileName": (rpt.findtext("HtmlFileName") or "").strip(),
            "LongName": (rpt.findtext("LongName") or "").strip(),
            "MenuCategory": (rpt.findtext("MenuCategory") or "").strip(),
        })
    df_reports = pd.DataFrame(report_rows)
    print(f"\nTotal reports: {len(df_reports)}")
    df_reports.head(15)

In [None]:
# MenuCategory distribution
cat_counts = df_reports["MenuCategory"].value_counts()
print("MenuCategory distribution:")
print(cat_counts.to_string())

fig, ax = plt.subplots(figsize=(7, 4))
cat_counts.plot(kind="bar", ax=ax, color="steelblue")
ax.set_title("FilingSummary.xml — Reports by MenuCategory (AAPL 2021)", fontsize=12)
ax.set_xlabel("MenuCategory")
ax.set_ylabel("Count")
ax.tick_params(axis="x", rotation=30)
plt.tight_layout()
plt.show()

### 6.3 R*.htm — XBRL Financial Statement Sheets

In [None]:
# Extract and show R2.htm (Consolidated Statements of Operations)
r2_raw, r2_entry = get_doc(aapl_text, aapl_index, "R2.htm")
if r2_raw:
    print(f"R2.htm size: {r2_entry['size_bytes'] / 1024:.0f} KB")
    print("\n--- First 1500 chars of R2.htm ---")
    print(r2_raw[:1500])

In [None]:
# Size distribution of all R*.htm sheets
r_sheets = [d for d in aapl_index if re.match(r"R\d+\.htm", d["filename"], re.IGNORECASE)]
r_sizes_kb = [d["size_bytes"] / 1024 for d in r_sheets]

fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].hist(r_sizes_kb, bins=30, color="steelblue", edgecolor="white")
axes[0].set_title(f"R*.htm Sheet Sizes (AAPL 2021, n={len(r_sheets)})", fontsize=12)
axes[0].set_xlabel("Size (KB)")
axes[0].set_ylabel("Count")

axes[1].hist(r_sizes_kb, bins=30, color="steelblue", edgecolor="white", cumulative=True, density=True)
axes[1].set_title("Cumulative Distribution of R*.htm Sizes", fontsize=12)
axes[1].set_xlabel("Size (KB)")
axes[1].set_ylabel("Cumulative Fraction")
axes[1].yaxis.set_major_formatter(mticker.PercentFormatter(xmax=1))

plt.tight_layout()
plt.show()

print(f"R*.htm sheet count: {len(r_sheets)}")
print(f"Sizes: min={min(r_sizes_kb):.1f} KB  max={max(r_sizes_kb):.1f} KB  mean={sum(r_sizes_kb)/len(r_sizes_kb):.1f} KB")

### 6.4 XBRL Instance Document

In [None]:

# Find the XBRL instance document (*_htm.xml)
xbrl_entry = next(
    (d for d in aapl_index if d["filename"].endswith("_htm.xml")),
    None
)
print(f"XBRL instance: {xbrl_entry['filename']}  ({xbrl_entry['size_bytes']/1e6:.2f} MB)")
xbrl_raw, _ = get_doc(aapl_text, aapl_index, xbrl_entry["filename"])
xbrl_root = ET.fromstring(xbrl_raw)

XBRLI = "http://www.xbrl.org/2003/instance"

# -----------------------------------------------------------------------
# Context period types — three types per XBRL 2003 schema:
#   instant  : single point-in-time (balance sheet dates)
#   duration : start/end date range (income statement periods)
#   forever  : no time dimension (entity-level facts, e.g. CIK)
# -----------------------------------------------------------------------
context_types = Counter()
contexts_by_type = {"instant": [], "duration": [], "forever": []}

for ctx in xbrl_root.iter(f"{{{XBRLI}}}context"):
    period = ctx.find(f"{{{XBRLI}}}period")
    if period is None:
        context_types["unknown"] += 1
    elif period.find(f"{{{XBRLI}}}forever") is not None:
        context_types["forever"] += 1
        contexts_by_type["forever"].append(ctx)
    elif period.find(f"{{{XBRLI}}}instant") is not None:
        context_types["instant"] += 1
        contexts_by_type["instant"].append(ctx)
    else:
        context_types["duration"] += 1
        contexts_by_type["duration"].append(ctx)

# -----------------------------------------------------------------------
# Unit types — two structures per XBRL 2003 schema:
#   measure : simple unit  e.g. <measure>iso4217:USD</measure>
#   divide  : ratio unit   e.g. USD/share for EPS facts
#             <divide><unitNumerator>...</unitNumerator>
#                    <unitDenominator>...</unitDenominator></divide>
# -----------------------------------------------------------------------
unit_types = Counter()
unit_details = []

for unit in xbrl_root.iter(f"{{{XBRLI}}}unit"):
    uid = unit.attrib.get("id", "")
    divide = unit.find(f"{{{XBRLI}}}divide")
    if divide is not None:
        unit_types["divide"] += 1
        num_el = divide.find(f".//{{{XBRLI}}}unitNumerator/{{{XBRLI}}}measure")
        den_el = divide.find(f".//{{{XBRLI}}}unitDenominator/{{{XBRLI}}}measure")
        num = num_el.text.strip() if num_el is not None else ""
        den = den_el.text.strip() if den_el is not None else ""
        unit_details.append({"id": uid, "type": "divide", "value": f"{num} / {den}"})
    else:
        measure = unit.find(f"{{{XBRLI}}}measure")
        unit_types["measure"] += 1
        val = measure.text.strip() if measure is not None else ""
        unit_details.append({"id": uid, "type": "measure", "value": val})

# -----------------------------------------------------------------------
# Facts — all non-structural top-level elements
# precision vs decimals: mutually exclusive per schema; check both
# decimals="INF" means exact value (integer counts, flags)
# decimals="-6"  means accurate to nearest 10^6 (precision indicator,
#                NOT a scale factor — raw value is always in base units)
# -----------------------------------------------------------------------
def _local(tag):
    return tag.split("}")[-1] if "}" in tag else tag

precision_count = 0
decimals_count = 0
decimals_inf_count = 0
fact_count = 0

for el in xbrl_root:
    if _local(el.tag) in ("context", "unit", "schemaRef"):
        continue
    fact_count += 1
    if "precision" in el.attrib:
        precision_count += 1
    if "decimals" in el.attrib:
        decimals_count += 1
        if el.attrib["decimals"].upper() == "INF":
            decimals_inf_count += 1

print(f"\nContext period types : {dict(context_types)}")
print(f"Unit types           : {dict(unit_types)}")
print(f"Total facts          : {fact_count}")
print(f"  decimals attr      : {decimals_count}  ({decimals_inf_count} INF)")
print(f"  precision attr     : {precision_count}")


In [None]:

# Show one example of each context period type
for period_type in ("instant", "duration", "forever"):
    examples = contexts_by_type[period_type]
    if examples:
        ctx = examples[0]
        print(f"--- {period_type.upper()} context (1 of {len(examples)}) ---")
        print(ET.tostring(ctx, encoding="unicode"))
        print()
    else:
        print(f"--- {period_type.upper()} : none found in this filing ---\n")


In [None]:

# Show sample monetary us-gaap facts with corrected attribute handling
#
# decimals="-6" means the raw value is accurate to the nearest 10^6.
# It is a PRECISION INDICATOR, not a scale factor.
# The XML value is always in base units (USD), never pre-scaled.
# Example: 54763000000 with decimals=-6 → $54,763,000,000 ± $1,000,000
#
# precision vs decimals: mutually exclusive per XBRL 2003 schema.
# Modern filings use decimals. Check both to be safe.

prefix_re = re.compile(r"\{([^}]+)\}(.+)")

us_gaap_facts = [
    el for el in xbrl_root
    if "fasb.org/us-gaap" in el.tag
    and el.text and el.text.strip().lstrip("-").isdigit()
]
print(f"Monetary us-gaap facts: {len(us_gaap_facts)}")
print(f"{'Element':<50} {'Value (USD)':>22}  {'unitRef':<15} {'precision_attr'}")
print("-" * 105)
for el in us_gaap_facts[:10]:
    m = prefix_re.match(el.tag)
    localname = m.group(2) if m else el.tag
    val = int(el.text.strip())
    unit_ref  = el.attrib.get("unitRef", "")
    # Check decimals first, fall back to precision (older filings)
    prec_attr = el.attrib.get("decimals") or el.attrib.get("precision") or ""
    attr_name = "decimals" if "decimals" in el.attrib else ("precision" if "precision" in el.attrib else "—")
    print(f"  {localname:<48} {val:>22,}  {unit_ref:<15} {attr_name}={prec_attr}")


In [None]:
# Show first 5 fact values (us-gaap monetary facts)
us_gaap_facts = [
    f for f in facts
    if "fasb.org/us-gaap" in f.tag and f.text and f.text.strip().lstrip("-").isdigit()
]
print(f"Monetary us-gaap facts: {len(us_gaap_facts)}")
for f in us_gaap_facts[:8]:
    m = prefix_re.match(f.tag)
    localname = m.group(2) if m else f.tag
    val = int(f.text.strip())
    ctx_ref = f.attrib.get("contextRef", "")
    unit_ref = f.attrib.get("unitRef", "")
    decimals = f.attrib.get("decimals", "")
    # Values are in thousands (decimals="-6" means reported in millions)
    print(f"  {localname:<50} {val:>20,}  [{unit_ref}, decimals={decimals}]")

## 7. Layer 5 — Universal Corpus Patterns

Compare SGML header metadata across all three reference files to confirm universal patterns.

In [None]:
# Cross-file SGML header comparison
header_rows = []
for name, data in ref_data.items():
    h = data["header"]
    idx = data["index"]
    header_rows.append({
        "file": name,
        "company_name": h.get("company_name", ""),
        "sic_code": h.get("sic_code", ""),
        "sic_name": h.get("sic_name", ""),
        "fiscal_year": h.get("fiscal_year", ""),
        "period_of_report": h.get("period_of_report", ""),
        "filed_as_of_date": h.get("filed_as_of_date", ""),
        "state_of_incorporation": h.get("state_of_incorporation", ""),
        "fiscal_year_end": h.get("fiscal_year_end", ""),
        "document_count": h.get("document_count", ""),
        "r_sheet_count": sum(1 for d in idx if re.match(r"R\d+\.htm", d["filename"])),
        "file_size_MB": round(ref_data[name]["text"].__sizeof__() / 1e6, 1),  # approx
    })

pd.DataFrame(header_rows).set_index("file").T

In [None]:
# DEI tag coverage across all three reference files
dei_coverage_rows = []
for name, data in ref_data.items():
    main_fn = next((d["filename"] for d in data["index"] if d["type"].upper() == "10-K"), None)
    if main_fn:
        main_html_local, _ = get_doc(data["text"], data["index"], main_fn)
        dei_local = extract_dei_tags(main_html_local) if main_html_local else {}
        for tag in _DEI_TAGS:
            dei_coverage_rows.append({
                "file": name,
                "tag": tag.split(":")[-1],
                "found": tag in dei_local,
                "value": dei_local.get(tag, "")[:50],
            })

df_dei_cov = pd.DataFrame(dei_coverage_rows)
pivot_dei = df_dei_cov.pivot_table(index="tag", columns="file", values="found", aggfunc="first")
print("DEI tag coverage (True = found):")
pivot_dei

## 8. Extraction Coverage: What We Get vs. What's Available

Summary of what the current pipeline extracts vs. what the raw files contain.

In [None]:
coverage = [
    # Data field, Available, Currently extracted, Source
    ("company_name",          True,  True,  "SGML header"),
    ("cik",                   True,  True,  "SGML header"),
    ("sic_code",              True,  True,  "SGML header"),
    ("sic_name",              True,  True,  "SGML header"),
    ("ticker",                True,  True,  "DEI iXBRL"),
    ("fiscal_year",           True,  True,  "SGML header"),
    ("period_of_report",      True,  True,  "SGML header"),
    ("ein",                   True,  False, "SGML header"),
    ("state_of_incorporation",True,  False, "SGML header"),
    ("fiscal_year_end (MMDD)",True,  False, "SGML header"),
    ("accession_number",      True,  False, "SGML header"),
    ("sec_file_number",       True,  False, "SGML header"),
    ("exchange (Nasdaq/NYSE)",True,  False, "DEI iXBRL"),
    ("shares_outstanding",    True,  False, "DEI iXBRL"),
    ("public_float",          True,  False, "DEI iXBRL"),
    ("filer_category",        True,  False, "DEI iXBRL"),
    ("amendment_flag",        True,  False, "DEI iXBRL"),
    ("FASB element definitions",True, False,"MetaLinks.json"),
    ("all financial facts",   True,  False, "XBRL instance XML"),
    ("calculation tree",      True,  False, "EX-101.CAL / MetaLinks"),
    ("named financial stmts", True,  False, "FilingSummary.xml"),
    ("company logo / charts", True,  False, "GRAPHIC documents"),
]

df_cov = pd.DataFrame(coverage, columns=["Data Field", "Available", "Extracted", "Source"])
df_cov["Status"] = df_cov["Extracted"].map({True: "✓ Extracted", False: "✗ Not yet"})

# Style the output
def highlight_status(row):
    if row["Extracted"]:
        return ["background-color: #d4edda"] * len(row)
    return ["background-color: #f8d7da"] * len(row)

df_cov[["Data Field", "Status", "Source"]].style.apply(
    lambda row: highlight_status(df_cov.iloc[row.name]),
    axis=1
)

## 9. Distribution Analysis & Pattern Discovery

Load the JSON output from the most recent script run and explore distributions
across the full sample. Run the script with `--sample 50` or `--all` first to
get statistically meaningful counts.

```bash
python scripts/eda/sec_html_structure_explorer.py --sample 50 --no-file-detail
```


In [None]:

# Load JSON results from last script run, build analysis DataFrame
import numpy as np
from datetime import date as _date

results_path = REPO_ROOT / "reports" / "sec_html_structure" / "results.json"
with open(results_path) as f:
    corpus_results = json.load(f)
print(f"Loaded {len(corpus_results)} filing results from {results_path.name}")

def _parse_yyyymmdd(s):
    try:
        return _date(int(s[:4]), int(s[4:6]), int(s[6:8]))
    except Exception:
        return None

rows = []
for r in corpus_results:
    h = r["sgml_header"]
    dei = r["dei_tags"]
    period  = _parse_yyyymmdd(h.get("period_of_report", ""))
    filed   = _parse_yyyymmdd(h.get("filed_as_of_date", ""))
    lag     = (filed - period).days if period and filed else None

    # Parse public float — strip commas, handle "B" suffix (some filings store 41.91)
    raw_float = dei.get("dei:EntityPublicFloat", "").replace(",", "").replace("$", "")
    try:
        public_float_b = float(raw_float) / 1e9
    except ValueError:
        public_float_b = None

    rows.append({
        "stem":              r["stem"],
        "ticker":            r["stem"].split("_")[0],
        "fiscal_year":       int(h["fiscal_year"]) if h.get("fiscal_year", "").isdigit() else None,
        "sic_code":          h.get("sic_code", ""),
        "sic_name":          h.get("sic_name", ""),
        "state_of_inc":      h.get("state_of_incorporation", ""),
        "period_of_report":  h.get("period_of_report", ""),
        "filed_as_of_date":  h.get("filed_as_of_date", ""),
        "filing_lag_days":   lag,
        "file_size_mb":      r["file_size_bytes"] / 1e6,
        "main_10k_mb":       r["main_10k_size_bytes"] / 1e6,
        "total_documents":   r["total_documents"],
        "r_sheet_count":     r["r_sheet_count"],
        "filer_category":    dei.get("dei:EntityFilerCategory", "").strip(),
        "exchange":          dei.get("dei:SecurityExchangeName", "").strip(),
        "wksi":              dei.get("dei:EntityWellKnownSeasonedIssuer", "").strip(),
        "inc_state":         dei.get("dei:EntityIncorporationStateCountryCode", "").strip(),
        "hq_state":          dei.get("dei:EntityAddressStateOrProvince", "").strip(),
        "public_float_b":    public_float_b,
        "dei_tag_count":     r["dei_tag_count"],
        # XBRL stats
        "xbrl_facts":        r.get("xbrl", {}).get("fact_count"),
        "xbrl_contexts":     r.get("xbrl", {}).get("context_count"),
        "xbrl_divide_units": r.get("xbrl", {}).get("unit_types", {}).get("divide", 0),
    })

df = pd.DataFrame(rows)
print(f"DataFrame: {df.shape[0]} rows × {df.shape[1]} columns")
df.head(3)


### 9.1 Filing Lag Distribution

Days between fiscal period end and SEC filing date. Large accelerated filers must
file within **60 days**, accelerated within **75 days**, others within **90 days**.


In [None]:

lag = df["filing_lag_days"].dropna()
print(f"Filing lag stats (n={len(lag)}):")
print(lag.describe().round(1).to_string())

fig, axes = plt.subplots(1, 2, figsize=(13, 4))

# Histogram
axes[0].hist(lag, bins=20, color="steelblue", edgecolor="white")
for x, label, color in [(60, "60d LAF", "red"), (75, "75d AF", "orange"), (90, "90d other", "green")]:
    axes[0].axvline(x, color=color, linestyle="--", linewidth=1.2, label=label)
axes[0].set_title("Filing Lag Distribution", fontsize=12)
axes[0].set_xlabel("Days (period end → filed date)")
axes[0].set_ylabel("Count")
axes[0].legend(fontsize=9)

# By filer category (boxplot if enough data, else strip)
cats = df[df["filing_lag_days"].notna()].copy()
cat_order = cats.groupby("filer_category")["filing_lag_days"].median().sort_values().index.tolist()
cat_data  = [cats[cats["filer_category"] == c]["filing_lag_days"].values for c in cat_order]
axes[1].boxplot(cat_data, labels=[c[:25] for c in cat_order], vert=True)
axes[1].set_title("Filing Lag by Filer Category", fontsize=12)
axes[1].set_ylabel("Days")
axes[1].tick_params(axis="x", rotation=20)

plt.tight_layout()
plt.show()

# Top fastest / slowest
print("\nFastest filers:")
print(df[["stem", "filer_category", "filing_lag_days"]].nsmallest(5, "filing_lag_days").to_string(index=False))
print("\nSlowest filers:")
print(df[["stem", "filer_category", "filing_lag_days"]].nlargest(5, "filing_lag_days").to_string(index=False))


### 9.2 Filer Category, Exchange & WKSI Distribution


In [None]:

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for ax, col, title in [
    (axes[0], "filer_category", "Filer Category"),
    (axes[1], "exchange",       "Exchange Listed On"),
    (axes[2], "wksi",           "Well-Known Seasoned Issuer"),
]:
    counts = df[col].replace("", "Unknown").value_counts()
    counts.plot(kind="bar", ax=ax, color="steelblue", edgecolor="white")
    ax.set_title(title, fontsize=11)
    ax.set_xlabel("")
    ax.set_ylabel("Count")
    ax.tick_params(axis="x", rotation=30)
    for bar in ax.patches:
        ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.1,
                str(int(bar.get_height())), ha="center", va="bottom", fontsize=9)

plt.tight_layout()
plt.show()

# Summary tables
for col, label in [("filer_category", "Filer Category"), ("exchange", "Exchange"), ("wksi", "WKSI")]:
    print(f"\n{label}:")
    print(df[col].replace("", "—").value_counts().to_string())


### 9.3 State of Incorporation & HQ State


In [None]:

fig, axes = plt.subplots(1, 2, figsize=(14, 4))

for ax, col, title in [
    (axes[0], "inc_state", "State of Incorporation (DEI)"),
    (axes[1], "hq_state",  "HQ State (DEI address)"),
]:
    counts = df[col].replace("", "Unknown").value_counts().head(12)
    counts.plot(kind="bar", ax=ax, color="steelblue", edgecolor="white")
    ax.set_title(title, fontsize=11)
    ax.set_xlabel("")
    ax.set_ylabel("Count")
    ax.tick_params(axis="x", rotation=35)

plt.suptitle("Geographic Distribution", fontsize=13, y=1.02)
plt.tight_layout()
plt.show()

print("State of incorporation (top 10):")
print(df["inc_state"].replace("", "—").value_counts().head(10).to_string())
print("\nHQ state (top 10):")
print(df["hq_state"].replace("", "—").value_counts().head(10).to_string())


### 9.4 File Size vs. Complexity Correlations

How strongly do file size, R*.htm sheet count, total documents, and XBRL fact count
co-vary? Pearson r closer to ±1 = stronger linear relationship.


In [None]:

metrics = ["file_size_mb", "main_10k_mb", "r_sheet_count", "total_documents", "xbrl_facts", "xbrl_contexts"]
corr = df[metrics].corr(method="pearson")

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Correlation heatmap
im = axes[0].imshow(corr, vmin=-1, vmax=1, cmap="RdBu_r", aspect="auto")
axes[0].set_xticks(range(len(metrics)))
axes[0].set_yticks(range(len(metrics)))
axes[0].set_xticklabels(metrics, rotation=35, ha="right", fontsize=9)
axes[0].set_yticklabels(metrics, fontsize=9)
for i in range(len(metrics)):
    for j in range(len(metrics)):
        axes[0].text(j, i, f"{corr.iloc[i, j]:.2f}", ha="center", va="center",
                     fontsize=8, color="white" if abs(corr.iloc[i, j]) > 0.6 else "black")
plt.colorbar(im, ax=axes[0], fraction=0.046)
axes[0].set_title("Pearson Correlation Heatmap", fontsize=12)

# Scatter: file_size_mb vs r_sheet_count (strongest expected pair)
valid = df[["file_size_mb", "r_sheet_count", "sic_name"]].dropna()
axes[1].scatter(valid["file_size_mb"], valid["r_sheet_count"],
                alpha=0.7, s=60, color="steelblue", edgecolors="white", linewidths=0.5)
for _, row in valid.iterrows():
    axes[1].annotate(row["sic_name"][:15], (row["file_size_mb"], row["r_sheet_count"]),
                     fontsize=7, alpha=0.6, xytext=(3, 3), textcoords="offset points")
r_val = valid["file_size_mb"].corr(valid["r_sheet_count"])
axes[1].set_title(f"File Size vs R*.htm Count  (r={r_val:.2f})", fontsize=12)
axes[1].set_xlabel("Total File Size (MB)")
axes[1].set_ylabel("R*.htm Sheet Count")

plt.tight_layout()
plt.show()

print("Correlation matrix:")
print(corr.round(2).to_string())


### 9.5 Industry Complexity: SIC Code vs Filing Size


In [None]:

# Group by SIC name, show industries with at least 2 filings in sample
sic_stats = (
    df.groupby("sic_name")
    .agg(
        n=("stem", "count"),
        avg_size_mb=("file_size_mb", "mean"),
        avg_r_sheets=("r_sheet_count", "mean"),
        avg_facts=("xbrl_facts", "mean"),
    )
    .query("n >= 2")
    .sort_values("avg_size_mb", ascending=False)
)

print("Industries with ≥2 filings — sorted by avg file size:")
print(sic_stats.round(1).to_string())

if len(sic_stats) >= 2:
    fig, axes = plt.subplots(1, 2, figsize=(14, 4))
    labels = [s[:30] for s in sic_stats.index]
    x = range(len(sic_stats))

    axes[0].bar(x, sic_stats["avg_size_mb"], color="steelblue", edgecolor="white")
    axes[0].set_xticks(x); axes[0].set_xticklabels(labels, rotation=35, ha="right", fontsize=8)
    axes[0].set_title("Avg File Size (MB) by Industry", fontsize=11)
    axes[0].set_ylabel("MB")

    axes[1].bar(x, sic_stats["avg_r_sheets"], color="teal", edgecolor="white")
    axes[1].set_xticks(x); axes[1].set_xticklabels(labels, rotation=35, ha="right", fontsize=8)
    axes[1].set_title("Avg R*.htm Sheet Count by Industry", fontsize=11)
    axes[1].set_ylabel("Sheets")

    plt.tight_layout()
    plt.show()
else:
    print("\nNeed more filings per SIC for chart — run --sample 50 or --all.")


### 9.6 Fiscal Year Trend: Does Filing Complexity Grow Over Time?


In [None]:

year_stats = (
    df.dropna(subset=["fiscal_year"])
    .groupby("fiscal_year")
    .agg(
        n=("stem", "count"),
        avg_size_mb=("file_size_mb", "mean"),
        avg_r_sheets=("r_sheet_count", "mean"),
        avg_docs=("total_documents", "mean"),
        avg_facts=("xbrl_facts", "mean"),
    )
    .sort_index()
)
print("Complexity by fiscal year:")
print(year_stats.round(1).to_string())

if len(year_stats) >= 3:
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    yrs = year_stats.index

    for ax, col, label, color in [
        (axes[0], "avg_r_sheets",  "Avg R*.htm Sheets",  "steelblue"),
        (axes[1], "avg_size_mb",   "Avg File Size (MB)", "teal"),
        (axes[2], "avg_facts",     "Avg XBRL Facts",     "coral"),
    ]:
        ax.plot(yrs, year_stats[col], marker="o", color=color, linewidth=2)
        ax.set_title(label, fontsize=11)
        ax.set_xlabel("Fiscal Year")
        ax.set_xticks(yrs)
        ax.tick_params(axis="x", rotation=30)

    plt.suptitle("Filing Complexity Trend Over Time", fontsize=13, y=1.02)
    plt.tight_layout()
    plt.show()
else:
    print("\nNeed ≥3 fiscal years for trend chart — run with a larger or more diverse sample.")


In [None]:
# Summary counts
extracted = df_cov["Extracted"].sum()
total = len(df_cov)
print(f"Currently extracted: {extracted} / {total} available fields ({100*extracted/total:.0f}%)")
print(f"\nFields not yet extracted by source:")
not_extracted = df_cov[~df_cov["Extracted"]]
for source, group in not_extracted.groupby("Source"):
    print(f"  {source:<25} {list(group['Data Field'])}")