In [2]:
# Install required packages
%pip install requests


Collecting requests
  Using cached requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting charset_normalizer<4,>=2 (from requests)
  Using cached charset_normalizer-3.4.4-cp311-cp311-win_amd64.whl.metadata (38 kB)
Collecting idna<4,>=2.5 (from requests)
  Using cached idna-3.11-py3-none-any.whl.metadata (8.4 kB)
Collecting urllib3<3,>=1.21.1 (from requests)
  Downloading urllib3-2.6.3-py3-none-any.whl.metadata (6.9 kB)
Collecting certifi>=2017.4.17 (from requests)
  Downloading certifi-2026.1.4-py3-none-any.whl.metadata (2.5 kB)
Using cached requests-2.32.5-py3-none-any.whl (64 kB)
Downloading certifi-2026.1.4-py3-none-any.whl (152 kB)
   ---------------------------------------- 0.0/152.9 kB ? eta -:--:--
   ---------------------------------------- 0.0/152.9 kB ? eta -:--:--
   -------- ------------------------------- 30.7/152.9 kB 1.4 MB/s eta 0:00:01
   ---------------------------------------- 152.9/152.9 kB 2.3 MB/s eta 0:00:00
Using cached charset_normalizer-3.4.4-cp311-cp31


[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import os
from pathlib import Path

# SEC compliance: include contact info
os.environ["SEC_USER_AGENT"] = "RevenueSegBot/0.1 (your.email@domain.com)"

from revseg.sec_edgar import download_many_latest_10k

tickers = ["AAPL", "MSFT", "GOOGL", "AMZN", "NVDA", "META", "AVGO"]

results = download_many_latest_10k(
    tickers=tickers,
    out_dir=Path("data/10k"),
    cache_dir=Path(".cache/sec"),
    include_amendments=False,
    min_interval_s=0.2,
)

for t, (ok, msg) in results.items():
    print(f"{t}: {'OK' if ok else 'FAIL'} - {msg}")


[1/7] Processing AAPL...
  AAPL: Fetching ticker->CIK map...
  AAPL: Fetching submissions for CIK 320193...
  AAPL: Found 10-K filed on 2025-10-31
  AAPL: Downloading filing index...
  AAPL: Downloading primary document (aapl-20250927.htm)...
[1/7] ✓ AAPL completed: C:\Users\yehud\Business Classification\data\10k\AAPL\2025-10-31_000032019325000079
[2/7] Processing MSFT...
  MSFT: Fetching ticker->CIK map...
  MSFT: Fetching submissions for CIK 789019...
  MSFT: Found 10-K filed on 2025-07-30
  MSFT: Downloading filing index...
  MSFT: Downloading primary document (msft-20250630.htm)...
[2/7] ✓ MSFT completed: C:\Users\yehud\Business Classification\data\10k\MSFT\2025-07-30_000095017025100235
[3/7] Processing GOOGL...
  GOOGL: Fetching ticker->CIK map...
  GOOGL: Fetching submissions for CIK 1652044...
  GOOGL: Found 10-K filed on 2025-02-05
  GOOGL: Downloading filing index...
  GOOGL: Downloading primary document (goog-20241231.htm)...
[3/7] ✓ GOOGL completed: C:\Users\yehud\Business C

In [4]:
# Install required packages for table extraction
%pip install beautifulsoup4 lxml


Collecting beautifulsoup4
  Downloading beautifulsoup4-4.14.3-py3-none-any.whl.metadata (3.8 kB)
Collecting lxml
  Using cached lxml-6.0.2-cp311-cp311-win_amd64.whl.metadata (3.7 kB)
Collecting soupsieve>=1.6.1 (from beautifulsoup4)
  Downloading soupsieve-2.8.1-py3-none-any.whl.metadata (4.6 kB)
Downloading beautifulsoup4-4.14.3-py3-none-any.whl (107 kB)
   ---------------------------------------- 0.0/107.7 kB ? eta -:--:--
   ---------------------------------------- 0.0/107.7 kB ? eta -:--:--
   -------------- ------------------------ 41.0/107.7 kB 960.0 kB/s eta 0:00:01
   ---------------------------------------- 107.7/107.7 kB 1.5 MB/s eta 0:00:00
Using cached lxml-6.0.2-cp311-cp311-win_amd64.whl (4.0 MB)
Downloading soupsieve-2.8.1-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, lxml, beautifulsoup4
Successfully installed beautifulsoup4-4.14.3 lxml-6.0.2 soupsieve-2.8.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
from pathlib import Path
import os

# If your SEC_USER_AGENT is already set in your environment, you can omit this.
# os.environ["SEC_USER_AGENT"] = "RevenueSegBot/0.1 (your.email@domain.com)"

from revseg.table_candidates import (
    find_latest_downloaded_filing_dir,
    find_primary_document_html,
    extract_table_candidates_from_html,
    write_candidates_json,
)

ticker = "MSFT"
base_dir = Path("data/10k")

filing_dir = find_latest_downloaded_filing_dir(base_dir, ticker)
html_path = find_primary_document_html(filing_dir)

candidates = extract_table_candidates_from_html(
    html_path,
    preview_rows=15,
    preview_cols=8,
)

out_json = write_candidates_json(
    candidates,
    Path(f"data/table_candidates/{ticker}_table_candidates.json"),
)

(len(candidates), str(out_json), str(html_path))



Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(html_path.read_text(encoding="utf-8", errors="ignore"), "lxml")


(85,
 'C:\\Users\\yehud\\Business Classification\\data\\table_candidates\\MSFT_table_candidates.json',
 'data\\10k\\MSFT\\2025-07-30_000095017025100235\\primary_document.html')

In [6]:
# Simple heuristic: prioritize candidates with more keyword hits and more detected years
ranked = sorted(
    candidates,
    key=lambda c: (len(c.keyword_hits), len(c.detected_years), c.n_rows * c.n_cols),
    reverse=True,
)

for c in ranked[:5]:
    print("----", c.table_id, "----")
    print("rows x cols:", c.n_rows, "x", c.n_cols)
    print("years:", c.detected_years)
    print("hits:", c.keyword_hits)
    print("heading:", c.heading_context[:200])
    print("nearby:", c.nearby_text_context[:300].replace("\n", " "))
    print("preview:")
    for r in c.preview[:8]:
        print("  ", r)
    print()


---- t0012 ----
rows x cols: 6 x 3
years: []
hits: ['revenue', 'sales', 'segment', 'segments', 'cloud', 'advertising', 'subscription', 'subscriptions', 'services']
heading: 
nearby: n or 19% driven by growth in Microsoft Cloud. Gross margin increased $22.9 billion or 13% with growth across each of our segments. • Gross margin percentage decreased slightly driven by Intelligent Cloud, offset in part by More Personal Computing. • Microsoft Cloud gross margin percentage decreased 
preview:
   ['', '', '']
   ['Windows OEM and Devices revenue growth', '', 'Revenue from sales of Windows Pro and non-Pro licenses sold through the OEM channel and sales of first-party Devices, including Surface and PC accessories']
   ['', '', '']
   ['Xbox content and services revenue growth', '', 'Revenue from Xbox content and services, comprising first- and third-party content (including games and in-game content), Xbox Game Pass and other subscriptions, Xbox Cloud Gaming, advertising, and other cloud servic