In [2]:
# Install required packages
%pip install requests


Collecting requests
  Using cached requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting charset_normalizer<4,>=2 (from requests)
  Using cached charset_normalizer-3.4.4-cp311-cp311-win_amd64.whl.metadata (38 kB)
Collecting idna<4,>=2.5 (from requests)
  Using cached idna-3.11-py3-none-any.whl.metadata (8.4 kB)
Collecting urllib3<3,>=1.21.1 (from requests)
  Downloading urllib3-2.6.3-py3-none-any.whl.metadata (6.9 kB)
Collecting certifi>=2017.4.17 (from requests)
  Downloading certifi-2026.1.4-py3-none-any.whl.metadata (2.5 kB)
Using cached requests-2.32.5-py3-none-any.whl (64 kB)
Downloading certifi-2026.1.4-py3-none-any.whl (152 kB)
   ---------------------------------------- 0.0/152.9 kB ? eta -:--:--
   ---------------------------------------- 0.0/152.9 kB ? eta -:--:--
   -------- ------------------------------- 30.7/152.9 kB 1.4 MB/s eta 0:00:01
   ---------------------------------------- 152.9/152.9 kB 2.3 MB/s eta 0:00:00
Using cached charset_normalizer-3.4.4-cp311-cp31


[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import os
from pathlib import Path

# SEC compliance: include contact info
os.environ["SEC_USER_AGENT"] = "RevenueSegBot/0.1 (your.email@domain.com)"

from revseg.sec_edgar import download_many_latest_10k

tickers = ["AAPL", "MSFT", "GOOGL", "AMZN", "NVDA", "META", "AVGO"]

results = download_many_latest_10k(
    tickers=tickers,
    out_dir=Path("data/10k"),
    cache_dir=Path(".cache/sec"),
    include_amendments=False,
    min_interval_s=0.2,
)

for t, (ok, msg) in results.items():
    print(f"{t}: {'OK' if ok else 'FAIL'} - {msg}")


[1/7] Processing AAPL...
  AAPL: Fetching ticker->CIK map...
  AAPL: Fetching submissions for CIK 320193...
  AAPL: Found 10-K filed on 2025-10-31
  AAPL: Downloading filing index...
  AAPL: Downloading primary document (aapl-20250927.htm)...
[1/7] ✓ AAPL completed: C:\Users\yehud\Business Classification\data\10k\AAPL\2025-10-31_000032019325000079
[2/7] Processing MSFT...
  MSFT: Fetching ticker->CIK map...
  MSFT: Fetching submissions for CIK 789019...
  MSFT: Found 10-K filed on 2025-07-30
  MSFT: Downloading filing index...
  MSFT: Downloading primary document (msft-20250630.htm)...
[2/7] ✓ MSFT completed: C:\Users\yehud\Business Classification\data\10k\MSFT\2025-07-30_000095017025100235
[3/7] Processing GOOGL...
  GOOGL: Fetching ticker->CIK map...
  GOOGL: Fetching submissions for CIK 1652044...
  GOOGL: Found 10-K filed on 2025-02-05
  GOOGL: Downloading filing index...
  GOOGL: Downloading primary document (goog-20241231.htm)...
[3/7] ✓ GOOGL completed: C:\Users\yehud\Business C

In [4]:
# Install required packages for table extraction
%pip install beautifulsoup4 lxml


Collecting beautifulsoup4
  Downloading beautifulsoup4-4.14.3-py3-none-any.whl.metadata (3.8 kB)
Collecting lxml
  Using cached lxml-6.0.2-cp311-cp311-win_amd64.whl.metadata (3.7 kB)
Collecting soupsieve>=1.6.1 (from beautifulsoup4)
  Downloading soupsieve-2.8.1-py3-none-any.whl.metadata (4.6 kB)
Downloading beautifulsoup4-4.14.3-py3-none-any.whl (107 kB)
   ---------------------------------------- 0.0/107.7 kB ? eta -:--:--
   ---------------------------------------- 0.0/107.7 kB ? eta -:--:--
   -------------- ------------------------ 41.0/107.7 kB 960.0 kB/s eta 0:00:01
   ---------------------------------------- 107.7/107.7 kB 1.5 MB/s eta 0:00:00
Using cached lxml-6.0.2-cp311-cp311-win_amd64.whl (4.0 MB)
Downloading soupsieve-2.8.1-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, lxml, beautifulsoup4
Successfully installed beautifulsoup4-4.14.3 lxml-6.0.2 soupsieve-2.8.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [27]:
from pathlib import Path
import os
import importlib

# If your SEC_USER_AGENT is already set in your environment, you can omit this.
# os.environ["SEC_USER_AGENT"] = "RevenueSegBot/0.1 (your.email@domain.com)"

# Reload the module to pick up any changes
import revseg.table_candidates
importlib.reload(revseg.table_candidates)

from revseg.table_candidates import (
    find_latest_downloaded_filing_dir,
    find_primary_document_html,
    extract_table_candidates_from_html,
    write_candidates_json,
)

ticker = "MSFT"
base_dir = Path("data/10k")

filing_dir = find_latest_downloaded_filing_dir(base_dir, ticker)
html_path = find_primary_document_html(filing_dir)

candidates = extract_table_candidates_from_html(
    html_path,
    preview_rows=15,
    preview_cols=8,
)

out_json = write_candidates_json(
    candidates,
    Path(f"data/table_candidates/{ticker}_table_candidates.json"),
)

(len(candidates), str(out_json), str(html_path))


(85,
 'C:\\Users\\yehud\\Business Classification\\data\\table_candidates\\MSFT_table_candidates.json',
 'data\\10k\\MSFT\\2025-07-30_000095017025100235\\primary_document.html')

In [28]:
# Deterministic "financial table" ranking - prioritizes tables with year headers, units, and money cells
ranked = sorted(
    candidates,
    key=lambda c: (
        c.has_year_header,
        c.has_units_marker,
        c.money_cell_ratio,
        c.numeric_cell_ratio,
        len(c.keyword_hits),
        c.n_rows * c.n_cols,
    ),
    reverse=True,
)

print("Top 10 financial table candidates (ranked by financial table signals):\n")
for c in ranked[:10]:
    print(
        c.table_id,
        "year_header=", c.has_year_header,
        "units=", c.has_units_marker,
        "money=", round(c.money_cell_ratio, 3),
        "numeric=", round(c.numeric_cell_ratio, 3),
        "years=", c.detected_years,
        "labels=", c.row_label_preview[:3],
    )


Top 10 financial table candidates (ranked by financial table signals):

t0073 year_header= True units= True money= 0.194 numeric= 0.194 years= [2023, 2024, 2025] labels= ['(In millions)', 'Year Ended June 30,', 'Server products and cloud services']
t0042 year_header= True units= True money= 0.17 numeric= 0.149 years= [2023, 2024, 2025] labels= ['(In millions)', 'June 30,', 'Land']
t0029 year_header= True units= True money= 0.154 numeric= 0.096 years= [2023, 2024, 2025] labels= ['(In millions)', 'Year Ended June 30,', 'Interest and dividends income']
t0055 year_header= True units= True money= 0.151 numeric= 0.151 years= [2024, 2025] labels= ['(In millions)', 'June 30,', 'Deferred Income Tax Assets']
t0018 year_header= True units= True money= 0.148 numeric= 0.148 years= [2024, 2025] labels= ['(In millions)', 'Year Ended June 30,', 'Interest and dividends income']
t0050 year_header= True units= True money= 0.142 numeric= 0.142 years= [2009, 2010, 2021, 2023, 2024, 2026, 2030, 2039, 2040, 

In [29]:
# Find and print diagnostics for specific table_ids (the actual financial tables)
def show(table_id: str):
    c = next(x for x in candidates if x.table_id == table_id)
    print("table_id:", c.table_id)
    print("rows x cols:", c.n_rows, "x", c.n_cols)
    print("has_year_header:", c.has_year_header)
    print("detected_years:", c.detected_years)
    print("has_units_marker:", c.has_units_marker)
    print("units_hint:", c.units_hint)
    print("numeric_cell_ratio:", c.numeric_cell_ratio)
    print("money_cell_ratio:", c.money_cell_ratio)
    print("row_label_preview (first 10):", c.row_label_preview[:10])
    print("heading:", c.heading_context[:200])
    print("caption:", c.caption_text[:200])
    print()

show("t0071")
show("t0073")


table_id: t0071
rows x cols: 45 x 13
has_year_header: True
detected_years: [2024, 2025]
has_units_marker: True
units_hint: (In millions)
numeric_cell_ratio: 0.09259259259259259
money_cell_ratio: 0.09259259259259259
row_label_preview (first 10): ['(In millions)', 'Year Ended June 30,', 'Productivity and Business Processes', 'Revenue', 'Cost of revenue', 'Operating expenses', 'Operating Income']
heading: 
caption: 

table_id: t0073
rows x cols: 20 x 13
has_year_header: True
detected_years: [2023, 2024, 2025]
has_units_marker: True
units_hint: (In millions)
numeric_cell_ratio: 0.1941747572815534
money_cell_ratio: 0.1941747572815534
row_label_preview (first 10): ['(In millions)', 'Year Ended June 30,', 'Server products and cloud services', 'Microsoft 365 Commercial products and cloud services', 'Gaming', 'LinkedIn', 'Windows and Devices', 'Search and news advertising', 'Dynamics products and cloud services', 'Enterprise and partner services']
heading: 
caption: 



In [31]:
from bs4 import BeautifulSoup
from pathlib import Path
import revseg.table_candidates as tc

ticker = "MSFT"
base_dir = Path("data/10k")

filing_dir = tc.find_latest_downloaded_filing_dir(base_dir, ticker)
html_path = tc.find_primary_document_html(filing_dir)

soup = BeautifulSoup(html_path.read_text(encoding="utf-8", errors="ignore"), "lxml")
tables = soup.find_all("table")

table_id = "t0073"
idx = int(table_id[1:])  # 73
tbl = tables[idx]

print("Nearest heading:", tc._nearest_heading_text(tbl))
print("Nearby context:", tc._collect_nearby_text(tbl)[:1200])

print("\n--- Table text (first ~40 lines) ---")
lines = tc._clean_text(tbl.get_text("\n", strip=True)).split("\n")
for line in lines[:40]:
    print(line)



Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(html_path.read_text(encoding="utf-8", errors="ignore"), "lxml")


Nearest heading: 
Nearby context: ertising 13,878 12,306 12,125 Dynamics products and cloud services 7,827 6,831 5,796 Enterprise and partner services 7,760 7,594 7,900 Microsoft 365 Consumer products and cloud services 7,404 6,648 6,417 Other 72 45 119 Total $ 281,724 $ 245,122 $ 211,915 Our Microsoft Cloud revenue, which includes Microsoft 365 Commercial cloud, Azure and other cloud services, the commercial portion of LinkedIn, and Dynamics 365, was $ 168.9 billion, $ 137.7 billion, and $ 111.6 billion in fiscal years 2025, 2024, and 2023, respectively. These amounts are included in Microsoft 365 Commercial products and cloud services, Server products and cloud services, LinkedIn, and Dynamics products and cloud services in the table above. Revenue, classified by significant product and service offerings, was as follows:

--- Table text (first ~40 lines) ---
(In millions) Year Ended June 30, 2025 2024 2023 Server products and cloud services $ 98,435 $ 79,828 $ 65,007 Microsoft 365 Co