In [1]:
import jupyter_black

jupyter_black.load()

In [2]:
from pathlib import Path
import csv
import time
from collections import namedtuple
from sec_edgar_downloader import Downloader as SecEdgarDownloader
from sec_downloader import DownloadStorage, Downloader
from dotenv import load_dotenv
import os
import glob
import os
import json
import re
from sec_api_io.secapio_data_retriever import SecapioDataRetriever
from tqdm.notebook import tqdm

load_dotenv()
sec_dl = Downloader("Alphanome.AI", "info@alphanome.ai")

In [3]:
DOWNLOAD_PATH = Path("..")
FILING_TYPES = ["10-Q", "10-K"]

In [4]:
Report = namedtuple("Report", ["comment", "query"])

In [5]:
def get_report_list(path_str: str):
    company_list = []
    with Path(path_str).open("r") as f:
        reader = csv.reader(f)
        next(reader)  # Skip the header
        for row in reader:
            assert (
                len(row) == 2
            ), f"Expected 2 columns, got {len(row)} instead, row: {row}"
            company_list.append(Report(*row))
    return company_list

In [6]:
report_list = get_report_list("00_report-list.csv")

In [7]:
def download_and_save(document_type, ticker) -> bool:
    document_type = document_type.upper().strip()

    query = None
    accession_number = None
    specified_type = None
    if "/" in ticker:
        query = ticker
        parts = ticker.split("/")
        if len(parts) == 3:  # ticker/accession/type format
            ticker, accession_number, specified_type = parts
            # Skip if the specified type doesn't match requested type
            if specified_type.upper().strip() != document_type:
                return False
        else:  # ticker/accession format
            ticker, accession_number = parts
            
        ticker = ticker.strip()
        accession_number = accession_number.strip()
        assert re.match(
            r"\d{10}-\d{2}-\d{6}", accession_number
        ), f"Expected accession number to be in the format of XXXXXXXX-XX-XXXXXX, got {accession_number} instead"
    ticker = ticker.upper().strip()

    if query is None:
        save_dir = DOWNLOAD_PATH / document_type / ticker
        if save_dir.exists():
            return False

        storage = DownloadStorage(filter_pattern="**/*.htm*")
        with storage as path:
            dl = SecEdgarDownloader("Alphanome.AI", "info@alphanome.ai", path)
            dl.get(document_type, ticker, limit=1, download_details=True)

        files = storage.get_file_contents()
        assert (
            len(files) == 1
        ), f"Expected 1 file, got {len(files)} instead. Ticker: {ticker}"
        path, content = files[0]

        accession_number = path.parts[3]
        save_path = save_dir / accession_number / "primary-document.html"
        save_path.parent.mkdir(parents=True, exist_ok=True)
        with save_path.open("w") as f:
            f.write(content)
    else:
        assert ticker and isinstance(
            ticker, str
        ), f"Expected ticker to be a string, got {ticker} instead"
        assert accession_number and isinstance(
            accession_number, str
        ), f"Expected accession_number to be a string, got {accession_number} instead"
        save_dir = DOWNLOAD_PATH / document_type / ticker / accession_number
        if save_dir.exists():
            return False
        metadatas = sec_dl.get_filing_metadatas(query)
        assert (
            len(metadatas) == 1
        ), f"Expected 1 metadata, got {len(metadatas)} instead. Ticker: {ticker}, accession_number: {accession_number}"
        metadata = metadatas[0]
        file_contents = sec_dl.download_filing(url=metadata.primary_doc_url)
        save_path = save_dir / "primary-document.html"
        save_path.parent.mkdir(parents=True, exist_ok=True)
        with save_path.open("wb") as f:
            f.write(file_contents)

    return True

In [8]:
for filing_type in FILING_TYPES:
    download_and_save(filing_type, "MSFT")

In [9]:
for filing_type in FILING_TYPES:
    print(f"\nDownloading {filing_type} filings...")
    for report in tqdm(report_list):
        assert isinstance(report, Report)
        try:
            downloaded = download_and_save(filing_type, report.query)
            if downloaded:
                time.sleep(0.1)  # SEC EDGAR limits to 10 requests per second
        except Exception as e:
            print(f"Error downloading {filing_type} for {report.query}: {str(e)}")


Downloading 10-Q filings...


  0%|          | 0/110 [00:00<?, ?it/s]


Downloading 10-K filings...


  0%|          | 0/110 [00:00<?, ?it/s]

In [10]:
def download_from_secapio(document_type, accession_number):
    retriever = SecapioDataRetriever()
    metadata = retriever.retrieve_report_metadata(
        document_type, accession_number=accession_number
    )
    url = metadata["linkToFilingDetails"]
    assert url.startswith("https://www.sec.gov/Archives/edgar/data/")
    return retriever.get_report_html(document_type, url)

In [11]:
def get_section_text(section_ids, content):
    sections = []
    for i in range(len(section_ids)):
        section_end = (
            "<top-level-section-start-marker" if i < len(section_ids) - 1 else "$"
        )
        section_text = re.findall(
            r'<top-level-section-start-marker id="'
            + section_ids[i]
            + '".*?</top-level-section-start-marker>(.*?)'
            + section_end,
            content,
            re.DOTALL,
        )
        section_text = section_text[0] if section_text else ""
        sections.append(
            {
                "identifier": section_ids[i],
                "character_count": len(section_text.strip()),
            }
        )
    return sections

In [12]:
def write_sections_to_file(sections, anchor_file):
    expected_sections_path = anchor_file.parent / "expected-top-level-sections.json"
    with expected_sections_path.open("w") as json_file:
        json.dump(sections, json_file, indent=4)

In [17]:
if "SECAPIO_API_KEY" in os.environ:
    retriever = SecapioDataRetriever()

    # Process each filing type
    for filing_type in FILING_TYPES:
        print(f"\nProcessing {filing_type} files with Secapio...")
        # Only search in the specific filing type directory
        anchor_files = list(
            (DOWNLOAD_PATH / filing_type).rglob("primary-document.html")
        )

        for anchor_file in tqdm(anchor_files):
            accession_number = anchor_file.parent.name.replace("-", "")
            secapio_file_path = anchor_file.parent / "primary-document-secapio.html"

            if not secapio_file_path.exists():
                try:
                    content = download_from_secapio(filing_type, accession_number)
                    with secapio_file_path.open("w") as f:
                        f.write(content)
                except Exception as e:
                    print(
                        f"Error downloading {filing_type} for {accession_number}: {str(e)}"
                    )
                    continue

            with secapio_file_path.open("r") as f:
                content = f.read()

            section_ids = re.findall(
                r'<top-level-section-start-marker id="(.*?)"', content
            )
            sections = get_section_text(section_ids, content)
            write_sections_to_file(sections, anchor_file)

            # Respect rate limits
            time.sleep(0.1)
else:
    print("3rd party data will not be downloaded due to missing key.")
    print("Please create a .env file with your SECAPIO_API_KEY to download the data.")


Processing 10-Q files with Secapio...


  0%|          | 0/75 [00:00<?, ?it/s]


Processing 10-K files with Secapio...


  0%|          | 0/74 [00:00<?, ?it/s]

Error downloading 10-K for 000165204423000094: no 10-K found for accessionNo="0001652044-23-000094"
Error downloading 10-K for 000119312518236766: no 10-K found for accessionNo="0001193125-18-236766"
Error downloading 10-K for 000132680119000037: no 10-K found for accessionNo="0001326801-19-000037"
