In [1]:
import jupyter_black

jupyter_black.load()

In [2]:
from pathlib import Path
import csv
import time
from collections import namedtuple
from sec_edgar_downloader import Downloader as SecEdgarDownloader
from sec_downloader import DownloadStorage
from dotenv import load_dotenv
import os
import glob
import os
import json
import re
from sec_api_io.secapio_data_retriever import SecapioDataRetriever
from tqdm.notebook import tqdm

load_dotenv()

True

In [3]:
DOWNLOAD_PATH = Path("..")

In [4]:
Company = namedtuple("Ticker", ["name", "ticker"])


def get_company_list(path_str):
    company_list = []
    with Path(path_str).open("r") as f:
        reader = csv.reader(f)
        next(reader)  # Skip the header
        for row in reader:
            assert (
                len(row) == 2
            ), f"Expected 2 columns, got {len(row)} instead, row: {row}"
            company_list.append(Company(*row))
    return company_list


company_list = get_company_list("00_company-list.csv")

In [5]:
def download_and_save(document_type, ticker) -> bool:
    document_type = document_type.upper().strip()
    ticker = ticker.upper().strip()

    save_dir = DOWNLOAD_PATH / document_type / ticker
    if save_dir.exists():
        return False

    storage = DownloadStorage(filter_pattern="**/*.htm*")
    with storage as path:
        dl = SecEdgarDownloader("Alphanome.AI", "info@alphanome.ai", path)
        dl.get(document_type, ticker, limit=1, download_details=True)

    files = storage.get_file_contents()
    assert (
        len(files) == 1
    ), f"Expected 1 file, got {len(files)} instead. Ticker: {ticker}"
    path, content = files[0]

    report_id = path.parts[3]
    save_path = save_dir / report_id / "primary-document.html"
    save_path.parent.mkdir(parents=True, exist_ok=True)
    with save_path.open("w") as f:
        f.write(content)
    return True


download_and_save("10-Q", "MSFT")

False

In [6]:
from tqdm import tqdm

for company in tqdm(company_list):
    downloaded = download_and_save("10-Q", company.ticker)
    if downloaded:
        time.sleep(1 / 10)  # SEC EDGAR limits to 10 requests per second

100%|██████████| 108/108 [00:00<00:00, 91753.05it/s]


In [7]:
def download_from_secapio(document_type, accession_number):
    retriever = SecapioDataRetriever()
    metadata = retriever.retrieve_report_metadata(
        document_type, accession_number=accession_number
    )
    url = metadata["linkToFilingDetails"]
    assert url.startswith("https://www.sec.gov/Archives/edgar/data/")
    return retriever.get_report_html(document_type, url)


def get_section_text(section_ids, content):
    sections = []
    for i in range(len(section_ids)):
        section_end = (
            "<top-level-section-start-marker" if i < len(section_ids) - 1 else "$"
        )
        section_text = re.findall(
            r'<top-level-section-start-marker id="'
            + section_ids[i]
            + '".*?</top-level-section-start-marker>(.*?)'
            + section_end,
            content,
            re.DOTALL,
        )
        section_text = section_text[0] if section_text else ""
        sections.append(
            {
                "identifier": section_ids[i],
                "character_count": len(section_text.strip()),
            }
        )
    return sections


def write_sections_to_file(sections, anchor_file):
    expected_sections_path = anchor_file.parent / "expected-top-level-sections.json"
    with expected_sections_path.open("w") as json_file:
        json.dump(sections, json_file, indent=4)


if "SECAPIO_API_KEY" in os.environ:
    retriever = SecapioDataRetriever()
    metadata = retriever.retrieve_report_metadata("10-Q", latest_from_ticker="AAPL")

    anchor_files = list(DOWNLOAD_PATH.rglob("primary-document.html"))
    for anchor_file in tqdm(anchor_files):
        accession_number = anchor_file.parent.name.replace("-", "")

        metadata_file_path = anchor_file.parent / "metadata.json"
        if not metadata_file_path.exists():
            input_metadata = retriever.retrieve_report_metadata(
                "10-Q", accession_number=accession_number
            )
            secapio_metadata_file_path = anchor_file.parent / "metadata-secapio.json"
            with secapio_metadata_file_path.open("w") as json_file:
                json.dump(input_metadata, json_file, indent=4)

            output_metadata = {
                "filing_details_url": metadata["linkToFilingDetails"],
                "filed_at": metadata["filedAt"],
                "period_of_report": metadata["periodOfReport"],
            }
            with metadata_file_path.open("w") as json_file:
                json.dump(output_metadata, json_file, indent=4)

        secapio_file_path = anchor_file.parent / "primary-document-secapio.html"
        if not secapio_file_path.exists():
            content = download_from_secapio("10-Q", accession_number)
            with secapio_file_path.open("w") as f:
                f.write(content)
        with secapio_file_path.open("r") as f:
            content = f.read()
        section_ids = re.findall(r'<top-level-section-start-marker id="(.*?)"', content)
        sections = get_section_text(section_ids, content)
        write_sections_to_file(sections, anchor_file)
else:
    print("3rd party data will not be downloaded due to missing key.")
    print("Please create a .env file with your SECAPIO_API_KEY to download the data.")

100%|██████████| 72/72 [00:02<00:00, 35.16it/s]
