In [63]:
import os
import re
import json
import requests
import time
from bs4 import BeautifulSoup
from requests.exceptions import RequestException, ConnectionError, HTTPError

# SEC requires a valid User-Agent header
HEADERS = {
    'User-Agent': 'UC Davis Analytics/1.0 (wenjunsong2002@outlook.com)',
    'Accept': 'application/json, text/html'
}

In [65]:
def download_file(url, dest_path, retries=3, delay=5):
    """
    Download a file from a URL to a local path with retries on failure.
    """
    for attempt in range(1, retries + 1):
        try:
            resp = requests.get(url, headers=HEADERS, timeout=15)
            resp.raise_for_status()
            with open(dest_path, 'wb') as f:
                f.write(resp.content)
            print(f"Downloaded: {url} -> {dest_path}")
            return
        except (ConnectionError, HTTPError, RequestException) as e:
            print(f"Attempt {attempt} failed for {url}: {e}")
            if attempt < retries:
                time.sleep(delay)
    raise RuntimeError(f"Failed to download file after {retries} attempts: {url}")

In [66]:
def parse_index_for_xml(index_url):
    """
    Fetch the EDGAR index page and parse out the XML filename ending with _htm.xml.
    """
    try:
        resp = requests.get(index_url, headers=HEADERS, timeout=15)
        resp.raise_for_status()
    except (ConnectionError, HTTPError, RequestException) as e:
        raise RuntimeError(f"Failed to load index page: {e}")

    soup = BeautifulSoup(resp.text, 'html.parser')
    link = soup.find('a', href=re.compile(r'.+_htm\.xml$'))
    if not link:
        raise ValueError('Could not find XML file link on index page')
    xml_filename = os.path.basename(link['href'])
    base = index_url.rsplit('/', 1)[0]
    xml_url = f"{base}/{xml_filename}"
    return xml_url, xml_filename

In [67]:
def clean_text(content):
    """
    Normalize whitespace in text.
    """
    return re.sub(r"\s+", " ", content).strip()

In [68]:
def extract_item_robust(text, start_label, end_label):
    """
    Extract text between two markers, skipping the first occurrence if it's in the TOC.
    """
    pattern_start = re.compile(start_label, re.IGNORECASE)
    pattern_end = re.compile(end_label, re.IGNORECASE)
    starts = list(pattern_start.finditer(text))
    if not starts:
        return ''
    start_idx = starts[1].start() if len(starts) > 1 else starts[0].start()
    end_match = pattern_end.search(text[start_idx:])
    if not end_match:
        return text[start_idx:].strip()
    end_idx = start_idx + end_match.start()
    return text[start_idx:end_idx].strip()

In [69]:
def extract_sections(text):
    return {
        'item1': extract_item_robust(text, r'Item 1\.?\s+Business', r'Item 1A\.?\s+Risk Factors'),
        'item1a': extract_item_robust(text, r'Item 1A\.?\s+Risk Factors', r'Item 1B\.?\s+Unresolved Staff Comments'),
        'item7': extract_item_robust(text, r'Item 7\.?\s+Management.*?Discussion.*?Financial.*?Condition', r'Item 7A\.?\s+Quantitative and Qualitative'),
        'item7a': extract_item_robust(text, r'Item 7A\.?\s+Quantitative and Qualitative', r'Item 8\.?\s+Financial Statements')
    }

In [70]:
def fetch_cusip_from_submissions(cik):
    cik_padded = f"{int(cik):010d}"
    url = f"https://data.sec.gov/submissions/CIK{cik_padded}.json"
    try:
        resp = requests.get(url, headers=HEADERS, timeout=15)
        resp.raise_for_status()
        data = resp.json()
        cusips = data.get('filings', {}).get('recent', {}).get('cusip', [])
        if cusips:
            return cusips[0][:6]
    except Exception as e:
        print(f"Failed to fetch CUSIP from submissions JSON: {e}")
    return ''

In [71]:
## Old
def fetch_cusip_from_sc13ga(cik):
    """
    Find CUSIP6 by parsing the latest SC 13G/A beneficial ownership report.
    The CUSIP number often appears before the phrase '(CUSIP Number)'.
    """
    cik_padded = f"{int(cik):010d}"
    subs_url = f"https://data.sec.gov/submissions/CIK{cik_padded}.json"
    try:
        resp = requests.get(subs_url, headers=HEADERS, timeout=15)
        resp.raise_for_status()
        data = resp.json()
        forms = data['filings']['recent']['form']
        accs = data['filings']['recent']['accessionNumber']
        docs = data['filings']['recent']['primaryDocument']
        cik_no_zero = cik.lstrip('0')
        for form, acc, doc in zip(forms, accs, docs):
            if form.startswith('SC 13G'):
                acc_no_dash = acc.replace('-', '')
                url = f"https://www.sec.gov/Archives/edgar/data/{cik_no_zero}/{acc_no_dash}/{doc}"
                r = requests.get(url, headers=HEADERS, timeout=15)
                r.raise_for_status()
                text = r.text
                # first attempt: number before '(CUSIP Number)'
                m = re.search(r'([0-9A-Za-z-]{6,11})\s*\(\s*CUSIP\s*Number', text, re.IGNORECASE)
                if m:
                    return m.group(1).replace('-', '')[:6]
                # fallback: CUSIP NO. after label
                m2 = re.search(r'CUSIP\s*NO\.?\s*([0-9A-Za-z-]{6,11})', text, re.IGNORECASE)
                if m2:
                    return m2.group(1).replace('-', '')[:6]
    except Exception as e:
        print(f"Failed to fetch CUSIP from SC 13G/A: {e}")
    return ''


In [None]:
def extract_metadata_from_xml(xml_path):
    """
    Parse the XBRL XML file for metadata: company name, CIK, CUSIP6.
    Falls back to submissions JSON or SC 13G/A if CUSIP absent.
    """
    try:
        with open(xml_path, 'r', encoding='utf-8') as f:
            xml_soup = BeautifulSoup(f, 'lxml-xml')
        name_tag = xml_soup.find('dei:EntityRegistrantName') or xml_soup.find('EntityRegistrantName')
        cik_tag = xml_soup.find('dei:EntityCentralIndexKey') or xml_soup.find('EntityCentralIndexKey')
        cusip_tag = xml_soup.find('dei:CusipNumber') or xml_soup.find('CusipNumber')
        names = name_tag.text.strip() if name_tag else ''
        cik = cik_tag.text.strip() if cik_tag else ''
        cusip6 = cusip_tag.text.strip()[:6] if cusip_tag else ''
        if not cusip6 and cik:
            cusip6 = fetch_cusip_from_submissions(cik)
        if not cusip6 and cik:
            cusip6 = fetch_cusip_from_sc13ga(cik)

        
        return { 'names': names, 'cik': cik, 'cusip6': cusip6 }
    except Exception as e:
        print(f"Failed to parse XML metadata: {e}")
        return { 'names': '', 'cik': '', 'cusip6': '' }

In [74]:
def extract_and_clean(index_url, output_json='ntap_10k_cleaned.json'):
    try:
        xml_url, xml_filename = parse_index_for_xml(index_url)
    except Exception as e:
        print(f"Error parsing index page: {e}")
        return
    os.makedirs('download', exist_ok=True)
    xml_path = os.path.join('download', xml_filename)
    try:
        download_file(xml_url, xml_path)
    except Exception as e:
        print(f"Error downloading XML file: {e}")
        return
    htm_filename = xml_filename.replace('_htm.xml', '.htm')
    htm_url = xml_url.replace(xml_filename, htm_filename)
    htm_path = os.path.join('download', htm_filename)
    try:
        download_file(htm_url, htm_path)
    except Exception as e:
        print(f"Error downloading HTML file: {e}")
        return
    try:
        with open(htm_path, 'r', encoding='utf-8') as f:
            html = f.read()
        soup = BeautifulSoup(html, 'html.parser')
        full_text = soup.get_text(separator=' ')
        cleaned = clean_text(full_text)
    except Exception as e:
        print(f"Error parsing HTML text: {e}")
        return
    sections = extract_sections(cleaned)
    meta = extract_metadata_from_xml(xml_path)
    # Construct official iXBRL URL as source
    # xml_url like https://www.sec.gov/Archives/.../indexdir/filename_htm.xml
    # derive base archive path
    archive_prefix = xml_url.split('https://www.sec.gov')[-1].rsplit('/', 1)[0]
    ix_htm_url = f"https://www.sec.gov/ix?doc={archive_prefix}/{htm_filename}"
    meta['source'] = ix_htm_url
    # Build official iXBRL source URL for clickable link
    archive_path = xml_url.split('https://www.sec.gov')[-1].rsplit('/', 1)[0]
    ix_htm_url = f"https://www.sec.gov/ix?doc={archive_path}/{htm_filename}"
    meta['source'] = ix_htm_url
    result = { **meta, **sections }
    try:
        with open(output_json, 'w', encoding='utf-8') as f:
            json.dump(result, f, ensure_ascii=False, indent=2)
        print(f"Saved cleaned data to {output_json}")
    except Exception as e:
        print(f"Error saving JSON: {e}")


In [76]:
def get_cik_mapping():
    mapping_url = 'https://www.sec.gov/files/company_tickers.json'
    resp = requests.get(mapping_url, headers=HEADERS, timeout=15)
    resp.raise_for_status()
    data = resp.json()
    return {v['ticker']: v['cik_str'] for v in data.values()}

def find_10k_accessions_by_year(cik):
    padded = f"{int(cik):010d}"
    subs_url = f"https://data.sec.gov/submissions/CIK{padded}.json"
    resp = requests.get(subs_url, headers=HEADERS, timeout=15)
    resp.raise_for_status()
    data = resp.json()
    forms = data['filings']['recent']['form']
    accs = data['filings']['recent']['accessionNumber']
    dates = data['filings']['recent']['filingDate']
    
    year_map = {}
    for form, acc, date in zip(forms, accs, dates):
        if form == '10-K':
            year = date[:4]
            year_map.setdefault(year, []).append(acc)
    return year_map

In [77]:
if __name__ == '__main__':
    tickers = [
        'AAPL', 'JPM', 'JNJ', 'XOM', 'WMT',
        'TSLA', 'PLD', 'BA', 'NFLX', 'NVDA'
    ]
    cik_map = get_cik_mapping()

    for ticker in tickers:
        cik = cik_map.get(ticker)
        if not cik:
            print(f"CIK not found for ticker {ticker}, skipping.")
            continue
        try:
            year_to_accessions = find_10k_accessions_by_year(cik)
            for year, accessions in sorted(year_to_accessions.items()):
                for acc in accessions:
                    acc_nodash = acc.replace('-', '')
                    index_filename = f"{acc}-index.html"
                    index_url = f"https://www.sec.gov/Archives/edgar/data/{int(cik)}/{acc_nodash}/{index_filename}"
                    out_dir = f"output/{ticker}"
                    os.makedirs(out_dir, exist_ok=True)
                    out_json = f"{out_dir}/{ticker}_10k_{year}.json"
                    print(f"\nProcessing {ticker} {year}: {index_url}")
                    extract_and_clean(index_url, output_json=out_json)
        except Exception as e:
            print(f"Failed processing {ticker}: {e}")


Processing AAPL 2014: https://www.sec.gov/Archives/edgar/data/320193/000119312514383437/0001193125-14-383437-index.html
Error parsing index page: Could not find XML file link on index page

Processing AAPL 2015: https://www.sec.gov/Archives/edgar/data/320193/000119312515356351/0001193125-15-356351-index.html
Error parsing index page: Could not find XML file link on index page

Processing AAPL 2016: https://www.sec.gov/Archives/edgar/data/320193/000162828016020309/0001628280-16-020309-index.html
Error parsing index page: Could not find XML file link on index page

Processing AAPL 2017: https://www.sec.gov/Archives/edgar/data/320193/000032019317000070/0000320193-17-000070-index.html
Error parsing index page: Could not find XML file link on index page

Processing AAPL 2018: https://www.sec.gov/Archives/edgar/data/320193/000032019318000145/0000320193-18-000145-index.html
Error parsing index page: Could not find XML file link on index page

Processing AAPL 2019: https://www.sec.gov/Archive

KeyboardInterrupt: 