In [1]:
import os
import re
import warnings
import unicodedata
from edgar import *
import pandas as pd
from tqdm import tqdm

warnings.simplefilter("ignore")
set_identity("ansa ansa1019@gmail.com")
headers = {
    "User-Agent": "NPUST@wp.npust.edu.tw",
    "Accept-Encoding": "gzip",
    "Host": "www.sec.gov",
}
df = pd.read_csv("sp500_companies.csv")
companies = df["Symbol"].values


def write_content(content, output_path):
    with open(output_path, "w", encoding="utf-8") as fout:
        fout.write(content)


def normalize(text):
    text = unicodedata.normalize("NFKD", text)
    text = "\n".join(text.splitlines())
    clear_pattern = ["^\s", "", "\xa0", "&nbsp;"]
    for x in clear_pattern:
        text = re.sub(re.compile(x, re.I | re.M), "", text)
    while list(re.finditer("[a-z,]\s*$\s*[a-z,]", text, re.I | re.M)):
        x = list(re.finditer("[a-z,]\s*$\s*[a-z,]", text, re.I | re.M))[0]
        text = text[:x.start()+1]+" "+text[x.end()-1:]
    return text


def clear(text):
    if re.search("item 7", text.split("\n", 1)[0], re.I | re.M):
        text = text.split("\n", 1)[1]
    text = re.sub(re.compile("\s*\n+", re.I | re.M), "\n", text)
    clear_pattern = ["^\s*Table\s*of\s*Contents?\s*$", "^\s*\d+\s*$", "^\s*[-]+\s*$",
                     "^\s*[^\n]*\|?[^\S\n]*\d{4}[^\S\n]*Form[^\S\n]*10-K[^\S\n]*\|?[^\S\n]*\d*\s*$", "^\s*Bank\s*of\s*America\s*[^\S\n]*\d*\s*$"]
    for x in clear_pattern:
        text = re.sub(re.compile(x, re.I | re.M), "", text)
    # text = re.sub("\n+|\s+", " ", text, flags=re.M) #刪掉換行
    text = text.lower()
    return text

In [1]:
log = ""
root = "mda/"

if not os.path.isdir(root):
    os.makedirs(root)
for y in range(2014, 2025):
    folder = root + str(y) + "/"
    if not os.path.isdir(folder):
        os.makedirs(folder)

loop = tqdm(range(len(companies)))
for i in loop:
    company = Company(companies[i])
    filings = company.get_filings(form="10-K").filter(date="2015-01-01:2024-12-31")
    if filings.empty:
        loop.set_description(f"{companies[i]}")
        log += f"❌ {companies[i]} 無10-K可供下載\n"
    else:
        for filing in filings:
            try:
                path = root + filing.report_date[:4] + "/" + companies[i] + ".txt"
                loop.set_description(path)
                tenk = filing.obj()
                text = normalize(tenk["Item 7"])
                text = clear(text)
                write_content(text, path)
            except Exception as e:
                log += f"❌ {path} 下載失敗\n{e}"
                continue
        log += f"✅ {companies[i]} 下載完成\n"
print(log)

NameError: name 'os' is not defined