In [1]:
import json
import logging
import warnings
import requests
import pandas as pd
from edgar import *
from tqdm import tqdm
from urllib3 import Retry
from edgar.xbrl import XBRLS
from datetime import datetime
from requests.adapters import HTTPAdapter
from bs4 import BeautifulSoup


warnings.simplefilter("ignore")
logging.getLogger().setLevel(logging.WARNING)
logging.getLogger("openai").setLevel(logging.ERROR)
logging.getLogger("urllib3").setLevel(logging.ERROR)
request = requests.Session()
retries = Retry(
    total=3,  # 最多重試 5 次
    backoff_factor=1,  # 每次重試的延遲時間指數增長（1s, 2s, 4s, 8s...）
    status_forcelist=[403, 429, 500, 502, 503, 504],  # 針對這些 HTTP 狀態碼進行重試
)
request.mount("http://", HTTPAdapter(max_retries=retries))
request.mount("https://", HTTPAdapter(max_retries=retries))
headers = {"User-Agent": "ansa ansa1019@gmail.com"}


# 公司名單
companies_range = 100
df_companies = pd.read_csv("sp500_companies.csv")
companies = df_companies.drop_duplicates(subset=["Shortname"], keep="first")[
    "Symbol"
].values[:companies_range]
with open("companies.json", "r", encoding="utf-8") as f:
    data = json.load(f)
missing_companies = [entry["ticker"] for entry in data["missing_companies"]]
find_cik = {entry["ticker"]: entry["cik"] for entry in data["find_cik"]}
companies = [c for c in companies if c not in missing_companies]

# 參數設定
set_identity("ansa ansa1019@gmail.com")
search_queries = ["IT capability", "organizational resilience"]
keywords = {q: [q] for q in search_queries}
report_item = {"paper_7": ["7"], "paper_17": ["1A", "7"]}
report_papers = "papers.json"
financials_file = "financials.csv"
report_year = [2014, 2023]
minlen = 1500
keyword_num = 30

In [2]:
filings = Company("AAPL").get_filings(form=["10-K"], date=f"2014-01-01:2015-12-31")
XBRLS.from_filings(filings).statements.income_statement().to_dataframe()

Unnamed: 0,label,concept,2015-09-26,2014-09-27
0,Cash dividends declared per common share,us-gaap_CommonStockDividendsPerShareDeclared,,1.82
1,Cash dividends declared per share,us-gaap_CommonStockDividendsPerShareDeclared,1.98,
2,Cost of sales,us-gaap_CostOfGoodsAndServicesSold,-140089000000.0,-112258000000.0
3,Gross margin,us-gaap_GrossProfit,93626000000.0,70537000000.0
4,Income before provision for income taxes,us-gaap_IncomeLossFromContinuingOperationsBefo...,72515000000.0,53483000000.0
5,Net Income,us-gaap_NetIncomeLoss,53394000000.0,39510000000.0
6,Operating Income,us-gaap_OperatingIncomeLoss,71230000000.0,52503000000.0
7,"Other income/(expense), net",us-gaap_NonoperatingIncomeExpense,1285000000.0,980000000.0
8,Provision for income taxes,us-gaap_IncomeTaxExpenseBenefit,19121000000.0,13973000000.0
9,Revenue,us-gaap_SalesRevenueNet,233715000000.0,182795000000.0


In [49]:
# def
def extract_financials(cik, year, extract_type=None):
    company = Company(cik)
    df = company.get_facts().to_pandas()
    df = df[(df["form"] == "10-K") & (df["namespace"] == "us-gaap")]
    filings = company.get_filings(form=["10-K"], date=f"{year}-01-01:{year+1}-12-31")
    if not filings:
        return None

    # 選取符合該年度報告的 filing
    filing = None
    for f in filings:
        report_date = datetime.fromisoformat(f.report_date)
        report_year = (
            report_date.year - 1 if report_date.month <= 3 else report_date.year
        )
        if report_year == year:
            filing = f
            break
    if not filing:
        return None

    try:
        income_statement = (
            XBRLS.from_filings(filings).statements.income_statement().to_dataframe()
        )
    except:
        income_statement = None
    url = f"https://www.sec.gov/Archives/edgar/data/{filing.cik}/{filing.accession_no.replace('-', '')}/{filing.primary_document}"

    facts = {
        "Revenues": {
            # 高優先：標準與 ASC 606 定義
            "High": [
                "Revenues",
                "RevenueFromContractWithCustomerExcludingAssessedTax",
                "RevenueFromContractWithCustomerIncludingAssessedTax",
            ],
            # 中優先：銷售細分類（商品、服務）
            "Medium": [
                "SalesRevenueNet",
                "SalesRevenueGoodsNet",
                "SalesRevenueServicesNet",
            ],
            # 特定產業
            "Low": [
                "RevenuesNetOfInterestExpense",
                "RealEstateRevenueNet",
                "RegulatedAndUnregulatedOperatingRevenue",
                "InvestmentAdvisoryFees",
            ],
            # 最後備援：模擬資料
            # "BusinessAcquisitionsProFormaRevenue",
        },
        "Income": {
            # 高優先：標準最終淨利
            "High": ["NetIncomeLoss", "ProfitLoss"],
            # 中優先：普通股基本淨利
            "Medium": ["NetIncomeLossAvailableToCommonStockholdersBasic"],
        },
    }

    def safe_extract(target):
        def parse_val(val):
            return float(re.sub("[—−]", "-", re.sub(r"[\s,]+", "", str(val))))

        if target == "Revenues":
            levels = ["High", "Medium", "Low"]
        else:
            levels = ["High", "Medium"]

        for level in levels:
            total = 0
            found = []
            fact_list = facts[target][level]

            # Income statement
            for fact in fact_list:
                if (
                    not income_statement.empty
                    and "us-gaap_" + fact in income_statement["concept"]
                ):
                    row = income_statement[
                        income_statement["concept"] == "us-gaap_" + fact
                    ]
                    for col in income_statement.columns[2:]:
                        col_date = datetime.fromisoformat(col)
                        report_year = (
                            col_date.year if col_date.month > 3 else col_date.year - 1
                        )
                        if report_year == year:
                            val = row[col].values[0]
                            if pd.notna(val):
                                total += float(val)
                                found.append(fact)
                            break
            if found:
                return total if len(found) > 1 else float(total)

            # df["frame"]
            for fact in fact_list:
                col = df[(df["fact"] == fact) & (df["frame"] == f"CY{year}")]
                if not col.empty:
                    val = col.sort_values("filed", ascending=False)["val"].iloc[0]
                    if val:
                        total += parse_val(val)
                        found.append(fact)
            if found:
                return total if len(found) > 1 else float(total)

            # df["end"]
            for fact in fact_list:
                col = df[(df["fact"] == fact) & (df["end"].str.startswith(str(year)))]
                if not col.empty:
                    val = col.sort_values("filed", ascending=False)["val"].iloc[0]
                    if val:
                        total += parse_val(val)
                        found.append(fact)
            if found:
                return total if len(found) > 1 else float(total)

            # Inline XBRL
            try:
                response = requests.get(
                    url, headers={"User-Agent": "ansa1019@gmail.com"}
                )
                soup = BeautifulSoup(response.content, "lxml")
                for fact in fact_list:
                    for tag in soup.find_all(
                        "ix:nonfraction", {"name": f"us-gaap:{fact}"}
                    ):
                        contextref = tag.get("contextref")
                        context = soup.find(id=contextref)
                        end_tag = context.find("xbrli:enddate")
                        report_date = datetime.fromisoformat(end_tag.text.strip())
                        report_year = (
                            report_date.year - 1
                            if report_date.month <= 3
                            else report_date.year
                        )
                        if report_year == year and context.find("xbrli:segment"):
                            val = tag.text
                            if val:
                                total += parse_val(val)
                                found.append(fact)
                if found:
                    return total if len(found) > 1 else float(total)
            except Exception as e:
                print(f"❌ Inline XBRL 抓取失敗：{e}")

        return None

    revenues = income = None
    revenues = safe_extract("Revenues")
    income = safe_extract("Income")
    return (revenues, income)

In [34]:
# 財務資料
log = ""
financial_data = {}
loop = tqdm(companies)
for ticker in loop:
    searchs = [ticker] + find_cik[ticker] if ticker in find_cik else [ticker]
    for search in searchs:
        try:
            for year in range(report_year[0], report_year[1] + 1):
                loop.set_description(f"正在抓取 {ticker} {year} 年的財務資料...")
                financial_data.setdefault(ticker, {}).setdefault(
                    year, {"Revenues": None, "Net Income": None}
                )
                if (
                    financial_data[ticker][year]["Revenues"] is not None
                    and financial_data[ticker][year]["Net Income"] is not None
                ):
                    continue
                result = extract_financials(search, year)
                if result is not None:
                    revenues, income = result
                    if revenues is not None:
                        financial_data[ticker][year]["Revenues"] = revenues
                    if income is not None:
                        financial_data[ticker][year]["Net Income"] = income
        except Exception as e:
            log += f"❌ 無法取得 {ticker} {year} 年的數據: {e}\n"

# 轉為 DataFrame 並輸出
flat_data = []
for ticker, years in financial_data.items():
    for year, values in years.items():
        row = {
            "ticker": ticker,
            "year": year,
            "Revenues": values.get("Revenues"),
            "Net Income": values.get("Net Income"),
        }
        flat_data.append(row)
df = pd.DataFrame(flat_data)
df.to_csv(financials_file, index=False)
print(f"📁 財務資料已保存到: {financials_file}")

if log:
    print("⚠️ 錯誤紀錄：\n" + log)

正在抓取 NEE 2014 年的財務資料...:  59%|█████▊    | 41/70 [12:01<12:58, 26.85s/it] stamina.retry_scheduled
正在抓取 LRCX 2023 年的財務資料...: 100%|██████████| 70/70 [23:02<00:00, 19.76s/it] 

📁 財務資料已保存到: financials.csv





In [4]:
# 檢查缺值
df = pd.read_csv(financials_file)
df[df.isnull().any(axis=1)]

Unnamed: 0,ticker,year,Revenues,Net Income
280,TMO,2014,16889600000.0,
281,TMO,2015,16965000000.0,
282,TMO,2016,36548000000.0,
283,TMO,2017,41836000000.0,
284,TMO,2018,24358000000.0,
382,BKNG,2016,10743000000.0,
383,BKNG,2017,12681000000.0,
384,BKNG,2018,14527000000.0,
385,BKNG,2019,15066000000.0,
386,BKNG,2020,6796000000.0,


In [43]:
extract_financials(cik, year)

(16889600000.0, None)

In [45]:
df = pd.read_csv("financials.csv")  # 替換成你的檔名
missing_df = df[df.isnull().any(axis=1)]

results = []

for _, row in tqdm(missing_df.iterrows(), total=len(missing_df)):
    ticker = row["ticker"]
    year = int(row["year"])

    results.append([ticker, year, extract_financials(ticker, year)])
for r in results:
    print(r)

  0%|          | 0/13 [00:00<?, ?it/s]

100%|██████████| 13/13 [00:13<00:00,  1.07s/it]

['TMO', 2014, (16889600000.0, 1894400000.0)]
['TMO', 2015, (16965000000.0, 1975000000.0)]
['TMO', 2016, (36548000000.0, 2022000000.0)]
['TMO', 2017, (41836000000.0, 2225000000.0)]
['TMO', 2018, (24358000000.0, 2938000000.0)]
['BKNG', 2016, (10743000000.0, 2135000000.0)]
['BKNG', 2017, (12681000000.0, 2341000000.0)]
['BKNG', 2018, (14527000000.0, 3998000000.0)]
['BKNG', 2019, (15066000000.0, 4865000000.0)]
['BKNG', 2020, (6796000000.0, 59000000.0)]
['BKNG', 2021, (10958000000.0, 1165000000.0)]
['BKNG', 2022, (17090000000.0, 3058000000.0)]
['BKNG', 2023, (21365000000.0, 4289000000.0)]





In [50]:
year = 2015
cik = "KKR"
company = Company(cik)
filing = company.get_filings(form="10-K", date=f"{year}-04-01:{year+1}-12-31")[-1]
df = company.get_facts().to_pandas()
df = df[df["namespace"] == "us-gaap"]
url = f"https://www.sec.gov/Archives/edgar/data/{filing.cik}/{filing.accession_no.replace('-', '')}/{filing.primary_document}"

result = extract_financials(cik, year)
result

(1043768000.0, 5763514000.0)

In [48]:
income_statement = (
    XBRLS.from_filings(
        Company(cik).get_filings(form="10-K", date=f"{year}-04-01:{year+1}-12-31")
    )
    .statements.income_statement()
    .to_dataframe()
)
income_statement

Unnamed: 0,label,concept,2015-12-31
0,Income (Loss) Before Taxes,us-gaap_IncomeLossFromContinuingOperationsBefo...,5341668000.0
1,Income Tax / (Benefit),us-gaap_IncomeTaxExpenseBenefit,66636000.0
2,Net Income,us-gaap_ProfitLoss,488482000.0
3,,,
4,Basic (in dollars per unit),us-gaap_NetIncomeLossPerOutstandingLimitedPart...,1.09
5,Basic (in units),us-gaap_WeightedAverageLimitedPartnershipUnits...,448884200.0
6,Compensation and Benefits,us-gaap_LaborAndRelatedExpense,1180591000.0
7,Diluted (in dollars per unit),us-gaap_NetIncomeLossNetOfTaxPerOutstandingLim...,1.01
8,Diluted (in units),us-gaap_WeightedAverageLimitedPartnershipUnits...,482699200.0
9,Dividend Income,us-gaap_InvestmentIncomeDividend,850527000.0


In [39]:
[f for f in df["fact"].unique() if "Sale" in f]

['AccumulatedOtherComprehensiveIncomeLossAvailableForSaleSecuritiesAdjustmentNetOfTax',
 'AvailableForSaleDebtSecuritiesAmortizedCostBasis',
 'AvailableForSaleSecurities',
 'AvailableForSaleSecuritiesAccumulatedGrossUnrealizedGainBeforeTax',
 'AvailableForSaleSecuritiesAccumulatedGrossUnrealizedLossBeforeTax',
 'AvailableForSaleSecuritiesAmortizedCost',
 'AvailableForSaleSecuritiesDebtSecuritiesCurrent',
 'AvailableForSaleSecuritiesGrossRealizedGains',
 'AvailableForSaleSecuritiesGrossUnrealizedLosses1',
 'DeferredTaxAssetsUnrealizedLossesOnAvailableforSaleSecuritiesGross',
 'FairValueMeasurementWithUnobservableInputsReconciliationRecurringBasisAssetPurchasesSalesIssuancesSettlements',
 'GainLossOnSaleOfPropertyPlantEquipment',
 'GoodwillWrittenOffRelatedToSaleOfBusinessUnit',
 'OtherComprehensiveIncomeAvailableForSaleSecuritiesAdjustmentNetOfTaxPeriodIncreaseDecrease',
 'OtherComprehensiveIncomeAvailableForSaleSecuritiesTax',
 'OtherComprehensiveIncomeLossForeignCurrencyTransactionAnd