In [1]:
import json
import logging
import warnings
import requests
import pandas as pd
from edgar import *
from tqdm import tqdm
from urllib3 import Retry
from edgar.xbrl import XBRLS
from datetime import datetime
from requests.adapters import HTTPAdapter
from bs4 import BeautifulSoup


warnings.simplefilter("ignore")
logging.getLogger().setLevel(logging.WARNING)
logging.getLogger("openai").setLevel(logging.ERROR)
logging.getLogger("urllib3").setLevel(logging.ERROR)
request = requests.Session()
retries = Retry(
    total=3,  # 最多重試 5 次
    backoff_factor=1,  # 每次重試的延遲時間指數增長（1s, 2s, 4s, 8s...）
    status_forcelist=[403, 429, 500, 502, 503, 504],  # 針對這些 HTTP 狀態碼進行重試
)
request.mount("http://", HTTPAdapter(max_retries=retries))
request.mount("https://", HTTPAdapter(max_retries=retries))
headers = {"User-Agent": "ansa ansa1019@gmail.com"}


# 公司名單
companies_range = 100
df_companies = pd.read_csv("sp500_companies.csv")
companies = df_companies.drop_duplicates(subset=["Shortname"], keep="first")[
    "Symbol"
].values[:companies_range]
with open("companies.json", "r", encoding="utf-8") as f:
    data = json.load(f)
missing_companies = [entry["ticker"] for entry in data["missing_companies"]]
find_cik = {entry["ticker"]: entry["cik"] for entry in data["find_cik"]}
companies = [c for c in companies if c not in missing_companies]

# 參數設定
set_identity("ansa ansa1019@gmail.com")
search_queries = ["IT capability", "organizational resilience"]
keywords = {q: [q] for q in search_queries}
report_item = {"paper_7": ["7"], "paper_17": ["1A", "7"]}
report_papers = "papers.json"
financials_file = "financials.csv"
report_year = [2014, 2023]
minlen = 1500
keyword_num = 30

In [None]:
# def
fact_list = []


def extract_financials(cik, year):
    company = Company(cik)
    df = company.get_facts().to_pandas()
    df = df[(df["form"] == "10-K") & (df["namespace"] == "us-gaap")]
    filings = company.get_filings(form=["10-K"], date=f"{year}-01-01:{year+1}-12-31")
    if not filings:
        return None
    filing = None
    for f in filings:
        report_date = datetime.fromisoformat(f.report_date)
        report_year = (
            report_date.year - 1 if report_date.month <= 3 else report_date.year
        )
        if report_year == year:
            filing = f
    if not filing:
        return None
    try:
        income = (
            XBRLS.from_filings(filings).statements.income_statement().to_dataframe()
        )
    except:
        income = None
    url = f"https://www.sec.gov/Archives/edgar/data/{filing.cik}/{filing.accession_no.replace('-', '')}/{filing.primary_document}"

    def safe_extract(target):
        # 定義優先順序的 fact 欄位
        facts = {
            "Revenues": [
                # 高優先：標準與 ASC 606 定義
                "Revenues",
                "RevenueFromContractWithCustomerExcludingAssessedTax",
                "RevenueFromContractWithCustomerIncludingAssessedTax",
                # 中優先：銷售細分類（商品、服務）
                "SalesRevenueNet",
                "SalesRevenueGoodsNet",
                "SalesRevenueServicesNet",
                # 特定產業
                # "RevenuesNetOfInterestExpense",
                # "RealEstateRevenueNet",
                # "RegulatedAndUnregulatedOperatingRevenue",
                # "OperatingRevenue",
                # 最後備援：模擬資料
                # "BusinessAcquisitionsProFormaRevenue",
            ],
            "Income": [
                # 高優先：標準最終淨利
                "NetIncomeLoss",
                "ProfitLoss",
                # 中優先：普通股基本淨利
                # "NetIncomeLossAvailableToCommonStockholdersBasic",
            ],
        }

        for fact in facts[target]:
            # 優先從 income_statement 抓取
            if fact in income.columns:
                row = income[income["concept"] == fact]
                for col in income.columns[2:]:
                    col_date = datetime.fromisoformat(col)
                    report_year = (
                        col_date.year if col_date.month > 3 else col_date.year - 1
                    )
                    if report_year == year:
                        val = row[col].values[0]
                        if pd.notna(val):
                            return float(val), fact

        for fact in facts[target]:
            # fact frame
            col = df[(df["fact"] == fact) & (df["frame"] == f"CY{year}")]
            if not col.empty:
                val = col.sort_values("filed", ascending=False)["val"].iloc[0]
                val = re.sub("[—−]", "-", re.sub(r"[\s,]+", "", str(val)))
                if val:
                    return float(val), fact

            # fact end 以年度開頭
            col = df[(df["fact"] == fact) & (df["end"].str.startswith(str(year)))]
            if not col.empty:
                val = col.sort_values("filed", ascending=False)["val"].iloc[0]
                val = re.sub("[—−]", "-", re.sub(r"[\s,]+", "", str(val)))
                if val:
                    return float(val), fact

        # 解析 XBRL
        try:
            response = request.get(url, headers={"User-Agent": "ansa1019@gmail.com"})
            soup = BeautifulSoup(response.content, "lxml")
            for fact in facts[target]:
                result = []
                for tag in soup.find_all("ix:nonfraction", {"name": f"us-gaap:{fact}"}):
                    contextref = tag.get("contextref")
                    context = soup.find(id=contextref)
                    end_tag = context.find("xbrli:enddate")
                    report_date = datetime.fromisoformat(end_tag.text.strip())
                    report_year = (
                        report_date.year - 1
                        if report_date.month <= 3
                        else report_date.year
                    )
                    if report_year == year:
                        if context.find("xbrli:segment") is not None:
                            val = re.sub("[—−]", "-", re.sub(r"[\s,]+", "", tag.text))
                            if val:
                                result.append(float(val))
                if result:
                    return max(result), "XBRL"
        except Exception as e:
            print(f"❌ Inline XBRL 抓取失敗：{e}")
        return None

    revenues, fact = safe_extract("Revenues")
    fact_list.append(fact)
    income, fact = safe_extract("Income")
    fact_list.append(fact)
    return (revenues, income)

In [27]:
!lsblk

NAME                     MAJ:MIN RM   SIZE RO TYPE  MOUNTPOINTS
loop0                      7:0    0  73.9M  1 loop  /snap/core22/1908
loop1                      7:1    0  63.7M  1 loop  
loop3                      7:3    0    87M  1 loop  /snap/lxd/29351
loop4                      7:4    0    51M  1 loop  /snap/cmake/1461
loop5                      7:5    0  63.8M  1 loop  /snap/core20/2501
loop6                      7:6    0  89.4M  1 loop  /snap/lxd/31333
loop7                      7:7    0     4K  1 loop  /snap/bare/5
loop8                      7:8    0 177.5M  1 loop  /snap/chromium/3117
loop9                      7:9    0  91.7M  1 loop  /snap/gtk-common-themes/1535
loop10                     7:10   0   516M  1 loop  /snap/gnome-42-2204/202
loop11                     7:11   0 505.1M  1 loop  /snap/gnome-42-2204/176
loop12                     7:12   0  67.2M  1 loop  /snap/cups/1100
loop13                     7:13   0  66.6M  1 loop  
loop14                     7:14   0  50.9M  1 l

In [None]:
!sudo apt install testdisk -dnlabansa
!sudo photorec

[sudo] password for ansa1019: 
sudo: a password is required
^C
[sudo] password for ansa1019: 

In [12]:
# 財務資料
log = ""
financial_data = {}
loop = tqdm(companies)
for ticker in loop:
    searchs = [ticker] + find_cik[ticker] if ticker in find_cik else [ticker]
    for search in searchs:
        try:
            for year in range(report_year[0], report_year[1] + 1):
                loop.set_description(f"正在抓取 {ticker} {year} 年的財務資料...")
                financial_data.setdefault(ticker, {}).setdefault(
                    year, {"Revenues": None, "Net Income": None}
                )
                if (
                    financial_data[ticker][year]["Revenues"] is not None
                    and financial_data[ticker][year]["Net Income"] is not None
                ):
                    continue
                result = extract_financials(search, year)
                if result is not None:
                    revenues, income = result
                    if revenues is not None:
                        financial_data[ticker][year]["Revenues"] = revenues
                    if income is not None:
                        financial_data[ticker][year]["Net Income"] = income
        except Exception as e:
            log += f"❌ 無法取得 {ticker} {year} 年的數據: {e}\n"

# 轉為 DataFrame 並輸出
flat_data = []
for ticker, years in financial_data.items():
    for year, values in years.items():
        row = {
            "ticker": ticker,
            "year": year,
            "Revenues": values.get("Revenues"),
            "Net Income": values.get("Net Income"),
        }
        flat_data.append(row)
df = pd.DataFrame(flat_data)
df.to_csv(financials_file, index=False)
print(f"📁 財務資料已保存到: {financials_file}")

if log:
    print("⚠️ 錯誤紀錄：\n" + log)

正在抓取 LRCX 2023 年的財務資料...: 100%|██████████| 70/70 [09:44<00:00,  8.35s/it] 

📁 財務資料已保存到: financials.csv
⚠️ 錯誤紀錄：
❌ 無法取得 TMO 2014 年的數據: cannot unpack non-iterable NoneType object
❌ 無法取得 GS 2014 年的數據: cannot unpack non-iterable NoneType object
❌ 無法取得 BKNG 2016 年的數據: cannot unpack non-iterable NoneType object
❌ 無法取得 NEE 2014 年的數據: cannot unpack non-iterable NoneType object
❌ 無法取得 KKR 2014 年的數據: cannot unpack non-iterable NoneType object
❌ 無法取得 PLD 2014 年的數據: cannot unpack non-iterable NoneType object






In [11]:
set(fact_list)

{'BusinessAcquisitionsProFormaRevenue',
 'NetIncomeLoss',
 'NetIncomeLossAvailableToCommonStockholdersBasic',
 'ProfitLoss',
 'RealEstateRevenueNet',
 'RegulatedAndUnregulatedOperatingRevenue',
 'RevenueFromContractWithCustomerExcludingAssessedTax',
 'RevenueFromContractWithCustomerIncludingAssessedTax',
 'Revenues',
 'RevenuesNetOfInterestExpense',
 'SalesRevenueGoodsNet',
 'SalesRevenueNet',
 'SalesRevenueServicesNet'}

In [17]:
# 檢查缺值
df = pd.read_csv(financials_file)
df[df.isnull().any(axis=1)]

Unnamed: 0,ticker,year,Revenues,Net Income
280,TMO,2014,,
341,GS,2014,,
364,BKNG,2016,,
385,NEE,2014,,
456,KKR,2014,,
637,PLD,2014,,


In [24]:
year = 2014
cik = "TMO"
company = Company(cik)
filing = company.get_filings(form="10-K", date=f"{year}-04-01:{year+1}-12-31")[-1]
df = company.get_facts().to_pandas()
df = df[df["namespace"] == "us-gaap"]
# url = f"https://www.sec.gov/Archives/edgar/data/{filing.cik}/{filing.accession_no.replace('-', '')}/{filing.primary_document}"

# result = extract_financials(cik, year)
# result

In [None]:
[f for f in df["fact"].unique() if "Sale" in f]

['AccumulatedOtherComprehensiveIncomeLossAvailableForSaleSecuritiesAdjustmentNetOfTax',
 'AvailableForSaleDebtSecuritiesAmortizedCostBasis',
 'AvailableForSaleSecurities',
 'AvailableForSaleSecuritiesAccumulatedGrossUnrealizedGainBeforeTax',
 'AvailableForSaleSecuritiesAccumulatedGrossUnrealizedLossBeforeTax',
 'AvailableForSaleSecuritiesAmortizedCost',
 'AvailableForSaleSecuritiesDebtSecuritiesCurrent',
 'AvailableForSaleSecuritiesGrossRealizedGains',
 'AvailableForSaleSecuritiesGrossUnrealizedLosses1',
 'DeferredTaxAssetsUnrealizedLossesOnAvailableforSaleSecuritiesGross',
 'FairValueMeasurementWithUnobservableInputsReconciliationRecurringBasisAssetPurchasesSalesIssuancesSettlements',
 'GainLossOnSaleOfPropertyPlantEquipment',
 'GoodwillWrittenOffRelatedToSaleOfBusinessUnit',
 'OtherComprehensiveIncomeAvailableForSaleSecuritiesAdjustmentNetOfTaxPeriodIncreaseDecrease',
 'OtherComprehensiveIncomeAvailableForSaleSecuritiesTax',
 'OtherComprehensiveIncomeLossForeignCurrencyTransactionAnd