## Filings


In [1]:
"""
複数企業・複数期間でSEC Filingsを取得するコード
対応フォーム: 10-K, 10-Q, 8-K
"""

%load_ext autoreload
%autoreload 2


from edgar import *
from edgar.files.html import Document
from edgar.files.docs.filing_document import Table
from edgar.documents import HTMLParser, ParserConfig
import pandas as pd
from datetime import datetime
from pathlib import Path
import os, sys
from dotenv import load_dotenv

load_dotenv()
ROOT_DIR = Path(os.environ.get('ROOT_DIR'))
sys.path.append(str(ROOT_DIR))

import src.edgar_utils as edgar_utils

set_identity("youxitiankaggle.com")

# === 設定パラメータ ===
# COMPANIES = ["AAPL", "MSFT", "NVDA", "GOOGL", "TSLA"]
COMPANIES = ["AAPL"]
FORM_TYPES = ["10-K", "10-Q", "8-K"]
START_DATE = "2022-01-01"
END_DATE = "2024-12-31"
LIMIT_PER_FORM = 5  # フォームタイプごとの最大取得件数


In [None]:
# === Filingsの一括取得 ===
all_filings_data = []

print(f"=== {len(COMPANIES)}社の SEC Filings を取得開始 ===")
print(f"期間: {START_DATE} ~ {END_DATE}")
print(f"対象フォーム: {', '.join(FORM_TYPES)}\n")

for ticker in COMPANIES:
    print(f"[{ticker}] のFilingsを取得中...")

    try:
        company = Company(ticker)

        for form_type in FORM_TYPES:
            filings = company.get_filings(form=form_type).filter(
                date=f"{START_DATE}:{END_DATE}"
            )

            count = 0
            for filing in filings:
                if count >= LIMIT_PER_FORM:
                    break

                filing_info = {
                    "ticker": ticker,
                    "company_name": company.name,
                    "form_type": filing.form,
                    "filing_date": filing.filing_date,
                    "accession_number": filing.accession_no,
                    "filing_url": filing.homepage_url,  # 修正: homepage_url を使用
                }

                all_filings_data.append(filing_info)
                count += 1

            print(f"  {form_type}: {count}件取得")

    except Exception as e:
        print(f"  ❌ エラー: {e}")
        continue

print(f"\n=== 取得完了: 合計 {len(all_filings_data)}件 ===\n")

# === 結果をDataFrameに変換 ===
filings_df = pd.DataFrame(all_filings_data)

if not filings_df.empty:
    filings_df = filings_df.sort_values(
        ["ticker", "filing_date"], ascending=[True, False]
    )

    print("=== 取得データのサマリー ===")
    print(f"\n企業別の取得件数:")
    print(filings_df.groupby("ticker").size())

    print(f"\nフォームタイプ別の取得件数:")
    print(filings_df.groupby("form_type").size())

    print(f"\n最初の10件:")
    display(filings_df.head(10))

    # CSVに保存（オプション）
    # output_file = f"sec_filings_{datetime.now().strftime('%Y%m%d')}.csv"
    # filings_df.to_csv(output_file, index=False)
    # print(f"\n✅ データを {output_file} に保存しました")
else:
    print("⚠️ データが取得できませんでした")


=== 1社の SEC Filings を取得開始 ===
期間: 2022-01-01 ~ 2024-12-31
対象フォーム: 10-K, 10-Q, 8-K

[AAPL] のFilingsを取得中...
  10-K: 3件取得
  10-Q: 5件取得
  8-K: 5件取得

=== 取得完了: 合計 13件 ===

=== 取得データのサマリー ===

企業別の取得件数:
ticker
AAPL    13
dtype: int64

フォームタイプ別の取得件数:
form_type
10-K    3
10-Q    5
8-K     5
dtype: int64

最初の10件:


Unnamed: 0,ticker,company_name,form_type,filing_date,accession_number,filing_url
0,AAPL,Apple Inc.,10-K,2024-11-01,0000320193-24-000123,https://www.sec.gov/Archives/edgar/data/320193...
8,AAPL,Apple Inc.,8-K,2024-10-31,0000320193-24-000120,https://www.sec.gov/Archives/edgar/data/320193...
9,AAPL,Apple Inc.,8-K,2024-09-10,0001140361-24-040659,https://www.sec.gov/Archives/edgar/data/320193...
10,AAPL,Apple Inc.,8-K,2024-08-26,0001140361-24-038601,https://www.sec.gov/Archives/edgar/data/320193...
11,AAPL,Apple Inc.,8-K,2024-08-23,0001140361-24-038403,https://www.sec.gov/Archives/edgar/data/320193...
3,AAPL,Apple Inc.,10-Q,2024-08-02,0000320193-24-000081,https://www.sec.gov/Archives/edgar/data/320193...
12,AAPL,Apple Inc.,8-K,2024-08-01,0000320193-24-000080,https://www.sec.gov/Archives/edgar/data/320193...
4,AAPL,Apple Inc.,10-Q,2024-05-03,0000320193-24-000069,https://www.sec.gov/Archives/edgar/data/320193...
5,AAPL,Apple Inc.,10-Q,2024-02-02,0000320193-24-000006,https://www.sec.gov/Archives/edgar/data/320193...
1,AAPL,Apple Inc.,10-K,2023-11-03,0000320193-23-000106,https://www.sec.gov/Archives/edgar/data/320193...


In [None]:
filings = edgar_utils.get_filings_single_ticker(
    "AAPL", "10-K", "2024-01-01", "2024-12-31"
)

for filing in filings:
    tenk = filing.obj()

    item_1a = tenk["Item 1A"]  # type: ignore
    tenk_business = tenk.business  # type: ignore
    tenk_risk_factors = tenk.risk_factors  # type: ignore

    html_content = tenk._filing.html()  # type: ignore
    # フォームタイプを明示的に指定する
    # 新しいパーサーを使用
    # config = ParserConfig(form="10-K")
    # parser = HTMLParser(config)
    # document = parser.parse(html_content)
    # print(document.sections)

    # for section in document.sections:
    #     print(f"{'='*20} Section: {section} {'='*20}")
    #     item_section = document.sections.get(section)
    #     if item_section:
    #         text = item_section.text()
    #         print(text)
    #         tables = item_section.tables()

    #         for table_node in tables:
    #             try:
    #                 df = table_node.to_dataframe()
    #                 if not df.empty:
    #                     display(df.head())
    #             except AttributeError:
    #                 print(f"TableNode does not have to_dataframe() method")

    # 古いパーサーを使用
    document = Document.parse(html_content)
    print(document.sections)
    tables = document.tables
    for table_node in tables:
        table = Table(table_node=table_node)
        df = table.to_dataframe()
        # if not df.empty:
        #     display(df.head())

    # text = item_section.text()  # type: ignore
    # tables = item_section.tables()  # type: ignore
    # for table_node in tables:
    #     df = Table(table_node).to_dataframe()
    #     if not df.empty:
    #         display(df.head())
    #         print(text)

    # management_discussion = tenk.management_discussion
    # # print(management_discussion)
    # # print(type(management_discussion))

    # all_tables = tenk.chunked_document.tables()
    # # for table in all_tables:
    # #     print(table)

    # html_content = tenk._filing.html()
    # document = Document.parse(html_content)
    # tables = document.tables

    # for table_node in tables:
    #     table = Table(table_node=table_node)
    #     df = table.to_dataframe()
    #     display(df)


AttributeError: 'Document' object has no attribute 'sections'

In [None]:
"""
複数企業の10-Kから特定のItem（セクション）を一括取得
"""

COMPANIES_BATCH = ["AAPL", "MSFT", "NVDA", "GOOGL"]
items_results = []

print("=== 複数企業の10-K Itemsを一括取得 ===\n")

for ticker in COMPANIES_BATCH:
    print(f"[{ticker}] を処理中...")

    try:
        company = Company(ticker)
        latest_10k = company.get_filings(form="10-K").latest()

        print(f"  提出日: {latest_10k.filing_date}")

        # 10-Kオブジェクトを取得
        tenk = latest_10k.obj()

        print("10-Kセクション")

        # Item 1: Business
        business_text = tenk.business if hasattr(tenk, "business") else None

        # Item 1A: Risk Factors
        risk_text = tenk.risk_factors if hasattr(tenk, "risk_factors") else None

        # Item 7: MD&A
        mda_text = (
            tenk.management_discussion
            if hasattr(tenk, "management_discussion")
            else None
        )

        result = {
            "ticker": ticker,
            "company_name": company.name,
            "filing_date": latest_10k.filing_date,
            "business_length": len(business_text) if business_text else 0,
            "risk_factors_length": len(risk_text) if risk_text else 0,
            "mda_length": len(mda_text) if mda_text else 0,
            "business_text": business_text,
            "risk_factors_text": risk_text,
            "mda_text": mda_text,
            "url": latest_10k.homepage_url,
        }

        items_results.append(result)
        print(
            f"  ✓ 完了 (Business: {result['business_length']:,}文字, Risk: {result['risk_factors_length']:,}文字, MD&A: {result['mda_length']:,}文字)\n"
        )

    except Exception as e:
        print(f"  ❌ エラー: {e}\n")
        continue

# 結果をDataFrameに変換
items_batch_df = pd.DataFrame(items_results)

print("\n=== 取得結果のサマリー ===")
display(
    items_batch_df[
        [
            "ticker",
            "company_name",
            "filing_date",
            "business_length",
            "risk_factors_length",
            "mda_length",
        ]
    ]
)

# 特定企業のテキストを確認（例：AAPL）
if not items_batch_df.empty:
    print("\n=== AAPL のBusiness セクションのプレビュー ===")
    aapl_row = items_batch_df[items_batch_df["ticker"] == "AAPL"]
    if not aapl_row.empty and aapl_row.iloc[0]["business_text"]:
        business_preview = aapl_row.iloc[0]["business_text"][:500]
        print(business_preview + "...")

# CSVに保存する場合（テキスト列を除外してサマリーのみ保存）
# summary_df = items_batch_df[['ticker', 'company_name', 'filing_date', 'business_length', 'risk_factors_length', 'mda_length', 'url']]
# summary_df.to_csv(f"10k_items_summary_{datetime.now().strftime('%Y%m%d')}.csv", index=False)
# print(f"\n✅ サマリーをCSVに保存しました")


=== 複数企業の10-K Itemsを一括取得 ===

[AAPL] を処理中...
  提出日: 2025-10-31
10-Kセクション
  ❌ エラー: 'TenK' object has no attribute 'sections'

[MSFT] を処理中...
  提出日: 2025-07-30
10-Kセクション
  ❌ エラー: 'TenK' object has no attribute 'sections'

[NVDA] を処理中...
  提出日: 2025-02-26
10-Kセクション
  ❌ エラー: 'TenK' object has no attribute 'sections'

[GOOGL] を処理中...
  提出日: 2025-02-05
10-Kセクション
  ❌ エラー: 'TenK' object has no attribute 'sections'


=== 取得結果のサマリー ===


KeyError: "None of [Index(['ticker', 'company_name', 'filing_date', 'business_length',\n       'risk_factors_length', 'mda_length'],\n      dtype='object')] are in the [columns]"

## 複数企業の 10-K から特定 Item を一括取得

複数企業の 10-K から Item 1 (Business)と Item 1A (Risk Factors)を一括で取得する。


In [None]:
"""
AAPLの最新10-Kから各Item（セクション）のテキストを取得する
"""

from edgar import Company

# AAPLのCompanyオブジェクトを取得
apple = Company("AAPL")

# 最新の10-Kを取得
latest_10k_filing = apple.get_filings(form="10-K").latest()

print(f"企業名: {apple.name}")
print(f"フォーム: {latest_10k_filing.form}")
print(f"提出日: {latest_10k_filing.filing_date}")
print(f"Accession Number: {latest_10k_filing.accession_no}")
print(f"URL: {latest_10k_filing.homepage_url}\n")

# 10-Kオブジェクトを取得（パース処理のため時間がかかる）
print("10-Kを解析中...")
tenk = latest_10k_filing.obj()

print("\n=== 利用可能な主要セクション（Items） ===\n")

# 各Itemのテキストを取得して表示
items_data = []

# Item 1: Business
if hasattr(tenk, "business"):
    business_text = tenk.business
    print(f"✓ Item 1 - Business")
    print(f"  文字数: {len(business_text):,}")
    print(f"  プレビュー: {business_text[:150]}...\n")

    items_data.append(
        {
            "item": "Item 1 - Business",
            "text_length": len(business_text),
            "text": business_text,
        }
    )

# Item 1A: Risk Factors
if hasattr(tenk, "risk_factors"):
    risk_text = tenk.risk_factors
    print(f"✓ Item 1A - Risk Factors")
    print(f"  文字数: {len(risk_text):,}")
    print(f"  プレビュー: {risk_text[:150]}...\n")

    items_data.append(
        {
            "item": "Item 1A - Risk Factors",
            "text_length": len(risk_text),
            "text": risk_text,
        }
    )

# Item 7: Management's Discussion and Analysis (MD&A)
if hasattr(tenk, "management_discussion"):
    mda_text = tenk.management_discussion
    print(f"✓ Item 7 - Management's Discussion and Analysis")
    print(f"  文字数: {len(mda_text):,}")
    print(f"  プレビュー: {mda_text[:150]}...\n")

    items_data.append(
        {"item": "Item 7 - MD&A", "text_length": len(mda_text), "text": mda_text}
    )

# すべての利用可能な属性を確認（デバッグ用）
print("\n=== 10-Kオブジェクトの全属性 ===")
available_attrs = [attr for attr in dir(tenk) if not attr.startswith("_")]
print(f"利用可能な属性数: {len(available_attrs)}")
print(f"主な属性: {available_attrs[:20]}")

# 結果をDataFrameで整理
items_df = pd.DataFrame(items_data)
print("\n=== 取得したItemsのサマリー ===")
display(items_df[["item", "text_length"]])


企業名: Apple Inc.
フォーム: 10-K
提出日: 2025-10-31
Accession Number: 0000320193-25-000079
URL: https://www.sec.gov/Archives/edgar/data/320193/0000320193-25-000079-index.html

10-Kを解析中...

=== 利用可能な主要セクション（Items） ===

✓ Item 1 - Business
  文字数: 16,004
  プレビュー: Item 1.    Business
Company Background
The Company designs, manufactures and markets smartphones, personal computers, tablets, wearables and accessori...

✓ Item 1A - Risk Factors
  文字数: 68,069
  プレビュー: Item 1A.    Risk Factors
The following summarizes factors that could have a material adverse effect on the Company’s business, reputation, results of ...

✓ Item 7 - Management's Discussion and Analysis
  文字数: 21,009
  プレビュー: Item 7.    Management’s Discussion and Analysis of Financial Condition and Results of Operations
The following discussion should be read in conjunctio...


=== 10-Kオブジェクトの全属性 ===
利用可能な属性数: 20
主な属性: ['balance_sheet', 'business', 'cash_flow_statement', 'chunked_document', 'company', 'directors_officers_and_governance', 

Unnamed: 0,item,text_length
0,Item 1 - Business,16004
1,Item 1A - Risk Factors,68069
2,Item 7 - MD&A,21009


## 10-K の各 Item（セクション）のテキストを取得

10-K は以下の主要な Item で構成される：

-   Item 1: Business（事業内容）
-   Item 1A: Risk Factors（リスク要因）
-   Item 7: Management's Discussion and Analysis (MD&A)
-   Item 8: Financial Statements（財務諸表）
-   その他の Item
