In [1]:
from datasets import load_dataset

# Stream the dataset to avoid loading everything in memory
ds = load_dataset("eloukas/edgar-corpus", "full", split="train")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# it never holds the whole dataset in memory, and the gzip keeps the file small.
import json, gzip, os

AIG_CIK = "0000005272"   # AIG
OUT_PATH = "aig_edgar.jsonl.gz"  # compact on-disk buffer for Spark

# # Stream the full split; no huge RAM spikes
# ds = load_dataset("eloukas/edgar-corpus", "full", split="train", streaming=True)

# Write only matching rows to newline-delimited JSON (gzipped)
count = 0
with gzip.open(OUT_PATH, "wt", encoding="utf-8") as f:
    for row in ds:
        # rows have keys like: filename, cik, year, section_1, section_1A, ...
        if str(row.get("cik", "")).zfill(10) == AIG_CIK:
            f.write(json.dumps(row, ensure_ascii=False) + "\n")
            count += 1

print(f"Wrote {count} AIG rows to {OUT_PATH}")

Wrote 22 AIG rows to aig_edgar.jsonl.gz


In [3]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("AIG-EDGAR")
    # tweak as you like; driver mem helps if you inspect a lot at once
    .config("spark.driver.memory", "6g")
    .getOrCreate()
)

# Read the gzipped JSONL directly
aig_df = spark.read.json(OUT_PATH)

# (Optional) normalize CIK to 10-digit string for consistency
from pyspark.sql.functions import lpad, col
aig_df = aig_df.withColumn("cik", lpad(col("cik").cast("string"), 10, "0"))

# Inspect a few rows
aig_df.select("filename", "cik", "year").show(10, truncate=False)

# Persist to Parquet (columnar, splittable, great for Spark)
PARQUET_DIR = "parquet_aig_edgar"
aig_df.write.mode("overwrite").parquet(PARQUET_DIR)

print(f"Saved AIG subset to {PARQUET_DIR}")


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/08 21:57:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


+-------------+----------+----+
|filename     |cik       |year|
+-------------+----------+----+
|5272_1994.txt|0000005272|1994|
|5272_1995.txt|0000005272|1995|
|5272_1998.txt|0000005272|1998|
|5272_1999.txt|0000005272|1999|
|5272_2000.txt|0000005272|2000|
|5272_2001.txt|0000005272|2001|
|5272_2003.htm|0000005272|2003|
|5272_2004.htm|0000005272|2004|
|5272_2005.htm|0000005272|2005|
|5272_2006.htm|0000005272|2006|
+-------------+----------+----+
only showing top 10 rows
Saved AIG subset to parquet_aig_edgar


                                                                                

In [4]:
aig_df.toPandas().head(2)

Unnamed: 0,cik,filename,section_1,section_10,section_11,section_12,section_13,section_14,section_15,section_1A,...,section_4,section_5,section_6,section_7,section_7A,section_8,section_9,section_9A,section_9B,year
0,5272,5272_1994.txt,ITEM 1. BUSINESS\nAmerican International Group...,ITEM 10. DIRECTORS AND EXECUTIVE OFFICERS OF T...,ITEM 11. EXECUTIVE COMPENSATION\nThis item is ...,ITEM 12. SECURITY OWNERSHIP OF CERTAIN BENEFIC...,ITEM 13. CERTAIN RELATIONSHIPS AND RELATED TRA...,"ITEM 14. EXHIBITS, FINANCIAL STATEMENT SCHEDUL...",,,...,ITEM 4. SUBMISSION OF MATTERS TO A VOTE OF SEC...,ITEM 5. MARKET FOR THE REGISTRANT'S COMMON STO...,ITEM 6. SELECTED FINANCIAL DATA\nAMERICAN INTE...,ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS O...,,ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY...,ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCO...,,,1994
1,5272,5272_1995.txt,ITEM 1. BUSINESS\nAmerican International Group...,ITEM 10. DIRECTORS AND EXECUTIVE OFFICERS OF T...,ITEM 11. EXECUTIVE COMPENSATION\nThis item is ...,ITEM 12. SECURITY OWNERSHIP OF CERTAIN BENEFIC...,ITEM 13. CERTAIN RELATIONSHIPS AND RELATED TRA...,"ITEM 14. EXHIBITS, FINANCIAL STATEMENT SCHEDUL...",,,...,ITEM 4. SUBMISSION OF MATTERS TO A VOTE OF SEC...,ITEM 5. MARKET FOR THE REGISTRANT'S COMMON STO...,ITEM 6. SELECTED FINANCIAL DATA AMERICAN INTER...,ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS O...,,ITEM 8. Financial Statements and Supplementary...,ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCO...,,,1995


In [5]:
import os
import google.generativeai as genai
api_key = "AIzaSyDz3kL0XL7QogHsDPh_g596Raj2CbpyMmQ"
os.environ["GOOGLE_API_KEY"] = api_key # or set in your shell
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

In [6]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
dim = len(embeddings.embed_query("dimension probe"))
print("Embedding dimension:", dim)

Embedding dimension: 768


In [14]:
from langchain_community.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
dim = len(embeddings.embed_query("dimension probe"))
print("Embedding dimension:", dim)

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


Embedding dimension: 384


In [15]:
df = aig_df.toPandas()

In [16]:
df.head(2)

Unnamed: 0,cik,filename,section_1,section_10,section_11,section_12,section_13,section_14,section_15,section_1A,...,section_4,section_5,section_6,section_7,section_7A,section_8,section_9,section_9A,section_9B,year
0,5272,5272_1994.txt,ITEM 1. BUSINESS\nAmerican International Group...,ITEM 10. DIRECTORS AND EXECUTIVE OFFICERS OF T...,ITEM 11. EXECUTIVE COMPENSATION\nThis item is ...,ITEM 12. SECURITY OWNERSHIP OF CERTAIN BENEFIC...,ITEM 13. CERTAIN RELATIONSHIPS AND RELATED TRA...,"ITEM 14. EXHIBITS, FINANCIAL STATEMENT SCHEDUL...",,,...,ITEM 4. SUBMISSION OF MATTERS TO A VOTE OF SEC...,ITEM 5. MARKET FOR THE REGISTRANT'S COMMON STO...,ITEM 6. SELECTED FINANCIAL DATA\nAMERICAN INTE...,ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS O...,,ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY...,ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCO...,,,1994
1,5272,5272_1995.txt,ITEM 1. BUSINESS\nAmerican International Group...,ITEM 10. DIRECTORS AND EXECUTIVE OFFICERS OF T...,ITEM 11. EXECUTIVE COMPENSATION\nThis item is ...,ITEM 12. SECURITY OWNERSHIP OF CERTAIN BENEFIC...,ITEM 13. CERTAIN RELATIONSHIPS AND RELATED TRA...,"ITEM 14. EXHIBITS, FINANCIAL STATEMENT SCHEDUL...",,,...,ITEM 4. SUBMISSION OF MATTERS TO A VOTE OF SEC...,ITEM 5. MARKET FOR THE REGISTRANT'S COMMON STO...,ITEM 6. SELECTED FINANCIAL DATA AMERICAN INTER...,ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS O...,,ITEM 8. Financial Statements and Supplementary...,ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCO...,,,1995


In [17]:
# pip install -U langchain langchain-community langchain-openai langchain-experimental faiss-cpu

import math
import pandas as pd
from typing import List, Iterable
from uuid import uuid4

from langchain.docstore.document import Document
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

# Prefer semantic chunking; fall back to character splitter if not available.
try:
    from langchain_experimental.text_splitter import SemanticChunker
    _HAS_SEM = True
except Exception:
    from langchain_text_splitters import RecursiveCharacterTextSplitter
    _HAS_SEM = False

In [18]:
def _make_splitter(emb):
    """
    Semantic splitter that finds natural breakpoints; falls back to
    a high-quality character-based splitter.
    """
    if _HAS_SEM:
        # Breakpoints chosen via embedding similarity changes.
        # 95th percentile is a good default for long 10-K sections.
        return SemanticChunker(
            emb,
            breakpoint_threshold_type="percentile",
            breakpoint_threshold_amount=95,
        )
    else:
        # Safe default if semantic chunker isn't available.
        return RecursiveCharacterTextSplitter(
            chunk_size=1200,
            chunk_overlap=150,
            add_start_index=True,
        )


def _is_nonempty_text(x) -> bool:
    if x is None:
        return False
    if isinstance(x, float) and math.isnan(x):
        return False
    return bool(str(x).strip())


def _iter_section_chunks(
    df: pd.DataFrame,
    splitter,
) -> Iterable[Document]:
    """Yield chunked Documents with rich metadata from a wide SEC sections DF."""
    # discover section columns dynamically
    section_cols: List[str] = [c for c in df.columns if c.startswith("section_")]

    for row in df.itertuples(index=False):
        filename = str(getattr(row, "filename"))
        cik = str(getattr(row, "cik"))
        year = int(getattr(row, "year"))

        for sec in section_cols:
            raw_text = getattr(row, sec)
            if not _is_nonempty_text(raw_text):
                continue

            text = str(raw_text).strip()
            # parent (section) identity
            section_id = f"{filename}#{sec}"
            section_title = text.splitlines()[0][:160] if text else ""

            # split into semantic chunks
            chunks: List[str] = splitter.split_text(text)

            for idx, chunk in enumerate(chunks):
                # stable per-chunk id (handy if you later want parent-child mapping)
                doc_id = f"{section_id}::chunk{idx}"
                meta = {
                    "doc_id": doc_id,            # unique id for this chunk
                    "section_id": section_id,    # parent section id
                    "section": sec,              # e.g., "section_10"
                    "section_title": section_title,
                    "filename": filename,
                    "cik": cik,
                    "year": year,
                    "chunk_index": idx,
                }
                yield Document(page_content=chunk, metadata=meta)


def build_faiss_from_sections(
    df: pd.DataFrame,
    embedding_model: str = "text-embedding-3-small",
) -> FAISS:
    """
    Build a FAISS vector store from a SEC sections dataframe.
    - One document per semantic chunk.
    - Rich metadata for filtering (section, file, cik, year, ids).
    """
    # embeddings = OpenAIEmbeddings(model=embedding_model)
    splitter = _make_splitter(embeddings)

    docs = list(_iter_section_chunks(df, splitter))
    if not docs:
        raise ValueError("No non-empty section text found to index.")

    vstore = FAISS.from_documents(docs, embeddings)
    return vstore

In [19]:
# ---------- usage example ----------
# df = ...  # your dataframe with columns: filename, cik, year, section_*
vectorstore = build_faiss_from_sections(df)

In [20]:
# Optional: persist to disk
vectorstore.save_local("faiss_edgar_sections_v1")

In [21]:
# ---------- querying examples ----------
# Plain similarity search
results = vectorstore.similarity_search("what is the declared a cash dividend?", k=5)

In [25]:
# Plain similarity search
results = vectorstore.similarity_search("what is the total revenue of aig in year 2016?", k=5)

In [26]:
results

[Document(id='f0d5be26-0bda-4745-b492-60f304c4ad36', metadata={'doc_id': '5272_2020.htm#section_8::chunk22', 'section_id': '5272_2020.htm#section_8', 'section': 'section_8', 'section_title': 'ITEM 8 | Financial Statements and Supplementary Data', 'filename': '5272_2020.htm', 'cik': '0000005272', 'year': 2020, 'chunk_index': 22}, page_content='AIG | 2020 Form 10-K 201\nITEM 8 | Notes to Consolidated Financial Statements | 3. Segment Information\nThe following table presents AIG’s year-end identifiable assets and capital expenditures by segment:\nThe following table presents AIG’s consolidated total revenues and real estate and other fixed assets, net of accumulated depreciation, by major geographic area:\n*Revenues are generally reported according to the geographic location of the segment. International revenues consists of revenues from our General Insurance International operating segment.'),
 Document(id='1dc9e95b-37a9-45df-b540-02cfbd9fd477', metadata={'doc_id': '5272_2003.htm#secti

In [None]:
# Metadata filtering (post-filter on retrieved docs)
# Only search results from section_10 for a specific file/year
results = vectorstore.similarity_search(
    "Describe the properties the company owns.",
    k=8,
    filter={"section": "section_2", "filename": "1047643_1997.txt", "year": 1997},
)

# See the metadata you can route on:
for d in results:
    print(d.metadata["section"], d.metadata["filename"], d.metadata["year"], d.metadata["doc_id"])


In [27]:
from langchain_community.retrievers import BM25Retriever

# Create a BM25Retriever for keyword search
bm25_retriever = BM25Retriever.from_documents(docs, k=2)

NameError: name 'docs' is not defined