In [1]:
# from pyspark.sql import SparkSession

# spark = (
#     SparkSession.builder
#     .appName("edgar-aig-sec10")
#     .master("local[*]")
#     # more headroom on the driver for local processing
#     .config("spark.driver.memory", "8g")
#     .config("spark.executor.memory", "4g")
#     .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
#     .config("spark.kryoserializer.buffer", "64m")
#     .config("spark.kryoserializer.buffer.max", "512m")
#     .config("spark.sql.shuffle.partitions", "64")  # avoid tiny default=200 on a laptop
#     .getOrCreate()
# )

In [2]:
from datasets import load_dataset
from pyspark.sql import Row

# Stream the dataset to avoid loading everything in memory
ds = load_dataset("eloukas/edgar-corpus", "year_1997", split="train")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# CIK = "718413"
# OUT_DIR = "/tmp/edgar_aig_sec10_parquet"  # change if needed

In [4]:
# Convert to Pandas DataFrame
df = ds.to_pandas()
df.head(2)

Unnamed: 0,filename,cik,year,section_1,section_1A,section_1B,section_2,section_3,section_4,section_5,...,section_8,section_9,section_9A,section_9B,section_10,section_11,section_12,section_13,section_14,section_15
0,820736_1997.txt,820736,1997,ITEM 1. BUSINESS\nBACKGROUND\nOrbital Sciences...,,,ITEM 2. PROPERTIES\nOrbital owns or leases ove...,"ITEM 3. LEGAL PROCEEDINGS\nOn October 10, 1996...",ITEM 4. SUBMISSION OF MATTERS TO A VOTE OF SEC...,ITEM 5. MARKET FOR REGISTRANT'S COMMON EQUITY ...,...,ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY...,ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCO...,,,ITEM 10. DIRECTORS AND EXECUTIVE OFFICERS OF T...,ITEM 11. EXECUTIVE COMPENSATION\nThe informati...,ITEM 12. SECURITY OWNERSHIP OF CERTAIN BENEFIC...,ITEM 13. CERTAIN RELATIONSHIPS AND RELATED TRA...,"ITEM 14. EXHIBITS, FINANCIAL STATEMENT SCHEDUL...",
1,1047643_1997.txt,1047643,1997,ITEM 1. BUSINESS\nAdvanced Communications Grou...,,,ITEM 2. PROPERTIES\nThe information appearing ...,ITEM 3. LEGAL PROCEEDINGS\nThe information app...,ITEM 4. SUBMISSION OF MATTERS TO A VOTE OF SEC...,ITEM 5. MARKET FOR THE REGISTRANT'S COMMON EQU...,...,ITEM 8. FINANCIAL STATEMENTS AND OTHER SUPPLEM...,ITEM 9. CHANGES AND DISAGREEMENTS WITH ACCOUNT...,,,ITEM 10. DIRECTORS AND EXECUTIVE OFFICERS OF T...,ITEM 11. EXECUTIVE COMPENSATION\nThe informati...,ITEM 12. SECURITY OWNERSHIP OF CERTAIN BENEFIC...,ITEM 13. CERTAIN RELATIONSHIPS AND RELATED TRA...,"ITEM 14. EXHIBITS, FINANCIAL STATEMENTS SCHEDU...",


In [5]:
df_1047643 = df[df.cik == '1047643']

In [6]:
df_1047643.columns

Index(['filename', 'cik', 'year', 'section_1', 'section_1A', 'section_1B',
       'section_2', 'section_3', 'section_4', 'section_5', 'section_6',
       'section_7', 'section_7A', 'section_8', 'section_9', 'section_9A',
       'section_9B', 'section_10', 'section_11', 'section_12', 'section_13',
       'section_14', 'section_15'],
      dtype='object')

In [12]:
import os
from typing import List
import pandas as pd
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers import ParentDocumentRetriever
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain.storage import InMemoryStore
import faiss

In [42]:
api_key = "AIzaSyDz3kL0XL7QogHsDPh_g596Raj2CbpyMmQ"

In [46]:
import os
import google.generativeai as genai

os.environ["GOOGLE_API_KEY"] = api_key # or set in your shell
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

In [47]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

# Recommended Gemini embedding model
embeddings = GoogleGenerativeAIEmbeddings(model="text-embedding-004")  

In [60]:
def get_section_columns(df: pd.DataFrame) -> List[str]:
    return [c for c in df.columns if c.lower().startswith("section_")]

section_cols = get_section_columns(df_1047643)

In [98]:
docs = []
for sec_col in section_cols:
    # Skip empty / NaN sections safely
    if sec_col not in df_1047643.columns:
        continue
    cell = df_1047643[sec_col].values[0]
    if cell is None or (isinstance(cell, float) and math.isnan(cell)):
        continue

    metadata = {
        "filename": "1047643",
        "cik": "1047643",
        "year": "1997",
        "sections_present": f"1047643#{sec_col}",
        "section": sec_col,   # add section explicitly for filtering
    }
    # Use the *value* of sec_col in the header
    content = str(cell).strip()
    content_with_header = f"## {sec_col}\n{content}"
    doc = Document(page_content=content_with_header, metadata=metadata)
    docs.append(doc)

# Optional: give stable parent IDs (recommended)
parent_ids = [f"{d.metadata['cik']}#{d.metadata['year']}#{d.metadata['section']}" for d in docs]


In [100]:
# Determine dimension dynamically to avoid mismatch
dim = len(embeddings.embed_query("dimension probe"))
index = faiss.IndexFlatIP(dim)  # inner-product (use normalize_L2=True below for cosine)
child_docstore = InMemoryDocstore()  # FAISS uses its own docstore for child chunks

faiss_child_docstore = InMemoryDocstore()
vectorstore = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=faiss_child_docstore,
    index_to_docstore_id={},
    normalize_L2=True,  # cosine-like similarity
)


In [101]:
# Parent store must be a BaseStore
parent_store = InMemoryStore()
child_splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=150)
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=parent_store,     # <-- the fix
    child_splitter=child_splitter,
)

In [102]:
# 3) Index in batches
BATCH = 500
for i in range(0, len(parents), BATCH):
    retriever.add_documents(parents[i:i+BATCH])

In [103]:
# 4) Persist FAISS to disk
SAVE_DIR = "./faiss_edgar_sections"
os.makedirs(SAVE_DIR, exist_ok=True)
vectorstore.save_local(SAVE_DIR)

print(f"Indexed {len(parents)} parent filings; child chunks stored in FAISS at {SAVE_DIR}.")


Indexed 1 parent filings; child chunks stored in FAISS at ./faiss_edgar_sections.


In [104]:
# query
results = retriever.get_relevant_documents("Section 10 disclosures about directors")
for d in results:
    print(d.metadata.get("filename"), d.metadata.get("year"))
    print(d.page_content, "...\n")

1047643_1997.txt 1997
# Filing: 1047643_1997.txt | CIK: 1047643 | Year: 1997

## SECTION 1
ITEM 1. BUSINESS
Advanced Communications Group, Inc. ("ACG") was incorporated in Delaware in September 1997 as a subsidiary of a predecessor company that was ultimately named Advanced Communications Corp. ("ACC"). ACC was formed in June 1996 and had previously been named Advanced Communications Group, Inc. By September 1997, ACC had entered into acquisition agreements to acquire a number of companies in various aspects of the telecommunications business. In October 1997, in order to facilitate these acquisitions, ACG entered into new definitive agreements to acquire the stock or assets of six telecommunications service providers, one yellow page publisher, two telephone equipment sales and maintenance companies, ACC, and a 49% interest in a company owning a fiber optic network (collectively, the "Acquisitions"). Shortly thereafter, ACG filed a Registration Statement on Form S-1 with the Securitie

In [105]:
results



In [None]:
# 6) Reload later
# vectorstore = FAISS.load_local(
#     SAVE_DIR,
#     embeddings,
#     allow_dangerous_deserialization=True  # required by LangChain FAISS loader
# )
# parent_docstore = InMemoryDocstore()  # rehydrate or persist parents separately if needed
# retriever = ParentDocumentRetriever(
#     vectorstore=vectorstore,
#     docstore=parent_docstore,
#     child_splitter=child_splitter,
# )

In [26]:
df['section_1'][1]

'ITEM 1. BUSINESS\nAdvanced Communications Group, Inc. ("ACG") was incorporated in Delaware in September 1997 as a subsidiary of a predecessor company that was ultimately named Advanced Communications Corp. ("ACC"). ACC was formed in June 1996 and had previously been named Advanced Communications Group, Inc. By September 1997, ACC had entered into acquisition agreements to acquire a number of companies in various aspects of the telecommunications business. In October 1997, in order to facilitate these acquisitions, ACG entered into new definitive agreements to acquire the stock or assets of six telecommunications service providers, one yellow page publisher, two telephone equipment sales and maintenance companies, ACC, and a 49% interest in a company owning a fiber optic network (collectively, the "Acquisitions"). Shortly thereafter, ACG filed a Registration Statement on Form S-1 with the Securities and Exchange Commission (the "Commission") with respect to an initial public offering (