In [25]:
import os
import faiss
import pandas as pd
from typing import List

from langchain.docstore.document import Document
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore

In [2]:
from datasets import load_dataset

# Stream the dataset to avoid loading everything in memory
ds = load_dataset("eloukas/edgar-corpus", "year_1997", split="train")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = ds.to_pandas()
df = df[df.cik == '1047643']
df.head(2)

Unnamed: 0,filename,cik,year,section_1,section_1A,section_1B,section_2,section_3,section_4,section_5,...,section_8,section_9,section_9A,section_9B,section_10,section_11,section_12,section_13,section_14,section_15
1,1047643_1997.txt,1047643,1997,ITEM 1. BUSINESS\nAdvanced Communications Grou...,,,ITEM 2. PROPERTIES\nThe information appearing ...,ITEM 3. LEGAL PROCEEDINGS\nThe information app...,ITEM 4. SUBMISSION OF MATTERS TO A VOTE OF SEC...,ITEM 5. MARKET FOR THE REGISTRANT'S COMMON EQU...,...,ITEM 8. FINANCIAL STATEMENTS AND OTHER SUPPLEM...,ITEM 9. CHANGES AND DISAGREEMENTS WITH ACCOUNT...,,,ITEM 10. DIRECTORS AND EXECUTIVE OFFICERS OF T...,ITEM 11. EXECUTIVE COMPENSATION\nThe informati...,ITEM 12. SECURITY OWNERSHIP OF CERTAIN BENEFIC...,ITEM 13. CERTAIN RELATIONSHIPS AND RELATED TRA...,"ITEM 14. EXHIBITS, FINANCIAL STATEMENTS SCHEDU...",


In [14]:
import os
import google.generativeai as genai
api_key = "AIzaSyDz3kL0XL7QogHsDPh_g596Raj2CbpyMmQ"
os.environ["GOOGLE_API_KEY"] = api_key # or set in your shell
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

In [15]:
# try:
#     from langchain_google_genai import GoogleGenerativeAIEmbeddings
#     # Recommended Gemini embedding model
#     embeddings = GoogleGenerativeAIEmbeddings(model="text-embedding-004")  
# except Exception:
#     # Fallback: small local HF model (requires sentence-transformers installed)
#     from langchain_community.embeddings import HuggingFaceEmbeddings
#     embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [19]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [20]:
# from langchain_google_genai import GoogleGenerativeAIEmbeddings

# # Correct model name (no "models/")
# embeddings = GoogleGenerativeAIEmbeddings(model="gemini-embedding-001")

# probe embedding dimension
dim = len(embeddings.embed_query("dimension probe"))
print("Embedding dimension:", dim)

Embedding dimension: 768


In [7]:
# Identify section columns dynamically
section_cols: List[str] = [c for c in df.columns if c.startswith("section_")]

# Build parent docs (one per non-empty section cell)
parents: List[Document] = []
parent_ids: List[str] = []

for row in df.itertuples(index=False):
    base_meta = {
        "filename": getattr(row, "filename"),
        "cik": getattr(row, "cik"),
        "year": getattr(row, "year"),
    }
    for sec in section_cols:
        content = getattr(row, sec)
        if not content or not str(content).strip():
            continue

        parent_id = f"{base_meta['filename']}#{sec}"   # stable per (file, section)
        doc = Document(
            page_content=str(content),
            metadata={
                **base_meta,
                "section": sec,
                "parent_id": parent_id,
            },
        )
        parents.append(doc)
        parent_ids.append(parent_id)

print(f"Prepared {len(parents)} parent docs across {len(section_cols)} sections.")


index = faiss.IndexFlatIP(dim)  # inner product (works as cosine with normalized vectors)
child_docstore = InMemoryDocstore()  # FAISS keeps its own docstore for children
vectorstore = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=child_docstore,
    index_to_docstore_id={},  # filled as we add children
)

# Docstore for PARENTS
parent_docstore = InMemoryStore()


# Splitter for children (tune sizes for your data)
child_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200,
    chunk_overlap=150,
    separators=["\n\n", "\n", " ", ""],
)


# Build the ParentDocumentRetriever
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,       # children live here
    docstore=parent_docstore,      # parents live here
    child_splitter=child_splitter, # how to chunk parents into children
    search_kwargs={"k": 6},        # tune recall
)


# Add parent docs; children are auto-created and indexed
retriever.add_documents(parents, ids=parent_ids)
print("Loaded parents and auto-chunked children into FAISS.")


child_store_dict = vectorstore.docstore._dict  # {child_id: Document}
example_items = list(child_store_dict.items())[:5]
for child_id, child_doc in example_items:
    print({
        "child_id": child_id,
        "parent_id": child_doc.metadata.get("doc_id"),  # parent_id you provided above
        "section": child_doc.metadata.get("section"),   # often propagated, but not guaranteed
        "filename": child_doc.metadata.get("filename"),
    })


for cid, cdoc in child_store_dict.items():
    if "filename" not in cdoc.metadata or "section" not in cdoc.metadata:
        # Look up the parent to copy its metadata
        parent_id = cdoc.metadata.get("doc_id")
        parent_doc = parent_docstore.search(parent_id)
        if parent_doc is not None:
            cdoc.metadata.setdefault("filename", parent_doc.metadata.get("filename"))
            cdoc.metadata.setdefault("section", parent_doc.metadata.get("section"))
            cdoc.metadata.setdefault("cik", parent_doc.metadata.get("cik"))
            cdoc.metadata.setdefault("year", parent_doc.metadata.get("year"))


# -------------------------------------------------------------------
# Retrieval example (returns full parent docs for the matching child chunks)
query = "what is administrative expense"
results = retriever.get_relevant_documents(query)
print(f"\nTop {len(results)} parent results for: {query!r}")
for i, d in enumerate(results, 1):
    print(f"\n[{i}] parent_id={d.metadata.get('parent_id')} | file={d.metadata.get('filename')} | section={d.metadata.get('section')}")
    print(d.page_content[:400], "...")


Prepared 14 parent docs across 20 sections.


Loaded parents and auto-chunked children into FAISS.


{'child_id': 'a0d2dcb8-3274-43e1-b28c-195ef6562bc5', 'parent_id': '1047643_1997.txt#section_1', 'section': 'section_1', 'filename': '1047643_1997.txt'}
{'child_id': '6198407d-0b3b-476e-a48f-69cacfe264b8', 'parent_id': '1047643_1997.txt#section_1', 'section': 'section_1', 'filename': '1047643_1997.txt'}
{'child_id': '427008c9-1861-4a7b-9b53-c964974e3e19', 'parent_id': '1047643_1997.txt#section_1', 'section': 'section_1', 'filename': '1047643_1997.txt'}
{'child_id': '4c13697e-3413-4ba6-9919-91bf47dd2db5', 'parent_id': '1047643_1997.txt#section_1', 'section': 'section_1', 'filename': '1047643_1997.txt'}
{'child_id': '78cf2786-ec71-43d0-8e28-c7511f48a549', 'parent_id': '1047643_1997.txt#section_1', 'section': 'section_1', 'filename': '1047643_1997.txt'}



Top 2 parent results for: 'what is administrative expense'

[1] parent_id=1047643_1997.txt#section_7 | file=1047643_1997.txt | section=section_7
ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS
RESULTS OF OPERATIONS FOR THE PERIOD FROM INCEPTION (JUNE 6, 1996) TO DECEMBER 31, 1996 COMPARED WITH THE TWELVE MONTHS ENDED DECEMBER 31, 1997.
General and administrative expense. General and administrative expense increased by $2,291,539 from $648,930 during the period from inception (June 6, 1996) to D ...

[2] parent_id=1047643_1997.txt#section_8 | file=1047643_1997.txt | section=section_8
ITEM 8. FINANCIAL STATEMENTS AND OTHER SUPPLEMENTARY DATA
INDEPENDENT AUDITORS' REPORT
The Board of Directors Advanced Communications Group, Inc.
We have audited the accompanying consolidated balance sheets of Advanced Communications Group, Inc. as of December 31, 1996 and 1997, and the related consolidated statements of operations, changes in stockholders' def