In [2]:
try:
    from langchain_experimental.text_splitter import SemanticChunker
    _HAS_SEM = True
    print("iam here")
except Exception:
    from langchain_text_splitters import RecursiveCharacterTextSplitter
    _HAS_SEM = False

iam here


In [3]:
# --- LangChain retriever: stream from Spark to FAISS ---
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
import faiss

### 1. Loading Document

In [4]:
from datasets import load_dataset

# Stream the dataset to avoid loading everything in memory
ds = load_dataset("eloukas/edgar-corpus", "full", split="train")

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# it never holds the whole dataset in memory, and the gzip keeps the file small.
import json, gzip, os

AIG_CIK = "0000005272"   # AIG
OUT_PATH = "aig_edgar.jsonl.gz"  # compact on-disk buffer for Spark

# # Stream the full split; no huge RAM spikes
# ds = load_dataset("eloukas/edgar-corpus", "full", split="train", streaming=True)

# Write only matching rows to newline-delimited JSON (gzipped)
count = 0
with gzip.open(OUT_PATH, "wt", encoding="utf-8") as f:
    for row in ds:
        # rows have keys like: filename, cik, year, section_1, section_1A, ...
        if str(row.get("cik", "")).zfill(10) == AIG_CIK:
            f.write(json.dumps(row, ensure_ascii=False) + "\n")
            count += 1

print(f"Wrote {count} AIG rows to {OUT_PATH}")

Wrote 22 AIG rows to aig_edgar.jsonl.gz


In [6]:
from pyspark.sql import SparkSession
AIG_CIK = "0000005272"
OUT_PATH = "aig_edgar.jsonl.gz"
spark = (
    SparkSession.builder
    .appName("AIG-EDGAR")
    # tweak as you like; driver mem helps if you inspect a lot at once
    .config("spark.driver.memory", "6g")
    .getOrCreate()
)

# Read the gzipped JSONL directly
aig_df = spark.read.json(OUT_PATH)

# (Optional) normalize CIK to 10-digit string for consistency
from pyspark.sql.functions import lpad, col
aig_df = aig_df.withColumn("cik", lpad(col("cik").cast("string"), 10, "0"))

# Inspect a few rows
aig_df.select("filename", "cik", "year").show(10, truncate=False)

# Persist to Parquet (columnar, splittable, great for Spark)
PARQUET_DIR = "parquet_aig_edgar"
aig_df.write.mode("overwrite").parquet(PARQUET_DIR)

print(f"Saved AIG subset to {PARQUET_DIR}")


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/09/10 23:22:39 WARN Utils: Your hostname, Anants-MacBook-Air.local, resolves to a loopback address: 127.0.0.1; using 192.168.1.163 instead (on interface en0)
25/09/10 23:22:39 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/10 23:22:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/09/10 23:22:40 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


+-------------+----------+----+
|filename     |cik       |year|
+-------------+----------+----+
|5272_1994.txt|0000005272|1994|
|5272_1995.txt|0000005272|1995|
|5272_1998.txt|0000005272|1998|
|5272_1999.txt|0000005272|1999|
|5272_2000.txt|0000005272|2000|
|5272_2001.txt|0000005272|2001|
|5272_2003.htm|0000005272|2003|
|5272_2004.htm|0000005272|2004|
|5272_2005.htm|0000005272|2005|
|5272_2006.htm|0000005272|2006|
+-------------+----------+----+
only showing top 10 rows
Saved AIG subset to parquet_aig_edgar


In [7]:
aig_df.toPandas().head(2)

Unnamed: 0,cik,filename,section_1,section_10,section_11,section_12,section_13,section_14,section_15,section_1A,...,section_4,section_5,section_6,section_7,section_7A,section_8,section_9,section_9A,section_9B,year
0,5272,5272_1994.txt,ITEM 1. BUSINESS\nAmerican International Group...,ITEM 10. DIRECTORS AND EXECUTIVE OFFICERS OF T...,ITEM 11. EXECUTIVE COMPENSATION\nThis item is ...,ITEM 12. SECURITY OWNERSHIP OF CERTAIN BENEFIC...,ITEM 13. CERTAIN RELATIONSHIPS AND RELATED TRA...,"ITEM 14. EXHIBITS, FINANCIAL STATEMENT SCHEDUL...",,,...,ITEM 4. SUBMISSION OF MATTERS TO A VOTE OF SEC...,ITEM 5. MARKET FOR THE REGISTRANT'S COMMON STO...,ITEM 6. SELECTED FINANCIAL DATA\nAMERICAN INTE...,ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS O...,,ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY...,ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCO...,,,1994
1,5272,5272_1995.txt,ITEM 1. BUSINESS\nAmerican International Group...,ITEM 10. DIRECTORS AND EXECUTIVE OFFICERS OF T...,ITEM 11. EXECUTIVE COMPENSATION\nThis item is ...,ITEM 12. SECURITY OWNERSHIP OF CERTAIN BENEFIC...,ITEM 13. CERTAIN RELATIONSHIPS AND RELATED TRA...,"ITEM 14. EXHIBITS, FINANCIAL STATEMENT SCHEDUL...",,,...,ITEM 4. SUBMISSION OF MATTERS TO A VOTE OF SEC...,ITEM 5. MARKET FOR THE REGISTRANT'S COMMON STO...,ITEM 6. SELECTED FINANCIAL DATA AMERICAN INTER...,ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS O...,,ITEM 8. Financial Statements and Supplementary...,ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCO...,,,1995


In [8]:
# import os
# import google.generativeai as genai
# api_key = "AIzaSyDz3kL0XL7QogHsDPh_g596Raj2CbpyMmQ"
# os.environ["GOOGLE_API_KEY"] = api_key # or set in your shell
# genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

In [9]:
# from langchain_google_genai import GoogleGenerativeAIEmbeddings
# embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
# dim = len(embeddings.embed_query("dimension probe"))
# print("Embedding dimension:", dim)

### 2. Preparing Parent Document retriever

In [10]:
from langchain_google_vertexai import VertexAIEmbeddings

embeddings = VertexAIEmbeddings(
    model_name="text-embedding-004",  # Gemini family embedding model
    project="drift-sense",
    location="us-central1",
)

dim = len(embeddings.embed_query("dimension probe"))
print("Embedding dimension:", dim)



Embedding dimension: 768


In [11]:
df = aig_df.toPandas()
df.shape

(22, 23)

In [12]:
from typing import List
from langchain.docstore.document import Document

# Identify section columns dynamically
section_cols: List[str] = [c for c in df.columns if c.startswith("section_")]

# Build parent docs (one per non-empty section cell)
parents: List[Document] = []
parent_ids: List[str] = []

for row in df.itertuples(index=False):
    base_meta = {
        "filename": getattr(row, "filename"),
        "cik": getattr(row, "cik"),
        "year": getattr(row, "year"),
    }
    for sec in section_cols:
        content = getattr(row, sec)
        if not content or not str(content).strip():
            continue

        parent_id = f"{base_meta['filename']}#{sec}"   # stable per (file, section)
        doc = Document(
            page_content=str(content),
            metadata={
                **base_meta,
                "section": sec,
                "parent_id": parent_id,
            },
        )
        parents.append(doc)
        parent_ids.append(parent_id)

print(f"Prepared {len(parents)} parent docs across {len(section_cols)} sections.")

Prepared 378 parent docs across 20 sections.


In [13]:
index = faiss.IndexFlatIP(dim)  # inner product (works as cosine with normalized vectors)
child_docstore = InMemoryDocstore()  # FAISS keeps its own docstore for children
vectorstore = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=child_docstore,
    index_to_docstore_id={},  # filled as we add children
)

In [14]:
# Docstore for PARENTS
parent_docstore = InMemoryStore()

# # Splitter for children (tune sizes for your data)
child_splitter = RecursiveCharacterTextSplitter(
    chunk_size=4000,
    separators=["\n\n", "\n", " ", ""],
)

# ---- SEMANTIC child splitter ----
# breakpoint_threshold_type: "percentile" (95 = fewer, larger chunks) or "standard_deviation"
# child_splitter = SemanticChunker(
#     embeddings,
#     breakpoint_threshold_type="percentile",
#     breakpoint_threshold_amount=95,
# )

# Build the ParentDocumentRetriever
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,       # children live here
    docstore=parent_docstore,      # parents live here
    child_splitter=child_splitter, # how to chunk parents into children
    # parent_splitter=parent_splitter,        # tune recall
)

In [15]:
len(parent_ids), len(parents)

(378, 378)

In [16]:
# Add parent docs; c`d and indexed
retriever.add_documents(parents, ids=parent_ids)
print("Loaded parents and auto-chunked children into FAISS.")

Loaded parents and auto-chunked children into FAISS.


In [17]:
import os
ARTIFACT_DIR = "./artifacts"
FAISS_DIR  = os.path.join(ARTIFACT_DIR, "faiss_child_store")
# Save FAISS (child chunks)
vectorstore.save_local(FAISS_DIR)

# Parents are already persisted automatically to PARENT_DIR via LocalFileStore
print("Saved FAISS to:", FAISS_DIR)

Saved FAISS to: ./artifacts/faiss_child_store


In [18]:
# -------------------------------------------------------------------
# Retrieval example (returns full parent docs for the matching child chunks)
query = "what is the total revenue as reported in Financial report"
results = retriever.get_relevant_documents(query)

  results = retriever.get_relevant_documents(query)


### Retreive and Extract Document

In [19]:
Extract_Details = """
## Role
You are an extraction analyst. Read the provided document content and metadata to extract AIG facts.

## Targets (extract EXACT text as written in the document body)
- Total Revenue
- Net income (loss) attributable to AIG
- Auditor firm (e.g., “PricewaterhouseCoopers LLP”, “KPMG LLP”, “Deloitte & Touche LLP”)

## Metadata Rules (authoritative)
- year: use {year} if provided in metadata; do not infer from text if metadata exists.
- section/source:
  - Prefer {parent_id} (e.g., "5272_2020.htm#section_9B") if present.
  - Else use {section} (e.g., "section_9B").
  - If neither present, use the clearest section header found in the text (e.g., "Item 7", "Item 8").

## Hints (don’t guess)
- “Total Revenue” may appear as “Total revenues”, “Consolidated total revenues”.
- “Net income (loss) attributable to AIG” might appear as “Net income attributable to AIG/common shareholders”.
- For the auditor, return the firm NAME only (not the report title).
- If any one of the three target fields (Total Revenue, Net income..., Auditor) is missing, return exactly: None

## Output (STRICT)
- Return EXACTLY one line with 5 fields separated by " || "
  1) Total Revenue
  2) Net income (loss) attributable to AIG
  3) Auditor firm
  4) year
  5) section/source (prefer parent_id; else section; else header text)
- No extra text, labels, or quotes.
- Preserve numbers/formatting as written (keep $, commas, parentheses, “million/billion”).

## Edge Rules
- If both “Net income” and “Net loss” variants appear, choose the one explicitly “attributable to AIG”.
- Prefer first unambiguous occurrence in MD&A/Financial Statements (Items 7/8) when multiple appear.
- Never infer the auditor from signatures without the firm’s name.

## Tiny Examples

[Example A — all present]
Meta: year=2019, section=section_7, parent_id=5272_2019.htm#section_7
Text: “Total revenues were $52.1 billion… Net income (loss) attributable to AIG was $(6.7) billion… audited by PricewaterhouseCoopers LLP…”
Output:
$52.1 billion || $(6.7) billion || PricewaterhouseCoopers LLP || 2019 || 5272_2019.htm#section_7

[Example B — missing a target → None]
Meta: year=2016, section=section_7A, parent_id=5272_2016.htm#section_7A
Text: “Total revenues were $39.8 billion… [no ‘net income attributable to AIG’]…”
Output:
None

## Document (body text):
{document}

## Metadata:
filename={filename}
year={year}
section={section}
parent_id={parent_id}
"""


In [36]:
from google.cloud import aiplatform
from langchain_google_vertexai import ChatVertexAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_google_genai import ChatGoogleGenerativeAI # Import the Google Generative AI class
import os

# Optional: Set your API key if it's not already in your environment variables
# from google.colab import userdata # Use this if you are in a Colab notebook
# os.environ["GOOGLE_API_KEY"] = userdata.get("GOOGLE_API_KEY")
PROJECT_ID = "hd-datascience-np"
LOCATION = "us-central1"

aiplatform.init(project=PROJECT_ID, location=LOCATION)

prompt_template_v1 = ChatPromptTemplate.from_messages([
    ("human", Extract_Details)
])

# # Choose model: "gemini-1.5-flash" (fast/cheap) or "gemini-1.5-pro" (higher quality)
# llm_1 = ChatVertexAI(
#     model_name="gemini-2.5-flash",
#     temperature=0,
#     max_output_tokens=1024,
#     project=PROJECT_ID,
#     location=LOCATION,
#     api_transport="grpc",   # good perf
# )

import os
import google.generativeai as genai
api_key = "AIzaSyDz3kL0XL7QogHsDPh_g596Raj2CbpyMmQ"
os.environ["GOOGLE_API_KEY"] = api_key # or set in your shell
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

from langchain_google_genai import ChatGoogleGenerativeAI

llm_1 = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",   # 2.5 Flash
    # api_key can be omitted if GOOGLE_API_KEY is set
    api_key=api_key,
    temperature=0.0,
    max_output_tokens=1024,
)

docs = retriever.get_relevant_documents("what is the total revenue as reported in 2016 Financial report")

# docs = ensemble_retriever.get_relevant_documents("what was the Total revenue of aig in 2016")

rag_chain = prompt_template_v1 | llm_1

In [37]:
extracted_details_rag = []

In [34]:
from tqdm import tqdm
for doc in tqdm(docs):
    out = rag_chain.invoke({
            "document": doc.page_content,
            "filename": doc.metadata.get("filename"),
            "year": doc.metadata.get("year"),
            "section": doc.metadata.get("section"),
            "parent_id": doc.metadata.get("parent_id"),
        })

    content = getattr(out, "content", out)
    if content is None:
        continue

    text = str(content).strip()
    
    # Skip empty/placeholder outputs
    if not text or text.lower() in {"none", "null", "{}", "[]"}:
        continue
    
    # Invoke the chain with a query
    extracted_details_rag.append({
        "filename": doc.metadata.get("filename"),
        "year": doc.metadata.get("year"),
        "section": doc.metadata.get("section"),
        "parent_id": doc.metadata.get("parent_id"),
        # "chunk_id": doc.metadata.get("chunk_id"),
        "extracted": text,
    })

100%|██████████| 3/3 [00:27<00:00,  9.11s/it]


In [35]:
extracted_details_rag

[{'filename': '5272_2016.htm',
  'year': '2016',
  'section': 'section_7',
  'parent_id': '5272_2016.htm#section_7',
  'extracted': '$52,364 million || $(8,007) million || None || 2016 || 5272_2016.htm#section_7'}]

In [None]:
docs = retriever.get_relevant_documents("what is the total revenue as reported in 2020 Financial report")

In [38]:
from tqdm import tqdm
extracted_details_rag = []
for doc in tqdm(docs):
    out = rag_chain.invoke({
            "document": doc.page_content,
            "filename": doc.metadata.get("filename"),
            "year": doc.metadata.get("year"),
            "section": doc.metadata.get("section"),
            "parent_id": doc.metadata.get("parent_id"),
        })

    content = getattr(out, "content", out)
    if content is None:
        continue

    text = str(content).strip()
    
    # Skip empty/placeholder outputs
    if not text or text.lower() in {"none", "null", "{}", "[]"}:
        continue
    
    # Invoke the chain with a query
    extracted_details_rag.append({
        "filename": doc.metadata.get("filename"),
        "year": doc.metadata.get("year"),
        "section": doc.metadata.get("section"),
        "parent_id": doc.metadata.get("parent_id"),
        # "chunk_id": doc.metadata.get("chunk_id"),
        "extracted": text,
    })

100%|██████████| 3/3 [00:22<00:00,  7.64s/it]


In [39]:
extracted_details_rag

[{'filename': '5272_2020.htm',
  'year': '2020',
  'section': 'section_8',
  'parent_id': '5272_2020.htm#section_8',
  'extracted': '$47,997 || $(5,973) || PricewaterhouseCoopers LLP || 2020 || 5272_2020.htm#section_8'}]

In [40]:
docs = retriever.get_relevant_documents("what is the total revenue as reported in 2012 Financial report")
from tqdm import tqdm
extracted_details_rag = []
for doc in tqdm(docs):
    out = rag_chain.invoke({
            "document": doc.page_content,
            "filename": doc.metadata.get("filename"),
            "year": doc.metadata.get("year"),
            "section": doc.metadata.get("section"),
            "parent_id": doc.metadata.get("parent_id"),
        })

    content = getattr(out, "content", out)
    if content is None:
        continue

    text = str(content).strip()
    
    # Skip empty/placeholder outputs
    if not text or text.lower() in {"none", "null", "{}", "[]"}:
        continue
    
    # Invoke the chain with a query
    extracted_details_rag.append({
        "filename": doc.metadata.get("filename"),
        "year": doc.metadata.get("year"),
        "section": doc.metadata.get("section"),
        "parent_id": doc.metadata.get("parent_id"),
        # "chunk_id": doc.metadata.get("chunk_id"),
        "extracted": text,
    })

100%|██████████| 2/2 [00:16<00:00,  8.45s/it]


In [41]:
extracted_details_rag

[]

In [42]:
docs = retriever.get_relevant_documents("what is the total revenue as reported in 2004 Financial report")
from tqdm import tqdm
extracted_details_rag = []
for doc in tqdm(docs):
    out = rag_chain.invoke({
            "document": doc.page_content,
            "filename": doc.metadata.get("filename"),
            "year": doc.metadata.get("year"),
            "section": doc.metadata.get("section"),
            "parent_id": doc.metadata.get("parent_id"),
        })

    content = getattr(out, "content", out)
    if content is None:
        continue

    text = str(content).strip()
    
    # Skip empty/placeholder outputs
    if not text or text.lower() in {"none", "null", "{}", "[]"}:
        continue
    
    # Invoke the chain with a query
    extracted_details_rag.append({
        "filename": doc.metadata.get("filename"),
        "year": doc.metadata.get("year"),
        "section": doc.metadata.get("section"),
        "parent_id": doc.metadata.get("parent_id"),
        # "chunk_id": doc.metadata.get("chunk_id"),
        "extracted": text,
    })
extracted_details_rag

100%|██████████| 3/3 [00:21<00:00,  7.32s/it]


[]

In [43]:
docs = retriever.get_relevant_documents("what is the total revenue as reported in 2008 Financial report")
from tqdm import tqdm
extracted_details_rag = []
for doc in tqdm(docs):
    out = rag_chain.invoke({
            "document": doc.page_content,
            "filename": doc.metadata.get("filename"),
            "year": doc.metadata.get("year"),
            "section": doc.metadata.get("section"),
            "parent_id": doc.metadata.get("parent_id"),
        })

    content = getattr(out, "content", out)
    if content is None:
        continue

    text = str(content).strip()
    
    # Skip empty/placeholder outputs
    if not text or text.lower() in {"none", "null", "{}", "[]"}:
        continue
    
    # Invoke the chain with a query
    extracted_details_rag.append({
        "filename": doc.metadata.get("filename"),
        "year": doc.metadata.get("year"),
        "section": doc.metadata.get("section"),
        "parent_id": doc.metadata.get("parent_id"),
        # "chunk_id": doc.metadata.get("chunk_id"),
        "extracted": text,
    })
extracted_details_rag

100%|██████████| 3/3 [00:20<00:00,  6.76s/it]


[{'filename': '5272_2006.htm',
  'year': '2006',
  'section': 'section_8',
  'parent_id': '5272_2006.htm#section_8',
  'extracted': '$113,195 million || $14,049 million || PricewaterhouseCoopers LLP || 2006 || 5272_2006.htm#section_8'}]