## 1. Load the Dataset

In [197]:
from datasets import load_dataset

# Stream the dataset to avoid loading everything in memory
ds = load_dataset("eloukas/edgar-corpus", "full", split="train")

### 1.1 Filter the dataset for AIG and save it

In [2]:
# it never holds the whole dataset in memory, and the gzip keeps the file small.
import json, gzip, os

AIG_CIK = "0000005272"   # AIG
OUT_PATH = "aig_edgar.jsonl.gz"  # compact on-disk buffer for Spark

# Write only matching rows to newline-delimited JSON (gzipped)
count = 0
with gzip.open(OUT_PATH, "wt", encoding="utf-8") as f:
    for row in ds:
        # rows have keys like: filename, cik, year, section_1, section_1A, ...
        if str(row.get("cik", "")).zfill(10) == AIG_CIK:
            f.write(json.dumps(row, ensure_ascii=False) + "\n")
            count += 1

print(f"Wrote {count} AIG rows to {OUT_PATH}")

Wrote 22 AIG rows to aig_edgar.jsonl.gz


### 1.2 Using Spark to create a dataframe

In [1]:
from pyspark.sql import SparkSession
OUT_PATH = "aig_edgar.jsonl.gz"

spark = (
    SparkSession.builder
    .appName("AIG-EDGAR")
    # tweak as you like; driver mem helps if you inspect a lot at once
    .config("spark.driver.memory", "6g")
    .getOrCreate()
)

# Read the gzipped JSONL directly
aig_df = spark.read.json(OUT_PATH)

# (Optional) normalize CIK to 10-digit string for consistency
from pyspark.sql.functions import lpad, col
aig_df = aig_df.withColumn("cik", lpad(col("cik").cast("string"), 10, "0"))

# Inspect a few rows
aig_df.select("filename", "cik", "year").show(10, truncate=False)

# Persist to Parquet (columnar, splittable, great for Spark)
PARQUET_DIR = "parquet_aig_edgar"
aig_df.write.mode("overwrite").parquet(PARQUET_DIR)

print(f"Saved AIG subset to {PARQUET_DIR}")


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/09/10 01:07:57 WARN Utils: Your hostname, Anants-MacBook-Air.local, resolves to a loopback address: 127.0.0.1; using 192.168.1.163 instead (on interface en0)
25/09/10 01:07:57 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/10 01:08:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


+-------------+----------+----+
|filename     |cik       |year|
+-------------+----------+----+
|5272_1994.txt|0000005272|1994|
|5272_1995.txt|0000005272|1995|
|5272_1998.txt|0000005272|1998|
|5272_1999.txt|0000005272|1999|
|5272_2000.txt|0000005272|2000|
|5272_2001.txt|0000005272|2001|
|5272_2003.htm|0000005272|2003|
|5272_2004.htm|0000005272|2004|
|5272_2005.htm|0000005272|2005|
|5272_2006.htm|0000005272|2006|
+-------------+----------+----+
only showing top 10 rows
Saved AIG subset to parquet_aig_edgar


                                                                                

In [2]:
aig_df.toPandas().head(2)

Unnamed: 0,cik,filename,section_1,section_10,section_11,section_12,section_13,section_14,section_15,section_1A,...,section_4,section_5,section_6,section_7,section_7A,section_8,section_9,section_9A,section_9B,year
0,5272,5272_1994.txt,ITEM 1. BUSINESS\nAmerican International Group...,ITEM 10. DIRECTORS AND EXECUTIVE OFFICERS OF T...,ITEM 11. EXECUTIVE COMPENSATION\nThis item is ...,ITEM 12. SECURITY OWNERSHIP OF CERTAIN BENEFIC...,ITEM 13. CERTAIN RELATIONSHIPS AND RELATED TRA...,"ITEM 14. EXHIBITS, FINANCIAL STATEMENT SCHEDUL...",,,...,ITEM 4. SUBMISSION OF MATTERS TO A VOTE OF SEC...,ITEM 5. MARKET FOR THE REGISTRANT'S COMMON STO...,ITEM 6. SELECTED FINANCIAL DATA\nAMERICAN INTE...,ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS O...,,ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY...,ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCO...,,,1994
1,5272,5272_1995.txt,ITEM 1. BUSINESS\nAmerican International Group...,ITEM 10. DIRECTORS AND EXECUTIVE OFFICERS OF T...,ITEM 11. EXECUTIVE COMPENSATION\nThis item is ...,ITEM 12. SECURITY OWNERSHIP OF CERTAIN BENEFIC...,ITEM 13. CERTAIN RELATIONSHIPS AND RELATED TRA...,"ITEM 14. EXHIBITS, FINANCIAL STATEMENT SCHEDUL...",,,...,ITEM 4. SUBMISSION OF MATTERS TO A VOTE OF SEC...,ITEM 5. MARKET FOR THE REGISTRANT'S COMMON STO...,ITEM 6. SELECTED FINANCIAL DATA AMERICAN INTER...,ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS O...,,ITEM 8. Financial Statements and Supplementary...,ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCO...,,,1995


## Solution 1: 
    - Using the Hybrid vector search/store based approach 
    - Passing Question to Hybrid retreiver to get relevant document 
    - Passing the fetched Document into LLM to extract the detail

### 1.1. Loading Embedding Model

In [5]:
# import os
# import google.generativeai as genai
# api_key = "AIzaSyDz3kL0XL7QogHsDPh_g596Raj2CbpyMmQ"
# os.environ["GOOGLE_API_KEY"] = api_key # or set in your shell
# genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

In [14]:
# from langchain_google_genai import GoogleGenerativeAIEmbeddings
# embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
# dim = len(embeddings.embed_query("dimension probe"))
# print("Embedding dimension:", dim)

In [4]:
from langchain_community.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
dim = len(embeddings.embed_query("dimension probe"))
print("Embedding dimension:", dim)

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


Embedding dimension: 384


In [120]:
df = aig_df.toPandas()

In [121]:
df = df[df.year.isin(['2004', '2008', '2012', '2016', '2020'])]

In [122]:
import math
import pandas as pd
from typing import List, Iterable
from uuid import uuid4
from langchain.docstore.document import Document
from langchain_community.vectorstores import FAISS

try:
    from langchain_experimental.text_splitter import SemanticChunker
    _HAS_SEM = True
except Exception:
    from langchain_text_splitters import RecursiveCharacterTextSplitter
    _HAS_SEM = False

### 1.2 Preparing Document for Injestion

In [62]:
def _make_splitter(emb):
    """
    Semantic splitter that finds natural breakpoints; falls back to
    a high-quality character-based splitter.
    """
    if _HAS_SEM:
        # Breakpoints chosen via embedding similarity changes.
        # 95th percentile is a good default for long 10-K sections.
        return SemanticChunker(
            emb,
            breakpoint_threshold_type="percentile",
            breakpoint_threshold_amount=95,
        )
    else:
        # Safe default if semantic chunker isn't available.
        return RecursiveCharacterTextSplitter(
            chunk_size=1200,
            chunk_overlap=150,
            add_start_index=True,
        )


def _is_nonempty_text(x) -> bool:
    if x is None:
        return False
    if isinstance(x, float) and math.isnan(x):
        return False
    return bool(str(x).strip())


def _iter_section_chunks(
    df: pd.DataFrame,
    splitter,
):
    """Yield chunked Documents with rich metadata from a wide SEC sections DF."""
    # discover section columns dynamically
    section_cols: List[str] = [c for c in df.columns if c.startswith("section_")]

    for row in df.itertuples(index=False):
        filename = str(getattr(row, "filename"))
        cik = str(getattr(row, "cik"))
        year = int(getattr(row, "year"))

        for sec in section_cols:
            raw_text = getattr(row, sec)
            if not _is_nonempty_text(raw_text):
                continue

            text = str(raw_text).strip()
            # parent (section) identity
            section_id = f"{filename}#{sec}"
            section_title = text.splitlines()[0][:160] if text else ""

            # split into semantic chunks
            chunks: List[str] = splitter.split_text(text)

            for idx, chunk in enumerate(chunks):
                # stable per-chunk id (handy if you later want parent-child mapping)
                doc_id = f"{section_id}::chunk{idx}"
                meta = {
                    "doc_id": doc_id,            # unique id for this chunk
                    "section_id": section_id,    # parent section id
                    "section": sec,              # e.g., "section_10"
                    "section_title": section_title,
                    "filename": filename,
                    "cik": cik,
                    "year": year,
                    "chunk_index": idx,
                }
                yield Document(page_content=chunk, metadata=meta)
                
splitter = _make_splitter(embeddings)
docs = list(_iter_section_chunks(df, splitter))

### 1.3 Creating Hybrid Search

In [63]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever

if not docs:
    raise ValueError("No non-empty section text found to index.")

vstore = FAISS.from_documents(docs, embeddings)

# Optional: persist to disk
vstore.save_local("faiss_edgar_sections_v2")

vector_retriever = vstore.as_retriever(search_kwargs={"k": 6})

# Create a BM25Retriever for keyword search
bm25_retriever = BM25Retriever.from_documents(docs, k=6)

# --- combine them ---
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, vector_retriever],
    weights=[0.5, 0.5],
)

In [70]:
# extraction_prompt = """
# ## Persona
# - Extract the exact data and present to the user

# ## Given context:
# {context}

# ## Question: 
# {question} 
# """ 

### 1.4 Prompt to extract details

In [159]:
extract_detail_v3 = """
## Role
You are an extraction analyst. Read the provided document content and metadata to extract AIG facts.

## Targets (extract EXACT text as written in the document body)
- Total Revenue
- Net income (loss) attributable to AIG
- Auditor firm (e.g., “PricewaterhouseCoopers LLP”, “KPMG LLP”, “Deloitte & Touche LLP”)

## Metadata Rules (authoritative)
- year: use year if provided in metadata; do not infer from text if metadata exists.
- section/source:
  - Prefer parent_id (e.g., "5272_2020.htm#section_9B") if present.
  - Else use section (e.g., "section_9B").
  - If neither present, use the clearest section header found in the text (e.g., "Item 7", "Item 8").

## Hints (don’t guess)
- “Total Revenue” may appear as “Total revenues”, “Consolidated total revenues”.
- “Net income (loss) attributable to AIG” might appear as “Net income attributable to AIG/common shareholders”.
- For the auditor, return the firm NAME only (not the report title).
- If any one of the three target fields (Total Revenue, Net income..., Auditor) is missing, return exactly: None

## Output (STRICT)
- Return EXACTLY one line with 5 fields separated by " || "
  1) Total Revenue
  2) Net income (loss) attributable to AIG
  3) Auditor firm
  4) year
  5) section/source (prefer parent_id; else section; else header text)
- No extra text, labels, or quotes.
- Preserve numbers/formatting as written (keep $, commas, parentheses, “million/billion”).

## Edge Rules
- If both “Net income” and “Net loss” variants appear, choose the one explicitly “attributable to AIG”.
- Prefer first unambiguous occurrence in MD&A/Financial Statements (Items 7/8) when multiple appear.
- Never infer the auditor from signatures without the firm’s name.

## Tiny Examples

[Example A — all present]
Meta: year=2019, section=section_7, parent_id=5272_2019.htm#section_7
Text: “Total revenues were $52.1 billion… Net income (loss) attributable to AIG was $(6.7) billion… audited by PricewaterhouseCoopers LLP…”
Output:
$52.1 billion || $(6.7) billion || PricewaterhouseCoopers LLP || 2019 || 5272_2019.htm#section_7

[Example B — missing a target → None]
Meta: year=2016, section=section_7A, parent_id=5272_2016.htm#section_7A
Text: “Total revenues were $39.8 billion… [no ‘net income attributable to AIG’]…”
Output:
None

## Document (body text):
{context}

## Question:
{question}
"""

In [162]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_google_genai import ChatGoogleGenerativeAI # Import the Google Generative AI class
import os

# Optional: Set your API key if it's not already in your environment variables
# from google.colab import userdata # Use this if you are in a Colab notebook
# os.environ["GOOGLE_API_KEY"] = userdata.get("GOOGLE_API_KEY")

aiplatform.init(project=PROJECT_ID, location=LOCATION)

prompt_template_v1 = ChatPromptTemplate.from_messages([
    ("human", extract_detail_v3)
])

# Choose model: "gemini-1.5-flash" (fast/cheap) or "gemini-1.5-pro" (higher quality)
llm_1 = ChatVertexAI(
    model_name="gemini-2.5-flash",
    temperature=0,
    max_output_tokens=1024,
    project=PROJECT_ID,
    location=LOCATION,
    api_transport="grpc",   # good perf
)

# # Create the RAG chain
rag_chain_v1 = (
    {"context": ensemble_retriever, "question": RunnablePassthrough()}
    | prompt_template_v1
    | llm_1
)

# Invoke the chain with a query
response = rag_chain_v1.invoke("Total revenue of aig in 2016")


In [163]:
response.content

'None'

In [165]:
# Invoke the chain with a query
response = rag_chain_v1.invoke("who is the auditor since 2003 to 2015.")
response.content

'None'

## Solution 2: 
 - Passing the document directly to LLM and extracting the data

In [7]:
# df_data = df[df.year.isin(['2016', '2017', '2018', '2019', '2020'])]

In [15]:
from langchain_community.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
dim = len(embeddings.embed_query("dimension probe"))
print("Embedding dimension:", dim)

Embedding dimension: 384


### 2.1 Preparing Parent Document

In [123]:
from typing import List

# Identify section columns dynamically
section_cols: List[str] = [c for c in df.columns if c.startswith("section_")]

# Build parent docs (one per non-empty section cell)
parents: List[Document] = []
parent_ids: List[str] = []

for row in df.itertuples(index=False):
    base_meta = {
        "filename": getattr(row, "filename"),
        "cik": getattr(row, "cik"),
        "year": getattr(row, "year"),
    }
    for sec in section_cols:
        content = getattr(row, sec)
        if not content or not str(content).strip():
            continue

        parent_id = f"{base_meta['filename']}#{sec}"   # stable per (file, section)
        doc = Document(
            page_content=str(content),
            metadata={
                **base_meta,
                "section": sec,
                "parent_id": parent_id,
            },
        )
        parents.append(doc)
        parent_ids.append(parent_id)

print(f"Prepared {len(parents)} parent docs across {len(section_cols)} sections.")

Prepared 76 parent docs across 20 sections.


### 2.2 (optional) chunking the parent document semantically 

In [17]:
from typing import List, Dict, Tuple
from langchain.docstore.document import Document

# If you use OpenAI:
# from langchain_openai import OpenAIEmbeddings
# embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

# If you use Google Gemini:
# from langchain_google_genai import GoogleGenerativeAIEmbeddings
# embeddings = GoogleGenerativeAIEmbeddings(model="text-embedding-004")

def chunk_parents_semantic(
    parents: List[Document],
    embeddings,  # any LangChain Embeddings implementation
    breakpoint_threshold_type: str = "percentile",   # "percentile" | "standard_deviation" | "interquartile"
    breakpoint_threshold_amount: float = 95,         # used when type="percentile"
    overlap_chars: int = 200,                         # optional context overlap between adjacent chunks
) -> Tuple[List[Document], List[str], Dict[str, List[str]]]:
    """
    Split each parent Document into semantically coherent child Documents.
    Returns:
      - children: list of child chunk Documents
      - children_ids: list of chunk_ids
      - children_by_parent: mapping from parent_id to list of chunk_ids
    """
    # Try to import SemanticChunker; if not available, fall back to RecursiveCharacterTextSplitter
    try:
        from langchain_text_splitters import SemanticChunker
        semantic_splitter = SemanticChunker(
            embeddings=embeddings,
            breakpoint_threshold_type=breakpoint_threshold_type,
            breakpoint_threshold_amount=breakpoint_threshold_amount,
        )
        use_semantic = True
    except Exception:
        # Fallback: character-based splitter
        from langchain_text_splitters import RecursiveCharacterTextSplitter
        semantic_splitter = RecursiveCharacterTextSplitter(
            chunk_size=8000,
            chunk_overlap=overlap_chars,
            length_function=len,
            separators=["\n\n", "\n", ". ", " ", ""],
        )
        use_semantic = False

    children: List[Document] = []
    children_ids: List[str] = []
    children_by_parent: Dict[str, List[str]] = {}

    for pdoc in parents:
        parent_id = pdoc.metadata["parent_id"]

        # Create chunk docs; prefer create_documents if supported
        try:
            chunks: List[Document] = semantic_splitter.create_documents(
                texts=[pdoc.page_content],
                metadatas=[pdoc.metadata],
            )
        except Exception:
            # Some splitters only expose split_text
            texts = semantic_splitter.split_text(pdoc.page_content)
            chunks = [Document(page_content=t, metadata=dict(pdoc.metadata)) for t in texts]

        # Optional character overlap (prefix the current chunk with the tail of the previous one)
        if overlap_chars and len(chunks) > 1:
            prev_text = ""
            for i, ch in enumerate(chunks):
                if i > 0 and prev_text:
                    prefix = prev_text[-overlap_chars:]
                    ch.page_content = prefix + ch.page_content
                prev_text = ch.page_content

        # Add stable IDs + extra metadata
        for i, ch in enumerate(chunks):
            chunk_id = f"{parent_id}::chunk-{i:04d}"
            ch.metadata.update({
                "parent_id": parent_id,
                "chunk_index": i,
                "chunk_id": chunk_id,
                "chunk_size": len(ch.page_content),
                "chunking": "semantic" if use_semantic else "recursive_fallback",
                "breakpoint_threshold_type": breakpoint_threshold_type if use_semantic else None,
                "breakpoint_threshold_amount": breakpoint_threshold_amount if use_semantic else None,
            })

            children.append(ch)
            children_ids.append(chunk_id)
            children_by_parent.setdefault(parent_id, []).append(chunk_id)

    print(
        f"Built {len(children)} child chunks from {len(parents)} parents "
        f"(avg {len(children)/max(1,len(parents)):.2f} chunks/parent). "
        f"Mode: {'semantic' if use_semantic else 'fallback (recursive)'}."
    )
    return children, children_ids, children_by_parent

children, children_ids, children_by_parent = chunk_parents_semantic(
    parents,
    embeddings=embeddings,
    breakpoint_threshold_type="percentile",
    breakpoint_threshold_amount=95,   # higher => fewer, larger chunks
    overlap_chars=200,
)

# Peek at a couple of chunks
for d in children[:3]:
    print(d.metadata["chunk_id"], d.metadata.get("section"), len(d.page_content))


Built 703 child chunks from 96 parents (avg 7.32 chunks/parent). Mode: fallback (recursive).
5272_2016.htm#section_1::chunk-0000 section_1 7994
5272_2016.htm#section_1::chunk-0001 section_1 8157
5272_2016.htm#section_1::chunk-0002 section_1 7895


### 2.3 Prompt to extract details

In [124]:
Extract_Details = """
## Role
You are an extraction analyst. Read the provided document content and metadata to extract AIG facts.

## Targets (extract EXACT text as written in the document body)
- Total Revenue
- Net income (loss) attributable to AIG
- Auditor firm (e.g., “PricewaterhouseCoopers LLP”, “KPMG LLP”, “Deloitte & Touche LLP”)

## Metadata Rules (authoritative)
- year: use {year} if provided in metadata; do not infer from text if metadata exists.
- section/source:
  - Prefer {parent_id} (e.g., "5272_2020.htm#section_9B") if present.
  - Else use {section} (e.g., "section_9B").
  - If neither present, use the clearest section header found in the text (e.g., "Item 7", "Item 8").

## Hints (don’t guess)
- “Total Revenue” may appear as “Total revenues”, “Consolidated total revenues”.
- “Net income (loss) attributable to AIG” might appear as “Net income attributable to AIG/common shareholders”.
- For the auditor, return the firm NAME only (not the report title).
- If any one of the three target fields (Total Revenue, Net income..., Auditor) is missing, return exactly: None

## Output (STRICT)
- Return EXACTLY one line with 5 fields separated by " || "
  1) Total Revenue
  2) Net income (loss) attributable to AIG
  3) Auditor firm
  4) year
  5) section/source (prefer parent_id; else section; else header text)
- No extra text, labels, or quotes.
- Preserve numbers/formatting as written (keep $, commas, parentheses, “million/billion”).

## Edge Rules
- If both “Net income” and “Net loss” variants appear, choose the one explicitly “attributable to AIG”.
- Prefer first unambiguous occurrence in MD&A/Financial Statements (Items 7/8) when multiple appear.
- Never infer the auditor from signatures without the firm’s name.

## Tiny Examples

[Example A — all present]
Meta: year=2019, section=section_7, parent_id=5272_2019.htm#section_7
Text: “Total revenues were $52.1 billion… Net income (loss) attributable to AIG was $(6.7) billion… audited by PricewaterhouseCoopers LLP…”
Output:
$52.1 billion || $(6.7) billion || PricewaterhouseCoopers LLP || 2019 || 5272_2019.htm#section_7

[Example B — missing a target → None]
Meta: year=2016, section=section_7A, parent_id=5272_2016.htm#section_7A
Text: “Total revenues were $39.8 billion… [no ‘net income attributable to AIG’]…”
Output:
None

## Document (body text):
{document}

## Metadata:
filename={filename}
year={year}
section={section}
parent_id={parent_id}
"""


### 2.4 Setting up the LLM

In [125]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_openai import ChatOpenAI
import os
from langchain_google_vertexai import ChatVertexAI
from google.cloud import aiplatform

PROJECT_ID = os.environ.get("GOOGLE_CLOUD_PROJECT", "drift-sense")
LOCATION  = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")


In [126]:
aiplatform.init(project=PROJECT_ID, location=LOCATION)

prompt_template = ChatPromptTemplate.from_messages([
    ("human", Extract_Details)
])

# Choose model: "gemini-1.5-flash" (fast/cheap) or "gemini-1.5-pro" (higher quality)
llm = ChatVertexAI(
    model_name="gemini-2.5-flash",
    temperature=0,
    max_output_tokens=1024,
    project=PROJECT_ID,
    location=LOCATION,
    api_transport="grpc",   # good perf
)

rag_chain = prompt_template | llm

### 2.5 Running the rag model

In [127]:
extracted_details = []

In [128]:
from tqdm import tqdm
for doc in tqdm(parents):
    # Run the chain
    out = rag_chain.invoke({
        "document": doc.page_content,
        "filename": doc.metadata.get("filename"),
        "year": doc.metadata.get("year"),
        "section": doc.metadata.get("section"),
        "parent_id": doc.metadata.get("parent_id"),
    })

    # LangChain chat models usually return an AIMessage; fall back to str if needed
    content = getattr(out, "content", out)
    if content is None:
        continue

    text = str(content).strip()
    
    # Skip empty/placeholder outputs
    if not text or text.lower() in {"none", "null", "{}", "[]"}:
        continue

    # Save either the raw text or a richer record
    extracted_details.append({
        "filename": doc.metadata.get("filename"),
        "year": doc.metadata.get("year"),
        "section": doc.metadata.get("section"),
        "parent_id": doc.metadata.get("parent_id"),
        # "chunk_id": doc.metadata.get("chunk_id"),
        "extracted": text,
    })

100%|██████████| 76/76 [03:40<00:00,  2.90s/it]


In [129]:
print(f"Saved {len(extracted_details)} items")

Saved 4 items


In [135]:
extracted_details

[{'filename': '5272_2004.htm',
  'year': '2004',
  'section': 'section_7',
  'parent_id': '5272_2004.htm#section_7',
  'extracted': '$98.69 billion || $9.77 billion || PricewaterhouseCoopers LLP || 2004 || 5272_2004.htm#section_7'},
 {'filename': '5272_2012.htm',
  'year': '2012',
  'section': 'section_8',
  'parent_id': '5272_2012.htm#section_8',
  'extracted': '$68,790 million || $'},
 {'filename': '5272_2016.htm',
  'year': '2016',
  'section': 'section_8',
  'parent_id': '5272_2016.htm#section_8',
  'extracted': '$52,330 million || $(849) million || PricewaterhouseCoopers LLP || 2016 || 5272_2016.htm#section_8'},
 {'filename': '5272_2020.htm',
  'year': '2020',
  'section': 'section_8',
  'parent_id': '5272_2020.htm#section_8',
  'extracted': '$47,997 million || $(5,973) million || PricewaterhouseCoopers LLP || 2020 || 5272_2020.htm#section_8'}]

### 2.6 Extracting the details and Saving it

In [141]:
import pandas as pd

def build_df_from_extracted(recs):
    rows = []
    for r in recs:
        print(r)
        parent_id = r.get("parent_id")
        extract_details = r.get("extracted").split("||")
        if len(extract_details) == 5:
            rev_raw, net_raw, _auditor, year_str, parent = extract_details
        else:
            continue

        # derive section from the tail of parent (e.g., "...#section_8")
        section = None
        if parent and "#" in parent:
            section = parent.split("#", 1)[1]

        rows.append({
            "Total revenues": rev_raw,  # in millions
            "Net income (loss) attributable to AIG": net_raw,      # in millions
            "year": year_str,
            "auditor": _auditor,
            "filename#section": parent_id
        })
    return pd.DataFrame(rows, columns=["Total revenues", "Net income (loss) attributable to AIG", "auditor", "year", "filename#section"])

df_final = build_df_from_extracted(extracted_details)

{'filename': '5272_2004.htm', 'year': '2004', 'section': 'section_7', 'parent_id': '5272_2004.htm#section_7', 'extracted': '$98.69 billion || $9.77 billion || PricewaterhouseCoopers LLP || 2004 || 5272_2004.htm#section_7'}
{'filename': '5272_2012.htm', 'year': '2012', 'section': 'section_8', 'parent_id': '5272_2012.htm#section_8', 'extracted': '$68,790 million || $'}
{'filename': '5272_2016.htm', 'year': '2016', 'section': 'section_8', 'parent_id': '5272_2016.htm#section_8', 'extracted': '$52,330 million || $(849) million || PricewaterhouseCoopers LLP || 2016 || 5272_2016.htm#section_8'}
{'filename': '5272_2020.htm', 'year': '2020', 'section': 'section_8', 'parent_id': '5272_2020.htm#section_8', 'extracted': '$47,997 million || $(5,973) million || PricewaterhouseCoopers LLP || 2020 || 5272_2020.htm#section_8'}


In [142]:
df_final

Unnamed: 0,Total revenues,Net income (loss) attributable to AIG,auditor,year,filename#section
0,$98.69 billion,$9.77 billion,PricewaterhouseCoopers LLP,2004,5272_2004.htm#section_7
1,"$52,330 million",$(849) million,PricewaterhouseCoopers LLP,2016,5272_2016.htm#section_8
2,"$47,997 million","$(5,973) million",PricewaterhouseCoopers LLP,2020,5272_2020.htm#section_8


In [143]:
df_ground_truth = pd.read_csv("GroundTruthData_v1.csv")
df_ground_truth

Unnamed: 0,company,cik,year,variable,value_usd,unit,category_value,section
0,AIG,5272,2020,Total revenues,43736.0,USD millions,,section_8
1,AIG,5272,2016,Total revenues,52367.0,USD millions,,section_8
2,AIG,5272,2012,Total revenues,65656.0,USD millions,,section_8
3,AIG,5272,2008,Total revenues,11777.0,USD millions,,section_8
4,AIG,5272,2004,Total revenues,98615.0,USD millions,,section_8
5,AIG,5272,2020,Net income (loss) attributable to AIG,-5944.0,USD millions,,section_8
6,AIG,5272,2016,Net income (loss) attributable to AIG,3348.0,USD millions,,section_8
7,AIG,5272,2012,Net income (loss) attributable to AIG,3438.0,USD millions,,section_8
8,AIG,5272,2008,Net income (loss) attributable to AIG,-101784.0,USD millions,,section_8
9,AIG,5272,2004,Net income (loss) attributable to AIG,9731.0,USD millions,,section_8
