In [None]:
# -------- Clean Colab & set correct versions --------
!rm -rf /root/.cache/huggingface/hub

# upgrade build tools
!pip install -q --upgrade pip setuptools wheel

# install versions Docling likes (numpy <2, scipy >=1.14,<2)
!pip install -q "numpy==1.26.4" "scipy==1.14.1"

# finally install docling + langextract
!pip install -q "docling==1.17.0" langextract


In [None]:
# wipe Docling's cache so it pulls a fresh copy
!rm -rf /root/.cache/huggingface/hub/models--ds4sd--docling-models


In [None]:
!pip install "langextract[visualize]"




In [6]:
#  Mount Drive & Install Packages
from google.colab import drive
drive.mount("/content/drive")

!pip install -U langextract[visualize] docling pdfplumber --quiet

import os, textwrap, time, random
import langextract as lx
from docling.document_converter import DocumentConverter

#  Read File with Docling (PDF/DOCX/PPTX)
file_path = "/content/drive/MyDrive/Financial_Statements.pdf"
converter = DocumentConverter()

def read_with_docling(path):
    result = converter.convert(path)
    return result.document.export_to_text()

full_text = read_with_docling(file_path)

#  Split PDF Text into Manageable Chunks
def split_text(text, chunk_size=5000):
    words = text.split()
    chunks, current, size = [], [], 0
    for word in words:
        current.append(word)
        size += len(word) + 1
        if size >= chunk_size:
            chunks.append(" ".join(current))
            current, size = [], 0
    if current:
        chunks.append(" ".join(current))
    return chunks

chunks = split_text(full_text, chunk_size=5000)
print(f" Total Chunks Created: {len(chunks)}")
batches = chunks
print(f"Total Batches: {len(batches)}")

#  Prompt & Example
os.environ["LANGEXTRACT_API_KEY"] = "AIzaSyB1aRzUfBW224GsN3aVoRMc-w3VPCv7pgc"

prompt = textwrap.dedent("""
Extract structured financial information with the following categories:

A. Party & Identification Information
- Extract names: Company, institution, counterparty, regulator.
- Extract identifiers: Tax ID, GST/VAT, PAN, CIN, registration numbers.
- Extract account numbers: Bank, loan, investment accounts.

B. Monetary Values
- Principal amounts: Loan value, invoice total, transaction amounts.
- Fees & charges: Service fees, late fees, management charges.
- Interest rates: Fixed/floating %, APR, benchmark reference (LIBOR, SOFR).
- Taxes: VAT, GST, withholding tax.
- Penalties/fines: Early termination fee, default charges.

C. Dates & Time Periods
- Effective date: Agreement or transaction start.
- Maturity date: Loan/investment closing.
- Due dates: Payment or installment schedule.
- Tenure/duration: Loan term, lock-in, ramp-up.
- Historical dates: Transaction, invoice, settlement dates.

Use exact text spans for extraction. Each extraction must include attributes to give context. Do not paraphrase.
""")

examples = [
    lx.data.ExampleData(
        text="Route Mobile Limited (CIN: U72900MH2004PLC146323, PAN: AACCR7740M, GSTIN: 27AAACJ5977A1ZL)...",
        extractions=[
            lx.data.Extraction(extraction_class="party", extraction_text="Route Mobile Limited", attributes={"type":"company"}),
            lx.data.Extraction(extraction_class="identifier", extraction_text="CIN: U72900MH2004PLC146323", attributes={"id_type":"CIN"}),
            lx.data.Extraction(extraction_class="identifier", extraction_text="PAN: AACCR7740M", attributes={"id_type":"PAN"}),
            lx.data.Extraction(extraction_class="identifier", extraction_text="GSTIN: 27AAACJ5977A1ZL", attributes={"id_type":"GSTIN"}),
            lx.data.Extraction(extraction_class="party", extraction_text="ICICI Bank", attributes={"type":"bank"}),
            lx.data.Extraction(extraction_class="monetary", extraction_text="service value", attributes={"value_type":"service_value"}),
            lx.data.Extraction(extraction_class="interest_rate", extraction_text="12% per annum", attributes={"type":"late_fee"}),
            lx.data.Extraction(extraction_class="tax", extraction_text="CGST @ 9%", attributes={"tax_type":"GST"}),
            lx.data.Extraction(extraction_class="tax", extraction_text="SGST @ 9%", attributes={"tax_type":"GST"}),
            lx.data.Extraction(extraction_class="account", extraction_text="038805001508", attributes={"account_type":"bank"}),
            lx.data.Extraction(extraction_class="date", extraction_text="31/10/2024", attributes={"date_type":"invoice_date"}),
            lx.data.Extraction(extraction_class="date", extraction_text="31/10/2024", attributes={"date_type":"due_date"}),
        ]
    )
]

# Retry helper with exponential backoff
def wait_with_backoff(attempt):
    wait = min(300, (2 ** attempt) + random.uniform(0, 5))
    print(f"  Waiting {int(wait)}s before retry...")
    time.sleep(wait)

# Extraction with robust retry & fallback
results = []
for i, batch in enumerate(batches, 1):
    success = False
    for attempt in range(5):
        try:
            print(f"\n Processing batch {i}/{len(batches)} (attempt {attempt+1})...")
            result = lx.extract(
                text_or_documents=batch,
                prompt_description=prompt,
                examples=examples,
                model_id="gemini-2.5-flash",
                api_key=os.environ["LANGEXTRACT_API_KEY"],
                batch_length=1,
                max_workers=1
            )
            results.append(result)
            success = True
            time.sleep(5)
            break
        except Exception as e:
            error_msg = str(e).lower()
            print(f"  Batch {i} failed: {e}")
            if "503" in error_msg or "unavailable" in error_msg:
                print(" Gemini overloaded, retrying...")
                wait_with_backoff(attempt)
                continue
            elif "rate limit" in error_msg:
                print("  Rate limit hit. Waiting longer...")
                wait_with_backoff(attempt + 2)
                continue
            else:
                print("  Unknown error, trying fallback model...")
                try:
                    result = lx.extract(
                        text_or_documents=batch,
                        prompt_description=prompt,
                        examples=examples,
                        model_id="gemini-1.5-flash",
                        api_key=os.environ["LANGEXTRACT_API_KEY"],
                        batch_length=1,
                        max_workers=1
                    )
                    results.append(result)
                    success = True
                    break
                except Exception as e2:
                    print(f"  Fallback also failed: {e2}")
                    wait_with_backoff(attempt)
    if not success:
        print(f"  Skipping batch {i} after all retries")

# Save JSONL & Visualization
if results:
    lx.io.save_annotated_documents(results, output_name="financial_data.jsonl", output_dir=".")

    html_content = lx.visualize("financial_data.jsonl")
    with open("financial_data_visualization.html", "w") as f:
        f.write(html_content.data if hasattr(html_content, 'data') else html_content)

    print("\n Completed! Output saved as financial_data.jsonl & financial_data_visualization.html")
else:
    print("\n No results saved (all batches failed).")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[0m



 Total Chunks Created: 9
Total Batches: 9

🚀 Processing batch 1/9 (attempt 1)...





🚀 Processing batch 2/9 (attempt 1)...





🚀 Processing batch 3/9 (attempt 1)...





🚀 Processing batch 4/9 (attempt 1)...





🚀 Processing batch 5/9 (attempt 1)...





🚀 Processing batch 6/9 (attempt 1)...
 ❌ Batch 6 failed: Gemini API error: 503 UNAVAILABLE. {'error': {'code': 503, 'message': 'The model is overloaded. Please try again later.', 'status': 'UNAVAILABLE'}}
 ⚠️ Gemini overloaded, retrying...
 ⏳ Waiting 4s before retry...





🚀 Processing batch 6/9 (attempt 2)...





🚀 Processing batch 7/9 (attempt 1)...





🚀 Processing batch 8/9 (attempt 1)...





🚀 Processing batch 9/9 (attempt 1)...


[94m[1mLangExtract[0m: Saving to [92mfinancial_data.jsonl[0m: 9 docs [00:00, 388.93 docs/s]

[92m✓[0m Saved [1m9[0m documents to [92mfinancial_data.jsonl[0m



[94m[1mLangExtract[0m: Loading [92mfinancial_data.jsonl[0m: 100%|██████████| 221k/221k [00:00<00:00, 27.5MB/s]

[92m✓[0m Loaded [1m9[0m documents from [92mfinancial_data.jsonl[0m

✅ Completed! Output saved as financial_data.jsonl & financial_data_visualization.html





In [7]:
from google.colab import files

# Download JSONL
files.download("financial_data.jsonl")

# Download HTML visualization
files.download("financial_data_visualization.html")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>