In [1]:
!pip install langextract docling pymupdf

Collecting langextract
  Downloading langextract-1.0.9-py3-none-any.whl.metadata (19 kB)
Collecting docling
  Downloading docling-2.51.0-py3-none-any.whl.metadata (10 kB)
Collecting pymupdf
  Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting async_timeout>=4.0.0 (from langextract)
  Downloading async_timeout-5.0.1-py3-none-any.whl.metadata (5.1 kB)
Collecting exceptiongroup>=1.1.0 (from langextract)
  Downloading exceptiongroup-1.3.0-py3-none-any.whl.metadata (6.7 kB)
Collecting ml-collections>=0.1.0 (from langextract)
  Downloading ml_collections-1.1.0-py3-none-any.whl.metadata (22 kB)
Collecting docling-core<3.0.0,>=2.42.0 (from docling-core[chunking]<3.0.0,>=2.42.0->docling)
  Downloading docling_core-2.47.0-py3-none-any.whl.metadata (6.5 kB)
Collecting docling-parse<5.0.0,>=4.4.0 (from docling)
  Downloading docling_parse-4.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.5 kB)
Collecting docling-ibm-models<4,>=3.9

In [3]:
import langextract as lx
import textwrap
import os
from docling.document_converter import DocumentConverter

In [5]:
converter = DocumentConverter()
result = converter.convert("Alpha Technologies Pvt. Ltd.pdf")
full_text = result.document.export_to_markdown()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
prompt = textwrap.dedent("""
Extract structured financial entities from the document.

1. Party & Identification Information
- Entity Names: Company, institution, counterparty, regulator
- Identifiers: Tax ID, GST/VAT, PAN, CIN, registration numbers
- Account Numbers: Bank account, loan account, investment account IDs

2. Monetary Values
- Principal Amounts: Loan value, invoice total, transaction amount
- Fees & Charges: Service fees, late fees, management charges
- Interest Rates: Fixed/floating, annual %, APR, benchmark reference (e.g., LIBOR, SOFR)
- Taxes: VAT, GST, withholding tax
- Penalties / Fines: Early termination fee, default charges

3. Dates & Time Periods
- Effective Date: Start of agreement or transaction
- Maturity Date: Loan/investment closing
- Due Dates: Payment due, installment schedule
- Tenure/Duration: Loan term, lock-in period, ramp-up period
- Historical Dates: Transaction date, invoice date, settlement date

Use exact text spans. Do not paraphrase.
Add attributes like currency, time_period, benchmark, account_type, or context if available.
""")

In [9]:
examples = [
    lx.data.ExampleData(
        text="On Jan 1, 2023, XYZ Bank issued a loan of ₹5,00,000 to Alpha Technologies Pvt. Ltd. with an annual interest rate of 8% fixed, payable in monthly installments. The loan has a tenure of 5 years and will mature on Dec 31, 2027. The company's CIN is U12345TN2020PTC111111 and PAN is AABCT1234F.",
        extractions=[
            lx.data.Extraction("entity_name", "Alpha Technologies Pvt. Ltd."),
            lx.data.Extraction("entity_name", "XYZ Bank"),
            lx.data.Extraction("identifier", "CIN: U12345TN2020PTC111111"),
            lx.data.Extraction("identifier", "PAN: AABCT1234F"),
            lx.data.Extraction("principal_amount", "₹5,00,000"),
            lx.data.Extraction("interest_rate", "8%"),
            lx.data.Extraction("effective_date", "Jan 1, 2023"),
            lx.data.Extraction("maturity_date", "Dec 31, 2027"),
            lx.data.Extraction("tenure", "5 years")
        ]
    ),

    lx.data.ExampleData(
        text="Invoice No: INV-9087 dated Feb 15, 2024 shows that ABC Corporation paid a total amount of $12,500 including a GST of $1,500. The due date for payment was Mar 15, 2024.",
        extractions=[
            lx.data.Extraction("entity_name", "ABC Corporation"),
            lx.data.Extraction("identifier", "Invoice No: INV-9087"),
            lx.data.Extraction("principal_amount", "$12,500"),
            lx.data.Extraction("tax", "GST of $1,500"),
            lx.data.Extraction("historical_date", "Feb 15, 2024"),
            lx.data.Extraction("due_date", "Mar 15, 2024")
        ]
    ),

    lx.data.ExampleData(
        text="On Apr 1, 2024, DEF Investments Ltd. invested €2,000,000 in a floating rate bond linked to SOFR + 2% with a maturity on Apr 1, 2029.",
        extractions=[
            lx.data.Extraction("entity_name", "DEF Investments Ltd."),
            lx.data.Extraction("principal_amount", "€2,000,000"),
            lx.data.Extraction("interest_rate", "SOFR + 2%"),
            lx.data.Extraction("effective_date", "Apr 1, 2024"),
            lx.data.Extraction("maturity_date", "Apr 1, 2029")
        ]
    ),

    lx.data.ExampleData(
        text="On Jul 10, 2023, GHI Pvt. Ltd. (Tax ID: TIN987654) transferred $50,000 to JKL Ltd., subject to a withholding tax of 10%.",
        extractions=[
            lx.data.Extraction("entity_name", "GHI Pvt. Ltd."),
            lx.data.Extraction("entity_name", "JKL Ltd."),
            lx.data.Extraction("identifier", "Tax ID: TIN987654"),
            lx.data.Extraction("principal_amount", "$50,000"),
            lx.data.Extraction("tax", "withholding tax of 10%"),
            lx.data.Extraction("historical_date", "Jul 10, 2023")
        ]
    ),

    lx.data.ExampleData(
        text="MNO Corp. was charged a late payment penalty of ₹10,000 on Aug 5, 2024 for not clearing the installment due on Jul 31, 2024.",
        extractions=[
            lx.data.Extraction("entity_name", "MNO Corp."),
            lx.data.Extraction("penalty_fine", "₹10,000"),
            lx.data.Extraction("due_date", "Jul 31, 2024"),
            lx.data.Extraction("historical_date", "Aug 5, 2024")
        ]
    )
]

In [10]:
os.environ["LANGEXTRACT_API_KEY"] = "AIzaSyAdcs4cesmHR_x5uQuNR3HgAgLWFHazy90"

In [11]:
extraction_result = lx.extract(
    text_or_documents=full_text,
    prompt_description=prompt,
    examples=examples,
    model_id="gemini-2.5-flash",
    extraction_passes=3,
    max_workers=20,
    max_char_buffer=1500
)



In [13]:
lx.io.save_annotated_documents(
    [extraction_result],
    output_name="financial_extractions.jsonl",
    output_dir="."
)

[94m[1mLangExtract[0m: Saving to [92mfinancial_extractions.jsonl[0m: 1 docs [00:00, 405.64 docs/s]

[92m✓[0m Saved [1m1[0m documents to [92mfinancial_extractions.jsonl[0m





In [15]:
html_content = lx.visualize("financial_extractions.jsonl")

with open("financial_visualization.html", "w", encoding="utf-8") as f:
    if hasattr(html_content, "data"):
        f.write(str(html_content.data))
    else:
        f.write(str(html_content))

[94m[1mLangExtract[0m: Loading [92mfinancial_extractions.jsonl[0m: 100%|██████████| 6.13k/6.13k [00:00<00:00, 6.38MB/s]

[92m✓[0m Loaded [1m1[0m documents from [92mfinancial_extractions.jsonl[0m





In [17]:
from google.colab import files

files.download("financial_visualization.html")
files.download("financial_extractions.jsonl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>