In [6]:
! pip install docling
! pip install langextract
import os
import textwrap
import json

import langextract as lx


from docling.document_converter import DocumentConverter

Collecting docling
  Using cached docling-2.51.0-py3-none-any.whl.metadata (10 kB)
Collecting docling-core<3.0.0,>=2.42.0 (from docling-core[chunking]<3.0.0,>=2.42.0->docling)
  Using cached docling_core-2.48.0-py3-none-any.whl.metadata (6.5 kB)
Collecting docling-parse<5.0.0,>=4.4.0 (from docling)
  Using cached docling_parse-4.4.0-cp313-cp313-win_amd64.whl.metadata (9.8 kB)
Collecting docling-ibm-models<4,>=3.9.1 (from docling)
  Using cached docling_ibm_models-3.9.1-py3-none-any.whl.metadata (6.7 kB)
Collecting easyocr<2.0,>=1.7 (from docling)
  Using cached easyocr-1.7.2-py3-none-any.whl.metadata (10 kB)
Collecting accelerate<2,>=1.0.0 (from docling)
  Using cached accelerate-1.10.1-py3-none-any.whl.metadata (19 kB)
Collecting jsonschema<5.0.0,>=4.16.0 (from docling-core<3.0.0,>=2.42.0->docling-core[chunking]<3.0.0,>=2.42.0->docling)
  Using cached jsonschema-4.25.1-py3-none-any.whl.metadata (7.6 kB)
Collecting semchunk<3.0.0,>=2.2.0 (from docling-core[chunking]<3.0.0,>=2.42.0->doc

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
os.environ["LANGEXTRACT_API_KEY"] = "Your-Api-Key"


converter = DocumentConverter()
doc = converter.convert("sample_financial_report.docx")  
input_text = doc.document.export_to_text()      
print("📄 Extracted text:\n", input_text[:500])

2025-09-10 17:43:22,342 - INFO - detected formats: [<InputFormat.DOCX: 'docx'>]
2025-09-10 17:43:22,369 - INFO - Going to convert document batch...
2025-09-10 17:43:22,372 - INFO - Initializing pipeline for SimplePipeline with options hash 64efdeb459220d9bbf631bfc20c2d4cd
2025-09-10 17:43:22,373 - INFO - Processing document sample_financial_report.docx
2025-09-10 17:43:22,449 - INFO - Finished converting document sample_financial_report.docx in 0.14 sec.


📄 Extracted text:
 # Financial Report - Q1 2025

This Loan Agreement is executed on **5th January 2023** between **PQR Finance Ltd (CIN: U67890DL2018PLC222222, PAN: BBBBP4321B)** and **Sunrise Textiles Pvt Ltd** . PQR Finance has sanctioned **Loan Account No. 876543210** in favor of Sunrise Textiles, with a **principal sanctioned amount of INR 25,00,000** . The loan carries an **annual interest rate of 8.2%** , and all charges are subject to **18% GST** on processing fees. The tenure of the loan is four years, wit


In [9]:
prompt = textwrap.dedent("""\
Extract contract-related financial and party entities in order of appearance.
Use exact text for extractions. Do not paraphrase or overlap entities.
Map each extraction to the closest entity type: 
Party & Identification, Monetary Value, or Date/Time.
Include contextual attributes such as type (e.g., PAN, VAT, bank account, penalty type, etc.) or currency.
""")


In [10]:
examples = [
    lx.data.ExampleData(
        text="On 12th March 2024, ABC Bank (CIN: U12345MH2020PLC111111, PAN: AAAPA1234A) granted a loan of INR 50,00,000 to XYZ Enterprises Ltd with account number 123456789. The loan carries an interest rate of 7.5% per annum, with GST of 18% applicable. The maturity date is 12th March 2029.",
        extractions=[
            lx.data.Extraction(
                extraction_class="company",
                extraction_text="ABC Bank",
                attributes={
                    "identifier_type": "CIN",
                    "identifier": "U12345MH2020PLC111111",
                    "PAN": "AAAPA1234A"
                }
            ),
            lx.data.Extraction(
                extraction_class="company",
                extraction_text="XYZ Enterprises Ltd"
            ),
            lx.data.Extraction(
                extraction_class="date",
                extraction_text="12th March 2024",
                attributes={"type": "effective_date"}
            ),
            lx.data.Extraction(
                extraction_class="account_number",
                extraction_text="123456789",
                attributes={"account_type": "loan_account"}
            ),
            lx.data.Extraction(
                extraction_class="financial_metric",
                extraction_text="INR 50,00,000",
                attributes={"metric_type": "principal_amount", "currency": "INR"}
            ),
            lx.data.Extraction(
                extraction_class="financial_metric",
                extraction_text="7.5%",
                attributes={"metric_type": "interest_rate", "rate_type": "annual"}
            ),
            lx.data.Extraction(
                extraction_class="financial_metric",
                extraction_text="18%",
                attributes={"metric_type": "tax", "tax_type": "GST"}
            ),
            lx.data.Extraction(
                extraction_class="date",
                extraction_text="12th March 2029",
                attributes={"type": "maturity_date"}
            )
        ]
    )
]

In [11]:
result = lx.extract(
    text_or_documents=input_text,
    prompt_description=prompt,
    examples=examples,
    model_id="gemini-2.5-flash"
)

2025-09-10 17:45:02,304 - INFO - Loaded provider plugin: gemini
2025-09-10 17:45:02,308 - INFO - Loaded provider plugin: ollama
2025-09-10 17:45:02,312 - INFO - Loaded provider plugin: openai
2025-09-10 17:45:05,353 - INFO - Starting document annotation.
2025-09-10 17:45:05,369 - INFO - Processing batch 0 with length 3
2025-09-10 17:45:05,370 - INFO - AFC is enabled with max remote calls: 10.
2025-09-10 17:45:05,373 - INFO - AFC is enabled with max remote calls: 10.
2025-09-10 17:45:05,378 - INFO - AFC is enabled with max remote calls: 10.
2025-09-10 17:45:09,921 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent "HTTP/1.1 200 OK"
2025-09-10 17:45:09,924 - INFO - AFC remote call 1 is done.
2025-09-10 17:45:12,855 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent "HTTP/1.1 200 OK"
2025-09-10 17:45:12,858 - INFO - AFC remote call 1 is done.
2025-09-10 17:45

In [12]:
lx.io.save_annotated_documents([result], output_name="doc_extraction.jsonl", output_dir=".")

[94m[1mLangExtract[0m: Saving to [92mdoc_extraction.jsonl[0m: 1 docs [00:00, 233.69 docs/s]

[92m✓[0m Saved [1m1[0m documents to [92mdoc_extraction.jsonl[0m





In [15]:
html_content = lx.visualize("doc_extraction.jsonl")
with open("doc_visualization.html", "w", encoding="utf-8") as f:
    f.write(html_content.data if hasattr(html_content, 'data') else html_content)

print("✅ Extraction complete. Results saved to 'doc_extraction.jsonl' and 'doc_visualization.html'")

[94m[1mLangExtract[0m: Loading [92mdoc_extraction.jsonl[0m: 100%|█████████▉| 11.3k/11.3k [00:00<00:00, 9.03MB/s]

[92m✓[0m Loaded [1m1[0m documents from [92mdoc_extraction.jsonl[0m
✅ Extraction complete. Results saved to 'doc_extraction.jsonl' and 'doc_visualization.html'



