In [11]:
!pip install langextract



In [12]:
!pip install docling



In [13]:
import langextract as lx
import textwrap

In [14]:
prompt = textwrap.dedent("""\
Extract entities related to Party & Identification Information, Monetary Values, and Dates & Time Periods.
Follow these rules:
1. Use exact text spans from the input, no paraphrasing.
2. Do not overlap entities.
3. Provide meaningful attributes to add context.
4. Categories:
   - Party & Identification Information:
     • Entity Names: Company, institution, counterparty, regulator
     • Identifiers: Tax ID, GST/VAT, PAN, CIN, registration numbers
     • Account Numbers: Bank account, loan account, investment account IDs
   - Monetary Values:
     • Principal Amounts: Loan value, invoice total, transaction amount
     • Fees & Charges: Service fees, late fees, management charges
     • Interest Rates: Fixed/floating, annual %, APR, benchmark reference (LIBOR, SOFR)
     • Taxes: VAT, GST, withholding tax
     • Penalties/Fines: Early termination fee, default charges
   - Dates & Time Periods:
     • Effective Date, Maturity Date, Due Dates, Tenure/Duration
     • Historical Dates: Transaction date, invoice date, settlement date
""")


In [15]:
from docling.document_converter import DocumentConverter

source = "/content/langextractdock.pdf"  # document per local path or URL
converter = DocumentConverter()
result = converter.convert(source)
doc=result.document.export_to_markdown()  # output: "## Docling Technical Report[...]"

In [16]:
doc

'"On March 15, 2023, HDFC Bank Ltd reported a loan disbursement of INR 5,00,000 under account number 123456789. The loan tenure is 5 years with an interest rate of 7.5% per annum.",'

In [17]:
examples = [
    lx.data.ExampleData(
        text=doc,
        extractions=[
            lx.data.Extraction(
                extraction_class="company",
                extraction_text="HDFC Bank Ltd"
            ),
            lx.data.Extraction(
                extraction_class="date",
                extraction_text="March 15, 2023",
                attributes={"date_type":"transaction date"}
            ),
            lx.data.Extraction(
                extraction_class="account_number",
                extraction_text="123456789",
                attributes={"account_type":"loan account"}
            ),
            lx.data.Extraction(
                extraction_class="monetary_value",
                extraction_text="INR 5,00,000",
                attributes={"value_type":"principal amount","currency":"INR"}
            ),
            lx.data.Extraction(
                extraction_class="duration",
                extraction_text="5 years",
                attributes={"duration_type":"loan tenure"}
            ),
            lx.data.Extraction(
                extraction_class="interest_rate",
                extraction_text="7.5% per annum",
                attributes={"rate_type":"fixed"}
            ),
        ]
    )
]


In [18]:
input_text = """On Jan 10, 2024, ICICI Bank sanctioned a home loan of INR 25,00,000 under loan account 987654321 with a tenure of 15 years and an annual interest rate of 8%.
On Feb 5, 2024, Axis Bank imposed a late fee of INR 2,000 on credit card account 1122334455.
Reliance Industries Ltd paid GST of INR 50,000 on March 12, 2023, under GSTIN 27AAACR5055K1ZL.
SEBI fined ABC Securities INR 1,50,000 on Aug 18, 2022, for regulatory violations.
On Dec 1, 2023, Tata Motors issued an invoice of INR 12,75,000 with a payment due date of Jan 15, 2024."""


In [19]:
import os

os.environ['LANGEXTRACT_API_KEY'] = "AIzaSyAIIes-a5rcyYxRI5mA4n6S1va632p-yOg"

result = lx.extract(
    text_or_documents=input_text,
    prompt_description=prompt,
    examples=examples,
    model_id="gemini-2.5-flash"
)



In [20]:
lx.io.save_annotated_documents([result], output_name="extraction_results.json", output_dir=".")


[94m[1mLangExtract[0m: Saving to [92mextraction_results.json[0m: 1 docs [00:00, 425.13 docs/s]

[92m✓[0m Saved [1m1[0m documents to [92mextraction_results.json[0m





In [21]:
html_content = lx.visualize("extraction_results.json")
with open("visualization.html", "w") as f:
    if hasattr(html_content, 'data'):
        f.write(html_content.data)  # For Colab/Jupyter
    else:
        f.write(html_content)

[94m[1mLangExtract[0m: Loading [92mextraction_results.json[0m: 100%|██████████| 6.09k/6.09k [00:00<00:00, 9.22MB/s]

[92m✓[0m Loaded [1m1[0m documents from [92mextraction_results.json[0m





In [22]:
from IPython.display import display, HTML
if hasattr(html_content, 'data'):
    display(HTML(html_content.data))
else:
    display(HTML(html_content))
