In [39]:
! pip install langextract





[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [40]:
import langextract as lx
import textwrap

In [None]:

import os
os.environ["LANGEXTRACT_API_KEY"] = "your-api-key"


In [42]:
prompt = textwrap.dedent("""\
Extract contract-related financial and party entities in order of appearance.
Use exact text for extractions. Do not paraphrase or overlap entities.
Map each extraction to the closest entity type: 
Party & Identification, Monetary Value, or Date/Time.
Include contextual attributes such as type (e.g., PAN, VAT, bank account, penalty type, etc.) or currency.
""")

In [43]:
examples = [
    lx.data.ExampleData(
        text="On 12th March 2024, ABC Bank (CIN: U12345MH2020PLC111111, PAN: AAAPA1234A) granted a loan of INR 50,00,000 to XYZ Enterprises Ltd with account number 123456789. The loan carries an interest rate of 7.5% per annum, with GST of 18% applicable. The maturity date is 12th March 2029.",
        extractions=[
            lx.data.Extraction(
                extraction_class="company",
                extraction_text="ABC Bank",
                attributes={
                    "identifier_type": "CIN",
                    "identifier": "U12345MH2020PLC111111",
                    "PAN": "AAAPA1234A"
                }
            ),
            lx.data.Extraction(
                extraction_class="company",
                extraction_text="XYZ Enterprises Ltd"
            ),
            lx.data.Extraction(
                extraction_class="date",
                extraction_text="12th March 2024",
                attributes={"type": "effective_date"}
            ),
            lx.data.Extraction(
                extraction_class="account_number",
                extraction_text="123456789",
                attributes={"account_type": "loan_account"}
            ),
            lx.data.Extraction(
                extraction_class="financial_metric",
                extraction_text="INR 50,00,000",
                attributes={"metric_type": "principal_amount", "currency": "INR"}
            ),
            lx.data.Extraction(
                extraction_class="financial_metric",
                extraction_text="7.5%",
                attributes={"metric_type": "interest_rate", "rate_type": "annual"}
            ),
            lx.data.Extraction(
                extraction_class="financial_metric",
                extraction_text="18%",
                attributes={"metric_type": "tax", "tax_type": "GST"}
            ),
            lx.data.Extraction(
                extraction_class="date",
                extraction_text="12th March 2029",
                attributes={"type": "maturity_date"}
            )
        ]
    )
]


In [44]:
input_text = """On September 1, 2024, HDFC Bank Ltd (HDFCBANK.NS) announced that it had disbursed loans worth ₹5,200 crore under its retail lending program. The loans carried an average interest rate of 8.5% per annum, with a tenure of 5 years. The announcement followed the Reserve Bank of India’s notification requiring banks to disclose retail lending data by the end of Q2 FY2024.

Meanwhile, in global markets, the US dollar traded at 1.08 against the euro and 146.5 per Japanese yen, sparking concerns of possible intervention by the Bank of Japan. The S&P 500 index closed at 4,520 points, gaining 1.2% on optimism around tech stocks, while Infosys Ltd (INFY) shares rose 2.3% in New York trading, settling at $18.40 per share.

In corporate filings, Reliance Industries Ltd (RELIANCE.NS) reported a net profit of ₹16,200 crore for the quarter ending June 30, 2024, compared to ₹14,800 crore in the same quarter last year. The company also mentioned pending GST liabilities of ₹1,200 crore and penalty provisions of ₹250 crore related to its telecom business. Its CIN number L17110MH1973PLC019786 was cited in the regulatory filing with the Ministry of Corporate Affairs.
"""



In [45]:
result = lx.extract(
    text_or_documents=input_text,
    prompt_description=prompt,
    examples=examples,
    model_id="gemini-2.5-flash"
)




In [46]:
lx.io.save_annotated_documents([result], output_name="extraction_results.jsonl", output_dir=".")


[94m[1mLangExtract[0m: Saving to [92mextraction_results.jsonl[0m: 1 docs [00:00, 330.31 docs/s]

[92m✓[0m Saved [1m1[0m documents to [92mextraction_results.jsonl[0m





In [47]:

html_content = lx.visualize("extraction_results.jsonl")

with open("visualization.html", "w", encoding="utf-8") as f:
    if hasattr(html_content, 'data'):  # For Jupyter/Colab object
        f.write(html_content.data)
    else:
        f.write(html_content)

print("✅ Extraction complete. Results saved to 'extraction_results.jsonl' and visualization.html")


[94m[1mLangExtract[0m: Loading [92mextraction_results.jsonl[0m: 100%|█████████▉| 7.70k/7.70k [00:00<00:00, 240kB/s]

[92m✓[0m Loaded [1m1[0m documents from [92mextraction_results.jsonl[0m
✅ Extraction complete. Results saved to 'extraction_results.jsonl' and visualization.html



