In [8]:
!pip install langextract




In [9]:
import langextract as lx
import textwrap, os


In [10]:
# 1. Define the extraction prompt
prompt = textwrap.dedent("""\
Extract the following entities precisely from the text:
1. Party & Identification: Company, institution, counterparty, regulator, Tax ID, GST/VAT, PAN, CIN, registration numbers, bank/loan/investment account IDs.
2. Monetary Values: Principal amounts (loan, invoice, transaction amounts), fees/charges, interest rates (fixed/floating, annual %, APR, LIBOR/SOFR references), taxes (VAT, GST, withholding), penalties.
3. Dates & Time Periods: Effective date, maturity date, payment due dates, tenure/duration, historical dates (transaction, invoice, settlement).
Use exact spans from the text. Provide meaningful attributes (e.g., type: 'loan value', 'interest rate', etc.).
""")

In [11]:
# 2. Provide a guiding example
examples = [
    lx.data.ExampleData(
        text="ABC Ltd. with PAN AAAPL1234C took a loan of ₹50,00,000 at 12% annual interest. The effective date is 01-Apr-2023 and maturity date is 31-Mar-2028.",
        extractions=[
            lx.data.Extraction(
                extraction_class="Company",
                extraction_text="ABC Ltd.",
                attributes={"identifier": "PAN AAAPL1234C"}
            ),
            lx.data.Extraction(
                extraction_class="Principal Amount",
                extraction_text="₹50,00,000",
                attributes={"type": "loan value"}
            ),
            lx.data.Extraction(
                extraction_class="Interest Rate",
                extraction_text="12% annual interest",
                attributes={"type": "fixed"}
            ),
            lx.data.Extraction(
                extraction_class="Effective Date",
                extraction_text="01-Apr-2023"
            ),
            lx.data.Extraction(
                extraction_class="Maturity Date",
                extraction_text="31-Mar-2028"
            ),
        ]
    )
]

In [42]:
import os

# Fetch from Colab Secrets
api_key = os.getenv("LANGEXTRACT_API_KEY")

if api_key is None:
    raise ValueError("API key not found. Did you set it in Tools → Secrets with name LANGEXTRACT_API_KEY?")

# Set it for LangExtract
os.environ["LANGEXTRACT_API_KEY"] = api_key

print("API key loaded securely ✅")


API key loaded securely ✅


In [21]:
result = lx.extract(
    text_or_documents=input_text,
    prompt_description=prompt,
    examples=examples,
    model_id="gemini-2.5-flash",
)


In [23]:
input_text = "XYZ Bank sanctioned a loan account 123456 to DEF Pvt. Ltd. with GSTIN 29ABCDE1234F1Z5. The principal amount is ₹10,00,000 with 10.5% floating interest. Payment due date: 15-Sep-2025."

result = lx.extract(
    text_or_documents=input_text,
    prompt_description=prompt,
    examples=examples,
    model_id="gemini-2.5-flash",
)


In [44]:
lx.io.save_annotated_documents(
    [result],
    output_name="extraction_results.jsonl",
    output_dir="."
)


[94m[1mLangExtract[0m: Saving to [92mextraction_results.jsonl[0m: 1 docs [00:00, 1072.16 docs/s]

[92m✓[0m Saved [1m1[0m documents to [92mextraction_results.jsonl[0m





In [29]:
from IPython.display import display

display(html_content)


In [46]:
html_content = lx.visualize("extraction_results.jsonl")

# Convert to string safely
html_str = str(html_content)

with open("visualization.html", "w", encoding="utf-8") as f:
    f.write(html_str)

print("✅ visualization.html saved successfully")


[94m[1mLangExtract[0m: Loading [92mextraction_results.jsonl[0m: 100%|██████████| 1.32k/1.32k [00:00<00:00, 2.86MB/s]

[92m✓[0m Loaded [1m1[0m documents from [92mextraction_results.jsonl[0m
✅ visualization.html saved successfully





In [47]:
from google.colab import files
files.download("visualization.html")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [48]:
!head extraction_results.jsonl


{"extractions": [{"extraction_class": "Company", "extraction_text": "XYZ Bank", "char_interval": {"start_pos": 0, "end_pos": 8}, "alignment_status": "match_exact", "extraction_index": 1, "group_index": 0, "description": null, "attributes": {"identifier": "loan account 123456"}}, {"extraction_class": "Company", "extraction_text": "DEF Pvt. Ltd.", "char_interval": {"start_pos": 45, "end_pos": 58}, "alignment_status": "match_exact", "extraction_index": 2, "group_index": 1, "description": null, "attributes": {"identifier": "GSTIN 29ABCDE1234F1Z5"}}, {"extraction_class": "Principal Amount", "extraction_text": "₹10,00,000", "char_interval": {"start_pos": 111, "end_pos": 121}, "alignment_status": "match_exact", "extraction_index": 3, "group_index": 2, "description": null, "attributes": {"type": "loan value"}}, {"extraction_class": "Interest Rate", "extraction_text": "10.5% floating interest", "char_interval": {"start_pos": 127, "end_pos": 150}, "alignment_status": "match_exact", "extraction_i

In [49]:
from IPython.display import HTML

# Generate visualization
html_content = lx.visualize("extraction_results.jsonl")

# Force convert to HTML string
html_str = HTML(html_content.data).data

# Save to file
with open("visualization.html", "w", encoding="utf-8") as f:
    f.write(html_str)

print("✅ visualization.html saved successfully")


[94m[1mLangExtract[0m: Loading [92mextraction_results.jsonl[0m: 100%|██████████| 1.32k/1.32k [00:00<00:00, 1.81MB/s]

[92m✓[0m Loaded [1m1[0m documents from [92mextraction_results.jsonl[0m
✅ visualization.html saved successfully





In [50]:
from google.colab import files
files.download("visualization.html")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>