In [None]:
!pip install langextract


Collecting langextract
  Downloading langextract-1.0.9-py3-none-any.whl.metadata (19 kB)
Collecting async_timeout>=4.0.0 (from langextract)
  Downloading async_timeout-5.0.1-py3-none-any.whl.metadata (5.1 kB)
Collecting exceptiongroup>=1.1.0 (from langextract)
  Downloading exceptiongroup-1.3.0-py3-none-any.whl.metadata (6.7 kB)
Collecting ml-collections>=0.1.0 (from langextract)
  Downloading ml_collections-1.1.0-py3-none-any.whl.metadata (22 kB)
Downloading langextract-1.0.9-py3-none-any.whl (106 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.2/106.2 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading async_timeout-5.0.1-py3-none-any.whl (6.2 kB)
Downloading exceptiongroup-1.3.0-py3-none-any.whl (16 kB)
Downloading ml_collections-1.1.0-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.7/76.7 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ml-collections, exceptiongroup, async

In [5]:
import langextract as lx
import textwrap
import os

# Set your LangExtract API key
os.environ["LANGEXTRACT_API_KEY"] = "AIzaSyB1aRzUfBW224GsN3aVoRMc-w3VPCv7pgc"
# Define the extraction task
prompt = textwrap.dedent("""\
Extract structured financial information with the following categories:

A. Party & Identification Information
- Extract names: Company, institution, counterparty, regulator.
- Extract identifiers: Tax ID, GST/VAT, PAN, CIN, registration numbers.
- Extract account numbers: Bank, loan, investment accounts.

B. Monetary Values
- Principal amounts: Loan value, invoice total, transaction amounts.
- Fees & charges: Service fees, late fees, management charges.
- Interest rates: Fixed/floating %, APR, benchmark reference (LIBOR, SOFR).
- Taxes: VAT, GST, withholding tax.
- Penalties/fines: Early termination fee, default charges.

C. Dates & Time Periods
- Effective date: Agreement or transaction start.
- Maturity date: Loan/investment closing.
- Due dates: Payment or installment schedule.
- Tenure/duration: Loan term, lock-in, ramp-up.
- Historical dates: Transaction, invoice, settlement dates.

Use exact text spans for extraction. Each extraction must include attributes to give context. Do not paraphrase.
""")

# Examples to guide extraction (use previous corrected extraction syntax)
examples = [
    lx.data.ExampleData(
        text="Route Mobile Limited (CIN: U72900MH2004PLC146323, PAN: AACCR7740M, GSTIN: 27AAACJ5977A1ZL) issued Tax Invoice No. RML/INV/2024/145 on 31/10/2024 with a due date of 31/10/2024. The invoice included CGST @ 9% and SGST @ 9% on the service value. Payments are to be made to ICICI Bank, Account Number 038805001508, IFSC Code ICIC0001959. In case of non-payment by the due date, an interest of 12% per annum will be charged as late fees.",
        extractions=[
            lx.data.Extraction(extraction_class="party", extraction_text="Route Mobile Limited", attributes={"type":"company"}),
            lx.data.Extraction(extraction_class="identifier", extraction_text="CIN: U72900MH2004PLC146323", attributes={"id_type":"CIN"}),
            lx.data.Extraction(extraction_class="identifier", extraction_text="PAN: AACCR7740M", attributes={"id_type":"PAN"}),
            lx.data.Extraction(extraction_class="identifier", extraction_text="GSTIN: 27AAACJ5977A1ZL", attributes={"id_type":"GSTIN"}),
            lx.data.Extraction(extraction_class="party", extraction_text="ICICI Bank", attributes={"type":"bank"}),
            lx.data.Extraction(extraction_class="monetary", extraction_text="service value", attributes={"value_type":"service_value"}),
            lx.data.Extraction(extraction_class="interest_rate", extraction_text="12% per annum", attributes={"type":"late_fee"}),
            lx.data.Extraction(extraction_class="tax", extraction_text="CGST @ 9%", attributes={"tax_type":"GST"}),
            lx.data.Extraction(extraction_class="tax", extraction_text="SGST @ 9%", attributes={"tax_type":"GST"}),
            lx.data.Extraction(extraction_class="account", extraction_text="038805001508", attributes={"account_type":"bank"}),
            lx.data.Extraction(extraction_class="date", extraction_text="31/10/2024", attributes={"date_type":"invoice_date"}),
            lx.data.Extraction(extraction_class="date", extraction_text="31/10/2024", attributes={"date_type":"due_date"}),
        ]
    ),
    # ... include other ExampleData entries here (use corrected syntax as before)
]

# Input text
input_text = """Route Mobile Limited (CIN: U72900MH2004PLC146323, PAN: AACCR7740M, GSTIN: 27AAACJ5977A1ZL) issued Tax Invoice No. RML/INV/2024/145 on 31/10/2024 with a due date of 31/10/2024. The invoice included CGST @ 9% and SGST @ 9% on the service value. Payments are to be made to ICICI Bank, Account Number 038805001508, IFSC Code ICIC0001959. In case of non-payment by the due date, an interest of 12% per annum will be charged as late fees.

HDFC Bank Limited issued its 7.70% Fixed Rate Senior Unsecured Bonds (ISIN: INE040A08641) on the allotment date of 18 November 2022. The bonds have a face value of ₹10,00,000 each and carry a fixed coupon of 7.70% per annum payable annually. The maturity date for the bonds is 18 November 2025.

On 07 December 2021, HSBC India announced the completion of its inaugural trade finance transaction linked to the Secured Overnight Financing Rate (SOFR) benchmark.

Axis Bank Limited, acting under the SARFAESI Act, issued a demand notice on 28-02-2023 against borrower accounts LTR000305252822 and PHR044803663450. Physical possession of the secured asset was taken on 08-06-2024. The property was scheduled for auction with earnest money deposit due by 27-08-2024 and bids opening on 28-08-2024. As per Section 194 of the Income Tax Act, the buyer must also deduct applicable TDS at source.

Shriram Finance Limited issued Secured Redeemable Non-Convertible Debentures (NCDs) with a face value of ₹1,000 per debenture. The issue offered multiple series with tenors of 36 months, 60 months, and 120 months. The coupon rates ranged from 9.10% to 9.70% per annum depending on the series.
"""

# Run extraction
result = lx.extract(
    text_or_documents=input_text,
    prompt_description=prompt,
    examples=examples,
    model_id="gemini-2.5-flash",
    api_key=os.environ["LANGEXTRACT_API_KEY"]  # Pass API key explicitly
)

# Save results to JSONL
lx.io.save_annotated_documents([result], output_name="financial_data.jsonl", output_dir=".")

# Generate interactive visualization
html_content = lx.visualize("financial_data.jsonl")

# Save visualization to HTML file
with open("financial_data_visualization.html", "w") as f:
    if hasattr(html_content, 'data'):
        f.write(html_content.data)
    else:
        f.write(html_content)

[94m[1mLangExtract[0m: Saving to [92mfinancial_data.jsonl[0m: 1 docs [00:00, 332.83 docs/s]

[92m✓[0m Saved [1m1[0m documents to [92mfinancial_data.jsonl[0m



[94m[1mLangExtract[0m: Loading [92mfinancial_data.jsonl[0m: 100%|██████████| 11.3k/11.3k [00:00<00:00, 7.79MB/s]

[92m✓[0m Loaded [1m1[0m documents from [92mfinancial_data.jsonl[0m





In [6]:
!ls


financial_data.jsonl  financial_data_visualization.html  sample_data


In [7]:
from google.colab import files

files.download('financial_data.jsonl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [11]:
files.download('financial_data_visualization.html')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [9]:
!ls sample_data


anscombe.json		      mnist_test.csv
california_housing_test.csv   mnist_train_small.csv
california_housing_train.csv  README.md
