In [1]:
!pip install langextract


Collecting langextract
  Downloading langextract-1.0.9-py3-none-any.whl.metadata (19 kB)
Collecting async_timeout>=4.0.0 (from langextract)
  Downloading async_timeout-5.0.1-py3-none-any.whl.metadata (5.1 kB)
Collecting exceptiongroup>=1.1.0 (from langextract)
  Downloading exceptiongroup-1.3.0-py3-none-any.whl.metadata (6.7 kB)
Collecting ml-collections>=0.1.0 (from langextract)
  Downloading ml_collections-1.1.0-py3-none-any.whl.metadata (22 kB)
Downloading langextract-1.0.9-py3-none-any.whl (106 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.2/106.2 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading async_timeout-5.0.1-py3-none-any.whl (6.2 kB)
Downloading exceptiongroup-1.3.0-py3-none-any.whl (16 kB)
Downloading ml_collections-1.1.0-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.7/76.7 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ml-collections, exceptiongroup, async

In [2]:
!pip install docling

Collecting docling
  Downloading docling-2.50.0-py3-none-any.whl.metadata (10 kB)
Collecting docling-core<3.0.0,>=2.42.0 (from docling-core[chunking]<3.0.0,>=2.42.0->docling)
  Downloading docling_core-2.47.0-py3-none-any.whl.metadata (6.5 kB)
Collecting docling-parse<5.0.0,>=4.2.2 (from docling)
  Downloading docling_parse-4.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.5 kB)
Collecting docling-ibm-models<4,>=3.9.1 (from docling)
  Downloading docling_ibm_models-3.9.1-py3-none-any.whl.metadata (6.7 kB)
Collecting filetype<2.0.0,>=1.2.0 (from docling)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting pypdfium2!=4.30.1,<5.0.0,>=4.30.0 (from docling)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting easyocr<2.0,>=1.7 (from docling)
  Downloading eas

In [3]:
import langextract as le
import textwrap

In [4]:
import os
os.environ["LANGEXTRACT_API_KEY"] = "AIzaSyBmF1kYhYittg4vUw7nhTC-c0vK4XQdxJE"

In [5]:
prompt = textwrap.dedent("""\
Extract contract-related financial and party entities in order of appearance.
Use exact text for extractions. Do not paraphrase or overlap entities.
Map each extraction to the closest entity type:
Party & Identification, Monetary Value, or Date/Time.
Include contextual attributes such as type (e.g., PAN, VAT, bank account, penalty type, etc.) or currency.
""")

In [6]:
examples = [
    le.data.ExampleData(
        text="On October 5, 2024, ICICI Bank Ltd (ICICIBANK.NS) announced that it had sanctioned home loans worth ₹4,800 crore during Q2 FY2024.",
        extractions=[
            le.data.Extraction(
                extraction_class="date",
                extraction_text="October 5, 2024",
                attributes={"type": "announcement_date"}
            ),
            le.data.Extraction(
                extraction_class="company",
                extraction_text="ICICI Bank Ltd (ICICIBANK.NS)",
                attributes={"identifier_type": "ticker", "identifier": "ICICIBANK.NS"}
            ),
            le.data.Extraction(
                extraction_class="financial_metric",
                extraction_text="₹4,800 crore",
                attributes={"metric_type": "home_loan_sanctioned", "currency": "INR", "fiscal_period": "Q2 FY2024"}
            ),
            le.data.Extraction(
                extraction_class="date",
                extraction_text="Q2 FY2024",
                attributes={"type": "fiscal_period"}
            )
        ]
    ),
    le.data.ExampleData(
        text="The loans carried an average interest rate of 8.2% per annum, with repayment tenures ranging from 10 to 20 years.",
        extractions=[
            le.data.Extraction(
                extraction_class="financial_metric",
                extraction_text="8.2% per annum",
                attributes={"metric_type": "interest_rate", "rate_type": "annual"}
            ),
            le.data.Extraction(
                extraction_class="financial_metric",
                extraction_text="10 to 20 years",
                attributes={"metric_type": "repayment_tenure", "unit": "years"}
            )
        ]
    ),
    le.data.ExampleData(
        text="The disclosure followed the Securities and Exchange Board of India’s (SEBI) new mandate requiring lenders to publish segment-wise lending data.",
        extractions=[
            le.data.Extraction(
                extraction_class="other_entity",
                extraction_text="Securities and Exchange Board of India’s (SEBI)",
                attributes={"entity_type": "regulator", "abbreviation": "SEBI"}
            ),
            le.data.Extraction(
                extraction_class="other_entity",
                extraction_text="lenders",
                attributes={"entity_type": "financial_institution"}
            )
        ]
    ),
    le.data.ExampleData(
        text="Shares of Tata Consultancy Services (TCS.NS) advanced 2.1% in New York trading, closing at $34.20 per ADR.",
        extractions=[
            le.data.Extraction(
                extraction_class="company",
                extraction_text="Tata Consultancy Services (TCS.NS)",
                attributes={"identifier_type": "ticker", "identifier": "TCS.NS"}
            ),
            le.data.Extraction(
                extraction_class="financial_metric",
                extraction_text="2.1%",
                attributes={"metric_type": "stock_change", "company": "Tata Consultancy Services (TCS.NS)"}
            ),
            le.data.Extraction(
                extraction_class="financial_metric",
                extraction_text="$34.20 per ADR",
                attributes={"metric_type": "ADR_close", "company": "Tata Consultancy Services (TCS.NS)", "currency": "USD"}
            )
        ]
    ),
    le.data.ExampleData(
        text="Adani Enterprises Ltd (ADANIENT.NS) reported a consolidated net profit of ₹2,150 crore for the quarter ending September 30, 2024, compared to ₹1,890 crore in the same quarter last year.",
        extractions=[
            le.data.Extraction(
                extraction_class="company",
                extraction_text="Adani Enterprises Ltd (ADANIENT.NS)",
                attributes={"identifier_type": "ticker", "identifier": "ADANIENT.NS"}
            ),
            le.data.Extraction(
                extraction_class="financial_metric",
                extraction_text="₹2,150 crore",
                attributes={"metric_type": "net_profit", "currency": "INR", "fiscal_period": "quarter ending September 30, 2024"}
            ),
            le.data.Extraction(
                extraction_class="financial_metric",
                extraction_text="₹1,890 crore",
                attributes={"metric_type": "net_profit", "currency": "INR", "fiscal_period": "same quarter last year"}
            ),
            le.data.Extraction(
                extraction_class="date",
                extraction_text="September 30, 2024",
                attributes={"type": "quarter_end"}
            )
        ]
    ),
    le.data.ExampleData(
        text="The company also disclosed pending regulatory dues of ₹320 crore and provisioned ₹75 crore for ongoing compliance matters.",
        extractions=[
            le.data.Extraction(
                extraction_class="company",
                extraction_text="The company",
                
            ),
            le.data.Extraction(
                extraction_class="financial_metric",
                extraction_text="₹320 crore",
                attributes={"metric_type": "regulatory_dues", "currency": "INR"}
            ),
            le.data.Extraction(
                extraction_class="financial_metric",
                extraction_text="₹75 crore",
                attributes={"metric_type": "provisioned_compliance", "currency": "INR"}
            )
        ]
    ),
    le.data.ExampleData(
        text="Its CIN number L51100GJ1993PLC019067 was referenced in the Ministry of Corporate Affairs filing.",
        extractions=[
            le.data.Extraction(
                extraction_class="financial_metric",
                extraction_text="L51100GJ1993PLC019067",
                attributes={"metric_type": "CIN"}
            ),
            le.data.Extraction(
                extraction_class="other_entity",
                extraction_text="Ministry of Corporate Affairs",
                attributes={"entity_type": "government_agency"}
            )
        ]
    )
]

In [7]:
input_text='''On October 5, 2024, ICICI Bank Ltd (ICICIBANK.NS) announced that it had sanctioned home loans worth ₹4,800 crore during Q2 FY2024. The loans carried an average interest rate of 8.2% per annum, with repayment tenures ranging from 10 to 20 years. The disclosure followed the Securities and Exchange Board of India’s (SEBI) new mandate requiring lenders to publish segment-wise lending data.

In the global currency markets, the British pound traded at 1.25 against the US dollar, while the yen weakened further to 148.2 per dollar, prompting speculation of coordinated central bank action. The NASDAQ Composite closed at 14,950 points, rising 1.5% as semiconductor stocks led gains. Meanwhile, shares of Tata Consultancy Services (TCS.NS) advanced 2.1% in New York trading, closing at $34.20 per ADR.

In corporate earnings, Adani Enterprises Ltd (ADANIENT.NS) reported a consolidated net profit of ₹2,150 crore for the quarter ending September 30, 2024, compared to ₹1,890 crore in the same quarter last year. The company also disclosed pending regulatory dues of ₹320 crore and provisioned ₹75 crore for ongoing compliance matters. Its CIN number L51100GJ1993PLC019067 was referenced in the Ministry of Corporate Affairs filing.

'''

In [10]:
result = le.extract(
    text_or_documents=input_text,
    prompt_description=prompt,
    examples=examples,
    model_id="gemini-2.5-flash"
)



In [11]:
le.io.save_annotated_documents([result], output_name="results.jsonl", output_dir=".")


[94m[1mLangExtract[0m: Saving to [92mresults.jsonl[0m: 1 docs [00:00, 425.77 docs/s]

[92m✓[0m Saved [1m1[0m documents to [92mresults.jsonl[0m





In [12]:
html_content = le.visualize("results.jsonl")

with open("visualization.html", "w", encoding="utf-8") as f:
    if hasattr(html_content, 'data'):  # For Jupyter/Colab object
        f.write(html_content.data)
    else:
        f.write(html_content)

print("✅ Extraction complete. Results saved to 'extraction_results.jsonl' and visualization.html")

[94m[1mLangExtract[0m: Loading [92mresults.jsonl[0m: 100%|██████████| 10.1k/10.1k [00:00<00:00, 10.8MB/s]

[92m✓[0m Loaded [1m1[0m documents from [92mresults.jsonl[0m
✅ Extraction complete. Results saved to 'extraction_results.jsonl' and visualization.html



