In [2]:
! pip install langextract

Collecting langextract
  Downloading langextract-1.0.9-py3-none-any.whl.metadata (19 kB)
Collecting async_timeout>=4.0.0 (from langextract)
  Downloading async_timeout-5.0.1-py3-none-any.whl.metadata (5.1 kB)
Collecting exceptiongroup>=1.1.0 (from langextract)
  Downloading exceptiongroup-1.3.0-py3-none-any.whl.metadata (6.7 kB)
Collecting ml-collections>=0.1.0 (from langextract)
  Downloading ml_collections-1.1.0-py3-none-any.whl.metadata (22 kB)
Downloading langextract-1.0.9-py3-none-any.whl (106 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.2/106.2 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading async_timeout-5.0.1-py3-none-any.whl (6.2 kB)
Downloading exceptiongroup-1.3.0-py3-none-any.whl (16 kB)
Downloading ml_collections-1.1.0-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.7/76.7 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ml-collections, exceptiongroup, async

In [3]:
import langextract as lx
import textwrap

In [4]:
import os
os.environ["LANGEXTRACT_API_KEY"] = "AIzaSyBdlM1apKepjMEeDGPh33bM4dbpR8QhEIM"

In [5]:
prompt = textwrap.dedent("""\
Extract financial, corporate, and regulatory entities in order of appearance.
Use the original wording for each extraction (do not paraphrase).
Map each entity to the closest type:
- Party & Identification (company, regulator, bank, stock exchange, identifier like CIN, PAN, ISIN)
- Monetary Value (amounts, revenue, profit, liabilities, penalties, taxes, market cap)
- Percentage/Ratio (ownership, shareholding, growth rate, interest rate, dividend yield, tax rate)
- Date/Time (agreement date, filing date, reporting period, maturity date, etc.)
- Financial Instrument (equity shares, bonds, loans, derivatives, account numbers, ISIN codes)
- Market Reference (indices, stock tickers, share price, exchange rates)
Include contextual attributes such as identifier type (CIN, PAN, ISIN), metric type (profit, revenue, penalty, loan amount), currency, instrument type, or regulator name.
""")


In [6]:
examples = [
    lx.data.ExampleData(
        text="On 31st March 2024, Reliance Industries Ltd (CIN: L17110MH1973PLC019786, NSE: RELIANCE, BSE: 500325) reported a net profit of ₹18,000 crore with revenue of ₹2,10,000 crore. The filing with SEBI mentioned outstanding debt of USD 5 billion (ISIN: INE002A01018) and an interim dividend payout of 20%. The maturity date of certain bonds was noted as 15th June 2028.",
        extractions=[
            lx.data.Extraction(
                extraction_class="company",
                extraction_text="Reliance Industries Ltd",
                attributes={"identifier_type": "CIN", "identifier": "L17110MH1973PLC019786"}
            ),
            lx.data.Extraction(
                extraction_class="stock_ticker",
                extraction_text="NSE: RELIANCE"
            ),
            lx.data.Extraction(
                extraction_class="stock_ticker",
                extraction_text="BSE: 500325"
            ),
            lx.data.Extraction(
                extraction_class="date",
                extraction_text="31st March 2024",
                attributes={"type": "reporting_date"}
            ),
            lx.data.Extraction(
                extraction_class="financial_metric",
                extraction_text="₹18,000 crore",
                attributes={"metric_type": "net_profit", "currency": "INR"}
            ),
            lx.data.Extraction(
                extraction_class="financial_metric",
                extraction_text="₹2,10,000 crore",
                attributes={"metric_type": "revenue", "currency": "INR"}
            ),
            lx.data.Extraction(
                extraction_class="regulator",
                extraction_text="SEBI"
            ),
            lx.data.Extraction(
                extraction_class="financial_metric",
                extraction_text="USD 5 billion",
                attributes={"metric_type": "outstanding_debt", "currency": "USD"}
            ),
            lx.data.Extraction(
                extraction_class="instrument",
                extraction_text="ISIN: INE002A01018",
                attributes={"instrument_type": "bond"}
            ),
            lx.data.Extraction(
                extraction_class="percentage",
                extraction_text="20%",
                attributes={"metric_type": "dividend_payout"}
            ),
            lx.data.Extraction(
                extraction_class="date",
                extraction_text="15th June 2028",
                attributes={"type": "maturity_date"}
            )
        ]
    )
]


In [7]:
input_text = """On July 25, 2024, Infosys Ltd (NSE: INFY, BSE: 500209, ISIN: INE009A01021) announced
a quarterly revenue of ₹38,500 crore and a net profit of ₹7,800 crore. The board declared an
interim dividend of 30%, payable by August 15, 2024. The filing with SEBI also disclosed
foreign currency debt of USD 1.2 billion, with maturity scheduled for December 31, 2026.

Meanwhile, the Reserve Bank of India (RBI) maintained the repo rate at 6.5% in its monetary
policy review. The Nifty 50 index closed at 22,150 points, while the Sensex ended at 73,200 points.
Infosys shares traded at ₹1,480 on NSE, reflecting a P/E ratio of 22.5. The INR/USD exchange
rate stood at 83.2 during the same period.

In parallel, HDFC Bank Ltd (NSE: HDFCBANK, ISIN: INE040A01034) reported a net interest
income (NII) of ₹24,500 crore and gross NPA ratio of 1.2%. The filing cited PAN: AAACH2702H
and CIN: L65920MH1994PLC080618 in its disclosure to the Ministry of Corporate Affairs.
"""


In [8]:
result = lx.extract(
    text_or_documents=input_text,
    prompt_description=prompt,
    examples=examples,
    model_id="gemini-2.5-flash"
)



In [9]:
lx.io.save_annotated_documents([result], output_name="extraction_results.jsonl", output_dir=".")

[94m[1mLangExtract[0m: Saving to [92mextraction_results.jsonl[0m: 1 docs [00:00, 463.41 docs/s]

[92m✓[0m Saved [1m1[0m documents to [92mextraction_results.jsonl[0m





In [10]:
html_content = lx.visualize("extraction_results.jsonl")

with open("visualization.html", "w", encoding="utf-8") as f:
    if hasattr(html_content, 'data'):  # For Jupyter/Colab object
        f.write(html_content.data)
    else:
        f.write(html_content)

print("✅ Extraction complete. Results saved to 'extraction_results.jsonl' and visualization.html")

[94m[1mLangExtract[0m: Loading [92mextraction_results.jsonl[0m: 100%|██████████| 9.24k/9.24k [00:00<00:00, 20.6MB/s]

[92m✓[0m Loaded [1m1[0m documents from [92mextraction_results.jsonl[0m
✅ Extraction complete. Results saved to 'extraction_results.jsonl' and visualization.html



