In [None]:
!pip install langextract

Collecting langextract
  Downloading langextract-1.0.9-py3-none-any.whl.metadata (19 kB)
Collecting async_timeout>=4.0.0 (from langextract)
  Downloading async_timeout-5.0.1-py3-none-any.whl.metadata (5.1 kB)
Collecting exceptiongroup>=1.1.0 (from langextract)
  Downloading exceptiongroup-1.3.0-py3-none-any.whl.metadata (6.7 kB)
Collecting ml-collections>=0.1.0 (from langextract)
  Downloading ml_collections-1.1.0-py3-none-any.whl.metadata (22 kB)
Downloading langextract-1.0.9-py3-none-any.whl (106 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.2/106.2 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading async_timeout-5.0.1-py3-none-any.whl (6.2 kB)
Downloading exceptiongroup-1.3.0-py3-none-any.whl (16 kB)
Downloading ml_collections-1.1.0-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.7/76.7 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ml-collections, exceptiongroup, async

In [None]:
!pip install docling

Collecting docling
  Downloading docling-2.54.0-py3-none-any.whl.metadata (11 kB)
Collecting docling-core<3.0.0,>=2.48.2 (from docling-core[chunking]<3.0.0,>=2.48.2->docling)
  Downloading docling_core-2.48.2-py3-none-any.whl.metadata (6.5 kB)
Collecting docling-parse<5.0.0,>=4.4.0 (from docling)
  Downloading docling_parse-4.5.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.8 kB)
Collecting docling-ibm-models<4,>=3.9.1 (from docling)
  Downloading docling_ibm_models-3.9.1-py3-none-any.whl.metadata (6.7 kB)
Collecting filetype<2.0.0,>=1.2.0 (from docling)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting pypdfium2!=4.30.1,<5.0.0,>=4.30.0 (from docling)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting easyocr<2.0,>=1.7 (from docling)
  Downloading eas

In [None]:
import langextract as lx
import textwrap

In [None]:
prompt = textwrap.dedent("""\
Extract entities related to Party & Identification Information, Monetary Values, and Dates & Time Periods.
Follow these rules:
1. Use exact text spans from the input, no paraphrasing.
2. Do not overlap entities.
3. Provide meaningful attributes to add context.
4. Categories:
   - Party & Identification Information:
     • Entity Names: Company, institution, counterparty, regulator
     • Identifiers: Tax ID, GST/VAT, PAN, CIN, registration numbers
     • Account Numbers: Bank account, loan account, investment account IDs
   - Monetary Values:
     • Principal Amounts: Loan value, invoice total, transaction amount
     • Fees & Charges: Service fees, late fees, management charges
     • Interest Rates: Fixed/floating, annual %, APR, benchmark reference (LIBOR, SOFR)
     • Taxes: VAT, GST, withholding tax
     • Penalties/Fines: Early termination fee, default charges
   - Dates & Time Periods:
     • Effective Date, Maturity Date, Due Dates, Tenure/Duration
     • Historical Dates: Transaction date, invoice date, settlement date
""")


In [None]:
from docling.document_converter import DocumentConverter

source = "/langextractdock.pdf"  # document per local path or URL
converter = DocumentConverter()
result = converter.convert(source)
doc=result.document.export_to_markdown()  # output: "## Docling Technical Report[...]"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
doc

'"On March 15, 2023, HDFC Bank Ltd reported a loan disbursement of INR 5,00,000 under account number 123456789. The loan tenure is 5 years with an interest rate of 7.5% per annum.",'

In [None]:
examples = [
    lx.data.ExampleData(
        text=doc,
        extractions=[
            lx.data.Extraction(
                extraction_class="company",
                extraction_text="HDFC Bank Ltd"
            ),
            lx.data.Extraction(
                extraction_class="date",
                extraction_text="March 15, 2023",
                attributes={"date_type":"transaction date"}
            ),
            lx.data.Extraction(
                extraction_class="account_number",
                extraction_text="123456789",
                attributes={"account_type":"loan account"}
            ),
            lx.data.Extraction(
                extraction_class="monetary_value",
                extraction_text="INR 5,00,000",
                attributes={"value_type":"principal amount","currency":"INR"}
            ),
            lx.data.Extraction(
                extraction_class="duration",
                extraction_text="5 years",
                attributes={"duration_type":"loan tenure"}
            ),
            lx.data.Extraction(
                extraction_class="interest_rate",
                extraction_text="7.5% per annum",
                attributes={"rate_type":"fixed"}
            ),
        ]
    )
]


In [None]:
input_text = """On Jan 10, 2024, ICICI Bank sanctioned a home loan of INR 25,00,000 under loan account 987654321 with a tenure of 15 years and an annual interest rate of 8%.
On Feb 5, 2024, Axis Bank imposed a late fee of INR 2,000 on credit card account 1122334455.
Reliance Industries Ltd paid GST of INR 50,000 on March 12, 2023, under GSTIN 27AAACR5055K1ZL.
SEBI fined ABC Securities INR 1,50,000 on Aug 18, 2022, for regulatory violations.
On Dec 1, 2023, Tata Motors issued an invoice of INR 12,75,000 with a payment due date of Jan 15, 2024."""


In [None]:
import os

os.environ['LANGEXTRACT_API_KEY'] = "AIzaSyAIIes-a5rcyYxRI5mA4n6S1va632p-yOg"

result = lx.extract(
    text_or_documents=input_text,
    prompt_description=prompt,
    examples=examples,
    model_id="gemini-2.5-flash"
)



In [None]:
lx.io.save_annotated_documents([result], output_name="extraction_results.json", output_dir=".")

[94m[1mLangExtract[0m: Saving to [92mextraction_results.json[0m: 1 docs [00:00, 340.47 docs/s]

[92m✓[0m Saved [1m1[0m documents to [92mextraction_results.json[0m





In [None]:
html_content = lx.visualize("extraction_results.json")
with open("visualization.html", "w") as f:
    if hasattr(html_content, 'data'):
        f.write(html_content.data)  # For Colab/Jupyter
    else:
        f.write(html_content)

[94m[1mLangExtract[0m: Loading [92mextraction_results.json[0m: 100%|██████████| 6.12k/6.12k [00:00<00:00, 10.6MB/s]

[92m✓[0m Loaded [1m1[0m documents from [92mextraction_results.json[0m





In [None]:
from IPython.display import display, HTML
if hasattr(html_content, 'data'):
    display(HTML(html_content.data))
else:
    display(HTML(html_content))
