In [15]:
!pip install docling



In [16]:
!pip install langextract



In [51]:
from docling.document_converter import DocumentConverter

source = "/content/sample.pdf"  # document per local path or URL
converter = DocumentConverter()
result = converter.convert(source)
markdown_text = result.document.export_to_markdown()
print(result.document.export_to_markdown())  # output: "## Docling Technical Report[...]"

On January 10, 2024, ICICI Bank issued a corporate loan of ₹25,00,000 to M/s Bright Future Ltd. at an interest rate of 6.2% per annum. The repayment tenure is 5 years, with installments due on the 5th of each quarter. Funds were disbursed to Account No. 1122334455 at HDFC Bank.

On June 30, 2023, State Bank of India sanctioned a personal loan of ₹8,50,000 to Ms. Priya Sharma under Agreement No. SB-2023-PI-78, carrying an interest rate of 7.5% p.a. linked to the RBI repo rate. The first EMI is scheduled on August 1, 2023, and payments will be auto-debited from Account 5566778899 at Axis Bank.


In [52]:
import os
os.environ["LANGEXTRACT_API_KEY"] = "AIzaSyDU0KF-VtadnUkaB3dn0UrWaDrz3KMmyAM"


In [56]:
import langextract as lx
import textwrap

prompt = textwrap.dedent("""\
    Extract the following entities in order of appearance:
    - ORGANIZATION (company, bank, firm, institution)
    - PERSON (individuals named in the document)
    - DATE (contract dates, deadlines, durations)
    - MONEY (loans, amounts, salaries, payments)
    - PERCENT (interest rates, growth rates, percentages)
    - ACCOUNT_NUMBER (bank accounts, policy numbers, transaction IDs)
    - ROLE (positions like Director, Manager, Borrower, Lender)
    - CONTRACT_REFERENCE (Agreement, Clause, Section references)
    - LOCATION (cities, offices, addresses)

""")


# 4. Provide a financial example
examples = [
    lx.data.ExampleData(
        text="On March 15, 2023, ICICI Bank approved a business loan of ₹5,00,000 for Rohit Kumar, the Borrower, at its Mumbai branch under Agreement No. AG-2023/45 with an interest rate of 4.5% per annum. The loan amount was credited to Account No. 987654321.",
        extractions=[
            lx.data.Extraction(
                extraction_class="DATE",
                extraction_text="March 15, 2023",
                attributes={"type": "approval_date"}
            ),
            lx.data.Extraction(
                extraction_class="ORG",
                extraction_text="ICICI Bank",
                attributes={"industry": "banking"}
            ),
            lx.data.Extraction(
                extraction_class="MONEY",
                extraction_text="₹5,00,000",
                attributes={"currency": "INR", "purpose": "business loan"}
            ),
            lx.data.Extraction(
                extraction_class="PERSON",
                extraction_text="Rohit Kumar",
                attributes={"role": "borrower"}
            ),
            lx.data.Extraction(
                extraction_class="ROLE",
                extraction_text="Borrower",
                attributes={"relation": "loan recipient"}
            ),
            lx.data.Extraction(
                extraction_class="LOCATION",
                extraction_text="Mumbai",
                attributes={"type": "branch location"}
            ),
            lx.data.Extraction(
                extraction_class="CONTRACT_REFERENCE",
                extraction_text="Agreement No. AG-2023/45",
                attributes={"type": "loan agreement"}
            ),
            lx.data.Extraction(
                extraction_class="PERCENT",
                extraction_text="4.5%",
                attributes={"type": "interest_rate", "timeframe": "per annum"}
            ),
            lx.data.Extraction(
                extraction_class="ACCOUNT_NUMBER",
                extraction_text="987654321",
                attributes={"type": "loan_account"}
            ),
        ]
    )
]



In [57]:
# --- Step 5: Run the extraction ---
result = lx.extract(
    text_or_documents=markdown_text,
    prompt_description=prompt,
    examples=examples,
    model_id="gemini-2.5-flash",
    extraction_passes=3,
    max_workers=20,
    max_char_buffer=1000
)



In [58]:
# --- Step 6: Save results to JSONL ---
lx.io.save_annotated_documents([result], output_name="extraction_results.jsonl", output_dir=".")

[94m[1mLangExtract[0m: Saving to [92mextraction_results.jsonl[0m: 1 docs [00:00, 82.91 docs/s]

[92m✓[0m Saved [1m1[0m documents to [92mextraction_results.jsonl[0m





In [59]:
# --- Step 7: Generate visualization ---
html_content = lx.visualize("extraction_results.jsonl")
with open("visualization.html", "w") as f:
    if hasattr(html_content, 'data'):
        f.write(html_content.data)  # For Jupyter/Colab
    else:
        f.write(html_content)

print("✅ Extraction complete. Check 'extraction_results.jsonl' and 'visualization.html'")

[94m[1mLangExtract[0m: Loading [92mextraction_results.jsonl[0m: 100%|██████████| 5.42k/5.42k [00:00<00:00, 7.37MB/s]

[92m✓[0m Loaded [1m1[0m documents from [92mextraction_results.jsonl[0m
✅ Extraction complete. Check 'extraction_results.jsonl' and 'visualization.html'



