In [1]:
# Install required packages
!pip install langextract pypdf python-docx python-pptx

Collecting langextract
  Downloading langextract-1.0.9-py3-none-any.whl.metadata (19 kB)
Collecting pypdf
  Downloading pypdf-6.1.1-py3-none-any.whl.metadata (7.1 kB)
Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting python-pptx
  Downloading python_pptx-1.0.2-py3-none-any.whl.metadata (2.5 kB)
Collecting async_timeout>=4.0.0 (from langextract)
  Downloading async_timeout-5.0.1-py3-none-any.whl.metadata (5.1 kB)
Collecting exceptiongroup>=1.1.0 (from langextract)
  Downloading exceptiongroup-1.3.0-py3-none-any.whl.metadata (6.7 kB)
Collecting ml-collections>=0.1.0 (from langextract)
  Downloading ml_collections-1.1.0-py3-none-any.whl.metadata (22 kB)
Collecting XlsxWriter>=0.5.7 (from python-pptx)
  Downloading xlsxwriter-3.2.9-py3-none-any.whl.metadata (2.7 kB)
Downloading langextract-1.0.9-py3-none-any.whl (106 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.2/106.2 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m


In [2]:
import os
import re
import string
import json
import textwrap
from typing import Dict, List, Optional, Union, Tuple
from google.colab import files
from pypdf import PdfReader
import docx
from pptx import Presentation
import langextract as lx
from langextract import data as lx_data

In [3]:
# Set API key
os.environ["LANGEXTRACT_API_KEY"] = "AIzaSyCTRrMUxRJRot5tX-eCSjbo433M2dayPjs"

In [7]:
class LangExtractFinancialEntityExtractor:
    def __init__(self):
        # Initialize LangExtract
        self.prompt = textwrap.dedent("""\
            Extract the following entities in order of appearance:
            - ORGANIZATION (company, bank, firm, institution)
            - PERSON (individuals named in the document)
            - DATE (contract dates, deadlines, durations)
            - MONEY (loans, amounts, salaries, payments)
            - PERCENT (interest rates, growth rates, percentages)
            - ACCOUNT_NUMBER (bank accounts, policy numbers, transaction IDs)
            - ROLE (positions like Director, Manager, Borrower, Lender)
            - CONTRACT_REFERENCE (Agreement, Clause, Section references)
            - LOCATION (cities, offices, addresses)
        """)

        # Define example document for LangExtract
        self.example_document = lx_data.Document(
            text="""
            This Strategic Partnership Agreement is executed on March 1, 2023, between TechNova Solutions Pvt. Ltd.,
            a software development company based in Hyderabad, India, and GlobalEdge Analytics Inc., a U.S.-based data analytics firm
            with offices in New York City, USA.

            Under this agreement, GlobalEdge will invest USD 2,000,000 in TechNova for the development of a joint AI research lab.
            The funds will be transferred to Account Number 5544332211 maintained with Axis Bank, Banjara Hills Branch, Hyderabad.

            The agreement will be valid for a term of 3 years starting from April 1, 2023, and is subject to renewal upon mutual consent.
            TechNova's CEO, Mr. Ramesh Varma, and GlobalEdge's Director of Partnerships, Ms. Linda Zhao, have signed the agreement
            on behalf of their respective companies.

            The profit-sharing ratio has been agreed at 60% (TechNova) and 40% (GlobalEdge).
            Agreement Reference Code: PARTNER-GE-TN-2023-0301.
            Any disputes will be resolved under the jurisdiction of the Telangana High Court.
            """
        )

        # Define extractions separately
        self.example_extractions = [
            lx_data.Extraction(
                extraction_class="DATE",
                extraction_text="March 1, 2023",
                attributes={"type": "signing_date"}
            ),
            lx_data.Extraction(
                extraction_class="ORG",
                extraction_text="TechNova Solutions Pvt. Ltd.",
                attributes={"industry": "software"}
            ),
            lx_data.Extraction(
                extraction_class="ORG",
                extraction_text="GlobalEdge Analytics Inc.",
                attributes={"industry": "data analytics"}
            ),
            lx_data.Extraction(
                extraction_class="LOCATION",
                extraction_text="Hyderabad, India",
                attributes={"type": "company_headquarters"}
            ),
            lx_data.Extraction(
                extraction_class="LOCATION",
                extraction_text="New York City, USA",
                attributes={"type": "global_office"}
            ),
            lx_data.Extraction(
                extraction_class="MONEY",
                extraction_text="USD 2,000,000",
                attributes={"purpose": "AI research investment"}
            ),
            lx_data.Extraction(
                extraction_class="ACCOUNT_NUMBER",
                extraction_text="5544332211",
                attributes={"bank": "Axis Bank", "branch": "Banjara Hills"}
            ),
            lx_data.Extraction(
                extraction_class="DATE",
                extraction_text="April 1, 2023",
                attributes={"type": "agreement_start_date"}
            ),
            lx_data.Extraction(
                extraction_class="PERSON",
                extraction_text="Mr. Ramesh Varma",
                attributes={"role": "CEO", "organization": "TechNova"}
            ),
            lx_data.Extraction(
                extraction_class="PERSON",
                extraction_text="Ms. Linda Zhao",
                attributes={"role": "Director of Partnerships", "organization": "GlobalEdge"}
            ),
            lx_data.Extraction(
                extraction_class="PERCENT",
                extraction_text="60%",
                attributes={"entity": "TechNova", "type": "profit_share"}
            ),
            lx_data.Extraction(
                extraction_class="PERCENT",
                extraction_text="40%",
                attributes={"entity": "GlobalEdge", "type": "profit_share"}
            ),
            lx_data.Extraction(
                extraction_class="CONTRACT_REFERENCE",
                extraction_text="PARTNER-GE-TN-2023-0301",
                attributes={"type": "partnership_agreement_id"}
            ),
            lx_data.Extraction(
                extraction_class="LOCATION",
                extraction_text="Telangana High Court",
                attributes={"type": "jurisdiction"}
            ),
        ]

        self.adani_companies = [
            "Adani Enterprises Limited", "Adani Ports and Special Economic Zone Limited",
            "Adani Power Limited", "Adani Transmission Limited", "Adani Gas Limited",
            "Adani Green Energy Limited", "Adani Total Gas Limited", "Adani Energy Solutions Limited",
            "Adani Airport Holdings Limited", "Adani Roads Transport Limited", "Adani New Industries Limited",
            "Adani Wilmar Limited", "Ambuja Cements Limited", "ACC Limited"
        ]

    def extract_text_from_pdf(self, pdf_path: str) -> str:
        """Extract text from PDF file."""
        reader = PdfReader(pdf_path)
        return "\n".join([page.extract_text() or "" for page in reader.pages])

    def extract_text_from_docx(self, docx_path: str) -> str:
        """Extract text from DOCX file."""
        doc = docx.Document(docx_path)
        return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])

    def extract_text_from_ppt(self, ppt_path: str) -> str:
        """Extract text from PPT/PPTX file."""
        prs = Presentation(ppt_path)
        text = ""
        for slide in prs.slides:
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    text += shape.text + "\n"
        return text

    def preprocess_text(self, text: str) -> str:
        """Preprocess the extracted text."""
        text = re.sub(r'\s+', ' ', text)
        text = text.replace("₹", "INR ").replace("$", "USD ")
        text = ''.join(c for c in text if c in string.printable)
        return text.strip()

    def extract_with_langextract(self, text: str) -> Dict[str, List[str]]:
        """Extract entities using LangExtract API."""
        try:
            # Create a document with extractions
            example_doc_with_extractions = lx_data.Document(
                text=self.example_document.text,
                extractions=self.example_extractions
            )

            # Use LangExtract to extract entities
            result = lx.extract(
                text_or_documents=text,
                prompt_description=self.prompt,
                examples=[example_doc_with_extractions],
                model_id="gemini-2.5-flash",
                extraction_passes=3,
                max_workers=20,
                max_char_buffer=1000
            )

            # Initialize entity categories
            entities = {
                "company_names": [],
                "financial_events": [],
                "stock_prices": [],
                "revenue": [],
                "market_cap": [],
                "earnings": [],
                "financial_ratios": [],
                "financial_dates": [],
                "phone_numbers": []
            }

            # Process extractions
            for extraction in result.extractions:
                entity_class = extraction.extraction_class
                entity_text = extraction.extraction_text

                # Map LangExtract classes to our categories
                if entity_class == "ORG":
                    entities["company_names"].append(entity_text)
                elif entity_class == "MONEY":
                    # Categorize monetary values
                    if any(keyword in entity_text.lower() for keyword in ["revenue", "income", "sales"]):
                        entities["revenue"].append(entity_text)
                    elif any(keyword in entity_text.lower() for keyword in ["profit", "loss", "earnings", "pat"]):
                        entities["earnings"].append(entity_text)
                    elif any(keyword in entity_text.lower() for keyword in ["market cap", "capitalization"]):
                        entities["market_cap"].append(entity_text)
                    elif any(keyword in entity_text.lower() for keyword in ["price", "share"]):
                        entities["stock_prices"].append(entity_text)
                    else:
                        # Default to revenue if no specific category matches
                        entities["revenue"].append(entity_text)
                elif entity_class == "PERCENT":
                    entities["financial_ratios"].append(entity_text)
                elif entity_class == "DATE":
                    entities["financial_dates"].append(entity_text)
                elif entity_class == "ACCOUNT_NUMBER":
                    # Check if it's a phone number
                    if re.match(r"\+?\d{1,3}[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}", entity_text):
                        entities["phone_numbers"].append(entity_text)
                elif entity_class == "CONTRACT_REFERENCE":
                    # Treat contract references as financial events
                    entities["financial_events"].append(f"Reference: {entity_text}")

            return entities

        except Exception as e:
            print(f"Error extracting with LangExtract: {e}")
            # Return empty entities if extraction fails
            return {
                "company_names": [],
                "financial_events": [],
                "stock_prices": [],
                "revenue": [],
                "market_cap": [],
                "earnings": [],
                "financial_ratios": [],
                "financial_dates": [],
                "phone_numbers": []
            }

    def extract_company_names(self, text: str) -> List[str]:
        """Extract company names from the text using LangExtract."""
        # First, add known Adani companies if found in text
        companies = []
        for company in self.adani_companies:
            if company.lower() in text.lower():
                companies.append(company)

        # Use LangExtract to find more companies
        langextract_entities = self.extract_with_langextract(text)
        for company in langextract_entities["company_names"]:
            if company not in companies:
                companies.append(company)

        return list(set(companies))

    def extract_financial_events(self, text: str) -> List[str]:
        """Extract financial events from the text using LangExtract."""
        langextract_entities = self.extract_with_langextract(text)
        return langextract_entities["financial_events"]

    def extract_stock_prices(self, text: str) -> List[str]:
        """Extract stock prices from the text using LangExtract."""
        langextract_entities = self.extract_with_langextract(text)
        return langextract_entities["stock_prices"]

    def extract_revenue(self, text: str) -> List[str]:
        """Extract revenue information from the text using LangExtract."""
        langextract_entities = self.extract_with_langextract(text)
        return langextract_entities["revenue"]

    def extract_market_cap(self, text: str) -> List[str]:
        """Extract market capitalization from the text using LangExtract."""
        langextract_entities = self.extract_with_langextract(text)
        return langextract_entities["market_cap"]

    def extract_earnings(self, text: str) -> List[str]:
        """Extract earnings information from the text using LangExtract."""
        langextract_entities = self.extract_with_langextract(text)
        return langextract_entities["earnings"]

    def extract_financial_ratios(self, text: str) -> List[str]:
        """Extract financial ratios from the text using LangExtract."""
        langextract_entities = self.extract_with_langextract(text)
        return langextract_entities["financial_ratios"]

    def extract_financial_dates(self, text: str) -> List[str]:
        """Extract financial dates from the text using LangExtract."""
        langextract_entities = self.extract_with_langextract(text)
        return langextract_entities["financial_dates"]

    def extract_phone_numbers(self, text: str) -> List[str]:
        """Extract phone numbers from the text using LangExtract."""
        langextract_entities = self.extract_with_langextract(text)
        return langextract_entities["phone_numbers"]

    def process_document(self, file_path: str) -> Dict[str, Union[str, List[str]]]:
        """Process a document and extract financial entities using LangExtract."""
        if not os.path.exists(file_path):
            return {"error": f"File does not exist: {file_path}"}

        if file_path.lower().endswith(".pdf"):
            raw_text = self.extract_text_from_pdf(file_path)
        elif file_path.lower().endswith(".docx"):
            raw_text = self.extract_text_from_docx(file_path)
        elif file_path.lower().endswith((".ppt", ".pptx")):
            raw_text = self.extract_text_from_ppt(file_path)
        else:
            return {"error": "Unsupported file format. Please use PDF, DOCX, or PPT/PPTX."}

        if not raw_text.strip():
            return {"error": "No text could be extracted from the document."}

        text = self.preprocess_text(raw_text)

        # Extract all entities using LangExtract
        langextract_entities = self.extract_with_langextract(text)

        # Extract all entities
        company_names = self.extract_company_names(text)
        financial_events = langextract_entities["financial_events"]
        stock_prices = langextract_entities["stock_prices"]
        revenue = langextract_entities["revenue"]
        market_cap = langextract_entities["market_cap"]
        earnings = langextract_entities["earnings"]
        financial_ratios = langextract_entities["financial_ratios"]
        financial_dates = langextract_entities["financial_dates"]
        phone_numbers = langextract_entities["phone_numbers"]

        result = {
            "file_path": file_path,
            "text_length": len(text),
            "entities": {
                "company_names": company_names,
                "financial_events": financial_events,
                "stock_prices": stock_prices,
                "revenue": revenue,
                "market_cap": market_cap,
                "earnings": earnings,
                "financial_ratios": financial_ratios,
                "financial_dates": financial_dates,
                "phone_numbers": phone_numbers
            }
        }
        return result

In [8]:
# Initialize extractor
extractor = LangExtractFinancialEntityExtractor()

In [9]:
#Upload and process document
uploaded = files.upload()

if uploaded:
    filename = list(uploaded.keys())[0]
    results = extractor.process_document(filename)

    if "error" in results:
        print(f"Error: {results['error']}")
    else:
        print(f"\nDocument Analysis Results")
        print(f"Text Length: {results['text_length']} characters")

        for entity_type, entities in results["entities"].items():
            print(f"\n{entity_type.replace('_', ' ').title()} ({len(entities)} found)")
            if entities:
                for entity in entities[:10]:
                    print(f"- {entity}")
            else:
                print("No entities found.")

Saving Q1 FY26.pdf to Q1 FY26.pdf
Error extracting with LangExtract: Document.__init__() got an unexpected keyword argument 'extractions'
Error extracting with LangExtract: Document.__init__() got an unexpected keyword argument 'extractions'

Document Analysis Results
Text Length: 46472 characters

Company Names (10 found)
- Adani Total Gas Limited
- Adani Energy Solutions Limited
- Adani New Industries Limited
- Adani Enterprises Limited
- Adani Ports and Special Economic Zone Limited
- Adani Airport Holdings Limited
- Adani Roads Transport Limited
- Adani Green Energy Limited
- ACC Limited
- Adani Power Limited

Financial Events (0 found)
No entities found.

Stock Prices (0 found)
No entities found.

Revenue (0 found)
No entities found.

Market Cap (0 found)
No entities found.

Earnings (0 found)
No entities found.

Financial Ratios (0 found)
No entities found.

Financial Dates (0 found)
No entities found.

Phone Numbers (0 found)
No entities found.


In [10]:
# Save results
if 'results' in locals() and "error" not in results:
    output_filename = "langextract_financial_entities.json"
    with open(output_filename, 'w') as f:
        json.dump(results, f, indent=2)
    files.download(output_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>