In [26]:
# Install necessary libraries for BERT and document processing
!pip install transformers torch pypdf python-docx python-pptx

print("Required packages installed successfully!")

Required packages installed successfully!


In [31]:
import os
import re
import string
import json
from typing import Dict, List, Optional, Union, Tuple
from google.colab import files
from pypdf import PdfReader
import docx
from pptx import Presentation
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

In [32]:
class FinancialEntityExtractor:
    def __init__(self, model_name: str = "dslim/bert-base-NER"):
        self.model_name = model_name
        self.tokenizer = None
        self.model = None
        self.nlp_pipeline = None
        self._load_model()

        self.adani_companies = [
            "Adani Enterprises Limited", "Adani Ports and Special Economic Zone Limited",
            "Adani Power Limited", "Adani Transmission Limited", "Adani Gas Limited",
            "Adani Green Energy Limited", "Adani Total Gas Limited", "Adani Energy Solutions Limited",
            "Adani Airport Holdings Limited", "Adani Roads Transport Limited", "Adani New Industries Limited",
            "Adani Wilmar Limited", "Ambuja Cements Limited", "ACC Limited"
        ]

    def _load_model(self):
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForTokenClassification.from_pretrained(self.model_name)
        self.nlp_pipeline = pipeline(
            "ner",
            model=self.model,
            tokenizer=self.tokenizer,
            aggregation_strategy="simple"
        )

    def extract_text_from_pdf(self, pdf_path: str) -> str:
        reader = PdfReader(pdf_path)
        return "\n".join([page.extract_text() or "" for page in reader.pages])

    def extract_text_from_docx(self, docx_path: str) -> str:
        doc = docx.Document(docx_path)
        return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])

    def extract_text_from_ppt(self, ppt_path: str) -> str:
        prs = Presentation(ppt_path)
        text = ""
        for slide in prs.slides:
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    text += shape.text + "\n"
        return text

    def preprocess_text(self, text: str) -> str:
        text = re.sub(r'\s+', ' ', text)
        text = text.replace("₹", "INR ").replace("$", "USD ")
        text = ''.join(c for c in text if c in string.printable)
        return text.strip()

    def extract_company_names(self, text: str) -> List[str]:
        companies = []
        for company in self.adani_companies:
            if company.lower() in text.lower():
                companies.append(company)

        bert_entities = self.nlp_pipeline(text)
        for ent in bert_entities:
            if ent["entity_group"] == "ORG":
                company_name = ent["word"]
                if 5 < len(company_name) < 50 and company_name not in companies:
                    companies.append(company_name.strip())

        return list(set(companies))

    def extract_financial_events(self, text: str) -> List[str]:
        events = []
        event_patterns = [
            r"(?:acquired|merged with|took over|acquisition of|merger with)\s+([A-Z][a-zA-Z0-9\s&.,-]+?)(?:\.|,|\s|$)",
            r"(?:declared|announced)\s+(?:a\s+)?(?:dividend|interim dividend|final dividend)\s+(?:of|@)\s+([₹$Rs]?\s?[0-9,]+(?:\.[0-9]+)?)(?:\s*(?:per share|/share))?",
            r"(?:AGM|EGM|Annual General Meeting|Extraordinary General Meeting|Board Meeting)[\s:]*([A-Za-z0-9\s,.-]+?)(?:\.|,|\s|$)",
            r"(?:launched|commissioned|inaugurated)\s+([A-Z][a-zA-Z0-9\s&.,-]+?)(?:\.|,|\s|$)",
            r"(?:secured|won|awarded)\s+(?:a\s+)?(?:contract|order|project)\s+(?:worth|of|valued at)?\s*([₹$Rs]?\s?[0-9,]+(?:\.[0-9]+)?\s?(Cr|Million|Billion|Lakh|Thousand|crore|mn|bn)?)",
            r"(?:raised|mopped up)\s+(?:funds|capital)\s+(?:of|worth|through)\s+([₹$Rs]?\s?[0-9,]+(?:\.[0-9]+)?\s?(Cr|Million|Billion|Lakh|Thousand|crore|mn|bn)?)(?:\s*(?:via|through)\s+([A-Za-z\s]+))?",
        ]
        for pattern in event_patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            for match in matches:
                event_phrase = re.search(pattern, text, re.IGNORECASE)
                if event_phrase:
                    full_event = event_phrase.group(0).strip()
                    if len(full_event) > 10 and full_event not in events:
                        events.append(full_event)
        return list(set(events))

    def extract_stock_prices(self, text: str) -> List[str]:
        stock_prices = []
        stock_price_patterns = [
            r"(?:Stock Price|Share Price|Current Price|Price)[\s:]*([₹$Rs]?\s?[0-9,]+(?:\.[0-9]+)?)",
            r"([₹$Rs]?\s?[0-9,]+(?:\.[0-9]+)?)\s*(?:per share|/share)",
        ]
        for pattern in stock_price_patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            for match in matches:
                if match and len(match.replace("₹", "").replace("$", "").replace("Rs", "").replace(" ", "").replace(",", "")) < 10:
                    if match not in stock_prices:
                        stock_prices.append(match.strip())
        return stock_prices

    def extract_revenue(self, text: str) -> List[str]:
        revenue = []
        revenue_patterns = [
            r"(?:Revenue|Total Income|Sales|Turnover)[\s:]*([₹$Rs]?\s?[0-9,]+(?:\.[0-9]+)?\s?(Cr|Million|Billion|Lakh|Thousand|crore|mn|bn)?)",
            r"Revenue\s+(?:grew to|of|was|reached)\s+([₹$Rs]?\s?[0-9,]+(?:\.[0-9]+)?\s?(Cr|Million|Billion|Lakh|Thousand|crore|mn|bn)?)"
        ]
        for pattern in revenue_patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            for match in matches:
                if match and match not in revenue:
                    revenue.append(match.strip())
        return revenue

    def extract_market_cap(self, text: str) -> List[str]:
        market_cap = []
        market_cap_patterns = [
            r"(?:Market Cap|Market Capitalization|M Cap)[\s:]*([₹$Rs]?\s?[0-9,]+(?:\.[0-9]+)?\s?(Cr|Million|Billion|Lakh|Thousand|crore|mn|bn)?)",
        ]
        for pattern in market_cap_patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            for match in matches:
                if match and match not in market_cap:
                    market_cap.append(match.strip())
        return market_cap

    def extract_earnings(self, text: str) -> List[str]:
        earnings = []
        earnings_patterns = [
            r"(?:Net (?:Profit|Loss)|PAT|Profit After Tax|Earnings)[\s:]*([₹$Rs]?\s?[0-9,]+(?:\.[0-9]+)?\s?(Cr|Million|Billion|Lakh|Thousand|crore|mn|bn)?)",
            r"(?:Profit Before Tax|PBT|EBITDA|EBIT)[\s:]*([₹$Rs]?\s?[0-9,]+(?:\.[0-9]+)?\s?(Cr|Million|Billion|Lakh|Thousand|crore|mn|bn)?)",
            r"(?:EPS|Earnings per share)[\s:]*([₹$Rs]?\s?[0-9,]+(?:\.[0-9]+)?)",
        ]
        for pattern in earnings_patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            for match in matches:
                if match and match not in earnings:
                    earnings.append(match.strip())
        return earnings

    def extract_financial_ratios(self, text: str) -> List[str]:
        ratios = []
        ratio_patterns = [
            r"(?:P/E Ratio|Price Earnings Ratio)[\s:]*([0-9]+(?:\.[0-9]+)?)",
            r"(?:ROE|Return on Equity)[\s:]*([0-9]+(?:\.[0-9]+)?%?)",
            r"(?:ROCE|Return on Capital Employed)[\s:]*([0-9]+(?:\.[0-9]+)?%?)",
            r"(?:Debt Equity Ratio|D/E Ratio)[\s:]*([0-9]+(?:\.[0-9]+)?)",
        ]
        for pattern in ratio_patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            for match in matches:
                if match and match not in ratios:
                    ratios.append(match.strip())
        return ratios

    def extract_financial_dates(self, text: str) -> List[str]:
        dates = []
        date_patterns = [
            r"\bQ[1-4]\s+FY\d{2,4}\b", r"\bFY\d{2,4}\b",
            r"\b\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4}\b",
            r"\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{4}\b",
            r"\b\d{1,2}/\d{1,2}/\d{4}\b", r"\b\d{1,2}-\d{1,2}-\d{4}\b", r"\b\d{4}\b"
        ]
        for pattern in date_patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            for match in matches:
                if pattern == r"\b\d{4}\b" and (int(match) < 1900 or int(match) > 2100):
                    continue
                if match not in dates:
                    dates.append(match.strip())
        return dates

    def extract_phone_numbers(self, text: str) -> List[str]:
        phone_numbers = []
        phone_patterns = [r"\+?\d{1,3}[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}"]
        for pattern in phone_patterns:
            matches = re.findall(pattern, text)
            for match in matches:
                if match not in phone_numbers:
                    phone_numbers.append(match.strip())
        return phone_numbers

    def process_document(self, file_path: str) -> Dict[str, Union[str, List[str]]]:
        if not os.path.exists(file_path):
            return {"error": f"File does not exist: {file_path}"}

        if file_path.lower().endswith(".pdf"):
            raw_text = self.extract_text_from_pdf(file_path)
        elif file_path.lower().endswith(".docx"):
            raw_text = self.extract_text_from_docx(file_path)
        elif file_path.lower().endswith((".ppt", ".pptx")):
            raw_text = self.extract_text_from_ppt(file_path)
        else:
            return {"error": "Unsupported file format. Please use PDF, DOCX, or PPT/PPTX."}

        if not raw_text.strip():
            return {"error": "No text could be extracted from the document."}

        text = self.preprocess_text(raw_text)

        company_names = self.extract_company_names(text)
        financial_events = self.extract_financial_events(text)
        stock_prices = self.extract_stock_prices(text)
        revenue = self.extract_revenue(text)
        market_cap = self.extract_market_cap(text)
        earnings = self.extract_earnings(text)
        financial_ratios = self.extract_financial_ratios(text)
        financial_dates = self.extract_financial_dates(text)
        phone_numbers = self.extract_phone_numbers(text)

        result = {
            "file_path": file_path,
            "text_length": len(text),
            "entities": {
                "company_names": company_names,
                "financial_events": financial_events,
                "stock_prices": stock_prices,
                "revenue": revenue,
                "market_cap": market_cap,
                "earnings": earnings,
                "financial_ratios": financial_ratios,
                "financial_dates": financial_dates,
                "phone_numbers": phone_numbers
            }
        }
        return result

In [33]:
# Initialize extractor
extractor = FinancialEntityExtractor()

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


In [34]:
# Upload and process document
uploaded = files.upload()

if uploaded:
    filename = list(uploaded.keys())[0]
    results = extractor.process_document(filename)

    if "error" in results:
        print(f"Error: {results['error']}")
    else:
        print(f"\nDocument Analysis Results")
        print(f"Text Length: {results['text_length']} characters")

        for entity_type, entities in results["entities"].items():
            print(f"\n{entity_type.replace('_', ' ').title()} ({len(entities)} found)")
            if entities:
                for entity in entities[:10]:
                    print(f"- {entity}")
            else:
                print("No entities found.")
else:
    print("No file uploaded.")

Saving AEL_Earnings_Presentation_Q1-FY26.pdf to AEL_Earnings_Presentation_Q1-FY26 (5).pdf

Document Analysis Results
Text Length: 34766 characters

Company Names (25 found)
- Mumbai
- Adani Connex Data Center
- Transport & Logistics Adani Airports Holdings Ltd
- Energy & Utility ANIL Ecosystem
- Energy
- Airports and Roads Large Infra
- Primary Industries Mining services
- Adani Enterprises Limited
- Adani Roads Transport Limited
- Adani Power Limited

Financial Events (2 found)
- acquisition of NQXT
- commissioned Indias

Stock Prices (0 found)
No entities found.

Revenue (0 found)
No entities found.

Market Cap (0 found)
No entities found.

Earnings (0 found)
No entities found.

Financial Ratios (0 found)
No entities found.

Financial Dates (21 found)
- Q1 FY26
- Q1 FY25
- FY26
- FY25
- FY19
- FY20
- FY21
- FY22
- FY23
- FY24

Phone Numbers (0 found)
No entities found.


In [35]:
# Save results
if 'results' in locals() and "error" not in results:
    output_filename = "financial_entities.json"
    with open(output_filename, 'w') as f:
        json.dump(results, f, indent=2)
    files.download(output_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>