In [1]:
# Install dependencies
!pip install -q transformers pypdf python-docx python-pptx

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.5/322.5 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.8/472.8 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.3/175.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from pypdf import PdfReader
import docx
from pptx import Presentation
import os
import re
import string

In [3]:
# --- 1. Load BERT NER Model ---
model_name = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
nlp_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


In [4]:
# --- 2. Text Extraction Functions ---
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    return "\n".join([page.extract_text() or "" for page in reader.pages])

def extract_text_from_docx(docx_path):
    doc = docx.Document(docx_path)
    return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])

def extract_text_from_ppt(ppt_path):
    prs = Presentation(ppt_path)
    text = ""
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text += shape.text + "\n"
    return text


In [5]:
# --- 3. Preprocessing Function ---
def preprocess_text(text):
    # Remove extra whitespace and newlines
    text = re.sub(r'\s+', ' ', text)
    # Normalize currency symbols
    text = text.replace("₹", "INR ").replace("$", "USD ")
    # Remove non-printable characters
    text = ''.join(c for c in text if c in string.printable)
    return text.strip()


In [6]:
# --- 4. Custom Regex for Financial Entities ---
def custom_entity_rules(text):
    patterns = {
        "EPS": r"(EPS|Earnings per share)[\s:]*([0-9]+(\.[0-9]+)?)",
        "DIVIDEND": r"(Dividend)[\s:]*([0-9]+(\.[0-9]+)?\s?(per share)?)",
        "REVENUE": r"(Revenue|Total Income)[\s:]*([0-9,]+(\.\d+)?\s?(Cr|Million|Billion)?)",
        "PROFIT_LOSS": r"(Net (Profit|Loss)|PAT)[\s:]*([0-9,]+(\.\d+)?\s?(Cr|Million|Billion)?)",
        "MARGIN": r"(Operating Margin|EBITDA Margin)[\s:]*([0-9]+(\.[0-9]+)?%)",
        "ASSET": r"(Total Assets)[\s:]*([0-9,]+(\.\d+)?\s?(Cr|Million|Billion)?)",
        "LIABILITY": r"(Total Liabilities)[\s:]*([0-9,]+(\.\d+)?\s?(Cr|Million|Billion)?)"
    }

    found_entities = []
    for label, pattern in patterns.items():
        matches = re.findall(pattern, text, re.IGNORECASE)
        for match in matches:
            value = match[1] if len(match) > 1 else match[0]
            found_entities.append({"entity": value.strip(), "type": label})
    return found_entities

In [7]:
# --- 5. Main Execution ---
file_path = input(" Enter path to financial document (.pdf, .docx, .pptx): ").strip()
if not os.path.exists(file_path):
    raise FileNotFoundError(" File does not exist.")

# Extract text
if file_path.lower().endswith(".pdf"):
    raw_text = extract_text_from_pdf(file_path)
elif file_path.lower().endswith(".docx"):
    raw_text = extract_text_from_docx(file_path)
elif file_path.lower().endswith((".ppt", ".pptx")):
    raw_text = extract_text_from_ppt(file_path)
else:
    raise ValueError(" Unsupported file format.")

# Preprocess text
text = preprocess_text(raw_text)
print("Text extracted and preprocessed successfully!\n")

# Run BERT NER
bert_entities = nlp_pipeline(text)

# Filter relevant BERT entities
financial_labels = {"ORG", "MONEY", "DATE", "PERCENT"}
bert_filtered = [{"entity": ent["word"], "type": ent["entity_group"]}
                 for ent in bert_entities if ent["entity_group"] in financial_labels]

# Run custom regex detection
custom_entities = custom_entity_rules(text)

# Combine and deduplicate
all_entities = bert_filtered + custom_entities
seen = set()
unique_entities = []
for e in all_entities:
    key = (e["entity"].lower(), e["type"])
    if key not in seen:
        seen.add(key)
        unique_entities.append(e)

# Display results
if unique_entities:
    print(f" Extracted Financial Entities:\n")
    for ent in unique_entities:
        print(f"{ent['entity']}  →  {ent['type']}")
else:
    print(" No financial entities found.")


 Enter path to financial document (.pdf, .docx, .pptx): /content/AEL_Earnings_Presentation_Q1-FY26.pdf
Text extracted and preprocessed successfully!

 Extracted Financial Entities:

Adani Enterprises Limited  →  ORG
Consolidated Financial Highlights  →  ORG
Hamburger Men  →  ORG
I  →  ORG
##BI  →  ORG
EBITDA  →  ORG
IRM  →  ORG
Commercial Mining Inc  →  ORG
Airports and Roads Large Infra  →  ORG
Mumbai  →  ORG
Copper Plant  →  ORG
EBIT  →  ORG
Energy & Utility ANIL Ecosystem  →  ORG
Adani Connex Data Center  →  ORG
Transport & Logistics Adani Airports Holdings Ltd  →  ORG
Adani Road Transport Ltd  →  ORG
Primary Industries Mining services  →  ORG
Mining, Metals and Industrials  →  ORG
A  →  ORG
Hamburg  →  ORG
Men  →  ORG
Green  →  ORG
In  →  ORG
Inc  →  ORG
Energy  →  ORG
Ada  →  ORG
