In [39]:
!pip install pdfplumber pytesseract pdf2image



In [40]:
import re
import pdfplumber
from pdf2image import convert_from_path
import tempfile


In [41]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text.strip()


In [42]:
def extract_emails(text):
    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-z]{2,}'
    emails = list(set(re.findall(email_pattern, text)))
    return emails if emails else ["Not found"]

In [43]:
def extract_phones(text):
    # More comprehensive regex for various phone number formats
    phone_pattern = r'\+?\d{1,3}?[-.\s]?\(?\d{2,4}\)?[-.\s]?\d{3,4}[-.\s]?\d{4}|\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}|\d{10}'
    phones = re.findall(phone_pattern, text)
    # Join the tuple elements into a string and then strip whitespace if needed (though this regex shouldn't produce tuples)
    phones = [p.strip() for p in phones if p.strip()]
    return list(set(phones)) if phones else ["Not found"]

In [44]:
def extract_names(text):
    # Matches 'Name: John Doe' or capitalized pairs like 'Kalluri Harshitha'
    labeled_names = re.findall(r'(?:Name|Full Name)[:\s]+([A-Z][a-z]+\s[A-Z][a-z]+)', text)
    capitalized_names = re.findall(r'\b[A-Z][a-z]+\s[A-Z][a-z]+\b', text)
    names = list(set(labeled_names+capitalized_names))
    # Filter out words like "Phone Number", "Email ID"
    names = [n for n in names if "Email" not in n and "Phone" not in n]
    return names if names else ["Not found"]


In [45]:
def extract_info(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    names = extract_names(text)
    emails = extract_emails(text)
    phones = extract_phones(text)

    return {"Names": names, "Emails": emails, "Phones": phones}

In [46]:
pdf_path = input("Enter PDF file path: ").strip()
result = extract_info(pdf_path)

print("\nExtracted Information:")
for key, values in result.items():
  print(f"{key}:")
  for val in values:
    print(f"  - {val}")

Enter PDF file path: /content/Forsys_AI_Quarterly_Performance_Report.pdf

Extracted Information:
Names:
  - Executive Team
  - Kalluri Harshitha
  - Priya
Ramesh
  - Amal Thomas
  - Rajesh Kumar
  - Pvt Ltd
  - Research
Date
Emails:
  - harshithakalluri0@gmail.com
  - amal@forsys.ai
  - priya.r@forsys.ai
Phones:
  - +1 408 555 1223
  - +91 309953693
