In [None]:

# Step 1: Install required packages
!pip install pymupdf

# Step 2: Import libraries
import fitz  # PyMuPDF for PDF processing
import re    # Regular expressions for pattern matching
from google.colab import files

def extract_contact_info(pdf_path):
    """
    MAIN FUNCTION: Extracts contact information from PDF
    Args:
        pdf_path (str): Path to the PDF file
    Returns:
        dict: Dictionary containing name, email, and phone
    """
    try:
        # Open PDF document
        doc = fitz.open(pdf_path)
        full_text = ""

        # Extract text from all pages
        for page in doc:
            full_text += page.get_text()

        doc.close()

        # Extract individual components
        name = extract_name(full_text)
        email = extract_email(full_text)
        phone = extract_phone(full_text)

        return {
            "name": name,
            "email": email,
            "phone": phone
        }

    except Exception as e:
        return {"error": f"Failed to process PDF: {str(e)}"}

def extract_email(text):
    """
    Extract email address using regex pattern
    Pattern: username@domain.tld
    """
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    match = re.search(email_pattern, text)
    return match.group(0) if match else "Not found"

def extract_phone(text):
    """
    Extract phone number using multiple regex patterns
    Supports various formats: US, international, with/without brackets
    """
    phone_patterns = [
        r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b',
        r'\b\(\d{3}\)\s*\d{3}[-.]?\d{4}\b',
        r'\b\d{3}\s\d{3}\s\d{4}\b',
        r'\b\+\d{1,3}[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b'
    ]

    for pattern in phone_patterns:
        match = re.search(pattern, text)
        if match:
            return match.group(0)
    return "Not found"

def extract_name(text):
    """
    Extract potential name using heuristic approach
    Looks for patterns of 2-3 consecutive capitalized words
    """
    # Pattern for names (2-3 words, each starting with capital letter)
    name_pattern = r'\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,2})\b'

    # Split text into lines
    lines = text.split('\n')

    # Common false positives to exclude
    false_positives = [
        'company', 'incorporated', 'corporate', 'department',
        'llc', 'inc', 'ltd', 'page', 'chapter', 'section',
        'resume', 'cv', 'curriculum', 'vitae'
    ]

    # Check first 10 lines where names typically appear
    for line in lines[:10]:
        matches = re.findall(name_pattern, line)
        for match in matches:
            # Filter out false positives
            if not any(fp in match.lower() for fp in false_positives):
                # Additional validation: reasonable length and not all uppercase
                if 4 <= len(match) <= 50 and not match.isupper():
                    return match

    return "Not found"

def process_pdf():
    """
    Complete workflow for PDF processing in Google Colab
    """
    print("📄 PDF Contact Information Extractor")
    print("=" * 50)

    # Upload PDF file
    print("1. Upload your PDF file:")
    uploaded = files.upload()

    if not uploaded:
        print("❌ No file uploaded. Exiting.")
        return

    # Get the uploaded filename
    pdf_filename = list(uploaded.keys())[0]
    print(f"✅ File uploaded: {pdf_filename}")

    # Extract contact information
    print("\n2. Extracting contact information...")
    results = extract_contact_info(pdf_filename)

    # Display results
    print("\n3. Extraction Results:")
    print("=" * 30)
    print(f"📝 Name:  {results.get('name', 'Not found')}")
    print(f"📧 Email: {results.get('email', 'Not found')}")
    print(f"📞 Phone: {results.get('phone', 'Not found')}")

    return results


def extract_contacts_advanced(pdf_path):
    """
    ADVANCED VERSION: Enhanced extraction with better error handling
    """
    try:
        # Extract basic info
        basic_info = extract_contact_info(pdf_path)

        if "error" in basic_info:
            return basic_info

        # Additional: Extract all emails and phones found
        doc = fitz.open(pdf_path)
        full_text = ""
        for page in doc:
            full_text += page.get_text()
        doc.close()

        # Find all emails and phones
        email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
        all_emails = re.findall(email_pattern, full_text)

        all_phones = []
        phone_patterns = [
            r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b',
            r'\b\(\d{3}\)\s*\d{3}[-.]?\d{4}\b',
            r'\b\d{3}\s\d{3}\s\d{4}\b'
        ]
        for pattern in phone_patterns:
            all_phones.extend(re.findall(pattern, full_text))

        # Add additional info to results
        basic_info.update({
            "all_emails": list(set(all_emails)),  # Remove duplicates
            "all_phones": list(set(all_phones)),  # Remove duplicates
            "text_length": len(full_text),
            "pages": len(fitz.open(pdf_path))
        })

        return basic_info

    except Exception as e:
        return {"error": str(e)}

# Run the main function
if __name__ == "__main__":
    # Execute the complete workflow
    results = process_pdf()

Collecting pymupdf
  Downloading pymupdf-1.26.5-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.5-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m46.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.5
📄 PDF Contact Information Extractor
1. Upload your PDF file:


Saving Mayur_Nage_Resume.docx to Mayur_Nage_Resume.docx
✅ File uploaded: Mayur_Nage_Resume.docx

2. Extracting contact information...

3. Extraction Results:
📝 Name:  Aurangabad India
📧 Email: mayurnage29@gmail.com
📞 Phone: 8239067102
