In [11]:
#!/usr/bin/env python3

import pytesseract
from PIL import Image
import pandas as pd
import re
import os
import glob
from pathlib import Path

# Optional: Uncomment and set if Tesseract is not in PATH (Windows)
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

class ContactExtractor:
    def __init__(self, images_folder, output_file="clean_contacts.xlsx"):
        self.images_folder = images_folder
        self.output_file = output_file
        self.contacts = []

        self.patterns = {
            'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
            'phone': r'(?:\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})',
            'extension': r'(?:ext|extension|x)\.?\s*(\d+)',
        }

        self.title_keywords = [
            'director', 'manager', 'engineer', 'executive',
            'specialist', 'coordinator', 'analyst', 'assistant',
            'supervisor', 'officer'
        ]

    def clean_field(self, line):
        return re.sub(
            r'^(Name|Full Name|Contact Name|Title|Job Title|Position)\s*[:\-]?\s*',
            '',
            line,
            flags=re.IGNORECASE
        ).strip()

    def preprocess_image(self, image_path):
        try:
            image = Image.open(image_path)
            if image.mode != 'L':
                image = image.convert('L')

            width, height = image.size
            if width < 800 or height < 600:
                scale = max(800 / width, 600 / height)
                image = image.resize((int(width * scale), int(height * scale)), Image.Resampling.LANCZOS)

            return image
        except Exception as e:
            print(f"Error loading {image_path}: {e}")
            return None

    def extract_text(self, image):
        config = r'--oem 3 --psm 6'
        return pytesseract.image_to_string(image, config=config).strip()

    def extract_fields(self, text):
        contact = {
            'Contact Name': '',
            'Title': '',
            'Phone': '',
            'Phone Ext': '',
            'Email': '',
            'Role Type': '',
            'Raw Text': text.strip()
        }

        lines = [line.strip() for line in text.strip().split("\n") if line.strip()]

        # Extract Email
        emails = re.findall(self.patterns['email'], text)
        if emails:
            contact['Email'] = emails[0]

        # Extract Phone
        phones = re.findall(self.patterns['phone'], text)
        if phones:
            p = phones[0]
            contact['Phone'] = f"({p[0]}) {p[1]}-{p[2]}"

        # Extract Extension
        ext = re.findall(self.patterns['extension'], text, re.IGNORECASE)
        if ext:
            contact['Phone Ext'] = ext[0]

        # Title + Role Type
        for line in lines:
            for keyword in self.title_keywords:
                if keyword in line.lower():
                    contact['Title'] = self.clean_field(line)
                    contact['Role Type'] = keyword.title()
                    break
            if contact['Title']:
                break

        # Contact Name (prefer labeled fields first)
        for line in lines:
            match = re.search(r'(Contact Name|Full Name|Name)\s*[:\-]?\s*(.+)', line, re.IGNORECASE)
            if match:
                contact['Contact Name'] = match.group(2).strip()
                break

        # Fallback: first human-looking name line
        if not contact['Contact Name']:
            for line in lines:
                if any(x in line.lower() for x in ['phone', 'email', 'fax']):
                    continue
                if contact['Title'] and line == contact['Title']:
                    continue
                if re.search(r'[a-zA-Z]', line) and len(line.split()) in [2, 3, 4]:
                    contact['Contact Name'] = self.clean_field(line)
                    break

        return contact

    def process_images(self):
        files = glob.glob(os.path.join(self.images_folder, '*.png')) + \
                glob.glob(os.path.join(self.images_folder, '*.PNG'))

        if not files:
            print("No PNG files found.")
            return

        for image_path in files:
            image = self.preprocess_image(image_path)
            if image:
                text = self.extract_text(image)
                if text:
                    contact = self.extract_fields(text)
                    self.contacts.append(contact)

    def export_to_excel(self):
        if not self.contacts:
            print("No contacts to export.")
            return
        df = pd.DataFrame(self.contacts)
        df.to_excel(self.output_file, index=False, engine='openpyxl')
        print(f"Exported {len(df)} contacts to {self.output_file}")

    def run(self):
        self.process_images()
        self.export_to_excel()

if __name__ == "__main__":
    # ✅ UPDATE THIS PATH
    IMAGES_FOLDER = "/Users/arjunkhatiwada/Documents/pngtoexcel/pngimages"
    OUTPUT_FILE = "/Users/arjunkhatiwada/Documents/pngtoexcel/clean_contacts.xlsx"

    extractor = ContactExtractor(IMAGES_FOLDER, OUTPUT_FILE)
    extractor.run()


Exported 3 contacts to /Users/arjunkhatiwada/Documents/pngtoexcel/clean_contacts.xlsx


In [3]:
!pip install pytesseract

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [4]:
import pytesseract