In [2]:
import cv2
import pytesseract
import re

# Ensure Tesseract is in your PATH or specify the pytesseract path like so:
# pytesseract.pytesseract.tesseract_cmd = r'<full_path_to_your_tesseract_executable>'

def preprocess_image(image_path):
    """Preprocess the image for better OCR results."""
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # Applying filtering and thresholding to make the image binary
    blur = cv2.GaussianBlur(gray, (5,5), 0)
    thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
    return thresh

def extract_text(image):
    """Extract text from the preprocessed image using Tesseract OCR."""
    # Using Tesseract to convert image into string
    text = pytesseract.image_to_string(image, lang='eng')
    return text

def extract_information(text):
    """Parse the extracted text to identify names, phone numbers, and email addresses."""
    # Regular expressions for finding emails and phone numbers
    email_regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    phone_regex = r'\+?\d[\d -]{8,}\d'

    emails = re.findall(email_regex, text)
    phones = re.findall(phone_regex, text)

    print("Extracted Emails:", emails)
    print("Extracted Phone Numbers:", phones)

    # For names and other details, you might need specific patterns or post-processing
    # Here we simply return the text for further analysis
    return emails, phones, text

def main(image_path):
    preprocessed_image = preprocess_image(image_path)
    extracted_text = extract_text(preprocessed_image)
    emails, phones, text = extract_information(extracted_text)

    # Optionally, save the extracted information to a file
    with open('extracted_info.txt', 'w') as file:
        file.write("Extracted Emails:\n")
        file.write("\n".join(emails))
        file.write("\nExtracted Phone Numbers:\n")
        file.write("\n".join(phones))
        file.write("\n\nFull Extracted Text:\n")
        file.write(text)

    print("Information extracted and saved to 'extracted_info.txt'.")

# Replace the path below with the path to your business card image
image_path = 'image path'
main(image_path)


Extracted Emails: ['jim@jimsgraphics.com']
Extracted Phone Numbers: ['999-999-9999']
Information extracted and saved to 'extracted_info.txt'.
