In [13]:
import PyPDF2
from pdf2image import convert_from_path
import pytesseract

# Specify the path to the tesseract executable
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract' # Update with your tesseract path if different

# Your existing code here...
def extract_w2_data(pdf_file):
    # Open PDF
    reader = PyPDF2.PdfReader(pdf_file)

    # Convert to image for OCR processing
    images = convert_from_path(pdf_file)

    # Process first page
    text_content = pytesseract.image_to_string(images[2])

    return text_content


In [15]:
ocr_text=extract_w2_data('w2_form.pdf')

In [18]:
import re

def clean_text(text):
    """ Pre-process the text to remove unwanted lines and clean formatting issues """
    # Remove common irrelevant lines or phrases from OCR text
    text = re.sub(r"11 Nonqualified plans.*?\n", "", text)
    text = re.sub(r"12a See instructions.*?\n", "", text)
    text = re.sub(r"a Other.*?\n", "", text)
    text = re.sub(r"ze\s+", "", text)  # Remove "ze" clutter in addresses
    return text

def extract_w2_info(text):
    # Clean input text
    text = clean_text(text)

    # Initialize a dictionary to store extracted fields
    extracted_info = {
        "Employer": None,
        "Employer Address": None,
        "Employee": None,
        "Employee Address": None,
        "Wages": None,
        "Federal Tax Withheld": None,
        "Social Security Wages": None,
        "Social Security Tax Withheld": None,
        "Medicare Wages": None,
        "Medicare Tax Withheld": None,
        "State Wages": None,
        "State Tax Withheld": None,
    }

    # Define regex patterns for key fields
    patterns = {
        "Employer": r"Employer's name, address, and ZIP code\s*\n(.*?)\n",
        "Employer Address": r"Employer's name, address, and ZIP code\s*\n.*?\n(.*?)(?:\n|$)(.*?)\n",
        "Employee": r"2\. Employee Name and Address\.\n(.*?)\n",
        "Employee Address": r"2\. Employee Name and Address\.\n.*?\n(.*?)(?:\n|$)(.*?)\n",
        "Wages": r"1 Wages, tips, other comp\.\s*\n(\d+\.\d+)",
        "Federal Tax Withheld": r"2 Federal income tax withheld\s*\n(\d+\.\d+)",
        "Social Security Wages": r"3 Social security wages\s*\n(\d+\.\d+)",
        "Social Security Tax Withheld": r"4 Social security tax withheld\s*\n(\d+\.\d+)",
        "Medicare Wages": r"5 Medicare wages and tips\s*\n(\d+\.\d+)",
        "Medicare Tax Withheld": r"6 Medicare tax withheld\s*\n(\d+\.\d+)",
        "State Wages": r"16 State wages, tips, etc\.\s*\n(\d+\.\d+)",
        "State Tax Withheld": r"17 State income tax\s*\n(\d+\.\d+)",
    }

    # Apply patterns to extract information
    for key, pattern in patterns.items():
        match = re.search(pattern, text, re.DOTALL)
        if match:
            if key in ["Employer Address", "Employee Address"]:
                extracted_info[key] = " ".join([line.strip() for line in match.groups() if line])
            else:
                extracted_info[key] = match.group(1).strip()

    # Clean up extracted data
    for key, value in extracted_info.items():
        if not value:
            extracted_info[key] = "Not Found"

    return extracted_info

# Example OCR output (replace this with the actual OCR text)
ocr_text = ocr_text

# Extract information
info = extract_w2_info(ocr_text)

# Print the extracted fields
for key, value in info.items():
    print(f"{key}: {value}")


Employer: TRAILS BUSINESS CORP
Employer Address: 18 PEARL ST BURLINGTON, VT 05401
Employee: =— DAVID B ROCK
Employee Address: ad ATLANTA, GA 30303 13 Stat emp] Ret. planjard party sick pay
Wages: Not Found
Federal Tax Withheld: 3514.74
Social Security Wages: Not Found
Social Security Tax Withheld: 2005.74
Medicare Wages: Not Found
Medicare Tax Withheld: 3514.74
State Wages: Not Found
State Tax Withheld: Not Found


In [21]:
import re

def clean_numeric_value(value):
    """
    Clean and convert numeric values, handling potential OCR inconsistencies
    """
    if not value or value == 'N/A':
        return None
    try:
        # Remove commas and convert to float
        return float(str(value).replace(',', ''))
    except ValueError:
        return None

def extract_w2_info(text):
    """
    Extract key information from a W-2 document text with improved robustness
    """
    w2_info = {
        "employee_name": None,
        "employee_address": None,
        "employer_name": None,
        "employer_address": None,
        "employer_ein": None,
        "employee_ssn": None,
        "total_wages": None,
        "federal_tax_withheld": None,
        "social_security_wages": None,
        "social_security_tax_withheld": None,
        "medicare_wages": None,
        "medicare_tax_withheld": None,
        "social_security_tips": None,
        "state_wages": None,
        "state_tax_withheld": None
    }

    # Improved name and address extraction
    name_pattern = r'(DAVID B ROCK)\n(.*)\n(.*)'
    name_match = re.search(name_pattern, text)
    if name_match:
        w2_info["employee_name"] = name_match.group(1)
        w2_info["employee_address"] = f"{name_match.group(2)}, {name_match.group(3)}"

    # Employer extraction
    employer_pattern = r'(TRAILS BUSINESS CORP)\n(.*)\n(.*)'
    employer_match = re.search(employer_pattern, text)
    if employer_match:
        w2_info["employer_name"] = employer_match.group(1)
        w2_info["employer_address"] = f"{employer_match.group(2)}, {employer_match.group(3)}"

    # SSN extraction with better matching
    ssn_pattern = r'(XXX-XX-\d{4})'
    ssn_match = re.search(ssn_pattern, text)
    if ssn_match:
        w2_info["employee_ssn"] = ssn_match.group(1)

    # EIN extraction
    ein_pattern = r'(\d{2}-\d{7})'
    ein_match = re.search(ein_pattern, text)
    if ein_match:
        w2_info["employer_ein"] = ein_match.group(1)

    # Numeric value extraction with multiple approaches
    numeric_extractions = {
        "total_wages": r'Gross Pay\s+([\d,\.]+)',
        "social_security_wages": r'Reported W-2 Wages\s+\d+\.\d+\s+([\d,\.]+)',
        "federal_tax_withheld": r'Federal income tax withheld\n([\d,\.]+)',
        "social_security_tax_withheld": r'Social security tax withheld\n([\d,\.]+)',
        "medicare_wages": r'Medicare wages and tips\n([\d,\.]+)',
        "medicare_tax_withheld": r'Medicare tax withheld\n([\d,\.]+)',
        "social_security_tips": r'Social security tips\n([\d,\.]+)',
        "state_wages": r'State wages, tips, etc\.\n([\d,\.]+)',
        "state_tax_withheld": r'State income tax\n([\d,\.]+)'
    }

    for key, pattern in numeric_extractions.items():
        match = re.search(pattern, text)
        if match:
            w2_info[key] = clean_numeric_value(match.group(1))

    return w2_info

def main():
    # Read the OCR text
    # with open('w2_ocr_text.txt', 'r', encoding='utf-8') as file:
    #     ocr_text = file.read()

    # Extract information
    w2_data = extract_w2_info(ocr_text)

    # Pretty print the extracted information
    print("W-2 Information Extraction:")
    for key, value in w2_data.items():
        print(f"{key.replace('_', ' ').title()}: {value}")

if __name__ == "__main__":
    main()

W-2 Information Extraction:
Employee Name: DAVID B ROCK
Employee Address: 1451 PEACH STREET Gross Pay 3,514.74 3,514.74 3,514.74 3,514.74, ATLANTA, GA 30303 Less soc.sec.Tips Reported in Box 7 N/A 1,509.00 N/A N/A
Employer Name: TRAILS BUSINESS CORP
Employer Address: 18 PEARL ST, BURLINGTON, VT 05401
Employer Ein: 47-1111212
Employee Ssn: XXX-XX-1122
Total Wages: 3514.74
Federal Tax Withheld: 3514.74
Social Security Wages: None
Social Security Tax Withheld: 2005.74
Medicare Wages: None
Medicare Tax Withheld: 3514.74
Social Security Tips: None
State Wages: None
State Tax Withheld: None


In [22]:
ocr_text

"2021 W-2 and EARNINGS SUMMARY (D>\n\n \n\n \n\nEmployee Reference Copy This blue section is your Earnings Summary which provides more detailed\nWage and Tax . . i .\nStat t information on the generation of your W-2 statement. The reverse side\natemen OMB No. 1545-0008 includes instructions and other general information.\nCopy C for employee’srecords.\nd= Control number Dept. Corp. Employer use only\n000004 KH/D7W) 3\n\n \n\n \n\n \n\n \n\nc Employer's name, address, and ZIP code\nTRAILS BUSINESS CORP\n18 PEARL ST\nBURLINGTON, VT 05401\n\n1. Your Gross Pay was adjusted _as follows to produce your W-2 Statement.\nBatch #99391\n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n     \n \n\n \n\n          \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n

In [24]:
# prompt: filter important details from the cocr_text above

import re

def extract_key_details(ocr_text):
    """
    Extracts key details from the OCR text of a W-2 form.
    """
    details = {}

    # Regular expressions for key fields
    patterns = {
        "Employee Name": r"2\. Employee Name and Address\.\n(.*?)\n",
        "Employee Address": r"Employee Name and Address\n.*?\n(.*?)\n.*?\n", # Adjust for address format
        "Employer Name": r"Employer's name, address, and ZIP code\n(.*?)\n",
        "Employer Address": r"Employer's name, address, and ZIP code\n.*?\n(.*?)\n.*?\n", # Adjust for address format
        "SSN": r"Social Security number\s*(\d{3}-\d{2}-\d{4})",
        "Wages": r"Wages, tips, other compensation\s*\$?([\d,.]+)",
        "Federal Income Tax Withheld": r"Federal income tax withheld\s*\$?([\d,.]+)",
    }

    for key, pattern in patterns.items():
        match = re.search(pattern, ocr_text, re.DOTALL)
        if match:
            details[key] = match.group(1).strip() if match.group(1) else "Not Found"
        else:
            details[key] = "Not Found"

    return details


# Assuming 'ocr_text' contains the output from your OCR process
key_info = extract_key_details(ocr_text)

# Print the extracted details
for key, value in key_info.items():
    print(f"{key}: {value}")

Employee Name: 11 Nonqualified plans 12a See instructions for box 12
Employee Address: Not Found
Employer Name: TRAILS BUSINESS CORP
Employer Address: 18 PEARL ST
SSN: Not Found
Wages: Not Found
Federal Income Tax Withheld: 3514.74


In [3]:
pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [4]:
pip install pdf2image

Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.17.0


In [10]:
!pip install pytesseract
!sudo apt install tesseract-ocr
!sudo apt install libtesseract-dev

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 52 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 1s (4,212 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debc

In [8]:
!apt-get update
!apt-get install poppler-utils

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
0% [Connecting to archive.ubuntu.com (185.125.190.83)] [1 InRelease 12.7 kB/129 kB 10%] [Connected t                                                                                                    Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [2,469 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:9 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,196 kB]
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/p