###### Importing necessary libraries

In [1]:
import pytesseract
import cv2
import pandas as pd
import re

###### Setting the path to Tesseract OCR

In [2]:
pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'

###### Reading all 3 images of invoices using cv2 and storing each in different variable

In [3]:
img1 = cv2.imread('invoice 1.jpg')
img2 = cv2.imread('invoice 2.jpg')
img3 = cv2.imread('invoice 3.jpg')

###### Converting the images from BGR to the RGB color space and storing each in a different variable

In [4]:
img1_rgb = cv2.cvtColor(img1, cv2.COLOR_BGR2RGB)
img2_rgb = cv2.cvtColor(img2, cv2.COLOR_BGR2RGB)
img3_rgb = cv2.cvtColor(img3, cv2.COLOR_BGR2RGB)

###### Extracting text content from the converted images using pytesseract and storing each in a different variable

In [5]:
imgtext1 = pytesseract.image_to_string(img1_rgb)
imgtext2 = pytesseract.image_to_string(img2_rgb)
imgtext3 = pytesseract.image_to_string(img3_rgb)

###### Storing all extracted text contents from the invoices in a list called 'invoices_text'

In [6]:
invoices_text = [imgtext1, imgtext2, imgtext3]

###### Creating a dictionary of common columns from all invoices

In [7]:
invoices_data = {'Name of Business':[],
                'Address':[],
                'Website':[],
                'Email':[],
                'Contact':[],
                'Bill To':[],
                'Ship To':[],
                'Invoice Date':[],
                'Subtotal':[],
                'Discount':[],
                'Tax Rate':[],
                'Total Tax':[],
                'Shipping/Handling':[],
                'Balance Due':[]
                }

###### Iterating through the invoices_text list to search for specific patterns using regular expressions for extracting values from each text content and then filling the extracted values from each invoice text content into the corresponding fields in the invoice_data dictionary

In [8]:
for text in invoices_text:
    business_name = re.search(r'Business Name',text)
    address = re.search(r'\b\d+\s[\w\s]+,\s[\w\s]+,\s[\w\s]+,\sZip\b',text)
    site = re.search(r'\bWebsite\b',text)
    email = re.search(r'\bEmail Address\b',text)
    contact = re.search(r'\bContact Number\b',text) 
    bill_to = re.search(r'[A-Z]+ Enterprise',text)
    ship_to = re.search(r'[A-Z]+ Enterprise',text)
    invoice_date = re.search(r'\b\d{2}/\d{2}/\d{4}\b',text)
    subtotal = re.search(r'(?<=SUBTOTAL\s)\d+\.\d+',text)
    discount = re.search(r'(?<=DISCOUNT\s)\d+\.\d+',text) 
    tax_rate = re.search(r'(?<=TAX RATE\s)\d+\.\d+',text) 
    total_tax = re.search(r'(?<=TOTAL TAX\s)\d+\.\d+',text) 
    shipping = re.search(r'(?<=SHIPPING/ HANDLING\s)\d+\.\d+',text) 
    balance_due = re.search(r'€\s*\d+(?:,\d{3})*(?:\.\d{2})?',text)
    
    
    invoices_data['Name of Business'].append(business_name.group())
    invoices_data['Address'].append(address.group())       
    invoices_data['Website'].append(site.group())    
    invoices_data['Email'].append(email.group())   
    invoices_data['Contact'].append(contact.group()) 
    invoices_data['Bill To'].append(bill_to.group())
    invoices_data['Ship To'].append(ship_to.group())
    invoices_data['Invoice Date'].append(invoice_date.group())   
    invoices_data['Subtotal'].append(subtotal.group()) 
    invoices_data['Discount'].append(discount.group())  
    invoices_data['Tax Rate'].append(tax_rate.group())                                                     
    invoices_data['Total Tax'].append(total_tax.group())                                                     
    invoices_data['Shipping/Handling'].append(shipping.group())                                                     
    invoices_data['Balance Due'].append(balance_due.group())                                                     

###### Creating a DataFrame from the above dictionary and displaying it in a structured manner

In [9]:
df = pd.DataFrame(invoices_data)
df

Unnamed: 0,Name of Business,Address,Website,Email,Contact,Bill To,Ship To,Invoice Date,Subtotal,Discount,Tax Rate,Total Tax,Shipping/Handling,Balance Due
0,Business Name,"123 Street Address, City, State, Zip",Website,Email Address,Contact Number,ABC Enterprise,ABC Enterprise,01/02/2022,0.0,0.0,0.0,0.0,0.0,€ 1000.00
1,Business Name,"123 Street Address, City, State, Zip",Website,Email Address,Contact Number,XYZ Enterprise,XYZ Enterprise,02/05/2022,0.0,0.0,0.0,0.0,0.0,€ 1800.00
2,Business Name,"123 Street Address, City, State, Zip",Website,Email Address,Contact Number,MY Enterprise,MY Enterprise,02/04/2022,0.0,0.0,0.0,0.0,0.0,€ 2000.00
