In [1]:
# PDF to JPG

In [2]:
from pdf2image import convert_from_path
import os
import glob

In [3]:
output_dir = 'Converted_Images'
os.makedirs(output_dir, exist_ok=True)

pdf_folder = 'PDFS'
pdf_files = glob.glob(os.path.join(pdf_folder, '*.pdf'))

for pdf_file in pdf_files:
    images = convert_from_path(pdf_file, 500, poppler_path=r'C:\Program Files\poppler-23.05.0\Library\bin')
    
    for i, image in enumerate(images):
        image_path = os.path.join(output_dir, f'{os.path.splitext(os.path.basename(pdf_file))[0]}_page{i}.jpg')
        image.save(image_path)
        print("Converted:", image_path)


Converted: Converted_Images\inv - 1_page0.jpg
Converted: Converted_Images\inv - 2_page0.jpg
Converted: Converted_Images\inv - 3_page0.jpg
Converted: Converted_Images\inv - 3_page1.jpg
Converted: Converted_Images\inv - 3_page2.jpg
Converted: Converted_Images\inv - 3_page3.jpg
Converted: Converted_Images\inv - 3_page4.jpg
Converted: Converted_Images\inv - 3_page5.jpg
Converted: Converted_Images\inv - 3_page6.jpg
Converted: Converted_Images\inv - 3_page7.jpg
Converted: Converted_Images\inv - 3_page8.jpg
Converted: Converted_Images\inv - 3_page9.jpg
Converted: Converted_Images\inv - 3_page10.jpg
Converted: Converted_Images\inv - 3_page11.jpg
Converted: Converted_Images\inv - 3_page12.jpg
Converted: Converted_Images\inv - 3_page13.jpg
Converted: Converted_Images\inv - 3_page14.jpg
Converted: Converted_Images\inv - 3_page15.jpg
Converted: Converted_Images\inv - 3_page16.jpg
Converted: Converted_Images\inv - 3_page17.jpg
Converted: Converted_Images\inv - 3_page18.jpg
Converted: Converted_Imag

In [4]:
# PreProcessing 

In [5]:
import cv2
import os
import numpy as np

In [6]:
# Create the output directory if it doesn't exist
output_dir = 'Processed_Images'
os.makedirs(output_dir, exist_ok=True)

In [7]:
# Color Correction
def color_correction(image):
    lab_image = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
    l_channel, a_channel, b_channel = cv2.split(lab_image)
    clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
    clahe_l_channel = clahe.apply(l_channel)
    corrected_lab_image = cv2.merge((clahe_l_channel, a_channel, b_channel))
    corrected_image = cv2.cvtColor(corrected_lab_image, cv2.COLOR_LAB2BGR)
    return corrected_image

In [8]:
# Brightness Adjustment
def adjust_brightness(image, value):
    hsv_image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    h_channel, s_channel, v_channel = cv2.split(hsv_image)
    v_channel = np.clip(v_channel + value, 0, 255)
    adjusted_hsv_image = cv2.merge((h_channel, s_channel, v_channel))
    adjusted_image = cv2.cvtColor(adjusted_hsv_image, cv2.COLOR_HSV2BGR)
    return adjusted_image

In [9]:
# Calculate Image Entropy
def calculate_entropy(image):
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    hist = cv2.calcHist([gray_image], [0], None, [256], [0, 256])
    hist /= hist.sum()
    entropy = -np.sum(hist * np.log2(hist + np.finfo(float).eps))
    return entropy

In [10]:
# Calculate Image Edge Density
def calculate_edge_density(image):
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray_image, 50, 150)
    edge_density = np.count_nonzero(edges) / (edges.shape[0] * edges.shape[1])
    return edge_density

In [11]:
threshold = 240

# Saturation Level Threshold
saturation_threshold = 8  # Adjust this value as needed

# Input folder path
input_folder = 'Converted_Images'

# Get a list of all image files in the input folder
image_files = os.listdir(input_folder)

In [20]:
# Process each image file
for file_name in image_files:
    # Construct the full path for the input and output images
    input_image_path = os.path.join(input_folder, file_name)
    output_image_path = os.path.join(output_dir, file_name)

    # Load the input image
    image = cv2.imread(input_image_path)

    # Calculate image metrics
    average_intensity = np.mean(image)
    entropy = calculate_entropy(image)
    edge_density = calculate_edge_density(image)
    saturation = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)[:, :, 1].mean()

    # Check if preprocessing is needed based on the criteria
    if (average_intensity < threshold or entropy < 4) and (edge_density > 0.03 or saturation < saturation_threshold):
        # Apply Color Correction
        color_corrected_image = color_correction(image)
        # Apply Brightness Adjustment
        adjusted_image = adjust_brightness(color_corrected_image, 230)

        # Save the processed image
        cv2.imwrite(output_image_path, adjusted_image)
        print("Processed Image saved at:", output_image_path)
    else:
        # Save the original image without preprocessing
        cv2.imwrite(output_image_path, image)
        print("Original Image saved at:", output_image_path)

Processed Image saved at: Processed_Images\inv - 1_page0.jpg
Original Image saved at: Processed_Images\inv - 2_page0.jpg
Original Image saved at: Processed_Images\inv - 3_page0.jpg
Original Image saved at: Processed_Images\inv - 3_page1.jpg
Original Image saved at: Processed_Images\inv - 3_page10.jpg
Processed Image saved at: Processed_Images\inv - 3_page11.jpg
Original Image saved at: Processed_Images\inv - 3_page12.jpg
Processed Image saved at: Processed_Images\inv - 3_page13.jpg
Processed Image saved at: Processed_Images\inv - 3_page14.jpg
Original Image saved at: Processed_Images\inv - 3_page15.jpg
Processed Image saved at: Processed_Images\inv - 3_page16.jpg
Original Image saved at: Processed_Images\inv - 3_page17.jpg
Processed Image saved at: Processed_Images\inv - 3_page18.jpg
Original Image saved at: Processed_Images\inv - 3_page19.jpg
Original Image saved at: Processed_Images\inv - 3_page2.jpg
Processed Image saved at: Processed_Images\inv - 3_page20.jpg
Original Image saved a

In [14]:
# OCR 

In [2]:
from mindee import Client, documents
import os

In [3]:
# mindee_client = Client(api_key="fbceb71fad7dd72a19f64269cb5adf16") #API key
mindee_client = Client(api_key="53d2514fb4521552c86bb65bd465a445") #API key
input_folder = "Processed_Images"


In [4]:
# Get a list of all image files in the input folder
image_files = os.listdir(input_folder)


In [5]:
# Process each image file
for file_name in image_files:
    # Construct the full path for the input image
    input_image_path = os.path.join(input_folder, file_name)
    
    print("Processing image:", file_name)
    
    # Create a document from the image
    input_doc = mindee_client.doc_from_path(input_image_path)
    
    # Parse the document using the TypeInvoiceV4 template
    api_response = input_doc.parse(documents.TypeInvoiceV4)
    
    # Check if relevant data is found
    if api_response.document is None:
        print("No relevant data found")
    else:
        # Retrieve the extracted information
        invoice_number = api_response.document.invoice_number
        total_amount = api_response.document.total_amount
        invoice_date = api_response.document.invoice_date
        reference_numbers = api_response.document.reference_numbers
        # reference_numbers = api_response.document.reference_numbers

        
        print("Invoice Number:", invoice_number)
        print("Total Amount:", total_amount)
        print("Invoice Date:", invoice_date)
        
        for reference_number in reference_numbers:
            if reference_number.value.startswith("4"):
                print("Purchase Order Number:", reference_number.value)
            else:
                print("Reference Numbers:", reference_number.value)
        # print(api_response.document)
    
    print("---------------------------------------------------")  # Add a newline between images
    print("---------------------------------------------------")  

Processing image: inv - 1_page0.jpg


HTTPException: API 403 HTTP error: {"api_request": {"error": {"code": "PlanLimitReached", "details": "You have reached your maximum number of requests", "message": "Plan limit has been reached"}, "resources": [], "status": "failure", "status_code": 403, "url": "https://api.mindee.net/v1/products/mindee/invoices/v4/predict"}}