In [2]:
import cv2
import re
from paddleocr import PaddleOCR
import json
import os

# Initialize PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')

# Preprocessing Function
def preprocess_image(image_path):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)  # Convert to grayscale
    image = cv2.GaussianBlur(image, (5, 5), 0)            # Apply Gaussian blur
    _, image = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)  # Apply adaptive thresholding
    return image

# OCR Function
def perform_ocr(image_path):
    # Preprocess the image
    preprocessed_image = preprocess_image(image_path)
    
    # Save the preprocessed image (optional, for visualization)
    cv2.imwrite("preprocessed_receipt.jpg", preprocessed_image)
    
    # Run OCR
    results = ocr.ocr(preprocessed_image)
    
    # Extract lines of text
    lines = [line[1][0] for line in results[0]]
    return lines

# Extract Product-Price Pairs
def extract_product_price_pairs(lines):
    price_pattern = r"\$\d+(\.\d{2})?"  # Regex to match prices (e.g., $12.34)
    pairs = []
    
    for line in lines:
        # Search for price in the line
        match = re.search(price_pattern, line)
        if match:
            price = match.group(0)
            # Remove price from the line to isolate the product name
            product = line.replace(price, "").strip()
            pairs.append({"product": product, "price": price})
    return pairs

# Main Function
def process_receipt(image_path):
    # Step 1: Perform OCR
    lines = perform_ocr(image_path)
    
    # Step 2: Extract product-price pairs
    product_price_pairs = extract_product_price_pairs(lines)
    
    # Step 3: Output results
    output = {
        "pairs": product_price_pairs,
        "total_items": len(product_price_pairs)
    }
    return output



ModuleNotFoundError: No module named 'paddle'

In [None]:
# Run the Pipeline
if __name__ == "__main__":
    
    # Folder containing images
    image_folder = 'images/'  # Replace with the path to your images folder

    # Get a list of all image files in the folder
    image_files = [f for f in os.listdir(image_folder) if f.endswith((".jpg", ".png", ".jpeg"))]
    results = process_receipt(image_files)
    
    # Print the results
    print(json.dumps(results, indent=4))
    
    # Save the results to a JSON file
    with open("output.json", "w") as f:
        json.dump(results, f, indent=4)
