In [67]:
import os
import cv2
import pytesseract
import pandas as pd
from PIL import Image, ImageEnhance, ImageFilter
import numpy as np
import re

# Path to Tesseract executable (only needed for Windows)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

class ReceiptOCR:
    def __init__(self, input_folder=None, output_folder=None, output_file="receipts_data.csv", lang="hun"):
        self.input_folder = os.path.abspath(input_folder) if input_folder else os.path.abspath("jpgs")
        self.output_folder = os.path.abspath(output_folder) if output_folder else os.path.abspath("pngs")
        self.output_file = output_file
        self.lang = lang
        os.makedirs(self.input_folder, exist_ok=True)  # Ensure input folder exists
        os.makedirs(self.output_folder, exist_ok=True)  # Ensure output folder exists

    # Convert image to PNG format to retain quality
    def convert_to_png(self, image_path):
        if image_path.lower().endswith(".jpg"):
            new_image_path = os.path.join(self.output_folder, os.path.basename(image_path).replace(".jpg", ".png"))
            img = Image.open(image_path)
            img.save(new_image_path, format="PNG", dpi=(300, 300))  # Set high DPI
            return new_image_path
        return image_path  # If already PNG, return original

    # Deskew the image if needed
    def deskew_image(self, image_path):
        image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

        if image is None:
            print(f"Error loading image: {image_path}")
            return image_path  # Return original path if image can't be loaded

        coords = np.column_stack(np.where(image > 0))

        if coords.shape[0] == 0:
            print(f"Warning: No text detected in {image_path}, skipping deskewing.")
            return image_path

        angle = cv2.minAreaRect(coords)[-1]

        if angle < -45:
            angle = -(90 + angle)
        else:
            angle = -angle

        (h, w) = image.shape[:2]
        center = (w // 2, h // 2)
        M = cv2.getRotationMatrix2D(center, angle, 1.0)
        rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)

        deskewed_path = image_path.replace(".png", "_deskewed.png")
        cv2.imwrite(deskewed_path, rotated)
        return deskewed_path

    # Remove shadows and enhance contrast
    def remove_shadows(self, image_path):
        image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
        dilated = cv2.dilate(image, kernel, iterations=3)
        blurred = cv2.medianBlur(dilated, 7)
        result = cv2.absdiff(image, blurred)
        _, result = cv2.threshold(result, 50, 255, cv2.THRESH_BINARY)

        shadow_free_path = image_path.replace(".png", "_shadowfree.png")
        cv2.imwrite(shadow_free_path, result)
        return shadow_free_path

    # Preprocess image for better OCR accuracy
    def preprocess_image(self, image_path):
        image_path = self.convert_to_png(image_path)  # Convert to PNG first
#         image_path = self.deskew_image(image_path)  # Deskew the image
        image_path = self.remove_shadows(image_path)  # Remove shadows
        
        image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
        image = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 31, 2)

        preprocessed_path = image_path.replace(".png", "_processed.png")
        cv2.imwrite(preprocessed_path, image)
        return preprocessed_path

    # Extract text from image with better OCR settings
    def extract_text(self, image_path):
        image_path = self.preprocess_image(image_path)  # Preprocess first
        custom_config = r'--oem 3 --psm 4 -c tessedit_char_whitelist="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789,.:-"'  # Use "single uniform block" mode
        text = pytesseract.image_to_string(image_path, lang=self.lang, config=custom_config)
        return self.clean_text(text)

    # Clean OCR output to remove noise
    def clean_text(self, text):
        replacements = {
            "1": "I", "5": "S", "0": "O", "8": "B",  # Number-letter fixes
            "ÖSSZESEN FA": "ÖSSZESEN", "KÁRTYA": "KÁRTYA",
        }
        for old, new in replacements.items():
            text = text.replace(old, new)
        text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
        return text

    # Extract key receipt details (date, total amount)
    def extract_receipt_details(self, text):
        date_pattern = r'\b(\d{2,4}[./-]\d{2}[./-]\d{2,4})\b'  # Matches 2025-03-06, 06/03/2025
        total_pattern = r'ÖSSZESEN[:\s]*([0-9]+[.,][0-9]{2})'  # Matches "ÖSSZESEN: 1234.56"

        date_match = re.search(date_pattern, text)
        total_match = re.search(total_pattern, text)

        date = date_match.group(1) if date_match else "Unknown"
        total = total_match.group(1) if total_match else "Unknown"

        return {"Date": date, "Total": total, "Cleaned Text": text}

    # Process all receipts in the folder
    def process_receipts(self):
        receipts_data = []

        for file in os.listdir(self.input_folder):
            if file.lower().endswith(".jpg"):
                file_path = os.path.join(self.input_folder, file)
                print(f"Processing: {file}")

                text = self.extract_text(file_path)
                receipt_details = self.extract_receipt_details(text)
                receipt_details["Filename"] = file
                receipts_data.append(receipt_details)

        # Save extracted data to CSV
        df = pd.DataFrame(receipts_data)
        df.to_csv(self.output_file, index=False, encoding="utf-8")
        print(f"Saved results to {self.output_file}")


In [70]:
import os
import cv2
import pytesseract
import pandas as pd
from PIL import Image, ImageEnhance, ImageFilter
import numpy as np
import re

# Path to Tesseract executable (only needed for Windows)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

class ReceiptOCR:
    def __init__(self, input_folder=None, output_folder=None, output_file="receipts_data.csv", lang="hun"):
        self.input_folder = os.path.abspath(input_folder) if input_folder else os.path.abspath("jpgs")
        self.output_folder = os.path.abspath(output_folder) if output_folder else os.path.abspath("pngs")
        self.output_file = output_file
        self.lang = lang
        os.makedirs(self.input_folder, exist_ok=True)  # Ensure input folder exists
        os.makedirs(self.output_folder, exist_ok=True)  # Ensure output folder exists

    # Convert image to PNG format to retain quality
    def convert_to_png(self, image_path):
        if image_path.lower().endswith(".jpg"):
            new_image_path = os.path.join(self.output_folder, os.path.basename(image_path).replace(".jpg", ".png"))
            img = Image.open(image_path)
            img.save(new_image_path, format="PNG", dpi=(300, 300))  # Set high DPI
            return new_image_path
        return image_path  # If already PNG, return original

    # Deskew the image if needed
    def deskew_image(self, image_path):
        image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
        if image is None:
            print(f"Error: Could not load image {image_path}")
            return image_path

        coords = np.column_stack(np.where(image > 0))
        if coords.shape[0] == 0:
            print(f"Warning: No text detected in {image_path}, skipping deskewing.")
            return image_path

        angle = cv2.minAreaRect(coords)[-1]
        if angle < -45:
            angle = -(90 + angle)
        else:
            angle = -angle

        (h, w) = image.shape[:2]
        center = (w // 2, h // 2)
        M = cv2.getRotationMatrix2D(center, angle, 1.0)
        rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)

        deskewed_path = image_path.replace(".png", "_deskewed.png")
        cv2.imwrite(deskewed_path, rotated)
        return deskewed_path

    # Remove shadows and enhance contrast
    def remove_shadows(self, image_path):
        image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
        if image is None:
            print(f"Error: Could not load image {image_path}")
            return image_path

        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
        try:
            dilated = cv2.dilate(image, kernel, iterations=3)
        except cv2.error as e:
            print(f"Error during dilation: {e}")
            return image_path

        blurred = cv2.medianBlur(dilated, 7)
        result = cv2.absdiff(image, blurred)
        _, result = cv2.threshold(result, 50, 255, cv2.THRESH_BINARY)

        shadow_free_path = image_path.replace(".png", "_shadowfree.png")
        cv2.imwrite(shadow_free_path, result)
        return shadow_free_path

    # Preprocess image for better OCR accuracy
    def preprocess_image(self, image_path):
        image_path = self.convert_to_png(image_path)  # Convert to PNG first
        image_path = self.deskew_image(image_path)  # Deskew the image
        image_path = self.remove_shadows(image_path)  # Remove shadows

        image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
        if image is None:
            print(f"Error: Could not load image {image_path}")
            return image_path

        image = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 31, 2)
        preprocessed_path = image_path.replace(".png", "_processed.png")
        cv2.imwrite(preprocessed_path, image)
        return preprocessed_path

    # Extract text from image with better OCR settings
    def extract_text(self, image_path):
        image_path = self.preprocess_image(image_path)  # Preprocess first
        custom_config = r'--oem 3 --psm 4 -c tessedit_char_whitelist="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789,.:-"'  # Use "single uniform block" mode
        text = pytesseract.image_to_string(image_path, lang=self.lang, config=custom_config)
        return self.clean_text(text)

    # Clean OCR output to remove noise
    def clean_text(self, text):
        replacements = {
            "1": "I", "5": "S", "0": "O", "8": "B",  # Number-letter fixes
            "ÖSSZESEN FA": "ÖSSZESEN", "KÁRTYA": "KÁRTYA",
        }
        for old, new in replacements.items():
            text = text.replace(old, new)
        text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
        return text

    # Extract key receipt details (date, total amount)
    def extract_receipt_details(self, text):
        date_pattern = r'\b(\d{2,4}[./-]\d{2}[./-]\d{2,4})\b'  # Matches 2025-03-06, 06/03/2025
        total_pattern = r'ÖSSZESEN[:\s]*([0-9]+[.,][0-9]{2})'  # Matches "ÖSSZESEN: 1234.56"

        date_match = re.search(date_pattern, text)
        total_match = re.search(total_pattern, text)

        date = date_match.group(1) if date_match else "Unknown"
        total = total_match.group(1) if total_match else "Unknown"

        return {"Date": date, "Total": total, "Cleaned Text": text}

    # Process all receipts in the folder
    def process_receipts(self):
        receipts_data = []

        for file in os.listdir(self.input_folder):
            if file.lower().endswith(".jpg"):
                file_path = os.path.join(self.input_folder, file)
                print(f"Processing: {file}")

                text = self.extract_text(file_path)
                receipt_details = self.extract_receipt_details(text)
                receipt_details["Filename"] = file
                receipts_data.append(receipt_details)

        # Save extracted data to CSV
        df = pd.DataFrame(receipts_data)
        df.to_csv(self.output_file, index=False, encoding="utf-8")
        print(f"Saved results to {self.output_file}")


In [72]:
print(os.listdir(self.output_folder))  # List all files in output folder
image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

if image is None:
    print(f"OpenCV cannot read: {image_path}")
else:
    print(f"Image loaded successfully: {image_path}")


NameError: name 'self' is not defined

In [71]:
# Example usage
if __name__ == "__main__":
    ocr = ReceiptOCR(input_folder=r"C:\Users\Bálint\Desktop\Asztal\Projekt\receipts\jpgs", 
                     output_folder=r"C:\Users\Bálint\Desktop\Asztal\Projekt\receipts\pngs", 
                     output_file="receipts_data.csv", lang="hun")
    ocr.process_receipts()

Processing: IMG_20250307_113911.jpg
Error: Could not load image C:\Users\Bálint\Desktop\Asztal\Projekt\receipts\pngs\IMG_20250307_113911.png
Error: Could not load image C:\Users\Bálint\Desktop\Asztal\Projekt\receipts\pngs\IMG_20250307_113911.png
Error: Could not load image C:\Users\Bálint\Desktop\Asztal\Projekt\receipts\pngs\IMG_20250307_113911.png
Processing: IMG_20250307_113924.jpg
Error: Could not load image C:\Users\Bálint\Desktop\Asztal\Projekt\receipts\pngs\IMG_20250307_113924.png
Error: Could not load image C:\Users\Bálint\Desktop\Asztal\Projekt\receipts\pngs\IMG_20250307_113924.png
Error: Could not load image C:\Users\Bálint\Desktop\Asztal\Projekt\receipts\pngs\IMG_20250307_113924.png


KeyboardInterrupt: 

In [58]:
import pandas as pd

# Define the file path
file_path = "receipts_data.csv"
df = pd.read_csv(file_path, encoding="utf-8")
pd.set_option('display.max_colwidth', None)
print(df["Cleaned Text"])

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             SPAR Magyarország Kereskedelmi Kft. . 2060 Bicske SPAR út 0326/1 HRSZ. . É 195 Szupermnarket 1119 Budapest Fehérvári út 45. ADÓSZÁM: 10485824-2-07 C00 SZAFTI OUINOA KASA 300L j 099 ) — BO0 PÖTTY.TR PROT.NAT.ÉT4151 1 499 C00 SPAR LEBOMLÓ ZACSKÓ 19 I — C00 BANÁN KG 1,2 KG x 699 Ft/KG 839 BO0 ZOTT PROT.PUD. KAKAGSZOLG 449 BO0 PÖTTY.TR PROT.MÁL.ÉT 