In [None]:
import os
import time
def rename_images(input_dir):
    if not os.path.exists(input_dir):
        print(f"Directory '{input_dir}' does not exist.")
        return
    image_files = sorted([f for f in os.listdir(input_dir) if f.lower().endswith(('png', 'jpg', 'jpeg', 'bmp', 'gif'))])
    for idx, filename in enumerate(image_files, start=1):
        ext = os.path.splitext(filename)[1]  # Get file extension
        new_name = f"receipt_{idx}{ext}"
        old_path = os.path.join(input_dir, filename)
        new_path = os.path.join(input_dir, new_name)
        # Handle permission error and ensure unique filenames
        attempt = 1
        while os.path.exists(new_path):
            new_name = f"receipt_{idx}_v{attempt}{ext}"
            new_path = os.path.join(input_dir, new_name)
            attempt += 1
        try:
            os.rename(old_path, new_path)
            print(f"Renamed '{filename}' to '{new_name}'")
        except PermissionError:
            print(f"Skipping '{filename}' due to permission error.")
# Directory containing receipt images
input_dir = "big_receipt_dataset"
rename_images(input_dir)

In [None]:
import os
import json
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from PIL import Image
import io

# Set up your Azure AI Document Intelligence credentials
endpoint = "https://receiptprocess.cognitiveservices.azure.com/"  # Replace with your endpoint
key = "CSDnP6rbFAgzLtbUpfkYzItnfxm1Z9KHdT1DWr7H7WypMFLoRwQOJQQJ99BBACYeBjFXJ3w3AAALACOGWgTA"  # Replace with your key

# Directory containing receipt images
# input_dir = "big_receipt_dataset"
input_dir = "big_receipt_dataset"
# Directory to store JSON output
output_dir = "processed_json_images"

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Initialize the client
document_intelligence_client = DocumentIntelligenceClient(
    endpoint=endpoint, credential=AzureKeyCredential(key)
)

# Function to serialize date objects
def serialize_date(date_obj):
    if date_obj:
        return date_obj.strftime("%Y-%m-%d")
    return None

# Function to safely get a value from a field
def get_field_value(field, field_type="string"):
    if field:
        if field_type == "string":
            return field.value_string if field.value_string else None
        elif field_type == "date":
            return serialize_date(field.value_date) if field.value_date else None
        elif field_type == "currency":
            return field.value_currency.amount if field.value_currency else None
        elif field_type == "number":
            return field.value_number if field.value_number else None
    return None

# Function to resize the image if it's too large
def resize_image(image_path, max_size=(800, 800), max_quality=85):
    with Image.open(image_path) as img:
        print(f"Original image size: {os.path.getsize(image_path) / (1024 * 1024):.2f} MB")
        
        # Resize image to fit within max_size
        img.thumbnail(max_size)
        
        # Save resized image as JPEG with reduced quality
        resized_image_path = os.path.splitext(image_path)[0] + "_resized.jpg"
        img.save(resized_image_path, "JPEG", quality=max_quality)  # Adjust quality to reduce file size
        
        print(f"Resized image saved as: {resized_image_path}")
        print(f"Resized image size: {os.path.getsize(resized_image_path) / (1024 * 1024):.2f} MB")
        
    return resized_image_path

# Loop through all files in the input directory
for filename in os.listdir(input_dir):
    file_path = os.path.join(input_dir, filename)
    
    # Check if it's an image file (you can add more extensions if needed)
    if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.pdf')):
        print(f"Processing: {filename}")
        
        print("Checking the image size first before saving.");
        file_size = os.path.getsize(file_path)  # Size in bytes of the file
        print(f"Processing: {filename} | Size: {file_size / (1024 * 1024):.2f} MB")  # Convert to MB
        
        if file_size > 4 * 1024 * 1024:  # 50 MB in bytes
            print(f"File {filename} is too large ({file_size / (1024 * 1024):.2f} MB). Resizing...")
            resized_image_path = resize_image(file_path)  # Resize the image
            file_path = resized_image_path  # Use the resized image for processing
            print(f"Resized image saved as: {resized_image_path}")

        # Open the file and analyze
        with open(file_path, "rb") as receipt_file:
            poller = document_intelligence_client.begin_analyze_document(
                "prebuilt-receipt", receipt_file
            )
            receipts = poller.result()

        # Process and store extracted receipt information
        receipt_data = []
        for idx, receipt in enumerate(receipts.documents):
            receipt_info = {
                "receipt_number": idx + 1,
                "receipt_type": receipt.doc_type,
                "merchant_name": get_field_value(receipt.fields.get("MerchantName")),
                "transaction_date": get_field_value(receipt.fields.get("TransactionDate"), "date"),
                "items": []
            }

            if receipt.fields.get("Items"):
                for item in receipt.fields.get("Items").value_array:
                    receipt_info["items"].append({
                        "description": get_field_value(item.value_object.get("Description")),
                        "quantity": get_field_value(item.value_object.get("Quantity"), "number"),
                        "price": get_field_value(item.value_object.get("Price"), "currency"),
                        "total_price": get_field_value(item.value_object.get("TotalPrice"), "currency")
                    })

            receipt_info["subtotal"] = get_field_value(receipt.fields.get("Subtotal"), "currency")
            receipt_info["tax"] = get_field_value(receipt.fields.get("TotalTax"), "currency")
            receipt_info["tip"] = get_field_value(receipt.fields.get("Tip"), "currency")
            receipt_info["total"] = get_field_value(receipt.fields.get("Total"), "currency")
            
            receipt_data.append(receipt_info)

        # Convert to JSON format
        receipt_json = json.dumps(receipt_data, indent=4)

        # Write JSON output to the output directory
        output_path = os.path.join(output_dir, f"{os.path.splitext(filename)[0]}.json")
        with open(output_path, "w") as json_file:
            json_file.write(receipt_json)
        
        print(f"Saved: {output_path}")

print("Processing complete! 🎉")


Processing: image_408.png
Checking the image size first before saving.
Processing: image_408.png | Size: 0.96 MB
Saved: processed_json_images/image_408.json
Processing: image_1933.png
Checking the image size first before saving.
Processing: image_1933.png | Size: 0.53 MB
Saved: processed_json_images/image_1933.json
Processing: image_420.png
Checking the image size first before saving.
Processing: image_420.png | Size: 0.52 MB
Saved: processed_json_images/image_420.json
Processing: image_346.png
Checking the image size first before saving.
Processing: image_346.png | Size: 0.81 MB
Saved: processed_json_images/image_346.json
Processing: image_1099.png
Checking the image size first before saving.
Processing: image_1099.png | Size: 0.18 MB
Saved: processed_json_images/image_1099.json
Processing: image_352.png
Checking the image size first before saving.
Processing: image_352.png | Size: 0.95 MB
Saved: processed_json_images/image_352.json
Processing: image_434.png
Checking the image size fi