In [None]:
# import os
# import time
# def rename_images(input_dir):
#     if not os.path.exists(input_dir):
#         print(f"Directory '{input_dir}' does not exist.")
#         return
#     image_files = sorted([f for f in os.listdir(input_dir) if f.lower().endswith(('png', 'jpg', 'jpeg', 'bmp', 'gif'))])
#     for idx, filename in enumerate(image_files, start=1):
#         ext = os.path.splitext(filename)[1]  # Get file extension
#         new_name = f"receipt_{idx}{ext}"
#         old_path = os.path.join(input_dir, filename)
#         new_path = os.path.join(input_dir, new_name)
#         # Handle permission error and ensure unique filenames
#         attempt = 1
#         while os.path.exists(new_path):
#             new_name = f"receipt_{idx}_v{attempt}{ext}"
#             new_path = os.path.join(input_dir, new_name)
#             attempt += 1
#         try:
#             os.rename(old_path, new_path)
#             print(f"Renamed '{filename}' to '{new_name}'")
#         except PermissionError:
#             print(f"Skipping '{filename}' due to permission error.")
# # Directory containing receipt images
# input_dir = "big_receipt_dataset"
# rename_images(input_dir)

In [1]:
import os
import json
import re
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from PIL import Image

# Set up your Azure AI Document Intelligence credentials
endpoint = "https://receiptprocess.cognitiveservices.azure.com/"  # Replace with your endpoint
key = "CSDnP6rbFAgzLtbUpfkYzItnfxm1Z9KHdT1DWr7H7WypMFLoRwQOJQQJ99BBACYeBjFXJ3w3AAALACOGWgTA"  # Replace with your key

# Directory containing receipt images
input_dir = "big_receipt_dataset"
# Directory to store JSON output
output_dir = "processed_json_images"

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Initialize the client
document_intelligence_client = DocumentIntelligenceClient(
    endpoint=endpoint, credential=AzureKeyCredential(key)
)

# Function to extract the numeric part of filenames
def extract_number(filename):
    match = re.search(r"(\d+)", filename)  # Extract first number in filename
    return int(match.group(1)) if match else float('inf')  # Convert to int or set to infinity if no number found

# Function to serialize date objects
def serialize_date(date_obj):
    return date_obj.strftime("%Y-%m-%d") if date_obj else None

# Function to safely get a value from a field
def get_field_value(field, field_type="string"):
    if field:
        if field_type == "string":
            return field.value_string if field.value_string else None
        elif field_type == "date":
            return serialize_date(field.value_date) if field.value_date else None
        elif field_type == "currency":
            return field.value_currency.amount if field.value_currency else None
        elif field_type == "number":
            return field.value_number if field.value_number else None
    return None

# Function to resize the image if it's too large
def resize_image(image_path, max_size=(800, 800), max_quality=85):
    with Image.open(image_path) as img:
        print(f"Original image size: {os.path.getsize(image_path) / (1024 * 1024):.2f} MB")
        img.thumbnail(max_size)
        resized_image_path = os.path.splitext(image_path)[0] + "_resized.jpg"
        img.save(resized_image_path, "JPEG", quality=max_quality)
        print(f"Resized image size: {os.path.getsize(resized_image_path) / (1024 * 1024):.2f} MB")
    return resized_image_path

# Function to process a batch of receipts
def process_receipt_batch(batch):
    for filename in batch:
        file_path = os.path.join(input_dir, filename)
        if not filename.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.pdf')):
            continue

        print(f"Processing: {filename}")
        file_size = os.path.getsize(file_path)
        print(f"Size: {file_size / (1024 * 1024):.2f} MB")

        if file_size > 4 * 1024 * 1024:
            print(f"Resizing {filename}...")
            file_path = resize_image(file_path)

        with open(file_path, "rb") as receipt_file:
            poller = document_intelligence_client.begin_analyze_document("prebuilt-receipt", receipt_file)
            receipts = poller.result()

        receipt_data = []
        for idx, receipt in enumerate(receipts.documents):
            receipt_info = {
                "receipt_number": idx + 1,
                "receipt_type": receipt.doc_type,
                "merchant_name": get_field_value(receipt.fields.get("MerchantName")),
                "transaction_date": get_field_value(receipt.fields.get("TransactionDate"), "date"),
                "items": []
            }
            if receipt.fields.get("Items"):
                for item in receipt.fields.get("Items").value_array:
                    receipt_info["items"].append({
                        "description": get_field_value(item.value_object.get("Description")),
                        "quantity": get_field_value(item.value_object.get("Quantity"), "number"),
                        "price": get_field_value(item.value_object.get("Price"), "currency"),
                        "total_price": get_field_value(item.value_object.get("TotalPrice"), "currency")
                    })
            receipt_info["subtotal"] = get_field_value(receipt.fields.get("Subtotal"), "currency")
            receipt_info["tax"] = get_field_value(receipt.fields.get("TotalTax"), "currency")
            receipt_info["tip"] = get_field_value(receipt.fields.get("Tip"), "currency")
            receipt_info["total"] = get_field_value(receipt.fields.get("Total"), "currency")
            receipt_data.append(receipt_info)

        receipt_json = json.dumps(receipt_data, indent=4)
        output_path = os.path.join(output_dir, f"{os.path.splitext(filename)[0]}.json")
        with open(output_path, "w") as json_file:
            json_file.write(receipt_json)

        print(f"Saved: {output_path}")

# Get all filenames and sort them numerically
all_filenames = sorted(
    [f for f in os.listdir(input_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.pdf'))],
    key=extract_number  # Sort using extracted numeric value
)

### MANUALLY CHANGE THESE VALUES BEFORE EACH RUN ###
start_index = 29   # Change this to select the batch you want to process
end_index = 39    # Change this to start + 20 for the next batch

batch = all_filenames[start_index:end_index]
print(f"Processing {len(batch)} receipts from index {start_index} to {end_index - 1}...")
process_receipt_batch(batch)

print("Processing complete! 🎉")

Processing 9 receipts from index 20 to 28...
Processing: receipt_21.jpg
Size: 0.13 MB
Saved: processed_json_images\receipt_21.json
Processing: receipt_22.jpg
Size: 0.06 MB
Saved: processed_json_images\receipt_22.json
Processing: receipt_23.jpg
Size: 0.11 MB
Saved: processed_json_images\receipt_23.json
Processing: receipt_24.jpg
Size: 0.13 MB
Saved: processed_json_images\receipt_24.json
Processing: receipt_25.jpg
Size: 0.07 MB
Saved: processed_json_images\receipt_25.json
Processing: receipt_26.jpg
Size: 0.11 MB
Saved: processed_json_images\receipt_26.json
Processing: receipt_27.jpg
Size: 0.15 MB
Saved: processed_json_images\receipt_27.json
Processing: receipt_28.jpg
Size: 0.11 MB
Saved: processed_json_images\receipt_28.json
Processing: receipt_29.jpg
Size: 0.03 MB
Saved: processed_json_images\receipt_29.json
Processing complete! 🎉
