In [1]:
import pytesseract
pytesseract.pytesseract.tesseract_cmd = "/opt/homebrew/bin/tesseract"

In [3]:
import os

input_folder = "prescriptions"
if os.path.exists(input_folder):
    print("'prescriptions' folder exists!")
    print("Sample files:", os.listdir(input_folder)[:5])
else:
    print(" Folder not found! Check the name and location.")

print("Current working directory:", os.getcwd())
print("Folders found:", os.listdir())

'prescriptions' folder exists!
Sample files: ['63.jpg', '77.jpg', '88.jpg', '89.jpg', '76.jpg']
Current working directory: /Users/owl/Desktop/ICMR_PROSPECT_Assignment
Folders found: ['.DS_Store', 'output', '.~ICMR_PROSPECT_Presentation.pptx', 'ICMR_PROSPECT_Presentation.pptx', 'pipeline.ipynb', '.ipynb_checkpoints', 'prescriptions']


In [5]:
import cv2
import json
import pandas as pd
import matplotlib.pyplot as plt

output_crop = "output/crops"
output_json = "output/json"
os.makedirs(output_crop, exist_ok=True)
os.makedirs(output_json, exist_ok=True)

In [7]:
def preprocess_image(image_path):
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if img is None:
        print(f" Warning: couldn't read {image_path}")
        return None
    blur = cv2.medianBlur(img, 3)
    _, binary = cv2.threshold(blur, 150, 255, cv2.THRESH_BINARY_INV)
    return binary

def extract_text_blocks(image_path):
    img = cv2.imread(image_path)
    if img is None:
        return []
    text = pytesseract.image_to_string(img)
    return text.strip().split('\n')

In [9]:
def simulate_structured_data(text_lines):
    structured = {
        "Doctor": None, "Patient": None, "Age": None, "Date": None,
        "Medicine": [], "Dosage": [], "Signature": None
    }
    for line in text_lines:
        line = line.strip()
        if "Dr" in line: structured["Doctor"] = line
        elif "Patient" in line: structured["Patient"] = line
        elif "age" in line.lower(): structured["Age"] = line
        elif "date" in line.lower(): structured["Date"] = line
        elif "mg" in line.lower() or "tablet" in line.lower(): structured["Medicine"].append(line)
        elif "1-0-1" in line or "1 tablet" in line: structured["Dosage"].append(line)
        elif "Who" in line: structured["Signature"] = line
    return structured

In [11]:
def process_all_images_with_logs():
    all_files = [f for f in os.listdir(input_folder) if f.endswith((".jpg", ".png"))]
    results = []
    skipped_files = []

    for fname in all_files:
        full_path = os.path.join(input_folder, fname)
        print(f"🔍 Processing: {fname}")
        text_lines = extract_text_blocks(full_path)

        if not text_lines or all(line.strip() == '' for line in text_lines):
            skipped_files.append(fname)
            continue

        structured = simulate_structured_data(text_lines)
        structured["filename"] = fname
        results.append(structured)

    with open(os.path.join(output_json, "structured_output.json"), "w") as f:
        json.dump(results, f, indent=2)

    print(f"\n Done! Extracted structured data from {len(results)} images.")
    print(f" Skipped {len(skipped_files)} images:", skipped_files[:5], "...")
    return results, skipped_files

output, skipped = process_all_images_with_logs()

🔍 Processing: 63.jpg
🔍 Processing: 77.jpg
🔍 Processing: 88.jpg
🔍 Processing: 89.jpg
🔍 Processing: 76.jpg
🔍 Processing: 62.jpg
🔍 Processing: 74.jpg
🔍 Processing: 60.jpg
🔍 Processing: 48.jpg
🔍 Processing: 49.jpg
🔍 Processing: 61.jpg
🔍 Processing: 75.jpg
🔍 Processing: 59.jpg
🔍 Processing: 71.jpg
🔍 Processing: 65.jpg
🔍 Processing: 64.jpg
🔍 Processing: 70.jpg
🔍 Processing: 58.jpg
🔍 Processing: 8.jpg
🔍 Processing: 66.jpg
🔍 Processing: 72.jpg
🔍 Processing: 99.jpg
🔍 Processing: 98.jpg
🔍 Processing: 73.jpg
🔍 Processing: 67.jpg
🔍 Processing: 9.jpg
🔍 Processing: 14.jpg
🔍 Processing: 28.jpg
🔍 Processing: 129.jpg
🔍 Processing: 101.jpg
🔍 Processing: 115.jpg
🔍 Processing: 114.jpg
🔍 Processing: 100.jpg
🔍 Processing: 128.jpg
🔍 Processing: 29.jpg
🔍 Processing: 15.jpg
🔍 Processing: 17.jpg
🔍 Processing: 116.jpg
🔍 Processing: 102.jpg
🔍 Processing: 103.jpg
🔍 Processing: 117.jpg
🔍 Processing: 16.jpg
🔍 Processing: 12.jpg
🔍 Processing: 113.jpg
🔍 Processing: 107.jpg
🔍 Processing: 106.jpg
🔍 Processing: 112.jpg
🔍

In [13]:
df = pd.DataFrame(output)
df.to_csv("output/structured_data.csv", index=False)
print("📄 CSV exported to 'output/structured_data.csv'")

📄 CSV exported to 'output/structured_data.csv'
