In [29]:
import cv2
import pytesseract
import pandas as pd
import re
import os
from PIL import Image

In [30]:
# ---------------- CONFIG ----------------
IMAGE_PATH = "../data/output/test.jpg"
EXCEL_PATH = "../data/output/voter_data.xlsx"
TESS_LANG = "mar+eng"

In [None]:
def preprocess_image(img_path):
    img = cv2.imread(img_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Resize for better OCR
    gray = cv2.resize(gray, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)

    # OTSU threshold (better for numbers)
    _, thresh = cv2.threshold(
        gray, 0, 255,
        cv2.THRESH_BINARY + cv2.THRESH_OTSU
    )

    return thresh



def extract_text(processed_img):
    pil_img = Image.fromarray(processed_img)
    text = pytesseract.image_to_string(pil_img, lang=TESS_LANG)
    return text


import re

MARATHI_DIGITS = {
    '०': '0', '१': '1', '२': '2', '३': '3', '४': '4',
    '५': '5', '६': '6', '७': '7', '८': '8', '९': '9'
}

def marathi_to_english_digits(text):
    for m, e in MARATHI_DIGITS.items():
        text = text.replace(m, e)
    return text


def clean_value(value):
    if value:
        value = value.replace(":", "").replace("：", "")
        value = re.sub(r"\s+", " ", value)
        return value.strip()
    return None


def parse_fields(text):
    text = marathi_to_english_digits(text)

    data = {
        "sr_no": None,        # MUST be filled externally
        "epic_id": None,
        "full_name": None,
        "father_name": None,
        "age": None,
        "gender": None,
        "house_no": None
    }

    # EPIC ID
    epic_match = re.search(r"\b[A-Z]{3}\d{7}\b", text)
    if epic_match:
        data["epic_id"] = epic_match.group()

    # Full Name
    fn_match = re.search(r"मतदाराचे पूर्ण\s*[:：]?\s*([^\n]+)", text)
    if fn_match:
        data["full_name"] = clean_value(fn_match.group(1))

    # Father Name
    father_match = re.search(r"वडिलांचे नाव\s*[:：]?\s*([^\n]+)", text)
    if father_match:
        data["father_name"] = clean_value(father_match.group(1))

    # House Number (handles |:)
    house_match = re.search(r"घर क्रमांक\s*[|:：\-]*\s*(\d{1,4})", text)
    if house_match:
        data["house_no"] = house_match.group(1)

    # # Age
    # age_match = re.search(r"वय\s*[:：\-]?\s*(\d{1,3})", text)
    # if age_match:
    #     data["age"] = age_match.group(1)
    age_text = pytesseract.image_to_string(
        pil_img,
        lang="mar",
        config="--psm 7 -c tessedit_char_whitelist=०१२३४५६७८९"
    )

    age = marathi_to_english_digits(age_text)
    age = validate_age(age)

    # Gender (same line or standalone)
    if re.search(r"लिंग\s*[:：]?\s*पु|\bपु\b", text):
        data["gender"] = "Male"
    elif re.search(r"लिंग\s*[:：]?\s*स्त्री|स्त्री", text):
        data["gender"] = "Female"

    return data


def save_to_excel(data):
    df_new = pd.DataFrame([data])

    if os.path.exists(EXCEL_PATH):
        df_existing = pd.read_excel(EXCEL_PATH)
        df_final = pd.concat([df_existing, df_new], ignore_index=True)
    else:
        df_final = df_new

    df_final.to_excel(EXCEL_PATH, index=False)
    print("✅ Data saved to", EXCEL_PATH)


In [32]:
processed_img = preprocess_image(IMAGE_PATH)
text = extract_text(processed_img)

print("\n---- OCR TEXT ----\n")
print(text)

data = parse_fields(text)
print("\n---- PARSED DATA ----\n")
print(data)

save_to_excel(data)


---- OCR TEXT ----

WRQ3609864 42/183/1038
मतदाराचे पूर्ण : नरेंद्र प्रल्हाद रामटेके
नांव

वडिलांचे नाव : प्रल्हाद रामटेके

घर क्रमांक |: २१०

वय : ५२ लिंग : पु



---- PARSED DATA ----

{'sr_no': None, 'epic_id': 'WRQ3609864', 'full_name': 'नरेंद्र प्रल्हाद रामटेके', 'father_name': 'प्रल्हाद रामटेके', 'age': '52', 'gender': 'Male', 'house_no': '210'}
✅ Data saved to ../data/output/voter_data.xlsx
