<a href="https://colab.research.google.com/github/akshaypradheep/vote-chori-ocr-colab/blob/main/Vote_Chori_OCR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!/usr/bin/env python3
"""
PDF Voter Block Extractor
-------------------------
Extracts voter block text from a structured PDF using Google Vision OCR.
Supports both Google Colab and local execution.

Author: Akshay Pradeep
Date: 2025-08-10
"""

!pip install pdf2image opencv-python google-cloud-vision google-cloud-translate PyPDF2 --quiet
!apt-get install poppler-utils -qq


import io
import os
import sys
import logging
from typing import Optional, Tuple, List

import numpy as np
from pdf2image import convert_from_path
from PIL import Image
from google.cloud import vision, translate_v2 as translate
from PyPDF2 import PdfReader
from google.colab import files

# ===============================
# 🔧 Configuration
# ===============================
#uploaded_pdf = files.upload()
#PDF_PATH: str = list(uploaded_pdf.keys())[0]

PDF_PATH: str = "2025-EROLLGEN-S06-94-FinalRoll-Revision2-GUJ-1-WI.pdf"
GOOGLE_CRED_FILE: str = "GOOGLE_APPLICATION_CREDENTIALS.json"

# Voter block coordinates
BASE_X, BASE_Y = 55, 100
BLOCK_WIDTH, BLOCK_HEIGHT = 785, 330
COLUMNS, ROWS = 3, 10

# Processing flags
START_PAGE: int = 3
END_PAGE: Optional[int] = None
IS_SUMMARY: bool = True
SAVE_TO_TXT: bool = True
DEBUG_OCR: bool = True  # <-- save cropped images only if True
TRANSLATE_TO_ENGLISH: bool = False  # <-- NEW FLAG

# Output
TXT_FILENAME: str = os.path.splitext(PDF_PATH)[0] + ".txt"
IMG_OUTPUT_DIR: str = "ocr_blocks"  # where cropped blocks go if DEBUG_OCR is True
if DEBUG_OCR:
    os.makedirs(IMG_OUTPUT_DIR, exist_ok=True)


# Logging setup
logging.basicConfig(
    stream=sys.stdout,
    level=logging.DEBUG,
    format="[%(asctime)s] %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    force=True
)


# ===============================
# 🖥 Environment Detection
# ===============================
def is_colab() -> bool:
    """Detect if running inside Google Colab."""
    try:
        import google.colab  # noqa: F401
        return True
    except ImportError:
        return False


# ===============================
# 🔑 Google Vision Client
# ===============================
def authenticate_google_vision(cred_path: str) -> vision.ImageAnnotatorClient:
    if not os.path.exists(cred_path):
        if is_colab():
            from google.colab import files
            logging.warning("Google Vision API key not found — please upload your JSON key file.")
            uploaded = files.upload()
            if not uploaded:
                sys.exit("No credentials uploaded. Exiting.")
            first_file = list(uploaded.keys())[0]
            os.rename(first_file, cred_path)
            logging.info("Key file saved as: %s", cred_path)
        else:
            sys.exit(f"Google Vision credentials not found at {cred_path}. Exiting.")

    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = cred_path
    return vision.ImageAnnotatorClient()


# ===============================
# 🔑 Google Translate Client
# ===============================
def authenticate_google_translate(cred_path: str) -> translate.Client:
    """Authenticate and return Google Translate API client."""
    if not os.path.exists(cred_path):
        sys.exit(f"Google Translate credentials not found at {cred_path}. Exiting.")
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = cred_path
    return translate.Client()


# ===============================
# 🌍 Translation
# ===============================
def translate_text_to_english(client: translate.Client, text: str) -> str:
    """Translate text to English if not already."""
    if not text.strip():
        return text
    result = client.translate(text, target_language="en")
    return result["translatedText"]


# ===============================
# 📄 PDF to Image Conversion
# ===============================
def convert_pdf_page_to_image(pdf_path: str, page_number: int, dpi: int = 300) -> np.ndarray:
    images = convert_from_path(pdf_path, dpi=dpi, first_page=page_number, last_page=page_number)
    return np.array(images[0])


# ===============================
# ✂ Image Cropping
# ===============================
def crop_voter_block(image: np.ndarray, x: int, y: int, w: int, h: int) -> np.ndarray:
    return image[y:y + h, x:x + w]


# ===============================
# 🔍 Text Extraction
# ===============================
def extract_text_from_block(client: vision.ImageAnnotatorClient, block_img: np.ndarray) -> str:
    pil_img = Image.fromarray(block_img)
    buf = io.BytesIO()
    pil_img.save(buf, format="PNG")

    image = vision.Image(content=buf.getvalue())
    response = client.text_detection(image=image)

    if response.error.message:
        raise RuntimeError(f"Google Vision API error: {response.error.message}")

    texts = response.text_annotations
    return texts[0].description.strip() if texts else ""


# ===============================
# 📜 Main Processing
# ===============================
def process_pdf(
    pdf_path: str,
    vision_client: vision.ImageAnnotatorClient,
    translate_client: Optional[translate.Client],
    start_page: int,
    end_page: Optional[int],
    save_to_txt: bool,
    txt_filename: str
) -> None:
    base_pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]

    if end_page is None:
        if IS_SUMMARY:
            end_page = len(PdfReader(pdf_path).pages) - 1
        else:
            end_page = len(PdfReader(pdf_path).pages)

    txt_file = open(txt_filename, "w", encoding="utf-8") if save_to_txt else None

    for page_num in range(start_page, end_page + 1):
        logging.info("Processing page %d", page_num)
        if txt_file:
            txt_file.write(f"\nProcessing page {page_num}\n")

        try:
            page_img = convert_pdf_page_to_image(pdf_path, page_num)
            for row in range(ROWS):
                for col in range(COLUMNS):
                    x = BASE_X + col * BLOCK_WIDTH
                    y = BASE_Y + row * BLOCK_HEIGHT
                    block_img = crop_voter_block(page_img, x, y, BLOCK_WIDTH, BLOCK_HEIGHT)

                    if DEBUG_OCR:
                        img_filename = f"{base_pdf_name}-{page_num}-{row+1}x{col+1}.png"
                        img_path = os.path.join(IMG_OUTPUT_DIR, img_filename)
                        Image.fromarray(block_img).save(img_path)
                        logging.debug(f"[DEBUG_OCR] Saved cropped block to {img_path}")

                    text = extract_text_from_block(vision_client, block_img)

                    if TRANSLATE_TO_ENGLISH and translate_client:
                        text = translate_text_to_english(translate_client, text)

                    logging.debug("Page %d, Row %d, Col %d", page_num, row + 1, col + 1)
                    if txt_file:
                        txt_file.write(f"\n🧩 Block (Page {page_num}, Row {row + 1}, Col {col + 1}):\n")
                        txt_file.write(text + "\n----\n")

        except Exception as e:
            error_msg = f"Failed on page {page_num}: {e}"
            logging.error(error_msg)
            if txt_file:
                txt_file.write(error_msg + "\n")

    if txt_file:
        txt_file.close()
        if is_colab():
            from google.colab import files
            logging.info("✅ Saved output to %s", txt_filename)
            files.download(txt_filename)
        else:
            logging.info("✅ Saved output to %s", txt_filename)


# ===============================
# 🚀 Entry Point
# ===============================
if __name__ == "__main__":
    vision_client = authenticate_google_vision(GOOGLE_CRED_FILE)
    translate_client = authenticate_google_translate(GOOGLE_CRED_FILE) if TRANSLATE_TO_ENGLISH else None
    process_pdf(PDF_PATH, vision_client, translate_client, START_PAGE, END_PAGE, SAVE_TO_TXT, TXT_FILENAME)