# Tibetan OCR Grid Search with Quality Scoring

Systematically tests OCR parameter combinations on Tibetan pecha PDFs.
Scores results using PyBo tokenization (% valid Tibetan words).

**How to use:**
1. Run cells 1-6 in order (setup)
2. Run cell 7 (quick test) to verify everything works
3. Run cell 8 (full grid search) for real results
4. Run cell 9 (analyze) to see what worked best

**Phase 2 TODO:** Explore binarization parameters (block_size, c) in Utils.binarize()

In [None]:
import os
import sys
import cv2
import csv
import json
import signal
import numpy as np
from pathlib import Path
from datetime import datetime
from dataclasses import dataclass, asdict
from typing import List, Dict, Tuple, Optional, Any
from tqdm import tqdm
import time
import logging
from logging.handlers import RotatingFileHandler

# PyBo for OCR quality scoring
try:
    from pybo import WordTokenizer
    PYBO_AVAILABLE = True
    print("✅ PyBo loaded for OCR quality scoring")
except ImportError:
    PYBO_AVAILABLE = False
    print("⚠️  PyBo not available - quality scoring disabled")
    print("   Install with: pip install git+https://github.com/OpenPecha/pybo.git")

# Add project root to path
PROJECT_ROOT = Path(__file__).parent if "__file__" in dir() else Path.cwd()
sys.path.insert(0, str(PROJECT_ROOT))

# Import BDRC modules
from BDRC.Data import (
    Encoding, LineMode, TPSMode, Platform,
    LineDetectionConfig, LayoutDetectionConfig, OCRModelConfig
)
from BDRC.Inference import OCRPipeline
from BDRC.Utils import import_local_models, get_platform
from BDRC.utils.pdf_extract import extract_images_from_pdf

print(f"Project root: {PROJECT_ROOT}")
print(f"Platform: {get_platform()}")


In [None]:
"""
=============================================================================
CONFIGURATION - Uncomment ONE file to process
=============================================================================
"""

# Base directory
BASE_DIR = Path.home() / "Documents" / "tibetan-ocr-app"

# Model paths (don't change)
OCR_MODELS_DIR = BASE_DIR / "OCRModels"
LINE_MODEL_PATH = BASE_DIR / "Models" / "Lines" / "PhotiLines.onnx"
LAYOUT_MODEL_PATH = BASE_DIR / "Models" / "Layout" / "photi.onnx"

# ============================================================================
# CHOOSE ONE FILE - Uncomment exactly ONE line
# ============================================================================

# --- UCHEN HIGH QUALITY (3 files, 16 pages total) ---
TARGET = BASE_DIR / "input_files/tibetan_texts/uchen_high_quality_examples/uchen high quality pdf.pdf"  # 10 pages
# TARGET = BASE_DIR / "input_files/tibetan_texts/uchen_high_quality_examples/uchen_high_quality_.pdf"  # 3 pages
# TARGET = BASE_DIR / "input_files/tibetan_texts/uchen_high_quality_examples/uchen_high_quality_pdf.pdf"  # 3 pages

# --- UCHEN MEDIUM QUALITY (2 files, 7 pages total) ---
# TARGET = BASE_DIR / "input_files/tibetan_texts/uchen_medium_quality_examples/uchen_medium quality.pdf"  # 3 pages
# TARGET = BASE_DIR / "input_files/tibetan_texts/uchen_medium_quality_examples/uchen_medium quality(1).pdf"  # 4 pages

# --- UCHEN POOR QUALITY (3 files, 9 pages total) ---
# TARGET = BASE_DIR / "input_files/tibetan_texts/uchen_poor_quality_examples/uchen_tsalyig_poor quality.pdf"  # 2 pages
# TARGET = BASE_DIR / "input_files/tibetan_texts/uchen_poor_quality_examples/uchen_poor quality.pdf"  # 3 pages
# TARGET = BASE_DIR / "input_files/tibetan_texts/uchen_poor_quality_examples/uchen_poor quality(1).pdf"  # 4 pages

# --- UMEH HIGH QUALITY (6 files, 18 pages total) ---
# TARGET = BASE_DIR / "input_files/tibetan_texts/umeh_high_quality_examples/umeh_druma_high quality.pdf"  # 3 pages
# TARGET = BASE_DIR / "input_files/tibetan_texts/umeh_high_quality_examples/umeh_drutsa_high quality.pdf"  # 3 pages
# TARGET = BASE_DIR / "input_files/tibetan_texts/umeh_high_quality_examples/umeh_dhernangdri_high quality(1).pdf"  # 3 pages
# TARGET = BASE_DIR / "input_files/tibetan_texts/umeh_high_quality_examples/umeh_dhernangdri_high quality.pdf"  # 3 pages
# TARGET = BASE_DIR / "input_files/tibetan_texts/umeh_high_quality_examples/umeh_dhernangdri_high quality(2).pdf"  # 3 pages
# TARGET = BASE_DIR / "input_files/tibetan_texts/umeh_high_quality_examples/umeh_tsugma khyug_high quality.pdf"  # 3 pages

# --- UMEH MEDIUM QUALITY (5 files, 25 pages total) ---
# TARGET = BASE_DIR / "input_files/tibetan_texts/umeh_medium_quality_examples/umeh_druchen_medium quality.pdf"  # 3 pages
# TARGET = BASE_DIR / "input_files/tibetan_texts/umeh_medium_quality_examples/umeh_druchen_medium quality(1).pdf"  # 4 pages
# TARGET = BASE_DIR / "input_files/tibetan_texts/umeh_medium_quality_examples/umeh_dhernangdri_medium quality.pdf"  # 12 pages
# TARGET = BASE_DIR / "input_files/tibetan_texts/umeh_medium_quality_examples/umeh_tsugmakhyug_medium quality.pdf"  # 3 pages
# TARGET = BASE_DIR / "input_files/tibetan_texts/umeh_medium_quality_examples/umeh_druchen_medium quality(2).pdf"  # 3 pages

# --- UMEH POOR QUALITY (4 files, 12 pages total) ---
# TARGET = BASE_DIR / "input_files/tibetan_texts/umeh_poor_quality_examples/umeh_dhernangdri_poor quality(2).pdf"  # 3 pages
# TARGET = BASE_DIR / "input_files/tibetan_texts/umeh_poor_quality_examples/umeh_drutsa_poor quality.pdf"  # 3 pages
# TARGET = BASE_DIR / "input_files/tibetan_texts/umeh_poor_quality_examples/umeh_khyugyig_poor quality.pdf"  # 3 pages
# TARGET = BASE_DIR / "input_files/tibetan_texts/umeh_poor_quality_examples/umeh_petsug_poor quality.pdf"  # 3 pages

# --- PECHAS WITH MORE TEXT (3 files, 9 pages total) ---
# TARGET = BASE_DIR / "input_files/tibetan_texts/Pechas with more text/pechas with more text 1.pdf"  # 3 pages
# TARGET = BASE_DIR / "input_files/tibetan_texts/Pechas with more text/pechas with more text 2.pdf"  # 3 pages
# TARGET = BASE_DIR / "input_files/tibetan_texts/Pechas with more text/pechas with more text 3.pdf"  # 3 pages

# --- PECHAS WITH LITTLE TEXT (3 files, 9 pages total) ---
# TARGET = BASE_DIR / "input_files/tibetan_texts/Pechas with little text/pechas with little text 1.pdf"  # 3 pages
# TARGET = BASE_DIR / "input_files/tibetan_texts/Pechas with little text/pechas with little text 2.pdf"  # 3 pages
# TARGET = BASE_DIR / "input_files/tibetan_texts/Pechas with little text/pechas with little text 3.pdf"  # 3 pages

# --- STANDALONE FILES (2 files, 14 pages total) ---
# TARGET = BASE_DIR / "input_files/tibetan_texts/sangs rgyas sman bla dngos.pdf"  # 2 pages - QUICKEST TEST
# TARGET = BASE_DIR / "input_files/tibetan_texts/sangs_rgyas_sman_gyi.pdf"  # 12 pages

# ============================================================================
# MODELS - Choose based on what you're processing
# ============================================================================

MODELS_TO_TEST = ["Woodblock", "Woodblock-Stacks", "Modern"]  # For Uchen
# MODELS_TO_TEST = ["Ume_Druma", "Ume_Petsuk", "Modern"]  # For Umeh

# ============================================================================
# Validation
# ============================================================================
if not TARGET.exists():
    raise FileNotFoundError(f"❌ File not found: {TARGET}")

# Setup output paths
OUTPUT_DIR = BASE_DIR / "grid_search_results"
OUTPUT_DIR.mkdir(exist_ok=True)

TEMP_DIR = OUTPUT_DIR / "temp_images"
TEMP_DIR.mkdir(exist_ok=True)

CHECKPOINT_DIR = OUTPUT_DIR / "_checkpoints"
CHECKPOINT_DIR.mkdir(exist_ok=True)
CHECKPOINT_FILE = CHECKPOINT_DIR / "progress.json"

print(f"✅ Target: {TARGET.name}")
print(f"   Models: {MODELS_TO_TEST}")
print(f"   Output: {OUTPUT_DIR}")


In [None]:
"""
=============================================================================
LOGGING SETUP
=============================================================================
"""

LOGS_DIR = OUTPUT_DIR / "logs"
LOGS_DIR.mkdir(exist_ok=True)
LOG_FILE = LOGS_DIR / "grid_search.log"

file_handler = RotatingFileHandler(
    LOG_FILE, maxBytes=10*1024*1024, backupCount=5
)
file_handler.setLevel(logging.INFO)
file_handler.setFormatter(logging.Formatter(
    '%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S'
))

console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
console_handler.setFormatter(logging.Formatter('%(message)s'))

logging.basicConfig(level=logging.INFO, handlers=[file_handler, console_handler])
logger = logging.getLogger(__name__)

logger.info("=" * 70)
logger.info("GRID SEARCH SESSION STARTED")
logger.info("=" * 70)
logger.info(f"Target File: {TARGET}")
logger.info(f"Models: {MODELS_TO_TEST}")
logger.info(f"Log file: {LOG_FILE}")
logger.info("=" * 70)


In [None]:
class OCRQualityScorer:
    """Score OCR text quality using PyBo tokenization. Higher = more valid Tibetan words."""

    def __init__(self):
        if not PYBO_AVAILABLE:
            self.tokenizer = None
            print("⚠️  Quality scoring disabled (PyBo not available)")
        else:
            self.tokenizer = WordTokenizer()
            print("✅ Quality scorer initialized with PyBo")

    def score_text(self, text: str) -> Dict[str, Any]:
        """Score OCR text. Returns quality_score (0-100), total/valid/invalid token counts."""
        if not self.tokenizer or not text.strip():
            return {'quality_score': 0.0, 'total_tokens': 0, 'valid_tokens': 0, 'invalid_tokens': 0}

        try:
            tokens = self.tokenizer.tokenize(text)
            total_tokens = 0
            valid_tokens = 0

            for token in tokens:
                pos = getattr(token, 'pos', None)
                if pos == '':  # Skip punctuation
                    continue
                total_tokens += 1
                if pos and pos not in ['NON_WORD', 'non-word', 'NO_POS', 'OTHER', '', None, 'X']:
                    valid_tokens += 1

            quality_score = (valid_tokens / total_tokens * 100) if total_tokens > 0 else 0.0
            return {
                'quality_score': round(quality_score, 2),
                'total_tokens': total_tokens,
                'valid_tokens': valid_tokens,
                'invalid_tokens': total_tokens - valid_tokens
            }
        except Exception as e:
            print(f"  ⚠️  Quality scoring error: {e}")
            return {'quality_score': 0.0, 'total_tokens': 0, 'valid_tokens': 0, 'invalid_tokens': 0}

quality_scorer = OCRQualityScorer()


In [None]:
"""
=============================================================================
PARAMETER GRID
=============================================================================

Strategy:
  - Use FULL_PARAMS for the FIRST image of each new category (1,728 combos)
  - Analyze results, then create a trimmed grid for that category
  - Use the trimmed grid for remaining images in that category
  
To add a new trimmed grid after analyzing a category:
  1. Copy the UCHEN_HIGH_PARAMS template
  2. Rename to match your category
  3. Fill in the winning values from your analysis
  4. Uncomment the PARAM_VALUES line for it below
=============================================================================
"""

@dataclass
class GridSearchParams:
    """Parameters for a single grid search run."""
    ocr_model_name: str
    line_mode: str       # "line" or "layout"
    class_threshold: float
    k_factor: float
    bbox_tolerance: float
    merge_lines: bool
    tps_threshold: float  # use_tps always True, threshold controls sensitivity

    def to_filename(self) -> str:
        merge_str = "T" if self.merge_lines else "F"
        return (
            f"{self.ocr_model_name}_{self.line_mode}_"
            f"k{self.k_factor}_bbox{self.bbox_tolerance}_"
            f"merge-{merge_str}_tps{self.tps_threshold}_conf{self.class_threshold}"
        )

# ============================================================================
# FULL GRID - 1,728 combos per image (use for first image of each category)
# ============================================================================
FULL_PARAMS = {
    "line_mode": ["line", "layout"],
    "class_threshold": [0.7, 0.8, 0.9],
    "k_factor": [2.0, 2.5, 3.0],
    "bbox_tolerance": [2.5, 3.5, 4.0, 5.0],
    "merge_lines": [True, False],
    "tps_threshold": [0.1, 0.25, 0.5, 0.9],
}

# ============================================================================
# TRIMMED GRIDS - based on Phase 1 results per category
# After running full grid on first image, analyze and add trimmed params here
# ============================================================================

# Uchen High Quality: 12 combos per image (from 1,728)
# Based on: uchen high quality pdf - page 1 results
UCHEN_HIGH_PARAMS = {
    "line_mode": ["line"],              # line beat layout by 5pts
    "class_threshold": [0.7],           # no difference observed
    "k_factor": [2.5],                  # no difference observed
    "bbox_tolerance": [2.5, 3.5, 4.0, 5.0],  # showed meaningful spread, keep all
    "merge_lines": [True],              # 10pt advantage over False
    "tps_threshold": [0.5],             # no difference on clean pages
}

# Uchen Medium Quality: TBD after first run
# UCHEN_MED_PARAMS = {
#     "line_mode": [],
#     "class_threshold": [],
#     "k_factor": [],
#     "bbox_tolerance": [],
#     "merge_lines": [],
#     "tps_threshold": [],
# }

# Uchen Poor Quality: TBD after first run
# UCHEN_POOR_PARAMS = {
#     "line_mode": [],
#     "class_threshold": [],
#     "k_factor": [],
#     "bbox_tolerance": [],
#     "merge_lines": [],
#     "tps_threshold": [],
# }

# Umeh High Quality: TBD after first run
# UMEH_HIGH_PARAMS = {
#     "line_mode": [],
#     "class_threshold": [],
#     "k_factor": [],
#     "bbox_tolerance": [],
#     "merge_lines": [],
#     "tps_threshold": [],
# }

# Umeh Medium Quality: TBD after first run
# UMEH_MED_PARAMS = {
#     "line_mode": [],
#     "class_threshold": [],
#     "k_factor": [],
#     "bbox_tolerance": [],
#     "merge_lines": [],
#     "tps_threshold": [],
# }

# Umeh Poor Quality: TBD after first run
# UMEH_POOR_PARAMS = {
#     "line_mode": [],
#     "class_threshold": [],
#     "k_factor": [],
#     "bbox_tolerance": [],
#     "merge_lines": [],
#     "tps_threshold": [],
# }

# ============================================================================
# ACTIVE SELECTION - uncomment ONE line
# ============================================================================
PARAM_VALUES = FULL_PARAMS
# PARAM_VALUES = UCHEN_HIGH_PARAMS
# PARAM_VALUES = UCHEN_MED_PARAMS
# PARAM_VALUES = UCHEN_POOR_PARAMS
# PARAM_VALUES = UMEH_HIGH_PARAMS
# PARAM_VALUES = UMEH_MED_PARAMS
# PARAM_VALUES = UMEH_POOR_PARAMS


def generate_param_combinations() -> List[GridSearchParams]:
    """Generate all parameter combinations."""
    combos = []
    for model in MODELS_TO_TEST:
        for lm in PARAM_VALUES["line_mode"]:
            for ct in PARAM_VALUES["class_threshold"]:
                for kf in PARAM_VALUES["k_factor"]:
                    for bt in PARAM_VALUES["bbox_tolerance"]:
                        for ml in PARAM_VALUES["merge_lines"]:
                            for tp in PARAM_VALUES["tps_threshold"]:
                                combos.append(GridSearchParams(
                                    ocr_model_name=model, line_mode=lm,
                                    class_threshold=ct, k_factor=kf,
                                    bbox_tolerance=bt, merge_lines=ml,
                                    tps_threshold=tp
                                ))
    return combos


# Show counts
total_combos = len(MODELS_TO_TEST)
for vals in PARAM_VALUES.values():
    total_combos *= len(vals)
print(f"Parameter combinations per image: {total_combos}")
print(f"Models: {MODELS_TO_TEST}")
for name, vals in PARAM_VALUES.items():
    print(f"  {name}: {vals}")

In [None]:
"""
=============================================================================
CORE ENGINE - OCR runner, checkpoint, results saving
=============================================================================
"""

class CheckpointManager:
    """Saves/loads progress for resumable grid search."""

    def __init__(self, checkpoint_file: Path):
        self.checkpoint_file = checkpoint_file
        self.completed = self._load()

    def _load(self) -> set:
        if self.checkpoint_file.exists():
            try:
                with open(self.checkpoint_file, 'r') as f:
                    return set(json.load(f).get("completed_images", []))
            except Exception as e:
                print(f"Warning: Could not load checkpoint: {e}")
        return set()

    def save(self):
        data = {"completed_images": list(self.completed), "last_updated": datetime.now().isoformat()}
        with open(self.checkpoint_file, 'w') as f:
            json.dump(data, f, indent=2)

    def mark_completed(self, image_path: str):
        self.completed.add(image_path)
        self.save()

    def is_completed(self, image_path: str) -> bool:
        return image_path in self.completed

    def get_completed_count(self) -> int:
        return len(self.completed)

    def reset(self):
        self.completed = set()
        if self.checkpoint_file.exists():
            self.checkpoint_file.unlink()


class GracefulInterrupt:
    """Handle Ctrl+C gracefully, allowing current image to complete."""

    def __init__(self):
        self.interrupted = False
        self._original_handler = None

    def __enter__(self):
        self._original_handler = signal.signal(signal.SIGINT, self._handler)
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        signal.signal(signal.SIGINT, self._original_handler)

    def _handler(self, signum, frame):
        print("\
\
⚠️  Interrupt received. Finishing current image then stopping...")
        print("   (Press Ctrl+C again to force quit)\
")
        self.interrupted = True
        signal.signal(signal.SIGINT, self._original_handler)


class GridSearchOCR:
    """Runs OCR with different parameter combinations."""

    def __init__(self):
        self.platform = get_platform()
        self.ocr_models = {}
        self.pipelines = {}

        # Line detection configs
        self.line_config = LineDetectionConfig(model_file=str(LINE_MODEL_PATH), patch_size=512)
        self.layout_config = LayoutDetectionConfig(
            model_file=str(LAYOUT_MODEL_PATH), patch_size=512,
            classes=["background", "image", "line", "caption", "margin"]
        )

        # Load OCR models
        print("\
Loading OCR models...")
        models = import_local_models(str(OCR_MODELS_DIR))
        for model in models:
            self.ocr_models[model.name] = model
            print(f"  ✅ Loaded: {model.name}")

    def run_ocr(self, image_path: Path, params: GridSearchParams) -> Tuple[bool, int, str, str, Dict]:
        """Run OCR on a single image. Returns (success, num_lines, text, error, quality_metrics)."""
        try:
            image = cv2.imread(str(image_path))
            if image is None:
                return False, 0, "", f"Failed to load image: {image_path}", {}

            # Get or create pipeline
            cache_key = f"{params.ocr_model_name}_{params.line_mode}"
            if cache_key not in self.pipelines:
                ocr_model = self.ocr_models.get(params.ocr_model_name)
                if not ocr_model:
                    return False, 0, "", f"OCR model not found: {params.ocr_model_name}", {}
                det_config = self.line_config if params.line_mode == "line" else self.layout_config
                self.pipelines[cache_key] = OCRPipeline(
                    platform=self.platform, ocr_config=ocr_model.config, line_config=det_config
                )

            pipeline = self.pipelines[cache_key]

            status, result = pipeline.run_ocr(
                image=image,
                k_factor=params.k_factor,
                bbox_tolerance=params.bbox_tolerance,
                merge_lines=params.merge_lines,
                use_tps=True,
                tps_threshold=params.tps_threshold,
                target_encoding=Encoding.Unicode
            )

            if status.name == "SUCCESS":
                rot_mask, sorted_lines, ocr_lines, page_angle = result
                full_text = "\
".join(line.text for line in ocr_lines)
                quality = quality_scorer.score_text(full_text)
                return True, len(ocr_lines), full_text, "", quality
            else:
                return False, 0, "", str(result), {}

        except Exception as e:
            return False, 0, "", str(e), {}


def get_test_images() -> Dict[str, List[Path]]:
    """Extract images from TARGET PDF. Returns {pdf_stem: [image_paths]}."""
    pdf_output_dir = TEMP_DIR / TARGET.stem
    pdf_output_dir.mkdir(exist_ok=True)

    existing = sorted(pdf_output_dir.glob("*.jpg"))
    if existing:
        print(f"  Using {len(existing)} existing images from {TARGET.name}")
    else:
        print(f"  Extracting images from {TARGET.name}...")
        extracted, total = extract_images_from_pdf(str(TARGET), str(pdf_output_dir))
        print(f"  Extracted {len(extracted)} images from {total} pages")
        existing = sorted(Path(p) for p in extracted)

    return {TARGET.stem: existing}


def save_result(output_dir, file_name, image_name, params, success,
                num_lines, ocr_text, error_message, processing_time, quality_metrics):
    """Save a single OCR result to a text file."""
    result_dir = output_dir / file_name / image_name
    result_dir.mkdir(parents=True, exist_ok=True)

    filepath = result_dir / (params.to_filename() + ".txt")

    lines = [
        "=" * 70, "OCR RESULT", "=" * 70, "",
        f"File: {file_name}", f"Image: {image_name}", "",
        "PARAMETERS:",
        f"  OCR Model: {params.ocr_model_name}",
        f"  Line Mode: {params.line_mode}",
        f"  Class Threshold: {params.class_threshold}",
        f"  K-Factor: {params.k_factor}",
        f"  BBox Tolerance: {params.bbox_tolerance}",
        f"  Merge Lines: {params.merge_lines}",
        f"  TPS Threshold: {params.tps_threshold}", "",
        "RESULTS:",
        f"  Success: {success}",
        f"  Lines Detected: {num_lines}",
        f"  Processing Time: {processing_time:.2f}s",
    ]

    if quality_metrics:
        lines += [
            "", "QUALITY METRICS:",
            f"  Quality Score: {quality_metrics.get('quality_score', 0):.2f}/100",
            f"  Total Tokens: {quality_metrics.get('total_tokens', 0)}",
            f"  Valid Words: {quality_metrics.get('valid_tokens', 0)}",
            f"  Invalid Words: {quality_metrics.get('invalid_tokens', 0)}",
        ]

    if error_message:
        lines.append(f"  Error: {error_message}")

    lines += ["", "=" * 70, "OCR TEXT", "=" * 70, "",
              ocr_text if ocr_text else "[No text extracted]"]

    with open(filepath, 'w', encoding='utf-8') as f:
        f.write("\
".join(lines))


def save_summary_csv(output_dir: Path, all_results: List[Dict]):
    """Save summary CSV of all results."""
    csv_path = output_dir / "summary.csv"
    if not all_results:
        print("No results to save")
        return
    fieldnames = list(all_results[0].keys())
    with open(csv_path, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(all_results)
    print(f"\
?? Summary saved: {csv_path}")


def run_grid_search(max_images: int = None, resume: bool = True):
    """
    Run the full grid search.

    Args:
        max_images: Limit number of images (for testing)
        resume: Resume from checkpoint (default True)
    """
    checkpoint = CheckpointManager(CHECKPOINT_FILE)
    ocr = GridSearchOCR()
    all_results = []

    if resume and checkpoint.get_completed_count() > 0:
        print(f"\
?? Resuming: {checkpoint.get_completed_count()} images already done")
    elif not resume:
        checkpoint.reset()
        print("\
?? Starting fresh")

    # Get images
    print("\
" + "=" * 70)
    print("LOADING IMAGES")
    print("=" * 70)
    images_by_file = get_test_images()

    total_images = sum(len(imgs) for imgs in images_by_file.values())
    param_combos = generate_param_combinations()

    print(f"\
" + "=" * 70)
    print("GRID SEARCH")
    print("=" * 70)
    print(f"Images: {total_images}")
    print(f"Combinations per image: {len(param_combos)}")
    print(f"Total iterations: {total_images * len(param_combos)}")
    print(f"Output: {OUTPUT_DIR}")
    print("=" * 70)

    with GracefulInterrupt() as interrupt:
        images_processed = 0
        images_skipped = 0

        for file_name, image_paths in sorted(images_by_file.items()):
            if interrupt.interrupted:
                break

            if max_images:
                image_paths = image_paths[:max_images]

            print(f"\
?? {file_name} ({len(image_paths)} images)")

            for image_path in image_paths:
                if interrupt.interrupted:
                    break

                image_key = str(image_path)
                image_name = image_path.stem

                if checkpoint.is_completed(image_key):
                    images_skipped += 1
                    continue

                print(f"\
   ??️  {image_name}")

                pbar = tqdm(param_combos, desc="      Params", leave=False)
                for params in pbar:
                    t0 = time.time()
                    success, num_lines, ocr_text, error, quality = ocr.run_ocr(image_path, params)
                    elapsed = time.time() - t0

                    save_result(OUTPUT_DIR, file_name, image_name, params,
                                success, num_lines, ocr_text, error, elapsed, quality)

                    all_results.append({
                        "file_name": file_name,
                        "image_name": image_name,
                        "ocr_model_name": params.ocr_model_name,
                        "line_mode": params.line_mode,
                        "class_threshold": params.class_threshold,
                        "k_factor": params.k_factor,
                        "bbox_tolerance": params.bbox_tolerance,
                        "merge_lines": params.merge_lines,
                        "tps_threshold": params.tps_threshold,
                        "success": success,
                        "num_lines_detected": num_lines,
                        "processing_time": elapsed,
                        "quality_score": quality.get('quality_score', 0.0),
                        "total_tokens": quality.get('total_tokens', 0),
                        "valid_tokens": quality.get('valid_tokens', 0),
                        "invalid_tokens": quality.get('invalid_tokens', 0),
                        "error": error[:100] if error else ""
                    })
                pbar.close()

                checkpoint.mark_completed(image_key)
                images_processed += 1
                print(f"      ✅ Done ({images_processed} processed, {images_skipped} skipped)")

        save_summary_csv(OUTPUT_DIR, all_results)

    print("\
" + "=" * 70)
    if interrupt.interrupted:
        print("⚠️  INTERRUPTED - Progress saved, run again to resume")
    else:
        print("✅ GRID SEARCH COMPLETE")
    print("=" * 70)
    print(f"Processed: {images_processed}  Skipped: {images_skipped}")
    print(f"Results: {OUTPUT_DIR}")

    return all_results


def analyze_results():
    """Load and analyze results from summary.csv."""
    csv_path = OUTPUT_DIR / "summary.csv"
    if not csv_path.exists():
        print("❌ No summary.csv found. Run grid search first.")
        return None

    try:
        import pandas as pd
    except ImportError:
        print("❌ pandas required: pip install pandas")
        return None

    df = pd.read_csv(csv_path)

    print("\
" + "=" * 70)
    print("ANALYSIS")
    print("=" * 70)
    print(f"\
Total results: {len(df)}")

    successful = df[df['success'] == True]

    print("\
?? Success Rate by Model:")
    print(df.groupby('ocr_model_name')['success'].mean().sort_values(ascending=False))

    if len(successful) > 0:
        print("\
✨ Avg Quality Score by Model:")
        print(successful.groupby('ocr_model_name')['quality_score'].mean().sort_values(ascending=False))

        print("\
✨ Avg Quality Score by Line Mode:")
        print(successful.groupby('line_mode')['quality_score'].mean())

        print("\
✨ Avg Quality Score by K-Factor:")
        print(successful.groupby('k_factor')['quality_score'].mean())

        print("\
?? TOP 10 (by quality score):")
        top10 = successful.nlargest(10, 'quality_score')[
            ['file_name', 'image_name', 'ocr_model_name', 'line_mode',
             'k_factor', 'bbox_tolerance', 'quality_score', 'num_lines_detected']
        ]
        print(top10.to_string(index=False))

        print("\
?? Quality Distribution:")
        print(f"  >90 (Excellent): {len(df[df['quality_score'] > 90])}")
        print(f"  70-90 (Good):    {len(df[(df['quality_score'] >= 70) & (df['quality_score'] <= 90)])}")
        print(f"  50-70 (Fair):    {len(df[(df['quality_score'] >= 50) & (df['quality_score'] < 70)])}")
        print(f"  <50 (Poor):      {len(df[df['quality_score'] < 50])}")

    return df


In [None]:
"""
Quick test: 1 image, all parameter combos.
Verify everything works before the full run.
"""
checkpoint = CheckpointManager(CHECKPOINT_FILE)
checkpoint.reset()
quick_results = run_grid_search(max_images=1, resume=False)


In [None]:
"""
Full grid search on all images from TARGET.
Resumes from checkpoint if interrupted.
Press kernel interrupt to stop gracefully.
"""
results = run_grid_search()


In [None]:
"""
View summary statistics and top parameter combos.
"""
df = analyze_results()


In [None]:
"""
Clear checkpoint to start fresh on next run.
"""
# checkpoint = CheckpointManager(CHECKPOINT_FILE)
# checkpoint.reset()
# print("✅ Checkpoint cleared")