# Grid Search Results Analysis

Analyze OCR grid search results. Run all cells top to bottom.

In [1]:
import pandas as pd
from pathlib import Path

BASE_DIR = Path.home() / "Documents" / "tibetan-ocr-app"
OUTPUT_DIR = BASE_DIR / "grid_search_results"
CSV_PATH = OUTPUT_DIR / "summary.csv"

df = pd.read_csv(CSV_PATH)
successful = df[df['success'] == True]
failed = df[df['success'] == False]

print(f"Total results: {len(df)}")
print(f"Successful: {len(successful)}")
print(f"Failed: {len(failed)}")
print(f"Images tested: {df['image_name'].nunique()}")


Total results: 1728
Successful: 1728
Failed: 0
Images tested: 1


In [2]:
print("=" * 70)
print("AVERAGE QUALITY SCORE BY PARAMETER")
print("=" * 70)

for param in ['ocr_model_name', 'line_mode', 'k_factor', 'bbox_tolerance',
              'merge_lines', 'tps_threshold', 'class_threshold']:
    print(f"\n--- {param} ---")
    print(successful.groupby(param)['quality_score'].mean().sort_values(ascending=False))


AVERAGE QUALITY SCORE BY PARAMETER

--- ocr_model_name ---
ocr_model_name
Woodblock-Stacks    64.782708
Modern              59.077292
Woodblock           41.793958
Name: quality_score, dtype: float64

--- line_mode ---
line_mode
line      57.721528
layout    52.714444
Name: quality_score, dtype: float64

--- k_factor ---
k_factor
2.5    55.353750
2.0    55.240208
3.0    55.060000
Name: quality_score, dtype: float64

--- bbox_tolerance ---
bbox_tolerance
2.5    60.267222
3.5    56.002500
4.0    54.524722
5.0    50.077500
Name: quality_score, dtype: float64

--- merge_lines ---
merge_lines
True     60.145833
False    50.290139
Name: quality_score, dtype: float64

--- tps_threshold ---
tps_threshold
0.10    55.217986
0.25    55.217986
0.50    55.217986
0.90    55.217986
Name: quality_score, dtype: float64

--- class_threshold ---
class_threshold
0.7    55.217986
0.8    55.217986
0.9    55.217986
Name: quality_score, dtype: float64


In [3]:
print("=" * 70)
print("TOP 20 PARAMETER COMBINATIONS")
print("=" * 70)

top20 = successful.nlargest(20, 'quality_score')[
    ['image_name', 'ocr_model_name', 'line_mode', 'k_factor',
     'bbox_tolerance', 'merge_lines', 'tps_threshold',
     'class_threshold', 'quality_score', 'num_lines_detected']
]
print(top20.to_string(index=False))


TOP 20 PARAMETER COMBINATIONS
                     image_name   ocr_model_name line_mode  k_factor  bbox_tolerance  merge_lines  tps_threshold  class_threshold  quality_score  num_lines_detected
uchen high quality pdf - page 1 Woodblock-Stacks      line       2.5             5.0         True           0.10              0.7          100.0                  10
uchen high quality pdf - page 1 Woodblock-Stacks      line       2.5             5.0         True           0.25              0.7          100.0                  10
uchen high quality pdf - page 1 Woodblock-Stacks      line       2.5             5.0         True           0.50              0.7          100.0                  10
uchen high quality pdf - page 1 Woodblock-Stacks      line       2.5             5.0         True           0.90              0.7          100.0                  10
uchen high quality pdf - page 1 Woodblock-Stacks      line       3.0             5.0         True           0.10              0.7          100.0 

In [4]:
print("=" * 70)
print("BOTTOM 20 PARAMETER COMBINATIONS (successful only)")
print("=" * 70)

bottom20 = successful.nsmallest(20, 'quality_score')[
    ['image_name', 'ocr_model_name', 'line_mode', 'k_factor',
     'bbox_tolerance', 'merge_lines', 'tps_threshold',
     'class_threshold', 'quality_score', 'num_lines_detected']
]
print(bottom20.to_string(index=False))


BOTTOM 20 PARAMETER COMBINATIONS (successful only)
                     image_name ocr_model_name line_mode  k_factor  bbox_tolerance  merge_lines  tps_threshold  class_threshold  quality_score  num_lines_detected
uchen high quality pdf - page 1      Woodblock    layout       2.5             5.0        False           0.10              0.7          22.58                  25
uchen high quality pdf - page 1      Woodblock    layout       2.5             5.0        False           0.25              0.7          22.58                  25
uchen high quality pdf - page 1      Woodblock    layout       2.5             5.0        False           0.50              0.7          22.58                  25
uchen high quality pdf - page 1      Woodblock    layout       2.5             5.0        False           0.90              0.7          22.58                  25
uchen high quality pdf - page 1      Woodblock    layout       3.0             5.0        False           0.10              0.7       

In [5]:
print("=" * 70)
print("QUALITY DISTRIBUTION")
print("=" * 70)
print(f"  >90 (Excellent): {len(successful[successful['quality_score'] > 90])}")
print(f"  70-90 (Good):    {len(successful[(successful['quality_score'] >= 70) & (successful['quality_score'] <= 90)])}")
print(f"  50-70 (Fair):    {len(successful[(successful['quality_score'] >= 50) & (successful['quality_score'] < 70)])}")
print(f"  <50 (Poor):      {len(successful[successful['quality_score'] < 50])}")


QUALITY DISTRIBUTION
  >90 (Excellent): 60
  70-90 (Good):    204
  50-70 (Fair):    876
  <50 (Poor):      588


In [6]:
print("=" * 70)
print("BEST vs WORST OCR OUTPUT")
print("=" * 70)

best = successful.nlargest(1, 'quality_score').iloc[0]
worst = successful.nsmallest(1, 'quality_score').iloc[0]

best_dir = OUTPUT_DIR / best['file_name'] / best['image_name']
worst_dir = OUTPUT_DIR / worst['file_name'] / worst['image_name']

best_files = list(best_dir.glob(f"{best['ocr_model_name']}_{best['line_mode']}_*"))
worst_files = list(worst_dir.glob(f"{worst['ocr_model_name']}_{worst['line_mode']}_*"))

if best_files:
    print(f"\nBEST (score={best['quality_score']}, model={best['ocr_model_name']}, mode={best['line_mode']}):")
    print("-" * 40)
    with open(best_files[0], 'r', encoding='utf-8') as f:
        print(f.read())

if worst_files:
    print(f"\nWORST (score={worst['quality_score']}, model={worst['ocr_model_name']}, mode={worst['line_mode']}):")
    print("-" * 40)
    with open(worst_files[0], 'r', encoding='utf-8') as f:
        print(f.read())


BEST vs WORST OCR OUTPUT

BEST (score=100.0, model=Woodblock-Stacks, mode=line):
----------------------------------------
OCR RESULT

File: uchen high quality pdf
Image: uchen high quality pdf - page 1

PARAMETERS:
  OCR Model: Woodblock-Stacks
  Line Mode: line
  Class Threshold: 0.9
  K-Factor: 2.5
  BBox Tolerance: 2.5
  Merge Lines: True
  TPS Threshold: 0.5

RESULTS:
  Success: True
  Lines Detected: 10
  Processing Time: 7.12s

QUALITY METRICS:
  Quality Score: 75.00/100
  Total Tokens: 12
  Valid Words: 9
  Invalid Words: 3

OCR TEXT

ལྟེ་བཿ།་
་
་
འརྩ་
སི་ཏུ་པཎ་ཆེན་ཆོས་ཀྱི་འབྱུང་གནས་ཀྱི་ཕྱོགས་སྒྲིགས་་
༅ཿ་
་ས་་་་།་
་
་
་་

WORST (score=22.58, model=Woodblock, mode=layout):
----------------------------------------
OCR RESULT

File: uchen high quality pdf
Image: uchen high quality pdf - page 1

PARAMETERS:
  OCR Model: Woodblock
  Line Mode: layout
  Class Threshold: 0.7
  K-Factor: 3.0
  BBox Tolerance: 3.5
  Merge Lines: True
  TPS Threshold: 0.5

RESULTS:
  Success: True
  Lines

In [7]:
# Change the parameters below to look at a specific result
MODEL = "Woodblock-Stacks"
MODE = "line"
IMAGE = "uchen high quality pdf - page 1"
FILE = "uchen high quality pdf"

result_dir = OUTPUT_DIR / FILE / IMAGE
matches = list(result_dir.glob(f"{MODEL}_{MODE}_*"))
print(f"Found {len(matches)} results for {MODEL}/{MODE}")
if matches:
    print(f"\nShowing: {matches[0].name}")
    print("-" * 40)
    with open(matches[0], 'r', encoding='utf-8') as f:
        print(f.read())

Found 288 results for Woodblock-Stacks/line

Showing: Woodblock-Stacks_line_k2.5_bbox2.5_merge-T_tps0.5_conf0.9.txt
----------------------------------------
OCR RESULT

File: uchen high quality pdf
Image: uchen high quality pdf - page 1

PARAMETERS:
  OCR Model: Woodblock-Stacks
  Line Mode: line
  Class Threshold: 0.9
  K-Factor: 2.5
  BBox Tolerance: 2.5
  Merge Lines: True
  TPS Threshold: 0.5

RESULTS:
  Success: True
  Lines Detected: 10
  Processing Time: 7.12s

QUALITY METRICS:
  Quality Score: 75.00/100
  Total Tokens: 12
  Valid Words: 9
  Invalid Words: 3

OCR TEXT

ལྟེ་བཿ།་
་
་
འརྩ་
སི་ཏུ་པཎ་ཆེན་ཆོས་ཀྱི་འབྱུང་གནས་ཀྱི་ཕྱོགས་སྒྲིགས་་
༅ཿ་
་ས་་་་།་
་
་
་་
