## Setup

In [None]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

# Import preprocessing modules
from core.pdf.pdf_loader import load_pdf_to_images
from core.preprocessing.preprocess import run_full_preprocessing

# Import OCR modules (when implemented)
# from core.ocr.ocr_engine import extract_text_from_image
# from core.ocr.postprocess_text import clean_ocr_output, merge_text_lines

# OCR libraries (uncomment when installed)
# import pytesseract
# import easyocr

%matplotlib inline

## Load and Preprocess Sample

In [None]:
# Load a sample page
pdf_path = Path("data/attention is all you need.pdf")

if pdf_path.exists():
    pages = load_pdf_to_images(str(pdf_path), dpi=300)  # Higher DPI for better OCR
    sample_page = pages[0]
    
    # Preprocess
    results = run_full_preprocessing(sample_page)
    
    print(f"Loaded page with shape: {sample_page.shape}")
    
    # Display preprocessing results
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    axes[0].imshow(results['gray'], cmap='gray')
    axes[0].set_title('Grayscale')
    axes[1].imshow(results['otsu'], cmap='gray')
    axes[1].set_title('Otsu Binary')
    axes[2].imshow(results['adaptive'], cmap='gray')
    axes[2].set_title('Adaptive Binary')
    for ax in axes:
        ax.axis('off')
    plt.show()
else:
    print(f"PDF not found at {pdf_path}")

## Test Tesseract OCR

In [None]:
# TODO: Test Tesseract OCR
# Requires pytesseract installation:
# pip install pytesseract
# Also need to install Tesseract OCR engine

# Example:
# import pytesseract
# 
# # Test on grayscale
# text_gray = pytesseract.image_to_string(results['gray'])
# print("Text from grayscale:")
# print(text_gray[:500])  # First 500 characters
# 
# # Test on binary
# text_binary = pytesseract.image_to_string(results['otsu'])
# print("\nText from Otsu binary:")
# print(text_binary[:500])

## Test EasyOCR

In [None]:
# TODO: Test EasyOCR (deep learning based)
# Requires easyocr installation:
# pip install easyocr

# Example:
# import easyocr
# 
# reader = easyocr.Reader(['en'])
# result = reader.readtext(results['gray'], detail=0)
# text_easyocr = '\n'.join(result)
# 
# print("Text from EasyOCR:")
# print(text_easyocr[:500])

## Compare OCR Results on Different Preprocessing

In [None]:
# TODO: Compare OCR accuracy on different preprocessing methods
# - Original grayscale
# - Histogram equalized
# - CLAHE
# - Otsu binary
# - Adaptive binary
# 
# Create a comparison table of:
# - Word count
# - Character count
# - Confidence scores
# - Processing time

## Text Cleaning and Postprocessing

In [None]:
# TODO: Test text cleaning functions
# Once postprocess_text module is implemented

# Example:
# from core.ocr.postprocess_text import clean_ocr_output
# 
# raw_text = "He11o    W0rld\n\n\n   Test"
# cleaned = clean_ocr_output(raw_text)
# 
# print("Raw:")
# print(repr(raw_text))
# print("\nCleaned:")
# print(repr(cleaned))

## OCR on Segmented Regions

In [None]:
# TODO: Once segmentation is implemented, test OCR on individual text blocks
# This should give better results than full-page OCR

# Example:
# from core.segmentation import segment_page_into_regions, extract_text_blocks
# 
# regions = segment_page_into_regions(results['otsu'])
# text_blocks = extract_text_blocks(sample_page, regions)
# 
# for idx, block in enumerate(text_blocks):
#     text = pytesseract.image_to_string(block)
#     print(f"\n--- Block {idx + 1} ---")
#     print(text)

## Quality Metrics

In [None]:
# TODO: If ground truth is available, calculate quality metrics
# - Character Error Rate (CER)
# - Word Error Rate (WER)
# - BLEU score

# Example:
# def calculate_wer(reference, hypothesis):
#     ref_words = reference.split()
#     hyp_words = hypothesis.split()
#     # Calculate Levenshtein distance
#     # Return error rate
#     pass

## Export Results

In [None]:
# TODO: Save OCR results to output directory
# output_dir = Path("outputs/ocr_text")
# output_dir.mkdir(parents=True, exist_ok=True)
# 
# with open(output_dir / "page_001.txt", 'w', encoding='utf-8') as f:
#     f.write(extracted_text)