## Setup

In [1]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

# Import preprocessing modules
from core.pdf.pdf_loader import load_pdf_to_images
from core.preprocessing.preprocess import run_full_preprocessing

# Import segmentation modules (when implemented)
# from core.segmentation.segmenter import segment_page_into_regions
# from core.segmentation.morphology_utils import apply_dilation, apply_erosion

%matplotlib inline
plt.rcParams['figure.figsize'] = (15, 10)

ModuleNotFoundError: No module named 'core'

## Load and Preprocess Sample Page

In [None]:
# Load a sample page
pdf_path = Path("data/attention is all you need.pdf")

if pdf_path.exists():
    pages = load_pdf_to_images(str(pdf_path), dpi=200)
    sample_page = pages[0]
    
    # Preprocess
    results = run_full_preprocessing(sample_page)
    binary = results['otsu']  # Use Otsu binarization as input for segmentation
    
    print(f"Loaded page with shape: {sample_page.shape}")
    print(f"Binary image shape: {binary.shape}")
else:
    print(f"PDF not found at {pdf_path}")

## Morphological Operations Experiments

In [None]:
# TODO: Implement morphological operations testing
# Once the morphology_utils module is implemented, test:
# - Dilation to connect text components
# - Erosion to separate touching regions
# - Opening to remove noise
# - Closing to fill gaps

# Example:
# kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
# dilated = cv2.dilate(binary, kernel, iterations=2)
# 
# fig, axes = plt.subplots(1, 2)
# axes[0].imshow(binary, cmap='gray')
# axes[0].set_title('Original Binary')
# axes[1].imshow(dilated, cmap='gray')
# axes[1].set_title('After Dilation')
# plt.show()

## Connected Component Analysis

In [None]:
# TODO: Implement connected component analysis
# Use cv2.connectedComponentsWithStats to find regions
# Analyze and classify each component

# Example:
# num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(binary, connectivity=8)
# 
# print(f"Found {num_labels - 1} components (excluding background)")
# 
# # Visualize components
# colored_labels = np.zeros((binary.shape[0], binary.shape[1], 3), dtype=np.uint8)
# for label in range(1, num_labels):
#     colored_labels[labels == label] = np.random.randint(0, 255, 3)
# 
# plt.imshow(colored_labels)
# plt.title('Connected Components')
# plt.axis('off')
# plt.show()

## Text vs. Figure Classification

In [None]:
# TODO: Develop classification heuristics
# Features to consider:
# - Aspect ratio (width/height)
# - Area (in pixels)
# - Density (filled pixels / bounding box area)
# - Position on page
# - Proximity to other components

# Example classification logic:
# def classify_component(stats):
#     x, y, w, h, area = stats
#     aspect_ratio = w / h if h > 0 else 0
#     
#     # Text typically has aspect ratio between 0.1 and 10
#     # Figures often have aspect ratio closer to 1
#     if 0.1 < aspect_ratio < 10 and area > 100:
#         return 'text'
#     elif aspect_ratio > 0.5 and area > 5000:
#         return 'figure'
#     else:
#         return 'noise'
#     
# return classification

## Visualize Segmentation Results

In [None]:
# TODO: Once segmentation is implemented, visualize results
# regions = segment_page_into_regions(binary)
# 
# # Draw bounding boxes on original image
# display_img = sample_page.copy()
# 
# for text_block in regions['text_blocks']:
#     x, y, w, h = text_block['bbox']
#     cv2.rectangle(display_img, (x, y), (x+w, y+h), (0, 255, 0), 2)  # Green for text
# 
# for figure_block in regions['figure_blocks']:
#     x, y, w, h = figure_block['bbox']
#     cv2.rectangle(display_img, (x, y), (x+w, y+h), (255, 0, 0), 2)  # Blue for figures
# 
# plt.figure(figsize=(12, 16))
# plt.imshow(cv2.cvtColor(display_img, cv2.COLOR_BGR2RGB))
# plt.title('Segmentation Results: Green=Text, Blue=Figures')
# plt.axis('off')
# plt.show()

## Extract and Display Crops

In [None]:
# TODO: Extract and display individual text and figure crops
# Useful for OCR testing and quality assessment