In [4]:
%matplotlib widget


In [None]:
# Minimal, clean labeling UI for GoodNotes taxonomy
import os, json
from typing import List, Dict, Any
import fitz, numpy as np, cv2
from PIL import Image, ImageDraw, ImageFont
import matplotlib.pyplot as plt
from pathlib import Path

try:
    import ipywidgets as W
    from matplotlib.widgets import RectangleSelector
    from IPython.display import display, clear_output
    W_OK = True
except Exception:
    W_OK = False

# Config (repo-relative; no hardcoded absolute paths)
NOTEBOOK_DIR = Path(__file__).parent if '__file__' in globals() else Path.cwd()
REPO_ROOT = NOTEBOOK_DIR
DATA_DIR = REPO_ROOT / "data"
DATASET_ROOT = REPO_ROOT / "dataset"
IMAGES_DIR = DATASET_ROOT / "images"
VIZ_DIR = DATASET_ROOT / "viz"
ANN_PATH = DATASET_ROOT / "annotations.json"
RASTER_DPI = 400
os.makedirs(IMAGES_DIR, exist_ok=True)
os.makedirs(VIZ_DIR, exist_ok=True)

def discover_pdfs(root: Path) -> List[str]:
    if not root.exists():
        return []
    return sorted([str(p) for p in root.rglob("*.pdf")])

PDF_PATHS = discover_pdfs(DATA_DIR)

CLASSES = [
    "handwriting_paragraph",
    "handwriting_explanatory",
    "handwriting_drawing",
    "inserted_image",
    "formula",
    "heading",
]
CLASS_TO_ID = {c:i for i,c in enumerate(CLASSES)}

# Per-class colors (RGB 0-255)
CLASS_COLOR_RGB = [
    (230, 25, 75),   # handwriting_paragraph - red
    (60, 180, 75),   # handwriting_explanatory - green
    (255, 225, 25),  # handwriting_drawing - yellow
    (0, 130, 200),   # inserted_image - blue
    (245, 130, 48),  # formula - orange
    (145, 30, 180),  # heading - purple
]

def rgb255_to_mpl(rgb: tuple) -> tuple:
    return (rgb[0]/255.0, rgb[1]/255.0, rgb[2]/255.0)

# Utils
def page_to_image(pdf_path: str, page_idx: int, dpi: int) -> Image.Image:
    doc = fitz.open(pdf_path)
    page = doc[page_idx]
    mat = fitz.Matrix(dpi/72, dpi/72)
    pix = page.get_pixmap(matrix=mat, alpha=False)
    arr = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, 3)
    return Image.fromarray(arr, mode='RGB')

# State
CURRENT = {
    'pdf_path': None,
    'page_idx': 0,
    'image': None,
    'boxes': [],
    'labels': [],
}

# Proposals (optional)
def proposals_morph(img: Image.Image, blockSize=41, C=12, close_k=9, close_it=2, open_k=3, open_it=1, area_min_frac=0.0002):
    gray = cv2.cvtColor(np.array(img.convert('RGB')), cv2.COLOR_RGB2GRAY)
    th = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY_INV, int(blockSize)|1, int(C))
    kc = np.ones((int(close_k), int(close_k)), np.uint8)
    ko = np.ones((int(open_k), int(open_k)), np.uint8)
    closed = cv2.morphologyEx(th, cv2.MORPH_CLOSE, kc, iterations=int(close_it))
    opened = cv2.morphologyEx(closed, cv2.MORPH_OPEN, ko, iterations=int(open_it))
    contours, _ = cv2.findContours(opened, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    h, w = opened.shape
    area_min = (w*h) * float(area_min_frac)
    boxes = []
    for c in contours:
        x,y,ww,hh = cv2.boundingRect(c)
        if ww*hh < area_min or hh < 10 or ww < 10:
            continue
        boxes.append([x, y, x+ww, y+hh])
    return boxes

# UI
if not W_OK:
    print("ipywidgets not available")
else:
    class_dd = W.Dropdown(options=CLASSES, value=CLASSES[0], description='Class')
    pdf_dd = W.Dropdown(options=PDF_PATHS, description='PDF')
    page_inp = W.IntText(value=0, description='Page')
    load_btn = W.Button(description='Load', button_style='primary')
    save_btn = W.Button(description='Save', button_style='success')
    clear_btn = W.Button(description='Clear')
    list_btn = W.Button(description='List')
    del_last_btn = W.Button(description='Del last')
    del_idx_inp = W.IntText(value=1, description='Del idx')
    del_idx_btn = W.Button(description='Del by idx')
    rescan_btn = W.Button(description='Rescan PDFs')

    prop_btn = W.Button(description='Propose+Add', button_style='warning')
    bs_w = W.IntSlider(value=41, min=21, max=101, step=2, description='Blk')
    c_w = W.IntSlider(value=12, min=0, max=25, step=1, description='C')

    out = W.Output()
    selector = None

    def refresh():
        with out:
            clear_output(wait=True)
            if CURRENT['image'] is None:
                print('Select PDF/Page and press Load')
                return
            fig, ax = plt.subplots(1,1, figsize=(10,14))
            ax.imshow(np.array(CURRENT['image']))
            for (x0,y0,x1,y1), cls_idx in zip(CURRENT['boxes'], CURRENT['labels']):
                color_rgb = CLASS_COLOR_RGB[int(cls_idx)]
                edge = rgb255_to_mpl(color_rgb)
                rect = plt.Rectangle((x0,y0), x1-x0, y1-y0, fill=False, edgecolor=edge, linewidth=2.5)
                ax.add_patch(rect)
                ax.text(x0, max(0, y0-6), CLASSES[cls_idx], color='white', fontsize=9,
                        bbox=dict(facecolor=edge, alpha=0.7, pad=1))
            ax.set_axis_off()
            global selector
            try:
                if selector is not None:
                    selector.disconnect_events()
            except Exception:
                pass
            selector = RectangleSelector(ax, onselect, useblit=False, button=[1], minspanx=5, minspany=5, spancoords='data', interactive=True)
            plt.show()

    def onselect(eclick, erelease):
        if eclick.xdata is None or erelease.xdata is None:
            return
        x0, y0 = int(min(eclick.xdata, erelease.xdata)), int(min(eclick.ydata, erelease.ydata))
        x1, y1 = int(max(eclick.xdata, erelease.xdata)), int(max(eclick.ydata, erelease.ydata))
        CURRENT['boxes'].append([x0,y0,x1,y1])
        CURRENT['labels'].append(CLASS_TO_ID[class_dd.value])
        refresh()

    def on_rescan(_):
        pdfs = discover_pdfs(DATA_DIR)
        pdf_dd.options = pdfs
        with out:
            print(f'Found {len(pdfs)} PDFs under data/')

    def on_load(_):
        if not pdf_dd.options:
            with out: print('No PDFs found in data/. Place files into data/.'); return
        CURRENT['pdf_path'] = pdf_dd.value
        CURRENT['page_idx'] = int(page_inp.value)
        CURRENT['image'] = page_to_image(CURRENT['pdf_path'], CURRENT['page_idx'], RASTER_DPI)
        CURRENT['boxes'] = []
        CURRENT['labels'] = []
        refresh()
        global selector
        selector = RectangleSelector(plt.gca(), onselect, useblit=False, button=[1], minspanx=5, minspany=5, spancoords='data', interactive=True)

    def on_save(_):
        if CURRENT['image'] is None or CURRENT['pdf_path'] is None:
            with out: print('Load a page first'); return
        src_dir = os.path.basename(os.path.dirname(CURRENT['pdf_path']))
        pdf_base = os.path.splitext(os.path.basename(CURRENT['pdf_path']))[0]
        def sanitize(s: str) -> str:
            return ''.join(ch if ch.isalnum() or ch in ('_', '-') else '_' for ch in s.replace(' ', '_'))
        base = f"{sanitize(src_dir)}__{sanitize(pdf_base)}__p{CURRENT['page_idx']}.png"
        img_path = IMAGES_DIR / base
        viz_path = VIZ_DIR / base
        CURRENT['image'].save(img_path)
        # draw viz overlay (match UI colors/thickness)
        viz = CURRENT['image'].copy()
        draw = ImageDraw.Draw(viz)
        # try to get a readable font; fallback to default
        try:
            font = ImageFont.truetype("DejaVuSans.ttf", 16)
        except Exception:
            font = ImageFont.load_default()
        for (x0,y0,x1,y1), cls_idx in zip(CURRENT['boxes'], CURRENT['labels']):
            rgb = CLASS_COLOR_RGB[int(cls_idx)]
            # thicker rectangle via multiple offsets to simulate stroke
            for off in (-1,0,1):
                draw.rectangle([x0-off,y0-off,x1+off,y1+off], outline=rgb, width=1)
            # label background rectangle
            label = CLASSES[int(cls_idx)]
            tw, th = draw.textbbox((0,0), label, font=font)[2:]
            pad = 2
            y_text = max(0, y0 - th - 4)
            draw.rectangle([x0, y_text, x0 + tw + 2*pad, y_text + th + 2*pad], fill=rgb)
            draw.text((x0 + pad, y_text + pad), label, fill=(255,255,255), font=font)
        viz.save(viz_path)
        # build record with repo-relative paths
        try:
            src_rel = str(Path(CURRENT['pdf_path']).relative_to(REPO_ROOT))
        except Exception:
            src_rel = CURRENT['pdf_path']
        img_rel = str(img_path.relative_to(REPO_ROOT))
        rec = {
            'image': img_rel,
            'width': CURRENT['image'].width,
            'height': CURRENT['image'].height,
            'source_pdf': src_rel,
            'page_idx': int(CURRENT['page_idx']),
            'annotations': [{'bbox': b, 'category_id': int(c)} for b,c in zip(CURRENT['boxes'], CURRENT['labels'])]
        }
        db = []
        if ANN_PATH.exists():
            with open(ANN_PATH, 'r') as f:
                try:
                    db = json.load(f)
                except Exception:
                    db = []
        # replace any prior record with the same image path
        db = [r for r in db if r.get('image') != img_rel]
        db.append(rec)
        os.makedirs(DATASET_ROOT, exist_ok=True)
        with open(ANN_PATH, 'w') as f:
            json.dump(db, f, ensure_ascii=False, indent=2)
        with out: print('Saved', img_rel, 'boxes:', len(rec['annotations']))

    def on_clear(_):
        CURRENT['boxes'] = []; CURRENT['labels'] = []; refresh()

    def on_list(_):
        with out:
            print('Boxes:')
            for i, ((x0,y0,x1,y1), cls_idx) in enumerate(zip(CURRENT['boxes'], CURRENT['labels']), start=1):
                print(f" {i:>3}: {CLASSES[cls_idx]} -> [{x0},{y0},{x1},{y1}]")

    def on_del_last(_):
        if CURRENT['boxes']:
            CURRENT['boxes'].pop(); CURRENT['labels'].pop(); refresh()

    def on_del_by_idx(_):
        idx = int(del_idx_inp.value) - 1
        if 0 <= idx < len(CURRENT['boxes']):
            del CURRENT['boxes'][idx]; del CURRENT['labels'][idx]; refresh()

    def on_props(_):
        if CURRENT['image'] is None:
            with out: print('Load first'); return
        boxes = proposals_morph(CURRENT['image'], bs_w.value, c_w.value)
        cls_id = CLASS_TO_ID[class_dd.value]
        for b in boxes:
            CURRENT['boxes'].append(b); CURRENT['labels'].append(cls_id)
        refresh()

    load_btn.on_click(on_load)
    save_btn.on_click(on_save)
    clear_btn.on_click(on_clear)
    list_btn.on_click(on_list)
    del_last_btn.on_click(on_del_last)
    del_idx_btn.on_click(on_del_by_idx)
    prop_btn.on_click(on_props)
    rescan_btn.on_click(on_rescan)

    ui = W.VBox([
        W.HBox([pdf_dd, page_inp, class_dd, rescan_btn]),
        W.HBox([load_btn, save_btn, clear_btn, list_btn, del_last_btn, del_idx_inp, del_idx_btn]),
        W.HBox([prop_btn, bs_w, c_w]),
        out
    ])
    display(ui)
    print('Select PDF/Page, Load, draw boxes (left-drag), change Class as needed, Save.')


VBox(children=(HBox(children=(Dropdown(description='PDF', options=('/Users/aeshef/Desktop/CAREER & STUDIES/ВШЭ…

Select PDF/Page, Load, draw boxes (left-drag), change Class as needed, Save.


In [6]:
# Deduplicate annotations.json by (source_pdf,page_idx) when available; else by image; drop missing images
import os, json
from pathlib import Path

if Path(ANN_PATH).exists():
    with open(ANN_PATH, 'r') as f:
        db = json.load(f)

    def key_for(rec: dict) -> tuple:
        sp = rec.get('source_pdf'); pi = rec.get('page_idx')
        if sp is not None and pi is not None:
            return (sp, int(pi))
        img = rec.get('image','')
        return (img, -1)

    seen = set()
    cleaned = []
    removed = 0
    missing = 0

    for rec in db:
        img = rec.get('image') or ''
        img_path = Path(img)
        if not img_path.is_absolute():
            img_path = Path(REPO_ROOT) / img_path
        if not img or not img_path.exists():
            missing += 1
            continue
        k = key_for(rec)
        if k in seen:
            removed += 1
            continue
        seen.add(k)
        cleaned.append(rec)

    with open(ANN_PATH, 'w') as f:
        json.dump(cleaned, f, ensure_ascii=False, indent=2)
    print(f'Deduplicated: kept {len(cleaned)}, removed {removed}, missing {missing}')
else:
    print('No annotations.json yet')


Deduplicated: kept 1, removed 0, missing 0
