# GroundedDINO + SAM2 — Чистый пайплайн

Минимальный ноутбук: рендер → детект (CV/GDINO+SAM2) → анализ → инжест → обзор → метрики.

In [1]:
# Конфиг
MODEL_DIR = '/root/models'
PDF_PATH  = '/root/data/playbook.pdf'
JSON_PATH = '/root/data/playbook.json'
PAGES     = [42, 45]
OUT_PAGES = 'out/page_images/PIK - Expert Guide - Platform IT Architecture - Playbook - v11'
OUT_DET   = 'out/visual/grounded_regions'
USE_CV    = False  # Полная мощность: GroundedDINO+SAM2 (True = быстрый CV)
CHAT_MODEL = 'gpt-4o'
EMB_MODEL  = 'text-embedding-3-large'
INDEX_PATH = 'out/openai_embeddings.ndjson'


In [2]:
# Вспомогательные
import subprocess, os, shlex
from pathlib import Path

def sh(cmd):
    print('→', cmd)
    res = subprocess.run(cmd, shell=True)
    if res.returncode != 0:
        raise RuntimeError('Command failed')
print('OPENAI_API_KEY set:', bool(os.getenv('OPENAI_API_KEY')))
print('MODEL_DIR exists:', Path(MODEL_DIR).exists())


OPENAI_API_KEY set: True
MODEL_DIR exists: True


In [7]:
# Рендер страниц
import shlex
Path(OUT_PAGES).mkdir(parents=True, exist_ok=True)
pages = ' '.join(str(p) for p in PAGES)
sh(f'python ../scripts/render_pages.py --pdf {shlex.quote(PDF_PATH)} --pages {pages} --outdir {shlex.quote(OUT_PAGES)} --dpi 150')


→ python ../scripts/render_pages.py --pdf /root/data/playbook.pdf --pages 42 45 --outdir 'out/page_images/PIK - Expert Guide - Platform IT Architecture - Playbook - v11' --dpi 150


Rendered playbook.pdf page 42 -> out/page_images/PIK - Expert Guide - Platform IT Architecture - Playbook - v11/page-42.png
Rendered playbook.pdf page 45 -> out/page_images/PIK - Expert Guide - Platform IT Architecture - Playbook - v11/page-45.png


In [9]:
# Детекция регионов
imgs = [str(Path(OUT_PAGES)/f'page-{p}.png') for p in PAGES]
if USE_CV:
    Path('out/visual/cv_regions').mkdir(parents=True, exist_ok=True)
    sh(f'python ../scripts/cv_segment.py --images-dir {shlex.quote(OUT_PAGES)} --pages {pages} --outdir out/visual/cv_regions')
    OUT_DET = 'out/visual/cv_regions'
else:
    Path(OUT_DET).mkdir(parents=True, exist_ok=True)
    quoted = ' '.join(shlex.quote(p) for p in imgs)
    sh(f'python ../scripts/grounded_sam_detect.py --images {quoted} --outdir {shlex.quote(OUT_DET)} --prompts diagram canvas table legend node arrow')


→ python ../scripts/cv_segment.py --images-dir 'out/page_images/PIK - Expert Guide - Platform IT Architecture - Playbook - v11' --pages 42 45 --outdir out/visual/cv_regions
CV segmented page 42 -> 7 regions
CV segmented page 45 -> 11 regions


In [11]:
# Анализ регионов LLM
sh(f'python ../scripts/analyze_detected_regions.py --detected-dir {shlex.quote(OUT_DET)} --all --outdir {shlex.quote(OUT_DET)} --chat-model {CHAT_MODEL} --skip-existing')


→ python ../scripts/analyze_detected_regions.py --detected-dir out/visual/cv_regions --all --outdir out/visual/cv_regions --chat-model gpt-4o --skip-existing


Analyzed 7 detected regions in unit 42 -> out/visual/cv_regions/42/regions
Analyzed 11 detected regions in unit 45 -> out/visual/cv_regions/45/regions


In [14]:
# Инжест и обзор
sh(f'python ../scripts/ingest_visual_artifacts.py --source-json {shlex.quote(JSON_PATH)} --regions-dir {shlex.quote(OUT_DET)} --out {shlex.quote(INDEX_PATH)} --model {EMB_MODEL}')
sh('python ../scripts/generate_visual_review.py --inline --out ../eval/visual_review.html')


→ python ../scripts/ingest_visual_artifacts.py --source-json /root/data/playbook.json --regions-dir out/visual/cv_regions --out out/openai_embeddings.ndjson --model text-embedding-3-large


Ingested 28 visual items into out/openai_embeddings.ndjson
→ python ../scripts/generate_visual_review.py --inline --out ../eval/visual_review.html
Wrote ../eval/visual_review.html


In [16]:
# Метрики (опционально)
sh(f'python ../scripts/eval_metrics.py --index {shlex.quote(INDEX_PATH)} --eval {shlex.quote("../eval/queries.jsonl")} --prefer-visual')


→ python ../scripts/eval_metrics.py --index out/openai_embeddings.ndjson --eval ../eval/queries.jsonl --prefer-visual


recall@1: 0.000
recall@3: 0.013
recall@5: 0.038
ndcg@1: 0.000
ndcg@3: 0.004
ndcg@5: 0.009
MRR: 0.035 (over 78 annotated queries)
