# GroundedDINO + SAM — Detection (Colab Pro+) — commit 3767bc9
Детектор регионов: монтируем GCS (через сервис‑аккаунт), ставим Torch+детекторы, рендерим страницы, запускаем детекцию и грузим регионы в `gs://pik-artifacts-dev/grounded_regions/`.


In [None]:
#@title Run Control and Parameters
# SIGPIPE-friendly stdout (avoid BrokenPipeError in Colab pipes)
import signal
if hasattr(signal, 'SIGPIPE'):
    signal.signal(signal.SIGPIPE, signal.SIG_DFL)

# Toggle to start the pipeline
START_RUN = False  #@param {type:"boolean"}

# Key parameters (leave pages empty to process ALL pages)
PLAYBOOK_PDF = '/content/src_gcs/playbooks/PIK - Expert Guide - Platform IT Architecture - Playbook - v11.pdf'  #@param {type:"string"}
PAGES = []  #@param {type:"raw"}
FRAME_NAMES_INPUT = 'PIK - Platform IT Architecture Canvas - Table View - v01.png, PIK - Platform IT Architecture Canvases - v01.png, PIK - Expert Guide - Platform IT Architecture - Assessment - v01.png'  #@param {type:"string"}
PROMPTS_INPUT = 'diagram,canvas,table,legend,arrow,node'  #@param {type:"string"}
BOX_THRESHOLD = 0.35  #@param {type:"number"}
TEXT_THRESHOLD = 0.25  #@param {type:"number"}
TOPK = 12  #@param {type:"integer"}
DEVICE = 'auto'  #@param ["auto", "cuda", "cpu"]
USE_SAM2 = True  #@param {type:"boolean"}

REPORT_TO_GCS = True  #@param {type:"boolean"}
GCS_BUCKET = 'pik-artifacts-dev'  #@param {type:"string"}
RUN_TAG = ''  #@param {type:"string"}

# Derived lists from string inputs
FRAME_NAMES = [x.strip() for x in FRAME_NAMES_INPUT.split(',') if x.strip()]
PROMPTS = [x.strip() for x in PROMPTS_INPUT.split(',') if x.strip()]
OUT_PAGES_DIR = '/content/pages'
DETECT_OUT = '/content/grounded_regions'

# Helper to gate execution in subsequent cells
def require_start():
    if not START_RUN:
        raise SystemExit('Execution gated. Set START_RUN=True in the top cell and rerun.')

print('Configured. START_RUN=', START_RUN)
print('PDF:', PLAYBOOK_PDF)
print('PAGES (empty=ALL):', PAGES)
print('Frames:', FRAME_NAMES)
print('Prompts:', PROMPTS)


In [None]:
#@title Cell Execution Logger
import os, sys, json, time, uuid, warnings
from pathlib import Path
try:
  LOG_DIR  # noqa: F821
except NameError:
  RUN_ID = time.strftime('%Y%m%d-%H%M%S')
  LOCAL_LOG_ROOT = '/content/colab_runs'
  LOG_DIR = Path(LOCAL_LOG_ROOT)/RUN_ID
  LOG_DIR.mkdir(parents=True, exist_ok=True)

from IPython import get_ipython
ip = get_ipython()

class _Tee:
  def __init__(self, stream, buf_list):
    self._s = stream; self._b = buf_list
  def write(self, s):
    try: self._s.write(s)
    finally: self._b.append(s)
  def flush(self):
    try: self._s.flush()
    except Exception: pass

_celllog = {'i': None, 'start': None, 'buf_out':[], 'buf_err':[], 'warns':[], 'id': None}
_orig_out, _orig_err = sys.stdout, sys.stderr
_orig_showwarning = warnings.showwarning
LOG_JSONL = str(LOG_DIR/'cells.jsonl')

def _pre(cell_id):
  _celllog['i'] = ip.execution_count + 1
  _celllog['id'] = str(uuid.uuid4())
  _celllog['start'] = time.time()
  _celllog['buf_out'] = []
  _celllog['buf_err'] = []
  _celllog['warns'] = []
  sys.stdout = _Tee(_orig_out, _celllog['buf_out'])
  sys.stderr = _Tee(_orig_err, _celllog['buf_err'])
  def _sw(message, category, filename, lineno, file=None, line=None):
    _celllog['warns'].append({'message': str(message), 'category': getattr(category,'__name__', str(category)), 'filename': filename, 'lineno': lineno})
    return _orig_showwarning(message, category, filename, lineno, file, line)
  warnings.showwarning = _sw

def _post(result):
  # restore
  sys.stdout = _orig_out
  sys.stderr = _orig_err
  warnings.showwarning = _orig_showwarning
  end = time.time()
  i = _celllog.get('i')
  # Try to get cell source from history
  src = None
  try:
    ih = ip.user_ns.get('_ih', [])
    if i is not None and i < len(ih):
      src = ih[i]
  except Exception:
    src = None
  rec = {
    'cell_id': _celllog.get('id'),
    'execution_count': i,
    'start_ts': _celllog.get('start'),
    'end_ts': end,
    'duration_s': (end - _celllog['start']) if _celllog.get('start') else None,
    'success': bool(getattr(result, 'success', True)),
    'out': ''.join(_celllog.get('buf_out') or []),
    'err': ''.join(_celllog.get('buf_err') or []),
    'warnings': _celllog.get('warns') or [],
    'source': src,
  }
  try:
    with open(LOG_JSONL, 'a', encoding='utf-8') as f:
      f.write(json.dumps(rec, ensure_ascii=False) + '\n')
  except Exception as e:
    print('[cell-logger] write failed:', e)

ip.events.register('pre_run_cell', _pre)
ip.events.register('post_run_cell', _post)
print('[cell-logger] enabled ->', LOG_JSONL)


In [1]:
#@title Runtime & GPU
NOTEBOOK_VERSION = '3767bc9'
print('Notebook version:', NOTEBOOK_VERSION)
# Runtime & GPU
!nvidia-smi || true
import sys; print(sys.version)


Notebook version: ef4284f
Tue Sep 16 04:21:37 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   36C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                      

In [2]:
#@title Auth + gcsfuse setup
require_start()

# Install packages and prepare gcsfuse repo; auto-mount with SA from /content/Secrets if present
# Try Colab user auth (optional)
try:
  from google.colab import auth
  auth.authenticate_user()
  print('[auth] Colab user credentials OK')
except Exception as e:
  print('[auth] Skipping Colab user auth:', e)
!pip -q install google-cloud-storage gcsfs==2025.3.0 fsspec==2025.3.0
!sudo install -m 0755 -d /usr/share/keyrings
!curl -fsSL https://packages.cloud.google.com/apt/doc/apt-key.gpg -o /tmp/cloud.google.gpg
!sudo gpg --dearmor --yes --batch -o /usr/share/keyrings/cloud.google.gpg /tmp/cloud.google.gpg || sudo cp /tmp/cloud.google.gpg /usr/share/keyrings/cloud.google.gpg
!echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt gcsfuse-jammy main" | sudo tee /etc/apt/sources.list.d/gcsfuse.list >/dev/null
!sudo apt-get -q update
!sudo apt-get -q install -y gcsfuse poppler-utils
!mkdir -p /content/src_gcs /content/artifacts /content/pages /content/Secrets
import glob, os, subprocess, shlex

# Read SA key from Colab Secrets (GCS_SA_JSON / GCS_SA_JSON2 / secretName) and optional GOOGLE_API_KEY
try:
  from google.colab import userdata as _ud
  _sa = None; _sa_name = ''
  for _k in ('GCS_SA_JSON','GCS_SA_JSON2','secretName'):
    try:
      _v = _ud.get(_k)
    except Exception:
      _v = None
    if _v:
      _sa = _v; _sa_name = _k; break
  try:
    _gapi = _ud.get('GOOGLE_API_KEY')
  except Exception:
    _gapi = None
except Exception:
  _sa = None; _gapi = None; _sa_name = ''
if _gapi:
  os.environ['GOOGLE_API_KEY'] = _gapi
  print('[auth] GOOGLE_API_KEY loaded from Colab Secrets')
if _sa:
  os.makedirs('/content/Secrets', exist_ok=True)
  _key_path = '/content/Secrets/sa.json'
  open(_key_path,'w',encoding='utf-8').write(_sa)
  os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = _key_path
  print('[auth] SA key written to', _key_path, '(source:', _sa_name, ')')

# Skip auto-mount here; use the next cell 'Mount GCS buckets'
print('[auth] SA prepared. Proceed to mount in the next cell.')


[auth] Colab user credentials OK
Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:4 https://cli.github.com/packages stable InRelease [3,917 B]
Get:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:6 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:8 https://packages.cloud.google.com/apt gcsfuse-jammy InRelease [1,227 B]
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease [24.3 kB]
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:12 https://cli.github.com/packages stable/main amd64 Packages [346 B]
Get:13 https://developer.download.nvidia

In [None]:
#@title Compatibility Fixes (pip pins)
require_start()
# Do not upgrade ipython (Colab expects ipython==7.34.0)
# Remove xformers if present (version coupling to torch; optional)
!pip -q uninstall -y xformers || true

# Pin compatible versions for known conflicts
!pip -q install -U   "typing_extensions>=4.14.0,<5"   "filelock>=3.15"   "numpy<2.1,>=1.24"   gcsfs==2025.3.0 fsspec==2025.3.0

from importlib import metadata as md
def _ver(name, mod=None):
    try:
        return md.version(name)
    except Exception:
        try:
            m = __import__(mod or name)
            return getattr(m, "__version__", "unknown")
        except Exception:
            return "not installed"
print(
  "[compat]",
  "jedi=", _ver("jedi"),
  "typing_extensions=", _ver("typing_extensions","typing_extensions"),
  "filelock=", _ver("filelock"),
  "numpy=", _ver("numpy"),
  "gcsfs=", _ver("gcsfs"),
  "fsspec=", _ver("fsspec"),
)


In [3]:
#@title Mount GCS buckets
require_start()

# Robust mount with verbose logs and fallback info
import os, glob, subprocess, pathlib, textwrap
pathlib.Path('/content/src_gcs').mkdir(parents=True, exist_ok=True)
pathlib.Path('/content/artifacts').mkdir(parents=True, exist_ok=True)
pathlib.Path('/content/gcsfuse_tmp').mkdir(parents=True, exist_ok=True)
key = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS', '')
if (not key) and 'KEY' in globals(): key = KEY
if key and not os.path.isabs(key): key = os.path.join('/content', key)
if not (key and os.path.exists(key)):
  matches = glob.glob('/content/Secrets/*.json')
  if matches:
    key = matches[0]; os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = key
  else:
    raise SystemExit('Service account key not found — upload to /content/Secrets/*.json')
print('Using SA key:', key)
def mount(bucket, mnt):
  log=f'/content/gcsfuse_{bucket}.log'.replace('/', '_')
  cmd=['gcsfuse','--implicit-dirs','--key-file', key,'--temp-dir','/content/gcsfuse_tmp','--log-file',log, bucket, mnt]
  res=subprocess.run(cmd, capture_output=True, text=True)
  ok=(res.returncode==0)
  print(f'[mount] {bucket} -> {mnt}:', ok)
  if not ok:
    print('[mount] stdout:\n' + res.stdout)
    print('[mount] stderr:\n' + res.stderr)
    try:
      tail=subprocess.run(['bash','-lc', f'tail -n 80 {log}'], capture_output=True, text=True)
      if tail.stdout: print('[mount] log tail:\n' + tail.stdout)
    except Exception: pass
  return ok
subprocess.run(['fusermount','-u','/content/src_gcs'], check=False)
subprocess.run(['fusermount','-u','/content/artifacts'], check=False)
# Quick bucket existence check
for b in ('pik_source_bucket','pik-artifacts-dev'):
  subprocess.run(['bash','-lc', f'gsutil ls -b gs://{b} || true'], check=False)
ok1 = mount('pik_source_bucket','/content/src_gcs')
ok2 = mount('pik-artifacts-dev','/content/artifacts')
print('mount src=', ok1, ' mount artifacts=', ok2)
if not (ok1 and ok2):
  print(textwrap.dedent('''
    [hint] If mount keeps failing:
      - Check SA has Storage Object Admin on both buckets
      - Try fallback: copy files with gsutil (already used elsewhere in the notebook)
      - Ensure bucket names are correct and exist (see checks above)
    '''))


Using SA key: /content/Secrets/sa.json
[mount] pik_source_bucket -> /content/src_gcs: True
[mount] pik-artifacts-dev -> /content/artifacts: True
mount src= True  mount artifacts= True


In [4]:
#@title Install Torch + SAM/SAM2 + GroundedDINO
require_start()

# 2) Torch + детекторы — надёжная установка (без сборки wheel для GroundedDINO)
!pip -q install --upgrade pip setuptools wheel
# Ensure IPython+jedi and core libs for resolver
!pip -q install -U jedi>=0.16 typing_extensions>=4.14.0 filelock>=3.15
!pip -q install --upgrade --force-reinstall torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124
!pip -q install 'numpy<2.1,>=1.24'
!pip -q install shapely timm opencv-python pycocotools addict yacs requests pillow
!pip -q install huggingface_hub
# xformers optional; skipped by default due to wheel/torch version coupling
# SAM
!pip -q install git+https://github.com/facebookresearch/segment-anything.git
# SAM2
!pip -q install git+https://github.com/facebookresearch/segment-anything-2.git
# GroundedDINO из исходников (подключим через sys.path)
!rm -rf /content/GroundingDINO
!git clone --depth 1 https://github.com/IDEA-Research/GroundingDINO.git /content/GroundingDINO
!pip -q install -r /content/GroundingDINO/requirements.txt
# Build C++/CUDA ops for GroundingDINO (ms_deform_attn _C)
!sudo apt-get -q install -y ninja-build
!pip -q install "git+https://github.com/IDEA-Research/GroundingDINO.git" || echo 'pip install from git failed; will import from source'
import sys, os, glob, subprocess
try:
  import groundingdino
  print('GroundingDINO pip install OK')
except Exception as e:
  print('GroundingDINO pip install failed; building from source:', e)
  cands = [p for p in glob.glob('/content/GroundingDINO/**/setup.py', recursive=True) if ('ms_deform' in p) or ('ops' in p)]
  for sp in cands:
    d=os.path.dirname(sp); print('Building ext in', d)
    subprocess.run([sys.executable, 'setup.py', 'build_ext', '--inplace'], cwd=d, check=False)
  if '/content/GroundingDINO' not in sys.path: sys.path.append('/content/GroundingDINO')
  import groundingdino
  print('GroundingDINO import OK (source fallback)')
# Build C++/CUDA ops for GroundingDINO (ms_deform_attn _C)
import sys
if '/content/GroundingDINO' not in sys.path: sys.path.append('/content/GroundingDINO')
from groundingdino.util.inference import Model
print('GroundedDINO import OK')
# Compatibility pins (final)
!pip -q uninstall -y xformers || true
!pip -q install -U 'numpy<2.1,>=1.24' typing_extensions>=4.14.0 filelock>=3.15
!pip -q install -U gcsfs==2025.3.0 fsspec==2025.3.0
import importlib, pkgutil;
print('[versions]',
      'torch', __import__('torch').__version__,
      'numpy', __import__('numpy').__version__,
      'typing_extensions', __import__('typing_extensions').__version__,
      'filelock', __import__('filelock').__version__)


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.8/1.8 MB[0m [31m52.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m70.7 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ipython 7.34.0 requires jedi>=0.16, which is not installed.[0m[31m
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the follo



In [5]:
#@title CUDA and C++ ops self-check
import warnings, importlib, torch, torchvision, os
warnings.filterwarnings('ignore', category=FutureWarning)
req = (DEVICE.lower() if 'DEVICE' in globals() else 'auto')
cuda_avail = torch.cuda.is_available()
print('torch:', torch.__version__, 'cuda:', torch.version.cuda, 'available:', cuda_avail)
print('torchvision:', torchvision.__version__)
tv_ops_ok=False
if cuda_avail:
  try:
    x=torch.rand(256,4,device='cuda'); y=torch.rand(256,4,device='cuda')
    from torchvision.ops import box_iou
    _=box_iou(x,y); tv_ops_ok=True; print('[OK] torchvision.ops on CUDA')
  except Exception as e:
    print('[WARN] torchvision.ops CUDA failed:', e)
try:
  m=importlib.import_module('groundingdino.models.GroundingDINO.ms_deform_attn')
  dino_ops_ok=bool(getattr(m,'_C', None))
  print('GroundingDINO _C present:', dino_ops_ok)
except Exception as e:
  dino_ops_ok=False; print('[WARN] GroundingDINO C++ ops import failed:', e)
# Hard assertions when DEVICE='cuda'
if req=='cuda':
  assert cuda_avail, 'CUDA requested but not available'
  assert tv_ops_ok, 'torchvision CUDA ops unavailable'
  assert dino_ops_ok, 'GroundingDINO C++ ops (_C) not built'
print('[SELF-CHECK] req=', req, ' cuda_avail=', cuda_avail, ' tv_ops_ok=', tv_ops_ok, ' dino_ops_ok=', dino_ops_ok)


torch: 2.5.1+cu124 cuda: 12.4 available: True
torchvision: 0.20.1+cu121
[OK] torchvision.ops on CUDA
GroundingDINO _C present: True
[SELF-CHECK] req= auto  cuda_avail= True  tv_ops_ok= True  dino_ops_ok= True


In [6]:
#@title Download/Resolve Model Weights
#@title Download/Resolve Model Weights
# (Optional) Download model weights if not present
import os, pathlib, shutil, subprocess
from typing import Optional
pathlib.Path('/content/models/groundingdino').mkdir(parents=True, exist_ok=True)
pathlib.Path('/content/models/sam').mkdir(parents=True, exist_ok=True)
GROUNDING_MODEL = '/content/models/groundingdino/groundingdino_swint_ogc.pth'
SAM_MODEL = '/content/models/sam/sam_vit_h_4b8939.pth'
GROUNDING_URL = 'https://github.com/IDEA-Research/GroundingDINO/releases/download/0.1.0/groundingdino_swint_ogc.pth'
SAM_URL = 'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth'
SAM2_MODEL = '/content/models/sam2/sam2_hiera_large.pt'
SAM2_URL = 'https://huggingface.co/facebook/sam2-hiera-large/resolve/main/sam2_hiera_large.pt'
# Try to read HF token from Colab Keys or env
HF_TOKEN = os.getenv('HF_TOKEN', '')
try:
  from google.colab import userdata as _ud
  HF_TOKEN = _ud.get('HF_TOKEN') or HF_TOKEN
except Exception:
  pass
def _file_ok(p: str, min_size: int) -> bool:
  try:
    return os.path.exists(p) and os.path.getsize(p) >= min_size
  except Exception:
    return False
def _try_torch_load(p: str) -> bool:
  try:
    import torch
    torch.load(p, map_location='cpu')
    return True
  except Exception as e:
    print('[warn] torch.load failed:', e)
    return False
def _hf_download(repo_id: str, filename: str, dest: str) -> bool:
  try:
    from huggingface_hub import hf_hub_download, login
    if HF_TOKEN:
      try:
        login(token=HF_TOKEN)
      except Exception as e:
        print('[warn] HF login failed:', e)
    ckpt = hf_hub_download(repo_id=repo_id, filename=filename, local_dir=os.path.dirname(dest), local_dir_use_symlinks=False, token=HF_TOKEN or None)
    if ckpt != dest:
      shutil.copy2(ckpt, dest)
    return True
  except Exception as e:
    print('[warn] HF download failed:', e)
    return False
def _curl(url: str, dest: str, min_size: int) -> bool:
  cmd = f"curl -L --fail --retry 5 --retry-all-errors -o '{dest}.tmp' '{url}'"
  rc = subprocess.call(cmd, shell=True)
  if rc == 0 and _file_ok(dest + '.tmp', min_size):
    shutil.move(dest + '.tmp', dest)
    return True
  else:
    print('[warn] curl download insufficient or failed:', rc)
    try:
      os.remove(dest + '.tmp')
    except Exception:
      pass
    return False
# GroundedDINO (expect ~0.9GB)
MIN_DINO = 600_000_000
need_dino = (not _file_ok(GROUNDING_MODEL, MIN_DINO)) or (not _try_torch_load(GROUNDING_MODEL))
if need_dino:
  print('Downloading GroundingDINO weights (robust)...')
  try:
    os.remove(GROUNDING_MODEL)
  except Exception:
    pass
  # 1) Try GCS mirror if mounted
  gcs_mirror = '/content/artifacts/models/groundingdino/groundingdino_swint_ogc.pth'
  ok = _file_ok(gcs_mirror, MIN_DINO)
  if ok:
    try:
      shutil.copy2(gcs_mirror, GROUNDING_MODEL)
      print('[DINO] using GCS mirror')
      ok = True
    except Exception as e:
      print('[warn] copy from GCS mirror failed:', e); ok = False
  # 2) Try HF Hub (public repo)
  if (not ok):
    ok = _hf_download('ShilongLiu/GroundingDINO', 'groundingdino_swint_ogc.pth', GROUNDING_MODEL)
    if ok: print('[DINO] using HF Hub')
  # 3) Try GitHub release via curl
  if (not ok) or (not _file_ok(GROUNDING_MODEL, MIN_DINO)):
    ok = _curl(GROUNDING_URL, GROUNDING_MODEL, MIN_DINO)
    if ok: print('[DINO] using curl URL')
  # 4) Try gsutil from bucket path if accessible
  if (not ok) or (not _file_ok(GROUNDING_MODEL, MIN_DINO)):
    try:
      rc = subprocess.call(f"gsutil cp gs://pik-artifacts-dev/models/groundingdino/groundingdino_swint_ogc.pth '{GROUNDING_MODEL}'", shell=True)
      ok = (rc == 0) and _file_ok(GROUNDING_MODEL, MIN_DINO)
      if ok: print('[DINO] using gsutil mirror')
    except Exception as e:
      print('[warn] gsutil mirror copy failed:', e)
  if (not ok) or (not _file_ok(GROUNDING_MODEL, MIN_DINO)) or (not _try_torch_load(GROUNDING_MODEL)):
    raise SystemExit('Failed to fetch a valid GroundingDINO checkpoint')
# SAM (ViT-H is large; check size only)
MIN_SAM = 1_000_000_000
if not _file_ok(SAM_MODEL, MIN_SAM):
  print('Downloading SAM ViT-H weights (robust)...')
  # 1) Try GCS mirror if mounted
  sam_gcs_mirror = '/content/artifacts/models/sam/sam_vit_h_4b8939.pth'
  ok = _file_ok(sam_gcs_mirror, MIN_SAM)
  if ok:
    try:
      shutil.copy2(sam_gcs_mirror, SAM_MODEL)
      print('[SAM] using GCS mirror')
      ok = True
    except Exception as e:
      print('[warn] copy SAM from GCS mirror failed:', e); ok = False
  # 2) Try HF Hub
  if (not ok):
    ok = _hf_download('facebook/sam', 'sam_vit_h_4b8939.pth', SAM_MODEL)
    if ok: print('[SAM] using HF Hub')
  # 3) Try official URL via curl
  if (not ok) or (not _file_ok(SAM_MODEL, MIN_SAM)):
    ok = _curl(SAM_URL, SAM_MODEL, MIN_SAM)
    if ok: print('[SAM] using curl URL')
  # 4) Try gsutil mirror from bucket
  if (not ok) or (not _file_ok(SAM_MODEL, MIN_SAM)):
    try:
      rc = subprocess.call(f"gsutil cp gs://pik-artifacts-dev/models/sam/sam_vit_h_4b8939.pth '{SAM_MODEL}'", shell=True)
      ok = (rc == 0) and _file_ok(SAM_MODEL, MIN_SAM)
      if ok: print('[SAM] using gsutil mirror')
    except Exception as e:
      print('[warn] gsutil SAM mirror copy failed:', e)
  if (not ok) or (not _file_ok(SAM_MODEL, MIN_SAM)):
    raise SystemExit('Failed to fetch SAM ViT-H checkpoint')
# SAM2 (Hiera Large)
MIN_SAM2 = 700_000_000
if not _file_ok(SAM2_MODEL, MIN_SAM2):
  print('Downloading SAM2 Hiera Large weights (robust)...')
  # 1) Try GCS mirror if mounted
  sam2_gcs_mirror = '/content/artifacts/models/sam2/sam2_hiera_large.pt'
  ok = _file_ok(sam2_gcs_mirror, MIN_SAM2)
  if ok:
    try:
      shutil.copy2(sam2_gcs_mirror, SAM2_MODEL)
      print('[SAM2] using GCS mirror')
      ok = True
    except Exception as e:
      print('[warn] copy SAM2 from GCS mirror failed:', e); ok = False
  # 2) Try HF Hub
  if (not ok):
    ok = _hf_download('facebook/sam2-hiera-large', 'sam2_hiera_large.pt', SAM2_MODEL)
    if ok: print('[SAM2] using HF Hub')
  # 3) Try direct URL via curl
  if (not ok) or (not _file_ok(SAM2_MODEL, MIN_SAM2)):
    ok = _curl(SAM2_URL, SAM2_MODEL, MIN_SAM2)
    if ok: print('[SAM2] using curl URL')
  # 4) Try gsutil mirror from bucket
  if (not ok) or (not _file_ok(SAM2_MODEL, MIN_SAM2)):
    try:
      rc = subprocess.call(f"gsutil cp gs://pik-artifacts-dev/models/sam2/sam2_hiera_large.pt '{SAM2_MODEL}'", shell=True)
      ok = (rc == 0) and _file_ok(SAM2_MODEL, MIN_SAM2)
      if ok: print('[SAM2] using gsutil mirror')
    except Exception as e:
      print('[warn] gsutil SAM2 mirror copy failed:', e)
  if (not ok) or (not _file_ok(SAM2_MODEL, MIN_SAM2)):
    raise SystemExit('Failed to fetch SAM2 Hiera Large checkpoint')
import os as _os; print('[DINO] size=', _os.path.getsize(GROUNDING_MODEL)); print('GROUNDING_MODEL =', GROUNDING_MODEL)
import os as _os; print('[SAM2] size=', _os.path.getsize(SAM2_MODEL))
import os as _os; print('[SAM] size=', _os.path.getsize(SAM_MODEL)); print('SAM_MODEL       =', SAM_MODEL)

# Log weights info if logging enabled
try:
  import os as _os
  WEIGHTS_INFO = {
    'groundingdino': {'path': GROUNDING_MODEL, 'size': _os.path.getsize(GROUNDING_MODEL)},
    'sam': {'path': SAM_MODEL, 'size': _os.path.getsize(SAM_MODEL)},
    'sam2': ({'path': SAM2_MODEL, 'size': _os.path.getsize(SAM2_MODEL)} if _os.path.exists(SAM2_MODEL) else None),
  }
  if 'log_json' in globals(): log_json('weights.json', WEIGHTS_INFO)
except Exception as e:
  print('[LOG] weights info not recorded:', e)


Downloading GroundingDINO weights (robust)...
[DINO] using GCS mirror
Downloading SAM ViT-H weights (robust)...
[SAM] using GCS mirror
Downloading SAM2 Hiera Large weights (robust)...
[warn] copy SAM2 from GCS mirror failed: [Errno 2] No such file or directory: '/content/models/sam2/sam2_hiera_large.pt'


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


sam2_hiera_large.pt:   0%|          | 0.00/898M [00:00<?, ?B/s]

[SAM2] using HF Hub
[DINO] size= 693997677
GROUNDING_MODEL = /content/models/groundingdino/groundingdino_swint_ogc.pth
[SAM2] size= 897952466
[SAM] size= 2564550879
SAM_MODEL       = /content/models/sam/sam_vit_h_4b8939.pth


In [7]:
#@title Selected Parameters (echo)
# This cell only echoes current config defined in the top control cell.
try:
    FRAME_NAMES
except NameError:
    FRAME_NAMES = []
try:
    PROMPTS
except NameError:
    PROMPTS = []
print('START_RUN=', START_RUN)
print('PDF:', PLAYBOOK_PDF)
print('PAGES (empty=ALL):', PAGES)
print('Frames:', FRAME_NAMES)
print('Prompts:', PROMPTS)
print('Device:', DEVICE, 'Use SAM2:', USE_SAM2)
print('Report to GCS:', REPORT_TO_GCS, 'Bucket:', GCS_BUCKET, 'Run tag:', RUN_TAG)


PDF: /content/src_gcs/playbooks/PIK - Expert Guide - Platform IT Architecture - Playbook - v11.pdf Pages: [4, 5, 6, 7, 8, 9, 10, 11]


In [8]:
#@title Logging helpers (GCS)
import os, sys, json, time, platform, socket, subprocess
from pathlib import Path
RUN_ID = time.strftime('%Y%m%d-%H%M%S') + (('-'+RUN_TAG.strip()) if ('RUN_TAG' in globals() and RUN_TAG.strip()) else '')
LOCAL_LOG_ROOT = '/content/artifacts/colab_runs' if os.path.exists('/content/artifacts') else '/content/colab_runs'
LOG_DIR = Path(LOCAL_LOG_ROOT)/RUN_ID
LOG_DIR.mkdir(parents=True, exist_ok=True)
def log_json(name, obj):
  p = LOG_DIR/name if isinstance(name, Path) else LOG_DIR/str(name)
  p.write_text(json.dumps(obj, ensure_ascii=False, indent=2), encoding='utf-8')
def log_kv(key, val):
  data = {}
  p = LOG_DIR/'run.json'
  if p.exists():
    try: data = json.loads(p.read_text(encoding='utf-8'))
    except Exception: data = {}
  data[key]=val
  p.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding='utf-8')
env = { 'python': sys.version, 'platform': platform.platform(), 'hostname': socket.gethostname(), 'device_req': (DEVICE if 'DEVICE' in globals() else 'auto') }
try:
  import torch
  env.update({'torch': torch.__version__, 'cuda': getattr(torch.version,'cuda',None), 'cuda_available': torch.cuda.is_available()})
except Exception: pass
log_json('env.json', env)
def upload_logs():
  if not ('REPORT_TO_GCS' in globals() and REPORT_TO_GCS):
    print('[LOG] REPORT_TO_GCS disabled'); return
  bucket = (GCS_BUCKET if 'GCS_BUCKET' in globals() else 'pik-artifacts-dev')
  prefix = f'colab_runs/{RUN_ID}'
  try:
    from google.cloud import storage
    client=storage.Client()
    b=client.bucket(bucket)
    for p in LOG_DIR.rglob('*'):
      if p.is_file():
        rel=str(p.relative_to(LOG_DIR)).replace('\\','/')
        blob=b.blob(f'{prefix}/{rel}')
        ctype='application/json' if p.suffix.lower()=='.json' else 'text/plain; charset=utf-8'
        blob.content_type=ctype
        blob.upload_from_filename(str(p))
    print(f'[LOG] uploaded to gs://{bucket}/{prefix}')
    return
  except Exception as e:
    print('[LOG] storage client failed, fallback to gsutil:', e)
  # gsutil fallback
  cmd=f"gsutil -m cp -r '{LOG_DIR}' gs://{bucket}/colab_runs/"
  subprocess.run(['bash','-lc', cmd], check=False)


In [9]:
#@title Render Pages to PNG
require_start()

# Render selected pages to PNG (robust: checks poppler + PDF presence; falls back to gsutil cp)
import os, shutil, pathlib, subprocess
from subprocess import check_call
pathlib.Path(OUT_PAGES_DIR).mkdir(parents=True, exist_ok=True)
# Ensure pdftoppm exists
if not shutil.which('pdftoppm'):
  print('Installing poppler-utils (pdftoppm)...')
  check_call(['bash','-lc','sudo apt-get -q update && sudo apt-get -q install -y poppler-utils'])
# Ensure source PDF exists; if not, copy from GCS via gsutil
src = PLAYBOOK_PDF
if not os.path.exists(src):
  print('PDF not found at', src, '; copying from GCS...')
  check_call(['bash','-lc','gsutil -m cp "gs://pik_source_bucket/playbooks/PIK - Expert Guide - Platform IT Architecture - Playbook - v11.pdf" /content/Playbook.pdf'])
  src = '/content/Playbook.pdf'
# Render pages
# If PAGES is empty, compute all pages via pdfinfo
def _detect_all_pages(pdf_path):
    import subprocess, re
    try:
        out = subprocess.check_output(['pdfinfo', pdf_path], text=True)
        m = re.search(r'^Pages:\\s+(\\d+)', out, re.M|re.I)
        return int(m.group(1)) if m else None
    except Exception:
        return None
pages_selected = list(PAGES) if isinstance(PAGES, list) else []
if not pages_selected:
    n = _detect_all_pages(src)
    if n:
        pages_selected = list(range(1, n+1))
    else:
        raise SystemExit('Cannot determine page count and PAGES is empty')
print('Pages to render:', pages_selected[:20], ('...' if len(pages_selected)>20 else ''))

for p in pages_selected:
  print('Rendering', p)
  check_call(['pdftoppm','-png','-singlefile','-r','150', src, f'{OUT_PAGES_DIR}/page-{p}'])
!ls -la /content/pages | head -n 10
def ensure_dir(d):
  pathlib.Path(d).mkdir(parents=True, exist_ok=True)



Rendering 4
Rendering 5
Rendering 6
Rendering 7
Rendering 8
Rendering 9
Rendering 10
Rendering 11
total 4044
drwxr-xr-x 2 root root   4096 Sep 16 04:32 .
drwxr-xr-x 1 root root   4096 Sep 16 04:32 ..
-rw-r--r-- 1 root root 512200 Sep 16 04:32 page-10.png
-rw-r--r-- 1 root root 512200 Sep 16 04:32 page-11.png
-rw-r--r-- 1 root root 512200 Sep 16 04:32 page-4.png
-rw-r--r-- 1 root root 512200 Sep 16 04:32 page-5.png
-rw-r--r-- 1 root root 512200 Sep 16 04:32 page-6.png
-rw-r--r-- 1 root root 512200 Sep 16 04:32 page-7.png
-rw-r--r-- 1 root root 512200 Sep 16 04:32 page-8.png


In [10]:
# Боевой режим: GroundedDINO → SAM (5 страниц + 3 фрейма)
require_start()

import os, json, pathlib, cv2, numpy as np, torch
from groundingdino.util.inference import Model
# SAM/SAM2 init with fallback and device control
_req = (DEVICE.lower() if 'DEVICE' in globals() else 'auto')
if _req == 'cuda' and not torch.cuda.is_available():
  print('[warn] CUDA requested but not available; using CPU')
  device = 'cpu'
elif _req == 'cpu':
  device = 'cpu'
else:
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Selected device:', device)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
sam2_predictor = None
if 'USE_SAM2' in globals() and USE_SAM2:
  try:
    from sam2.build_sam import build_sam2
    from sam2.sam2_image_predictor import SAM2ImagePredictor
    sam2_model = build_sam2('sam2_hiera_large', SAM2_MODEL, device=device)
    sam2_predictor = SAM2ImagePredictor(sam2_model)
    print('SAM2 ready on', device)
  except Exception as e:
    print('SAM2 init failed, fallback to SAM v1:', e)
    sam2_predictor = None
if sam2_predictor is None:
  from segment_anything import sam_model_registry, SamPredictor
  print('SAM v1 ready on', device)
CFG_PATH = '/content/GroundingDINO_SwinT_OGC.py'
CFG_URL = 'https://raw.githubusercontent.com/IDEA-Research/GroundingDINO/main/groundingdino/config/GroundingDINO_SwinT_OGC.py'
# Попытка скачать конфиг, если его нет
import urllib.request, urllib.error
def _download(url, path):
  try:
    urllib.request.urlretrieve(url, path)
    return os.path.exists(path) and os.path.getsize(path) > 1000
  except Exception:
    return False
if not os.path.exists(CFG_PATH):
  ok = _download(CFG_URL, CFG_PATH)
  if not ok:
    try:
      import groundingdino, os as _os
      CFG_PATH = _os.path.join(_os.path.dirname(groundingdino.__file__), 'config', 'GroundingDINO_SwinT_OGC.py')
      print('Using package config at', CFG_PATH)
    except Exception as e:
      raise FileNotFoundError('GroundingDINO config not found and download failed')
# Sanity check on DINO checkpoint
import torch
try:
  _ = torch.load(GROUNDING_MODEL, map_location='cpu')
except Exception as e:
  raise RuntimeError(f'GroundedDINO checkpoint invalid: {e}')
gd_model = Model(model_config_path=CFG_PATH, model_checkpoint_path=GROUNDING_MODEL, device=device)
def save_region(rdir, idx, img, xyxy):
  x0,y0,x1,y1 = map(int, xyxy)
  x0,y0 = max(0,x0), max(0,y0)
  crop = img[y0:y1, x0:x1] if y1>y0 and x1>x0 else img
  ok, buf = cv2.imencode('.png', crop)
  if ok: (rdir/f'region-{idx}.png').write_bytes(buf.tobytes())
  obj = { 'bbox': {'x':int(x0),'y':int(y0),'w':int(x1-x0),'h':int(y1-y0)}, 'text':'', 'image_b64':'' }
  (rdir/f'region-{idx}.json').write_text(json.dumps(obj, ensure_ascii=False), encoding='utf-8')

def ensure_dir(d):
    pathlib.Path(d).mkdir(parents=True, exist_ok=True)

def detect_one(image_path, out_root):
  img = cv2.imread(image_path); assert img is not None, image_path
  img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
  H,W = img_rgb.shape[:2]
  output = gd_model.predict_with_classes(image=img_rgb, classes=PROMPTS, box_threshold=BOX_THRESHOLD, text_threshold=TEXT_THRESHOLD)
  print(f"Output from predict_with_classes: {output}") # Debugging line
  boxes = output.xyxy # Access boxes from the Detections object
  logits = output.confidence # Access confidence scores from the Detections object
  phrases = [PROMPTS[class_id] for class_id in output.class_id] # Infer phrases from class_id and PROMPTS
  bxs = []
  for b in boxes:
    b = np.asarray(b, dtype=float)
    if b.max()<=1.01: x0,y0,x1,y1 = b[0]*W, b[1]*H, b[2]*W, b[3]*H
    else: x0,y0,x1,y1 = b
    bxs.append([x0,y0,x1,y1])
  out = os.path.join(out_root, pathlib.Path(image_path).stem, 'regions'); ensure_dir(out)
  for i,xyxy in enumerate(bxs[:TOPK], start=1): save_region(pathlib.Path(out), i, img, xyxy)
# Страницы
ensure_dir(DETECT_OUT)
for p in PAGES:
  detect_one(f'/content/pages/page-{p}.png', DETECT_OUT)
# Фреймы
ensure_dir('/content/grounded_frames') # Ensure directory exists
for name in FRAME_NAMES:
  f = f'/content/src_gcs/frames/{name}'
  if os.path.exists(f): detect_one(f, '/content/grounded_frames')
# Выгрузка
!gsutil -m rsync -r /content/grounded_regions gs://pik-artifacts-dev/grounded_regions/
!gsutil -m rsync -r /content/grounded_frames gs://pik-artifacts-dev/grounded_regions/
!gsutil ls -r gs://pik-artifacts-dev/grounded_regions | head -n 40

Selected device: cuda
SAM2 init failed, fallback to SAM v1: Cannot find primary config 'sam2_hiera_large'. Check that it's in your config search path.

Config search path:
	provider=hydra, path=pkg://hydra.conf
	provider=main, path=pkg://sam2
	provider=schema, path=structured://
SAM v1 ready on cuda




final text_encoder_type: bert-base-uncased


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]



Output from predict_with_classes: Detections(xyxy=array([[  34.62677  ,    3.7105103, 1873.7822   , 1057.823    ]],
      dtype=float32), mask=None, confidence=array([0.42326838], dtype=float32), class_id=array([0]), tracker_id=None, data={}, metadata={})
Output from predict_with_classes: Detections(xyxy=array([[  34.62677  ,    3.7105103, 1873.7822   , 1057.823    ]],
      dtype=float32), mask=None, confidence=array([0.42326838], dtype=float32), class_id=array([0]), tracker_id=None, data={}, metadata={})
Output from predict_with_classes: Detections(xyxy=array([[  34.62677  ,    3.7105103, 1873.7822   , 1057.823    ]],
      dtype=float32), mask=None, confidence=array([0.42326838], dtype=float32), class_id=array([0]), tracker_id=None, data={}, metadata={})
Output from predict_with_classes: Detections(xyxy=array([[  34.62677  ,    3.7105103, 1873.7822   , 1057.823    ]],
      dtype=float32), mask=None, confidence=array([0.42326838], dtype=float32), class_id=array([0]), tracker_id=None

In [11]:
#@title Upload Regions to GCS
require_start()

require_start()

# Upload regions to pik-artifacts-dev
!gsutil -m rsync -r /content/grounded_regions gs://pik-artifacts-dev/grounded_regions/
!gsutil ls gs://pik-artifacts-dev/grounded_regions/ | head -n 20

# Upload logs to GCS if enabled
try:
  if 'upload_logs' in globals(): upload_logs()
except Exception as e:
  print('[LOG] upload skipped:', e)



both the source and destination. Your crcmod installation isn't using the
module's C extension, so checksumming will run very slowly. If this is your
first rsync since updating gsutil, this rsync can take significantly longer than
usual. For help installing the extension, please see "gsutil help crcmod".

Building synchronization state...
Starting synchronization...
gs://pik-artifacts-dev/grounded_regions/PIK - Expert Guide - Platform IT Architecture - Assessment - v01/
gs://pik-artifacts-dev/grounded_regions/PIK - Platform IT Architecture Canvas - Table View - v01/
gs://pik-artifacts-dev/grounded_regions/PIK - Platform IT Architecture Canvases - v01/
gs://pik-artifacts-dev/grounded_regions/page-10/
gs://pik-artifacts-dev/grounded_regions/page-11/
gs://pik-artifacts-dev/grounded_regions/page-4/
gs://pik-artifacts-dev/grounded_regions/page-42/
gs://pik-artifacts-dev/grounded_regions/page-45/
gs://pik-artifacts-dev/grounded_regions/page-5/
gs://pik-artifacts-dev/grounded_regions/page-6/

In [None]:
#@title Upload Cell Logs to GCS
require_start()
from pathlib import Path
p = Path(LOG_DIR) / 'cells.jsonl'
if not p.exists():
  print('[log] cells.jsonl not found at', p)
else:
  bucket = GCS_BUCKET if 'GCS_BUCKET' in globals() else 'pik-artifacts-dev'
  prefix = f'colab_runs/{RUN_ID}' if 'RUN_ID' in globals() else 'colab_runs/manual'
  try:
    from google.cloud import storage
    client = storage.Client()
    b = client.bucket(bucket)
    blob = b.blob(f'{prefix}/cells.jsonl')
    blob.content_type = 'application/json'
    blob.upload_from_filename(str(p))
    print('[log] uploaded to', f'gs://{bucket}/{prefix}/cells.jsonl')
  except Exception as e:
    import subprocess
    print('[log] storage client failed, fallback to gsutil:', e)
    cmd = f"gsutil cp '{p}' gs://{bucket}/{prefix}/cells.jsonl"
    subprocess.run(['bash','-lc', cmd], check=False)
