In [None]:
# 1) Check GPU
import subprocess
print(subprocess.check_output(['nvidia-smi'], text=True))


In [None]:
# 2) Setup: deps, code, datasets, model
import shutil
from pathlib import Path
from huggingface_hub import snapshot_download

WORK      = Path('/content/work/multimodal_ml')
MONET_DIR = WORK / 'monet'
VSTAR_DIR = WORK / 'datasets/vstar_bench'
MME_DIR   = WORK / 'datasets/MME-RealWorld-Lite'
GEO3K_DIR = WORK / 'datasets/geometry_3k'
MODEL_DIR = Path('/content/work/models/Monet-7B')
WORK.mkdir(parents=True, exist_ok=True)
(WORK / 'datasets').mkdir(exist_ok=True)

!apt-get -qq install -y git-lfs unzip && git lfs install
!python -m pip install -q --no-cache-dir 'setuptools>=77' pillow
!python -m pip uninstall -y -q torch torchvision torchaudio vllm transformers tokenizers || true
!python -m pip install -q --no-cache-dir --index-url https://download.pytorch.org/whl/cu126 torch==2.7.1 torchvision==0.22.1 torchaudio==2.7.1
!python -m pip install -q --no-cache-dir vllm==0.10.0 'transformers==4.56.1' qwen-vl-utils accelerate 'huggingface_hub>=0.25,<1.0' datasets==4.0.0

if not MONET_DIR.exists():
    !git clone --depth 1 https://github.com/NOVAglow646/Monet.git {MONET_DIR}
if not VSTAR_DIR.exists():
    !git clone --depth 1 https://huggingface.co/datasets/craigwu/vstar_bench {VSTAR_DIR}
if not (GEO3K_DIR / 'test').exists():
    !git clone --filter=blob:none --no-checkout --depth 1 https://github.com/lupantech/InterGPS /tmp/igps_tmp \
      && git -C /tmp/igps_tmp sparse-checkout set data/geometry3k \
      && git -C /tmp/igps_tmp checkout \
      && mv /tmp/igps_tmp/data/geometry3k {GEO3K_DIR} \
      && rm -rf /tmp/igps_tmp
    for z in sorted(GEO3K_DIR.rglob('*.zip')): !unzip -oq {z} -d {z.parent}
if not (MME_DIR / '.huggingface').exists():
    if MME_DIR.exists(): shutil.rmtree(MME_DIR)
    snapshot_download('yifanzhang114/MME-RealWorld-Lite', repo_type='dataset', local_dir=str(MME_DIR), local_dir_use_symlinks=False)
    for z in sorted(MME_DIR.rglob('*.zip')): !unzip -oq {z} -d {z.parent}
if not (MODEL_DIR / 'config.json').exists():
    MODEL_DIR.mkdir(parents=True, exist_ok=True)
    snapshot_download('NOVAglow646/Monet-7B', local_dir=str(MODEL_DIR), local_dir_use_symlinks=False)

print('Setup complete.')

In [None]:
# 3) Dataset sanity check (local files only, no inference)
from pathlib import Path
from collections import Counter
import json

WORK = Path('/content/work/multimodal_ml')
DATASETS = WORK / 'datasets'
VSTAR_DIR = DATASETS / 'vstar_bench'
MME_DIR = DATASETS / 'MME-RealWorld-Lite'
MME_DATA = MME_DIR / 'data'
MME_IMGS = MME_DATA / 'imgs'
GEO3K_DIR = DATASETS / 'geometry_3k'

print('=== Dataset Roots ===')
for p in [VSTAR_DIR, MME_DIR, GEO3K_DIR]:
    print(f'{p} exists={p.exists()}')

print()
print('=== V* quick view ===')
if VSTAR_DIR.exists():
    for x in sorted(VSTAR_DIR.iterdir()):
        print(x.name)
    vstar_manifest = VSTAR_DIR / 'test_questions.jsonl'
    if vstar_manifest.exists():
        print()
        print('V* test_questions.jsonl head:')
        with open(vstar_manifest, 'r', encoding='utf-8') as f:
            for i, line in enumerate(f):
                if i >= 3:
                    break
                print(line.rstrip())

print()
print('=== MME quick view ===')
for p in [MME_DIR, MME_DATA, MME_IMGS]:
    print(f'{p} exists={p.exists()}')

ext_counts = Counter()
for fp in MME_DIR.rglob('*'):
    if fp.is_file():
        ext = fp.suffix.lower() if fp.suffix else '(noext)'
        ext_counts[ext] += 1
print()
print('MME file-type counts (top 20):')
for ext, cnt in ext_counts.most_common(20):
    print(f'{ext}: {cnt}')

cand = []
if MME_DATA.exists():
    for ext in ('*.jsonl', '*.json', '*.parquet', '*.csv', '*.tsv'):
        cand.extend(sorted(MME_DATA.rglob(ext)))

print()
print('MME candidate annotation files (first 20):')
for fp in cand[:20]:
    print('-', fp.relative_to(MME_DIR))

ann = MME_DATA / 'MME-RealWorld-Lite.json'
if ann.exists():
    rows = json.load(open(ann, 'r', encoding='utf-8'))
    print()
    print(f'MME annotation rows: {len(rows)}')
    if rows and isinstance(rows[0], dict):
        print('MME first-row keys:', list(rows[0].keys()))

    unresolved = []
    preview = []
    for r in rows[:20]:
        img_val = str(r.get('Image', '')).strip()
        p1 = Path(img_val)
        if p1.is_absolute():
            resolved = p1.exists()
            target = p1
        else:
            cands = [MME_DATA / img_val, MME_IMGS / img_val, MME_IMGS / Path(img_val).name]
            target = next((c for c in cands if c.exists()), cands[0])
            resolved = any(c.exists() for c in cands)
        if len(preview) < 5:
            preview.append((img_val, str(target), resolved))
        if not resolved:
            unresolved.append(img_val)

    print()
    print('MME image resolution preview (first 5):')
    for raw, target, ok in preview:
        print(f'raw={raw} | resolved_path={target} | ok={ok}')
    print(f'unresolved_in_first20={len(unresolved)}')
else:
    print()
    print('WARNING: canonical annotation file not found:', ann)

print()
print('=== Geometry3K quick view ===')
for split in ['train', 'val', 'test']:
    d = GEO3K_DIR / split
    if d.exists():
        n = sum(1 for x in d.iterdir() if x.is_dir())
        print(f'{split}: {n} problem folders')
    else:
        print(f'{split}: missing')

print()
print('Dataset sanity check complete.')



In [None]:
# 4) V*Bench — Monet-7B full test set evaluation
import importlib, json, os, re, sys, traceback
from pathlib import Path
from PIL import Image
from transformers import AutoProcessor

WORK          = Path('/content/work/multimodal_ml')
MODEL_DIR     = Path('/content/work/models/Monet-7B')
VSTAR_DIR     = WORK / 'datasets/vstar_bench'
RESULTS_FILE  = WORK / 'results/vstar_results.json'
MAX_MODEL_LEN = 32768
SEED          = 0
SAVE_EVERY    = 50

os.environ.update({
    'LATENT_SIZE': '10', 'LATENT_START_ID': '151666', 'LATENT_END_ID': '151667',
    'VLLM_USE_V1': '1', 'VLLM_WORKER_MULTIPROC_METHOD': 'spawn', 'VLLM_ENABLE_V1_MULTIPROCESSING': '0',
})
RESULTS_FILE.parent.mkdir(parents=True, exist_ok=True)

sys.path.insert(0, str(WORK / 'monet'))
import inference.apply_vllm_monet
import inference.load_and_gen_vllm as lg

_f = WORK / 'monet/inference/load_and_gen_vllm.py'
_src = _f.read_text()
_src = _src.replace('enable_sleep_mode=True',     'enable_sleep_mode=False')
_src = _src.replace('enable_chunked_prefill=True', 'enable_chunked_prefill=False')
_f.write_text(_src)
lg = importlib.reload(lg)
lg.tqdm = lambda it, **kw: it

from vllm import LLM
try:
    _reuse = (isinstance(mllm, LLM)
              and mllm.llm_engine.model_config.max_model_len == MAX_MODEL_LEN)
except:
    _reuse = False

if not _reuse:
    try:    mllm.shutdown()
    except: pass
    import subprocess
    total_gib, free_gib = [float(x) / 1024 for x in subprocess.check_output(
        'nvidia-smi --query-gpu=memory.total,memory.free --format=csv,noheader,nounits',
        shell=True, text=True).strip().split(',')]
    util = round(min(0.92, (free_gib - 4) / total_gib), 3)
    mllm, sampling_params = lg.vllm_mllm_init(
        str(MODEL_DIR), tp=1, gpu_memory_utilization=util, max_model_len=MAX_MODEL_LEN)
    processor = AutoProcessor.from_pretrained(str(MODEL_DIR), trust_remote_code=True)

sampling_params.seed = SEED

# ── Dataset — resume from checkpoint if interrupted ───────────────────────
all_rows = []
with open(VSTAR_DIR / 'test_questions.jsonl') as f:
    for line in f:
        if line.strip():
            obj = json.loads(line)
            if (VSTAR_DIR / obj['image']).exists():
                all_rows.append(obj)

results = json.loads(RESULTS_FILE.read_text()) if RESULTS_FILE.exists() else []
done_ids = {r['question_id'] for r in results}
remaining = [r for r in all_rows if r['question_id'] not in done_ids]
print(f'{len(all_rows)} total | {len(done_ids)} done | {len(remaining)} remaining')

# ── Inference ─────────────────────────────────────────────────────────────
def extract_answer(text):
    match = re.search(r'\\boxed\{([^}]*)\}', text)
    if match and re.fullmatch(r'[A-Ea-e]', match.group(1).strip()):
        return match.group(1).strip().upper()
    letters = re.findall(r'\b([A-E])\b', text[-400:])
    return letters[-1].upper() if letters else None

def clean_output(text):
    """Replace raw latent bytes between abs_vis_token tags with a readable placeholder."""
    return re.sub(r'(<abs_vis_token>)(.*?)(</abs_vis_token>)', r'\1<latent>\3', text, flags=re.DOTALL)

for i, row in enumerate(remaining, 1):
    inputs = lg.vllm_mllm_process_batch_from_messages([[
        {'role': 'user', 'content': [
            {'type': 'text',  'text':  row['text']},
            {'type': 'image', 'image': Image.open(VSTAR_DIR / row['image']).convert('RGB')},
        ]},
    ]], processor)

    try:
        output = mllm.generate(inputs, sampling_params=sampling_params, use_tqdm=False)[0].outputs[0]
    except Exception:
        print(f'[{i}/{len(remaining)}] ERROR {row["question_id"]}:\n{traceback.format_exc()}')
        continue

    token_ids    = list(getattr(output, 'token_ids', []) or [])
    predicted    = extract_answer(output.text or '')
    ground_truth = str(row.get('label', '')).strip().upper()
    results.append({
        'question_id':  row['question_id'],
        'category':     row.get('category'),
        'image':        row['image'],
        'question':     row['text'],
        'output':       clean_output(output.text or ''),
        'predicted':    predicted,
        'ground_truth': ground_truth,
        'correct':      predicted == ground_truth,
        'used_latent':  151666 in token_ids or 151667 in token_ids,
        'finish':       output.finish_reason,
        'tokens':       len(token_ids),
    })

    if i % SAVE_EVERY == 0 or i == len(remaining):
        RESULTS_FILE.write_text(json.dumps(results, indent=2))
        n       = len(results)
        correct = sum(r['correct'] for r in results)
        latent  = sum(r['used_latent'] for r in results)
        print(f'[{i}/{len(remaining)}] saved={n} acc={correct/n:.3f} latent={latent/n:.3f}')

n       = len(results)
correct = sum(r['correct'] for r in results)
latent  = sum(r['used_latent'] for r in results)
print(f'\n=== V*Bench Results (n={n}) ===')
print(f'accuracy:    {correct}/{n} = {correct/max(n,1):.3f}')
print(f'latent_rate: {latent}/{n} = {latent/max(n,1):.3f}')
print(f'results →    {RESULTS_FILE}')

In [None]:
# 5) MME-RealWorld-Lite — Monet-7B benchmark
import importlib, json, os, random, re, sys, traceback
from collections import Counter
from pathlib import Path
from PIL import Image
from transformers import AutoProcessor

WORK          = Path('/content/work/multimodal_ml')
MODEL_DIR     = Path('/content/work/models/Monet-7B')
MME_DIR       = WORK / 'datasets/MME-RealWorld-Lite'
N_SAMPLE      = 20
SEED          = 0
MAX_MODEL_LEN = 32768

os.environ.update({
    'LATENT_SIZE': '10', 'LATENT_START_ID': '151666', 'LATENT_END_ID': '151667',
    'VLLM_USE_V1': '1', 'VLLM_WORKER_MULTIPROC_METHOD': 'spawn', 'VLLM_ENABLE_V1_MULTIPROCESSING': '0',
})
random.seed(SEED)

# ── Monet inference module ────────────────────────────────────────────────
sys.path.insert(0, str(WORK / 'monet'))
import inference.apply_vllm_monet
import inference.load_and_gen_vllm as lg

_f = WORK / 'monet/inference/load_and_gen_vllm.py'
_src = _f.read_text()
_src = _src.replace('enable_sleep_mode=True',     'enable_sleep_mode=False')
_src = _src.replace('enable_chunked_prefill=True', 'enable_chunked_prefill=False')
_f.write_text(_src)
lg = importlib.reload(lg)
lg.tqdm = lambda it, **kw: it

# ── Engine — reinit if config changed, otherwise reuse ───────────────────
from vllm import LLM
try:
    _reuse = (isinstance(mllm, LLM)
              and mllm.llm_engine.model_config.max_model_len == MAX_MODEL_LEN)
except:
    _reuse = False

if not _reuse:
    try:    mllm.shutdown()
    except: pass
    import subprocess
    total_gib, free_gib = [float(x) / 1024 for x in subprocess.check_output(
        'nvidia-smi --query-gpu=memory.total,memory.free --format=csv,noheader,nounits',
        shell=True, text=True).strip().split(',')]
    util = round(min(0.92, (free_gib - 4) / total_gib), 3)
    mllm, sampling_params = lg.vllm_mllm_init(
        str(MODEL_DIR), tp=1, gpu_memory_utilization=util, max_model_len=MAX_MODEL_LEN)
    processor = AutoProcessor.from_pretrained(str(MODEL_DIR), trust_remote_code=True)

sampling_params.seed = SEED

# ── Dataset ───────────────────────────────────────────────────────────────
rows = json.load(open(MME_DIR / 'data/MME-RealWorld-Lite.json'))
print(f'{len(rows)} annotations | keys: {list(rows[0].keys())}')

image_index = {
    f.name: f for f in (MME_DIR / 'data').rglob('*')
    if f.suffix.lower() in {'.jpg', '.jpeg', '.png', '.webp', '.bmp'}
}

samples = random.sample(rows, min(N_SAMPLE, len(rows)))

# ── Eval ──────────────────────────────────────────────────────────────────
def extract_answer(text):
    match = re.search(r'\\boxed\{([^}]*)\}', text)
    if match and re.fullmatch(r'[A-Ea-e]', match.group(1).strip()):
        return match.group(1).strip().upper()
    letters = re.findall(r'\b([A-E])\b', text[-400:])
    return letters[-1].upper() if letters else None

finish_counts = Counter()
skip_counts   = Counter()
num_correct   = 0
num_latent    = 0

for i, row in enumerate(samples, 1):
    image_path = image_index.get(Path(row['Image']).name)
    if image_path is None:
        skip_counts['no_image'] += 1
        print(f'[{i:02d}] SKIP (not found): {row["Image"]}')
        continue

    question     = row['Text']
    ground_truth = row['Ground truth'].strip().upper()
    choices      = row.get('Answer choices', [])
    if choices:
        question += '\n' + '\n'.join(f'({chr(65+j)}) {c}' for j, c in enumerate(choices))
    question += '\nPut your final answer in \\boxed{}.'

    inputs = lg.vllm_mllm_process_batch_from_messages([[
        {'role': 'user', 'content': [{'type': 'text', 'text': question},
                                     {'type': 'image', 'image': Image.open(image_path).convert('RGB')}]},
    ]], processor)

    try:
        output = mllm.generate(inputs, sampling_params=sampling_params, use_tqdm=False)[0].outputs[0]
    except Exception:
        skip_counts['error'] += 1
        print(f'[{i:02d}] ERROR:\n{traceback.format_exc()}')
        continue

    raw_text    = output.text or ''
    predicted   = extract_answer(raw_text)
    token_ids   = list(getattr(output, 'token_ids', []) or [])
    correct     = (predicted == ground_truth) if ground_truth else False
    used_latent = 151666 in token_ids or 151667 in token_ids

    finish_counts[str(output.finish_reason)] += 1
    num_correct += correct
    num_latent  += used_latent

    tail = raw_text.replace('\n', ' ').strip()[-300:]
    print(f'[{i:02d}] correct={correct} latent={used_latent} pred={predicted} gt={ground_truth} finish={output.finish_reason} tokens={len(token_ids)}')
    print(f'     ...{tail}')

processed = len(samples) - sum(skip_counts.values())
print(f'\n=== Results (n={processed}) ===')
print(f'accuracy:    {num_correct}/{processed} = {num_correct / max(processed, 1):.3f}')
print(f'latent_rate: {num_latent}/{processed} = {num_latent / max(processed, 1):.3f}')
print(f'finish:      {dict(finish_counts)}')
if skip_counts:
    print(f'skipped:     {dict(skip_counts)}')

In [None]:
# 6) Cleanup — free GPU memory
# Note: vLLM holds a CUDA private pool; full release requires a runtime restart.
# This does best-effort cleanup for repeated runs within the same session.
import gc, torch

try:    mllm.shutdown()
except: pass
try:    del mllm, sampling_params, processor
except: pass

gc.collect()
torch.cuda.synchronize()
torch.cuda.empty_cache()
!pkill -f vllm || true
!pkill -f ray  || true

print('Cleanup done. For full GPU release, restart the runtime.')

In [None]:
# 7) Repo inference example — vllm_inference_example.py reproduced in-notebook
# Runs the exact conversation from monet/inference/vllm_inference_example.py.
import importlib, os, re, subprocess, sys
from pathlib import Path
from PIL import Image
from transformers import AutoProcessor

WORK      = Path('/content/work/multimodal_ml')
MODEL_DIR = Path('/content/work/models/Monet-7B')

os.environ.update({
    'LATENT_SIZE': '10', 'LATENT_START_ID': '151666', 'LATENT_END_ID': '151667',
    'VLLM_USE_V1': '1', 'VLLM_WORKER_MULTIPROC_METHOD': 'spawn', 'VLLM_ENABLE_V1_MULTIPROCESSING': '0',
})

sys.path.insert(0, str(WORK / 'monet'))
import inference.apply_vllm_monet
import inference.load_and_gen_vllm as lg

_f = WORK / 'monet/inference/load_and_gen_vllm.py'
_src = _f.read_text()
_src = _src.replace('enable_sleep_mode=True',      'enable_sleep_mode=False')
_src = _src.replace('enable_chunked_prefill=True',  'enable_chunked_prefill=False')
_f.write_text(_src)
lg = importlib.reload(lg)
lg.tqdm = lambda it, **kw: it

from vllm import LLM
try:
    _reuse = isinstance(mllm, LLM)
except NameError:
    _reuse = False

if not _reuse:
    total_gib, free_gib = [float(x) / 1024 for x in subprocess.check_output(
        'nvidia-smi --query-gpu=memory.total,memory.free --format=csv,noheader,nounits',
        shell=True, text=True).strip().split(',')]
    util = round(min(0.92, (free_gib - 4) / total_gib), 3)
    mllm, sampling_params = lg.vllm_mllm_init(str(MODEL_DIR), tp=1, gpu_memory_utilization=util)
    processor = AutoProcessor.from_pretrained(str(MODEL_DIR), trust_remote_code=True)

# ── Verbatim from vllm_inference_example.py ──────────────────────────────
def replace_abs_vis_token_content(s: str) -> str:
    pattern = re.compile(r'(<abs_vis_token>)(.*?)(</abs_vis_token>)', flags=re.DOTALL)
    return pattern.sub(r'\1<latent>\3', s)

question = (
    'Question:  Which car has the longest rental period? '
    'The choices are listed below:\n'
    '(A)DB11 COUPE.\n'
    '(B) V12 VANTAGES COUPES.\n'
    '(C) VANQUISH VOLANTE.\n'
    '(D) V12 VOLANTE.\n'
    '(E) The image does not feature the time. '
    'Put your final answer in \\boxed{}.'
)
image = Image.open(WORK / 'monet/images/example_question.png').convert('RGB')

inputs = lg.vllm_mllm_process_batch_from_messages([[
    {'role': 'user', 'content': [{'type': 'text', 'text': question},
                                 {'type': 'image', 'image': image}]},
]], processor)

output    = mllm.generate(inputs, sampling_params=sampling_params, use_tqdm=False)[0].outputs[0]
token_ids = list(getattr(output, 'token_ids', []) or [])
latent    = 151666 in token_ids or 151667 in token_ids

print(f'finish={output.finish_reason}  tokens={len(token_ids)}  latent={latent}')
print(replace_abs_vis_token_content(output.text or ''))

In [None]:
# 8) Geometry3K — Monet-7B full test set evaluation
import importlib, json, os, re, subprocess, sys, traceback
from pathlib import Path
from PIL import Image
from transformers import AutoProcessor

WORK          = Path('/content/work/multimodal_ml')
MODEL_DIR     = Path('/content/work/models/Monet-7B')
GEO3K_DIR     = WORK / 'datasets/geometry_3k/test'
RESULTS_FILE  = WORK / 'results/geo3k_results.json'
MAX_MODEL_LEN = 32768
SEED          = 0
SAVE_EVERY    = 50

os.environ.update({
    'LATENT_SIZE': '10', 'LATENT_START_ID': '151666', 'LATENT_END_ID': '151667',
    'VLLM_USE_V1': '1', 'VLLM_WORKER_MULTIPROC_METHOD': 'spawn', 'VLLM_ENABLE_V1_MULTIPROCESSING': '0',
})
RESULTS_FILE.parent.mkdir(parents=True, exist_ok=True)

sys.path.insert(0, str(WORK / 'monet'))
import inference.apply_vllm_monet
import inference.load_and_gen_vllm as lg

_f = WORK / 'monet/inference/load_and_gen_vllm.py'
_src = _f.read_text()
_src = _src.replace('enable_sleep_mode=True',     'enable_sleep_mode=False')
_src = _src.replace('enable_chunked_prefill=True', 'enable_chunked_prefill=False')
_f.write_text(_src)
lg = importlib.reload(lg)
lg.tqdm = lambda it, **kw: it

from vllm import LLM
try:
    _reuse = (isinstance(mllm, LLM)
              and mllm.llm_engine.model_config.max_model_len == MAX_MODEL_LEN)
except:
    _reuse = False

if not _reuse:
    try:    mllm.shutdown()
    except: pass
    total_gib, free_gib = [float(x) / 1024 for x in subprocess.check_output(
        'nvidia-smi --query-gpu=memory.total,memory.free --format=csv,noheader,nounits',
        shell=True, text=True).strip().split(',')]
    util = round(min(0.92, (free_gib - 4) / total_gib), 3)
    mllm, sampling_params = lg.vllm_mllm_init(
        str(MODEL_DIR), tp=1, gpu_memory_utilization=util, max_model_len=MAX_MODEL_LEN)
    processor = AutoProcessor.from_pretrained(str(MODEL_DIR), trust_remote_code=True)

sampling_params.seed = SEED

# ── Dataset — resume from checkpoint if interrupted ───────────────────────
problem_dirs = sorted(GEO3K_DIR.iterdir(), key=lambda p: int(p.name))

results  = json.loads(RESULTS_FILE.read_text()) if RESULTS_FILE.exists() else []
done_ids = {r['id'] for r in results}
remaining = [d for d in problem_dirs if str(d.name) not in done_ids]
print(f'{len(problem_dirs)} total | {len(done_ids)} done | {len(remaining)} remaining')

# ── Helpers ───────────────────────────────────────────────────────────────
def build_prompt(problem_text, choices):
    lines = [f'Question: {problem_text}', 'Choices:']
    for i, c in enumerate(choices):
        lines.append(f'({chr(65+i)}) {c}')
    lines.append('Put your final answer in \\boxed{}.')
    return '\n'.join(lines)

def extract_answer(text, choices):
    m = re.search(r'\\boxed\{([^}]*)\}', text)
    if m:
        boxed = m.group(1).strip()
        if re.fullmatch(r'[A-Da-d]', boxed):
            return boxed.upper()
        norm = lambda s: re.sub(r'\s+', ' ', s.strip().upper())
        for i, c in enumerate(choices):
            if norm(boxed) == norm(c):
                return chr(65 + i)
    letters = re.findall(r'\b([A-D])\b', text[-400:])
    return letters[-1].upper() if letters else None

def clean_output(text):
    return re.sub(r'(<abs_vis_token>)(.*?)(</abs_vis_token>)', r'\1<latent>\3', text, flags=re.DOTALL)

# ── Inference ─────────────────────────────────────────────────────────────
for i, prob_dir in enumerate(remaining, 1):
    data     = json.loads((prob_dir / 'data.json').read_text())
    question = build_prompt(data['problem_text'], data['choices'])
    try:
        image  = Image.open(prob_dir / 'img_diagram.png').convert('RGB')
        inputs = lg.vllm_mllm_process_batch_from_messages([[
            {'role': 'user', 'content': [
                {'type': 'text',  'text':  question},
                {'type': 'image', 'image': image},
            ]},
        ]], processor)
        output = mllm.generate(inputs, sampling_params=sampling_params, use_tqdm=False)[0].outputs[0]
    except Exception:
        print(f'[{i}/{len(remaining)}] ERROR {prob_dir.name}:\n{traceback.format_exc()}')
        continue

    token_ids    = list(getattr(output, 'token_ids', []) or [])
    predicted    = extract_answer(output.text or '', data['choices'])
    ground_truth = str(data['answer']).strip().upper()
    results.append({
        'id':           str(prob_dir.name),
        'problem_type': data.get('problem_type_graph', []),
        'goal_type':    data.get('problem_type_goal', []),
        'question':     question,
        'output':       clean_output(output.text or ''),
        'predicted':    predicted,
        'ground_truth': ground_truth,
        'correct':      predicted == ground_truth,
        'used_latent':  151666 in token_ids or 151667 in token_ids,
        'finish':       output.finish_reason,
        'tokens':       len(token_ids),
    })

    if i % SAVE_EVERY == 0 or i == len(remaining):
        RESULTS_FILE.write_text(json.dumps(results, indent=2))
        n       = len(results)
        correct = sum(r['correct'] for r in results)
        latent  = sum(r['used_latent'] for r in results)
        print(f'[{i}/{len(remaining)}] saved={n} acc={correct/n:.3f} latent={latent/n:.3f}')

n       = len(results)
correct = sum(r['correct'] for r in results)
latent  = sum(r['used_latent'] for r in results)
print(f'\n=== Geometry3K Results (n={n}) ===')
print(f'accuracy:    {correct}/{n} = {correct/max(n,1):.3f}')
print(f'latent_rate: {latent}/{n} = {latent/max(n,1):.3f}')
print(f'results →    {RESULTS_FILE}')

In [None]:
# 9) Geometry3K — text-only baseline (question + choices, no image)
import importlib, json, os, re, subprocess, sys, traceback
from pathlib import Path
from transformers import AutoProcessor

WORK          = Path('/content/work/multimodal_ml')
MODEL_DIR     = Path('/content/work/models/Monet-7B')
GEO3K_DIR     = WORK / 'datasets/geometry_3k/test'
RESULTS_FILE  = WORK / 'results/geo3k_results_no_image.json'
MAX_MODEL_LEN = 32768
SEED          = 0
SAVE_EVERY    = 50

os.environ.update({
    'LATENT_SIZE': '10', 'LATENT_START_ID': '151666', 'LATENT_END_ID': '151667',
    'VLLM_USE_V1': '1', 'VLLM_WORKER_MULTIPROC_METHOD': 'spawn', 'VLLM_ENABLE_V1_MULTIPROCESSING': '0',
})
RESULTS_FILE.parent.mkdir(parents=True, exist_ok=True)

sys.path.insert(0, str(WORK / 'monet'))
import inference.apply_vllm_monet
import inference.load_and_gen_vllm as lg

_f = WORK / 'monet/inference/load_and_gen_vllm.py'
_src = _f.read_text()
_src = _src.replace('enable_sleep_mode=True',     'enable_sleep_mode=False')
_src = _src.replace('enable_chunked_prefill=True', 'enable_chunked_prefill=False')
_f.write_text(_src)
lg = importlib.reload(lg)
lg.tqdm = lambda it, **kw: it

from vllm import LLM
try:
    _reuse = (isinstance(mllm, LLM)
              and mllm.llm_engine.model_config.max_model_len == MAX_MODEL_LEN)
except:
    _reuse = False

if not _reuse:
    try:    mllm.shutdown()
    except: pass
    total_gib, free_gib = [float(x) / 1024 for x in subprocess.check_output(
        'nvidia-smi --query-gpu=memory.total,memory.free --format=csv,noheader,nounits',
        shell=True, text=True).strip().split(',')]
    util = round(min(0.92, (free_gib - 4) / total_gib), 3)
    mllm, sampling_params = lg.vllm_mllm_init(
        str(MODEL_DIR), tp=1, gpu_memory_utilization=util, max_model_len=MAX_MODEL_LEN)
    processor = AutoProcessor.from_pretrained(str(MODEL_DIR), trust_remote_code=True)

sampling_params.seed = SEED

# ── Dataset — resume from checkpoint if interrupted ───────────────────────
problem_dirs = sorted(GEO3K_DIR.iterdir(), key=lambda p: int(p.name))

results  = json.loads(RESULTS_FILE.read_text()) if RESULTS_FILE.exists() else []
done_ids = {r['id'] for r in results}
remaining = [d for d in problem_dirs if str(d.name) not in done_ids]
print(f'{len(problem_dirs)} total | {len(done_ids)} done | {len(remaining)} remaining')

# ── Helpers (identical to cell 8) ────────────────────────────────────────
def build_prompt(problem_text, choices):
    lines = [f'Question: {problem_text}', 'Choices:']
    for i, c in enumerate(choices):
        lines.append(f'({chr(65+i)}) {c}')
    lines.append('Put your final answer in \\boxed{}.')
    return '\n'.join(lines)

def extract_answer(text, choices):
    m = re.search(r'\\boxed\{([^}]*)\}', text)
    if m:
        boxed = m.group(1).strip()
        if re.fullmatch(r'[A-Da-d]', boxed):
            return boxed.upper()
        norm = lambda s: re.sub(r'\s+', ' ', s.strip().upper())
        for i, c in enumerate(choices):
            if norm(boxed) == norm(c):
                return chr(65 + i)
    letters = re.findall(r'\b([A-D])\b', text[-400:])
    return letters[-1].upper() if letters else None

# ── Inference — text only; bypass multimodal processor to avoid None image ──
for i, prob_dir in enumerate(remaining, 1):
    data     = json.loads((prob_dir / 'data.json').read_text())
    question = build_prompt(data['problem_text'], data['choices'])

    try:
        text   = processor.apply_chat_template(
            [{'role': 'user', 'content': question}],
            tokenize=False, add_generation_prompt=True)
        inputs = [{'prompt_token_ids': processor.tokenizer.encode(text, add_special_tokens=False)}]
        output = mllm.generate(inputs, sampling_params=sampling_params, use_tqdm=False)[0].outputs[0]
    except Exception:
        print(f'[{i}/{len(remaining)}] ERROR {prob_dir.name}:\n{traceback.format_exc()}')
        continue

    token_ids    = list(getattr(output, 'token_ids', []) or [])
    predicted    = extract_answer(output.text or '', data['choices'])
    ground_truth = str(data['answer']).strip().upper()
    results.append({
        'id':           str(prob_dir.name),
        'problem_type': data.get('problem_type_graph', []),
        'goal_type':    data.get('problem_type_goal', []),
        'question':     question,
        'output':       output.text or '',
        'predicted':    predicted,
        'ground_truth': ground_truth,
        'correct':      predicted == ground_truth,
        'used_latent':  151666 in token_ids or 151667 in token_ids,
        'finish':       output.finish_reason,
        'tokens':       len(token_ids),
    })

    if i % SAVE_EVERY == 0 or i == len(remaining):
        RESULTS_FILE.write_text(json.dumps(results, indent=2))
        n       = len(results)
        correct = sum(r['correct'] for r in results)
        print(f'[{i}/{len(remaining)}] saved={n} acc={correct/n:.3f}')

n       = len(results)
correct = sum(r['correct'] for r in results)
print(f'\n=== Geometry3K No-Image Results (n={n}) ===')
print(f'accuracy: {correct}/{n} = {correct/max(n,1):.3f}')
print(f'results → {RESULTS_FILE}')


In [None]:
# 10) Geometry3K — logic form baseline (question + choices + diagram logic form, no image)
import importlib, json, os, re, subprocess, sys, traceback
from pathlib import Path
from transformers import AutoProcessor

WORK          = Path('/content/work/multimodal_ml')
MODEL_DIR     = Path('/content/work/models/Monet-7B')
GEO3K_DIR     = WORK / 'datasets/geometry_3k/test'
RESULTS_FILE  = WORK / 'results/geo3k_results_logic_form.json'
MAX_MODEL_LEN = 32768
SEED          = 0
SAVE_EVERY    = 50

os.environ.update({
    'LATENT_SIZE': '10', 'LATENT_START_ID': '151666', 'LATENT_END_ID': '151667',
    'VLLM_USE_V1': '1', 'VLLM_WORKER_MULTIPROC_METHOD': 'spawn', 'VLLM_ENABLE_V1_MULTIPROCESSING': '0',
})
RESULTS_FILE.parent.mkdir(parents=True, exist_ok=True)

sys.path.insert(0, str(WORK / 'monet'))
import inference.apply_vllm_monet
import inference.load_and_gen_vllm as lg

_f = WORK / 'monet/inference/load_and_gen_vllm.py'
_src = _f.read_text()
_src = _src.replace('enable_sleep_mode=True',     'enable_sleep_mode=False')
_src = _src.replace('enable_chunked_prefill=True', 'enable_chunked_prefill=False')
_f.write_text(_src)
lg = importlib.reload(lg)
lg.tqdm = lambda it, **kw: it

from vllm import LLM
try:
    _reuse = (isinstance(mllm, LLM)
              and mllm.llm_engine.model_config.max_model_len == MAX_MODEL_LEN)
except:
    _reuse = False

if not _reuse:
    try:    mllm.shutdown()
    except: pass
    total_gib, free_gib = [float(x) / 1024 for x in subprocess.check_output(
        'nvidia-smi --query-gpu=memory.total,memory.free --format=csv,noheader,nounits',
        shell=True, text=True).strip().split(',')]
    util = round(min(0.92, (free_gib - 4) / total_gib), 3)
    mllm, sampling_params = lg.vllm_mllm_init(
        str(MODEL_DIR), tp=1, gpu_memory_utilization=util, max_model_len=MAX_MODEL_LEN)
    processor = AutoProcessor.from_pretrained(str(MODEL_DIR), trust_remote_code=True)

sampling_params.seed = SEED

# ── Dataset — resume from checkpoint if interrupted ───────────────────────
problem_dirs = sorted(GEO3K_DIR.iterdir(), key=lambda p: int(p.name))

results  = json.loads(RESULTS_FILE.read_text()) if RESULTS_FILE.exists() else []
done_ids = {r['id'] for r in results}
remaining = [d for d in problem_dirs if str(d.name) not in done_ids]
print(f'{len(problem_dirs)} total | {len(done_ids)} done | {len(remaining)} remaining')

# ── Helpers ───────────────────────────────────────────────────────────────
def build_prompt(problem_text, choices, diagram_logic_form):
    """Replaces the image with the structured diagram logic form."""
    lines = ['Diagram:']
    lines.extend(diagram_logic_form)
    lines.append(f'Question: {problem_text}')
    lines.append('Choices:')
    for i, c in enumerate(choices):
        lines.append(f'({chr(65+i)}) {c}')
    lines.append('Put your final answer in \\boxed{}.')
    return '\n'.join(lines)

def extract_answer(text, choices):
    m = re.search(r'\\boxed\{([^}]*)\}', text)
    if m:
        boxed = m.group(1).strip()
        if re.fullmatch(r'[A-Da-d]', boxed):
            return boxed.upper()
        norm = lambda s: re.sub(r'\s+', ' ', s.strip().upper())
        for i, c in enumerate(choices):
            if norm(boxed) == norm(c):
                return chr(65 + i)
    letters = re.findall(r'\b([A-D])\b', text[-400:])
    return letters[-1].upper() if letters else None

# ── Inference — logic form as image substitute; bypass multimodal processor ──
for i, prob_dir in enumerate(remaining, 1):
    data       = json.loads((prob_dir / 'data.json').read_text())
    logic_form = json.loads((prob_dir / 'logic_form.json').read_text())
    diagram    = logic_form.get('diagram_logic_form', [])
    question   = build_prompt(data['problem_text'], data['choices'], diagram)

    try:
        text   = processor.apply_chat_template(
            [{'role': 'user', 'content': question}],
            tokenize=False, add_generation_prompt=True)
        inputs = [{'prompt_token_ids': processor.tokenizer.encode(text, add_special_tokens=False)}]
        output = mllm.generate(inputs, sampling_params=sampling_params, use_tqdm=False)[0].outputs[0]
    except Exception:
        print(f'[{i}/{len(remaining)}] ERROR {prob_dir.name}:\n{traceback.format_exc()}')
        continue

    token_ids    = list(getattr(output, 'token_ids', []) or [])
    predicted    = extract_answer(output.text or '', data['choices'])
    ground_truth = str(data['answer']).strip().upper()
    results.append({
        'id':           str(prob_dir.name),
        'problem_type': data.get('problem_type_graph', []),
        'goal_type':    data.get('problem_type_goal', []),
        'question':     question,
        'output':       output.text or '',
        'predicted':    predicted,
        'ground_truth': ground_truth,
        'correct':      predicted == ground_truth,
        'used_latent':  151666 in token_ids or 151667 in token_ids,
        'finish':       output.finish_reason,
        'tokens':       len(token_ids),
    })

    if i % SAVE_EVERY == 0 or i == len(remaining):
        RESULTS_FILE.write_text(json.dumps(results, indent=2))
        n       = len(results)
        correct = sum(r['correct'] for r in results)
        print(f'[{i}/{len(remaining)}] saved={n} acc={correct/n:.3f}')

n       = len(results)
correct = sum(r['correct'] for r in results)
print(f'\n=== Geometry3K Logic-Form Results (n={n}) ===')
print(f'accuracy: {correct}/{n} = {correct/max(n,1):.3f}')
print(f'results → {RESULTS_FILE}')
