In [None]:
# %%
# %% [0] (optional) allow big uploads + quiet analytics
import os
os.environ.setdefault("GRADIO_MAX_UPLOAD_SIZE", "512")  # MB
os.environ.setdefault("GRADIO_ANALYTICS_ENABLED", "0")

In [None]:
# %%
# %% [1] Imports, env, model
import os, json, time, html, re
from dotenv import load_dotenv
from PIL import Image, ImageOps, ImageDraw
import gradio as gr
from openai import OpenAI
from pathlib import Path  
import datetime

# Directory where  sample images live
SAMPLE_DIR = Path("sample_images")   

SAMPLE_FILES = [
    SAMPLE_DIR / "bottles.jpg",
    SAMPLE_DIR / "burger.jpg",
    SAMPLE_DIR / "car.jpg",
    SAMPLE_DIR / "man.jpg",
    SAMPLE_DIR / "vegetables.jpg",
]

# In-memory log of all tags generated this session
TAG_LOG: list[dict] = []

# Load once as PIL images
SAMPLE_IMAGES = [Image.open(p).convert("RGB") for p in SAMPLE_FILES]

# Load API keys / endpoints from api.env (not committed)
load_dotenv("api.env")
API_KEY  = os.getenv("MOONDREAM_API_KEY", "").strip()
ENDPOINT = os.getenv("MOONDREAM_ENDPOINT", "http://localhost:2020/v1").strip()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "").strip()
OA_CLIENT = OpenAI(api_key=OPENAI_API_KEY) if OPENAI_API_KEY else None

# Moondream model ‚Äì either cloud (API key) or local (endpoint)
import moondream as md
MODEL  = md.vl(api_key=API_KEY) if API_KEY else md.vl(endpoint=ENDPOINT)
SOURCE = "moondream-cloud" if API_KEY else "moondream-local"


In [None]:
# %%
# %% [2] Helpers

def exif_rgb(img: Image.Image, max_side: int = 1024) -> Image.Image:
    """
    Normalize an input PIL image for model use:
    - Convert to RGB
    - Respect EXIF orientation (rotate/flip if needed)
    - Downscale so the longest side is at most `max_side` pixels

    This significantly reduces payload size and compute without
    materially hurting caption/analysis quality.
    """
    img = ImageOps.exif_transpose(img.convert("RGB"))
    w, h = img.size
    longest = max(w, h)
    if longest > max_side:
        scale = max_side / float(longest)
        new_size = (int(w * scale), int(h * scale))
        img = img.resize(new_size, Image.LANCZOS)
    return img


def to_px_box(obj, w, h):
    """
    Convert a normalized Moondream bounding box (in [0, 1])
    to absolute pixel coordinates for drawing.
    """
    return [
        int(obj["x_min"] * w),
        int(obj["y_min"] * h),
        int(obj["x_max"] * w),
        int(obj["y_max"] * h),
    ]


def to_px_point(p, w, h):
    """
    Convert a normalized point (in [0, 1]) to pixel coordinates.
    """
    return [int(p["x"] * w), int(p["y"] * h)]

def crop_from_norm_box(img: Image.Image, box_norm: dict) -> Image.Image:
    """
    Crop a region from an image given a Moondream-style normalized box:
    { "x_min": ..., "y_min": ..., "x_max": ..., "y_max": ... } in [0, 1].
    """
    w, h = img.size
    x1 = int(box_norm["x_min"] * w)
    y1 = int(box_norm["y_min"] * h)
    x2 = int(box_norm["x_max"] * w)
    y2 = int(box_norm["y_max"] * h)
    # Clamp just in case
    x1, y1 = max(x1, 0), max(y1, 0)
    x2, y2 = min(x2, w), min(y2, h)
    if x2 <= x1 or y2 <= y1:
        return img
    return img.crop((x1, y1, x2, y2))


def jdump(obj) -> str:
    """Pretty-print JSON with unicode support."""
    return json.dumps(obj, indent=2, ensure_ascii=False)


def make_code_details(json_text: str) -> str:
    # Disable the "Show Code" block entirely
    return ""


def _steer(q: str) -> str:
    """
    Add gentle instructions to a user question so that the model:
    - Answers in one clear sentence
    - Avoids pronouns like 'this' or 'that'
    - Admits uncertainty when needed
    """
    return (
        "Answer the question in one clear, complete sentence. "
        "If you‚Äôre unsure, say you‚Äôre unsure and why. "
        "Avoid deictic words like 'this' or 'that'.\n\n"
        f"Question: {q.strip()}"
    )


def _is_low_info(ans: str) -> bool:
    """
    Heuristic to detect unhelpful answers such as 'this', 'that', or
    very short single words, which should trigger a retry.
    """
    a = (ans or "").strip().lower()
    return (len(a) <= 4) or a in {"this", "that", "yes", "no", "unknown", "n/a"}


# Styled overlays like the playground
def draw_overlay(base: Image.Image, detections=None, points=None) -> Image.Image:
    """
    Render Moondream outputs on top of an image.

    Args:
        base: Original RGB PIL image.
        detections: List of dicts with a 'box' key containing `[x1, y1, x2, y2]`.
        points: List of dicts with an 'xy' key containing `[x, y]`.

    Returns:
        A new RGB image with:
        - Red rectangles for detection boxes
        - Blue glowing markers for points
    """
    detections = detections or []
    points = points or []
    canvas = base.convert("RGBA")
    overlay = Image.new("RGBA", canvas.size, (0, 0, 0, 0))
    d = ImageDraw.Draw(overlay)

    RED = (239, 68, 68, 255)   # red-500
    for det in detections:
        x1, y1, x2, y2 = [int(v) for v in det["box"]]
        d.rectangle([x1, y1, x2, y2], outline=RED, width=3)

    BLUE = (59, 130, 246)      # blue-500
    for pt in points:
        x, y = [int(v) for v in pt["xy"]]
        # glow
        r_glow = 16
        d.ellipse([x-r_glow, y-r_glow, x+r_glow, y+r_glow],
                  fill=(BLUE[0], BLUE[1], BLUE[2], 70))
        # blue ring
        r_ring = 11
        d.ellipse([x-r_ring, y-r_ring, x+r_ring, y+r_ring],
                  outline=(BLUE[0], BLUE[1], BLUE[2], 255), width=3)
        # thin white ring
        r_inner = 7
        d.ellipse([x-r_inner, y-r_inner, x+r_inner, y+r_inner],
                  outline=(255, 255, 255, 220), width=2)
        # center dot
        r_dot = 3
        d.ellipse([x-r_dot, y-r_dot, x+r_dot, y+r_dot],
                  fill=(255, 255, 255, 255))

    return Image.alpha_composite(canvas, overlay).convert("RGB")


# Output panel sections (PROMPT / REASONING / RESULT)
def _panel_sections(mode: str, sections: dict[str, str], meta: dict,
                    extra_chip: str | None = None) -> str:
    """
    Render a Moondream-style HTML panel for the right-hand result area.

    Args:
        mode: Short label for the mode, e.g. 'query', 'caption', 'point', 'detect', 'agent'.
        sections: Mapping of title -> body text (e.g. {'PROMPT': '...', 'RESULT': '...'}).
        meta: Metadata dict, typically containing 'ms' (latency in ms) and 'source'.
        extra_chip: Optional extra chip text (e.g. 'LENGTH: LONG').

    Returns:
        HTML string used to populate a Gradio HTML component.
    """
    chips = [f"<span class='chip'>{mode.upper()}</span>"]
    if extra_chip:
        chips.append(f"<span class='chip chip--muted'>{html.escape(extra_chip)}</span>")

    rt_ms = meta.get("ms")
    rt_html = (
        f"<div class='meta'>Response time: {int(rt_ms)}ms</div>"
        if isinstance(rt_ms, (int, float)) else ""
    )

    parts = [f"<div class='chips'>{''.join(chips)}</div>"]
    for title, body in sections.items():
        if not body:
            continue
        safe = html.escape(body)
        parts.append(f"<div class='section-h'>{title}</div>")
        parts.append(f"<div class='body'>{safe}</div>")
    parts.append(rt_html)
    return f"<div class='panel'>{''.join(parts)}</div>"


# Caption helper with length control
def caption_with_length(image: Image.Image, length: str) -> tuple[str, dict]:
    t0 = time.time()
    try:
        text = MODEL.caption(image, length=length)["caption"]
    except Exception:
        base = MODEL.caption(image)["caption"]
        if length == "short":
            prompt = (
                "Rewrite this caption into EXACTLY 1 sentence. "
                "No headings or line breaks. Caption: {base}"
            )
        elif length == "normal":
            prompt = (
                "Rewrite this caption into EXACTLY 3 sentences as a single paragraph. "
                "No headings or bullet points. Caption: {base}"
            )
        else:
            prompt = (
                "Expand this caption into TWO short paragraphs separated by a blank line. "
                "Use 4 to 6 sentences total across both paragraphs. "
                "Stay factual and grounded in the image; no lists or headings. "
                f"Caption: {base}"
            )
        try:
            text = MODEL.query(image, prompt)["answer"]
        except Exception:
            text = base

    if length == "long" and "\n" not in text.strip():
        sents = re.split(r'(?<=[.!?])\s+', text.strip())
        if len(sents) >= 4:
            mid = max(2, len(sents) // 2)
            text = " ".join(sents[:mid]) + "\n\n" + " ".join(sents[mid:])

    meta = {
        "ms": int((time.time() - t0) * 1000),
        "source": SOURCE,
        "length": length,
    }
    return text, meta


def suggest_questions(image: Image.Image, n: int = 3) -> list[str]:
    """
    Ask Moondream for n short, concrete example questions about the image.
    Returns a list of up to n strings, but never an empty list ‚Äì it will
    fall back to generic questions if needed.
    """
    if image is None:
        return []

    default_qs = [
        "What are the main objects in this image?",
        "What is the setting or environment?",
        "Is there anything unusual or interesting here?",
    ]

    prompt = (
        "You are helping a user explore this image in an interactive playground. "
        "Analyze the image carefully and propose 3 short, concrete factual questions "
        "the user might ask specifically about THIS image. "
        "Vary the questions so they touch on things like: environment/natural features, "
        "structures/objects, and any visible activity or context if present.\n\n"
        "Respond ONLY with a JSON array of strings, like:\n"
        "[\"Question about the environment\", \"Question about objects\", \"Question about activity\"]"
    )

    try:
        raw = MODEL.query(image, prompt)["answer"].strip()
    except Exception:
        # Model call failed ‚Üí just use defaults
        return default_qs[:n]

    qs: list[str] = []

    # Try JSON first
    try:
        data = json.loads(raw)
        if isinstance(data, list):
            qs = [str(x).strip() for x in data if str(x).strip()]
    except Exception:
        pass

    # Fallback: split by lines / bullets if JSON parse didn't work
    if not qs:
        parts = re.split(r"[\n;‚Ä¢\-]+", raw)
        qs = [p.strip().strip("0123456789.:- ") for p in parts if len(p.strip()) > 5]

    # Final safety: if we *still* have nothing, use the defaults
    if not qs:
        qs = default_qs

    return qs[:n]

def generate_image_tags(image: Image.Image) -> dict:
    """
    Generate structured tags for an image.

    Returns a dict like:
    {
      "caption": "...",
      "objects": [...],
      "scene": [...],
      "activities": [...],
      "risk_tags": [...]
    }

    Uses OpenAI when available for richer tags; otherwise falls back to
    a simple Moondream-caption-only structure.
    """
    if image is None:
        return {"error": "No image provided."}

    base = exif_rgb(image, max_side=1024)

    # 1) Base caption from Moondream
    try:
        cap = MODEL.caption(base)["caption"].strip()
    except Exception as e:
        cap = f"(caption error: {e})"

    # 2) If no OpenAI client, return a simple structure
    if OA_CLIENT is None:
        return {
            "caption": cap,
            "objects": [],
            "scene": [],
            "activities": [],
            "risk_tags": [],
            "note": "OPENAI_API_KEY not set; returning caption-only tags.",
        }

    # 3) Use GPT-4.1-mini to turn the caption into structured tags
    system_prompt = """
You turn image captions into structured tags for downstream search and analysis.

You ONLY see a natural-language caption (no raw pixels). From that caption,
infer high-level tags. Be conservative and avoid hallucinations.

Return WELL-FORMED JSON with exactly these keys:
- "caption": string (the original or slightly cleaned-up caption)
- "objects": list of short strings (e.g., ["person", "ladder", "monitor"])
- "scene": list of short strings describing the setting or environment
  (e.g., ["lab", "office", "outdoor", "desert", "warehouse"])
- "activities": list of short strings for any actions or processes
  (e.g., ["maintaining equipment", "assembling hardware"])
- "risk_tags": list of short strings for any safety/logistics/terrain risks
  (e.g., ["trip hazard", "tight workspace", "steep slope", "food safety"])

If the caption does not support something, keep the corresponding list empty.

You MUST respond with **only** JSON, no backticks, no commentary.
""".strip()

    user_prompt = json.dumps({"caption": cap}, ensure_ascii=False)

    try:
        resp = OA_CLIENT.chat.completions.create(
            model="gpt-4.1-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt},
            ],
            temperature=0.1,
        )
        raw = (resp.choices[0].message.content or "").strip()
        try:
            data = json.loads(raw)
        except Exception:
            # Fallback: wrap raw text
            data = {
                "caption": cap,
                "objects": [],
                "scene": [],
                "activities": [],
                "risk_tags": [],
                "raw": raw,
                "note": "Failed to parse JSON; raw model output attached.",
            }
    except Exception as e:
        data = {
            "caption": cap,
            "objects": [],
            "scene": [],
            "activities": [],
            "risk_tags": [],
            "error": f"Tagging error: {e}",
        }

    # Ensure the keys exist
    data.setdefault("caption", cap)
    for key in ["objects", "scene", "activities", "risk_tags"]:
        if not isinstance(data.get(key), list):
            data[key] = []

    # Log it for the session
    TAG_LOG.append(
        {
            "timestamp": datetime.datetime.utcnow().isoformat() + "Z",
            "source": "moondream-playground",
            "caption": data.get("caption", cap),
            "tags": data,
        }
    )

    return data

def run_vision_agent(
    image: Image.Image,
    question: str,
    detailed: bool = True,
) -> str:
    """
    A NASA-flavored, semi-agentic vision reasoning pipeline that works for
    *any* image type (terrain, equipment, people, food, documents, etc.).

    Pipeline:
      0) Caption with Moondream
      1) Planner (GPT-4.1-mini) ‚Äì decides WHAT to detect / inspect
      2) Optional Moondream detect + region recaption
      3) Final NASA-style reasoning (GPT-4.1-mini) ‚Äì uses caption, detections,
         region captions, and world knowledge
      4) Confidence pass ‚Äì small JSON pass to assign a 0‚Äì100 confidence score

    Returns an HTML string formatted with _panel_sections(...) so it
    looks like the Moondream playground results box.
    """
    t_total = time.time()

    # ---------- Guard rails / early exits ----------
    if OA_CLIENT is None:
        sections = {
            "RESULT": (
                "‚ö†Ô∏è OpenAI API key not configured.\n\n"
                "Set OPENAI_API_KEY in your environment to enable the vision reasoning agent."
            )
        }
        meta = {"ms": 0, "source": "openai-missing"}
        return _panel_sections("agent", sections, meta)

    if image is None:
        sections = {"RESULT": "Please upload an image first."}
        meta = {"ms": 0, "source": SOURCE}
        return _panel_sections("agent", sections, meta)

    if not (question or "").strip():
        sections = {"RESULT": "Please enter a question about the image."}
        meta = {"ms": 0, "source": SOURCE}
        return _panel_sections("agent", sections, meta)

    user_q = question.strip()
    base = exif_rgb(image, max_side=1024)

    # --------------------------
    # STEP 0 ‚Äî BASE CAPTION
    # --------------------------
    try:
        caption = MODEL.caption(base)["caption"].strip()
    except Exception as e:
        caption = f"(caption error: {e})"

    # --------------------------
    # STEP 1 ‚Äî UNIVERSAL PLANNER
    # --------------------------
    planner_prompt_system = """
You are a planning module for a NASA-focused, general-purpose vision agent.

You NEVER see the image itself. You ONLY see:
- A natural-language CAPTION of the image.
- A USER QUESTION about the image.

You are helping NASA engineers, scientists, and analysts answer questions about:
- Terrain, natural hazards, weather / environmental risk
- Infrastructure, equipment, vehicles, lab setups
- Human activity, PPE / safety compliance, accessibility
- Food / supplies, logistics, or other everyday scenes

Tools available to downstream steps (you just PLAN; you do NOT run them):
- detect(image, label): detects instances of a given object noun
  (e.g., "person", "helmet", "ladder", "boulder", "vehicle").
- region_caption(image_region): caption a cropped region of the image.

Your job:
  ‚Ä¢ Understand what the user question is really trying to assess
    (e.g., safety, accessibility, hazards, logistics, calibration, etc.).
  ‚Ä¢ Decide whether object detection would add useful signal.
  ‚Ä¢ If so, choose up to 6 SHORT NOUN labels (no adjectives) relevant to the question.
  ‚Ä¢ Optionally request region-level captions if zooming in will help.
  ‚Ä¢ If detection is unnecessary, return an empty list.

Examples of good labels:
  ["person", "ladder", "helmet"]
  ["vehicle", "road", "barrier"]
  ["rock", "slope", "water"]

BAD labels (not allowed):
  ["large person", "red car", "blurry plate"]  # adjectives ‚Üí forbidden

Your output MUST be ONLY valid JSON with this shape:

{
  "intent": "short summary of what the question is trying to understand",
  "profile": "safety" | "accessibility" | "terrain" | "equipment" | "logistics" | "general",
  "detect_labels": ["noun1", "noun2", ...],
  "use_region_captions": true or false,
  "notes": "1-3 short sentences explaining why you chose these labels/tools"
}
""".strip()

    planner_prompt_user = f"""
CAPTION: {caption}

USER QUESTION: {user_q}

Decide what (if anything) should be detected and whether region-level captions would help.
Return ONLY JSON, no extra text.
""".strip()

    try:
        plan_resp = OA_CLIENT.chat.completions.create(
            model="gpt-4.1-mini",
            messages=[
                {"role": "system", "content": planner_prompt_system},
                {"role": "user", "content": planner_prompt_user},
            ],
            temperature=0.2,
        )
        plan_raw = (plan_resp.choices[0].message.content or "").strip()
        try:
            plan = json.loads(plan_raw)
        except Exception:
            plan = {}
    except Exception as e:
        plan = {}
        plan_raw = f"(planner failure: {e})"

    detect_labels = plan.get("detect_labels", []) or []
    if not isinstance(detect_labels, list):
        detect_labels = []
    detect_labels = [str(x).strip() for x in detect_labels if str(x).strip()]

    intent = plan.get("intent", "") or ""
    profile = plan.get("profile", "") or "general"
    use_region_captions = bool(plan.get("use_region_captions", False))
    notes = plan.get("notes", "") or ""

    # --------------------------
    # STEP 2 ‚Äî RUN DETECTION (OPTIONAL)
    # --------------------------
    detections = []
    region_caps = []  # per-object zoom-in captions

    if detect_labels:
        for lb in detect_labels:
            try:
                res = MODEL.detect(base, lb)
                objs = res.get("objects", [])
                detections.append(
                    {
                        "label": lb,
                        "count": len(objs),
                    }
                )

                # Optional region-level captions for first few objects
                if use_region_captions and objs:
                    # Limit total region captions so we don't go wild on latency
                    for o in objs[:3]:
                        try:
                            crop = crop_from_norm_box(base, o)
                            rc = MODEL.caption(crop)["caption"].strip()
                        except Exception as e:
                            rc = f"(region caption error: {e})"
                        region_caps.append(
                            {
                                "label": lb,
                                "box": o,
                                "caption": rc,
                            }
                        )
            except Exception as e:
                detections.append(
                    {
                        "label": lb,
                        "error": str(e),
                    }
                )

    # --------------------------
    # STEP 3 ‚Äî FINAL REASONING
    # --------------------------
    style_hint = (
        "Give detailed, step-by-step reasoning (4‚Äì7 bullets) and a rich answer in 2‚Äì4 sentences."
        if detailed
        else "Keep the reasoning tight (3 bullets) and the final answer in 1‚Äì2 sentences."
    )

    profile_hint = {
        "safety": "Focus on physical safety, hazards, PPE usage, and risk to people or hardware.",
        "accessibility": "Focus on how easy or hard it is for people or vehicles to access and move through this environment.",
        "terrain": "Focus on terrain, slope, obstacles, and environmental stability or hazards.",
        "equipment": "Focus on equipment condition, proper usage, and potential failure or maintenance issues.",
        "logistics": "Focus on supplies, organization, throughput, and practical logistics.",
        "general": "Give a balanced, NASA-context-friendly analysis.",
    }.get(profile, "Give a balanced, NASA-context-friendly analysis.")

    reasoning_system = f"""
You are a vision reasoning assistant supporting NASA engineers, scientists, and operators.

You DO NOT see the image. You ONLY know:
- A natural-language overall caption of the image.
- Optional detection outputs (object counts).
- Optional region-level captions for detected areas.
- General world knowledge (e.g., hazards, safety, accessibility, physics, equipment usage).
- The user question.

Your responsibilities:
  ‚Ä¢ Provide careful, honest visual reasoning focused on NASA-style concerns.
  ‚Ä¢ Use the caption, detections, and region captions faithfully.
  ‚Ä¢ NEVER hallucinate objects not supported by caption/detections/regions.
  ‚Ä¢ If the question asks about food, calories, safety, hazards, accessibility, equipment, etc.,
    use general knowledge to approximate answers while being transparent about uncertainty.
  ‚Ä¢ If the image content does NOT actually support the question, clearly say so and explain why.

Special focus profile:
  {profile_hint}

{style_hint}

Format your response EXACTLY like this:

Reasoning:
- bullet 1
- bullet 2
- bullet 3
(3‚Äì7 bullets total, depending on the style hint)

Answer:
1‚Äì3 sentences directly answering the question using the available evidence.
If giving any numerical estimate (e.g., calories, risk level), clearly mark it as approximate.
""".strip()

    vision_context = {
        "caption": caption,
        "intent": intent,
        "profile": profile,
        "planner_notes": notes,
        "detect_labels": detect_labels,
        "detections": detections,
        "region_captions": region_caps,
    }

    reasoning_user = (
        "vision_context = "
        + json.dumps(vision_context, ensure_ascii=False)
        + "\n\nUSER QUESTION:\n"
        + user_q
    )

    try:
        final_resp = OA_CLIENT.chat.completions.create(
            model="gpt-4.1-mini",
            messages=[
                {"role": "system", "content": reasoning_system},
                {"role": "user", "content": reasoning_user},
            ],
            temperature=0.3,
        )
        answer = (final_resp.choices[0].message.content or "").strip()
    except Exception as e:
        answer = f"(reasoning error: {e})"
        detections = detections or []
        region_caps = region_caps or []

    # --------------------------
    # STEP 4 ‚Äî CONFIDENCE PASS
    # --------------------------
    confidence_label = ""
    try:
        conf_system = """
You assign confidence scores to vision reasoning answers.

You are given:
- The original user question.
- A structured summary of the tools used.
- The final reasoning + answer text.

Your job:
  ‚Ä¢ Assign an overall confidence score from 0 to 100.
  ‚Ä¢ Provide a short label ("low", "medium", "high").
  ‚Ä¢ Provide one short sentence explaining why.

Return ONLY JSON like:
{"confidence": 78, "label": "medium", "why": "Short reason..."}
""".strip()

        tools_summary = {
            "profile": profile,
            "detect_labels": detect_labels,
            "detections": detections,
            "region_captions_used": bool(region_caps),
        }

        conf_user = json.dumps(
            {
                "question": user_q,
                "tools": tools_summary,
                "answer": answer,
            },
            ensure_ascii=False,
        )

        conf_resp = OA_CLIENT.chat.completions.create(
            model="gpt-4.1-mini",
            messages=[
                {"role": "system", "content": conf_system},
                {"role": "user", "content": conf_user},
            ],
            temperature=0.0,
        )
        conf_raw = (conf_resp.choices[0].message.content or "").strip()
        conf_data = {}
        try:
            conf_data = json.loads(conf_raw)
        except Exception:
            conf_data = {}

        c_val = conf_data.get("confidence", None)
        c_label = (conf_data.get("label") or "").lower()
        c_why = conf_data.get("why") or ""

        if isinstance(c_val, (int, float)):
            c_val = max(0, min(int(c_val), 100))
            confidence_label = f"{c_val}/100 ({c_label or 'confidence'})"
            if c_why:
                confidence_label += f" ‚Äì {c_why}"
        elif c_label or c_why:
            confidence_label = (c_label or "uncertain").capitalize()
            if c_why:
                confidence_label += f" ‚Äì {c_why}"
    except Exception:
        confidence_label = ""

    # --------------------------
    # FORMAT AS PANEL (Moondream-style)
    # --------------------------
    tools_lines = [
        f"Planner intent: {intent or '(none)'}",
        f"Profile: {profile}",
        f"Detect labels: {', '.join(detect_labels) if detect_labels else '(none)'}",
        f"Planner notes: {notes or '(none)'}",
        f"Region captions: {'yes' if region_caps else 'no'}",
    ]
    tools_text = "\n".join(tools_lines)

    sections = {
        "PROMPT": user_q,
        "TOOLS": tools_text,
        "RESULT": answer,
    }
    if confidence_label:
        sections["CONFIDENCE"] = confidence_label

    meta = {
        "ms": int((time.time() - t_total) * 1000),
        "source": SOURCE,
    }

    # Mode label "agent" will show as a chip in the UI
    return _panel_sections("agent", sections, meta)



In [None]:
# %%
# %% [3] Core run

def run(mode, img, question, reasoning, cap_len, labels):
    """
    Core handler for the Moondream playground modes.

    Args:
        mode: One of "Query", "Caption", "Point", or "Detect".
        img: PIL image from the Gradio input.
        question: Textbox content; interpreted differently depending on `mode`.
        reasoning: If True and mode == "Query", request additional reasoning from Moondream.
        cap_len: Caption length selector ("short", "normal", "long") for Caption mode.
        labels: Label string used for Point/Detect modes.

    Returns:
        A triple of:
            (updated_image, html_panel, code_html)
        which are wired to:
            - `out_img`
            - `out_panel`
            - `code_html` (currently disabled)
    """
    def _updates(img_value=None, panel_html="", code_html="", show=False):
        return (
            gr.update(value=img_value, visible=show),
            gr.update(value=panel_html, visible=show),
            gr.update(value="", visible=show),
        )

    if img is None:
        return _updates(show=False)

    try:
        base = exif_rgb(img, max_side=1024)
        W, H = base.size

        # --- QUERY ---
        if mode == "Query":
            q = (question or "").strip()
            if not q:
                data = {
                    "mode": "query",
                    "text": "",
                    "suggestions": [
                        "What objects are most prominent?",
                        "What is the likely setting or environment?",
                        "Are any brand names or labels visible?",
                    ],
                    "meta": {"source": SOURCE, "ms": 0},
                }
                right = _panel_sections(
                    "query",
                    {"RESULT": "Suggestions: " + " ‚Ä¢ ".join(data["suggestions"])},
                    data["meta"],
                )
                return _updates(
                    img_value=base,
                    panel_html=right,
                    code_html=make_code_details(jdump(data)),
                    show=True,
                )

            sections = {"PROMPT": q}
            reason_ms = 0

            if reasoning:
                t_r = time.time()
                rationale = MODEL.query(
                    base,
                    "Briefly explain the reasoning you used to answer the question. "
                    "Keep it concise and factual.\n\nQuestion: " + q,
                )["answer"]
                sections["REASONING"] = rationale
                reason_ms = int((time.time() - t_r) * 1000)

            t0 = time.time()

            # Try to ground the model with a quick caption, then ask the steered question.
            ans = ""
            try:
                cap = MODEL.caption(base)["caption"].strip()
                if cap:
                    primed_q = f"Image summary: {cap}\n\n{_steer(q)}"
                    ans = MODEL.query(base, primed_q)["answer"].strip()
            except Exception:
                pass

            # Fallback: ask the steered question directly.
            if not ans:
                ans = MODEL.query(base, _steer(q))["answer"].strip()

            # Retry once if the reply is low-information (e.g., "This").
            if _is_low_info(ans):
                ans = MODEL.query(
                    base,
                    _steer(q) + "\n\nBe specific and name the object/place. Avoid pronouns.",
                )["answer"].strip()

            meta = {
                "ms": int((time.time() - t0) * 1000) + reason_ms,
                "source": SOURCE,
            }
            sections["RESULT"] = ans.strip()

            payload = {
                "mode": "query",
                "prompt": q,
                "reasoning": sections.get("REASONING", ""),
                "result": ans,
                "meta": meta,
            }
            right = _panel_sections("query", sections, meta)
            return _updates(
                img_value=base,
                panel_html=right,
                code_html=make_code_details(jdump(payload)),
                show=True,
            )

        # --- CAPTION ---
        if mode == "Caption":
            caption_text, meta = caption_with_length(base, cap_len)
            sections = {"RESULT": caption_text}
            payload = {
                "mode": "caption",
                "text": caption_text,
                "meta": meta,
                "length": cap_len,
            }
            right = _panel_sections(
                "caption", sections, meta,
                extra_chip=f"LENGTH: {cap_len.upper()}",
            )
            return _updates(
                img_value=base,
                panel_html=right,
                code_html=make_code_details(jdump(payload)),
                show=True,
            )

        # --- POINT ---
        if mode == "Point":
            label = (labels or "object").strip()
            t0 = time.time()
            pts_norm = MODEL.point(base, label)["points"]  # normalized
            meta = {
                "ms": int((time.time() - t0) * 1000),
                "source": SOURCE,
                "points": len(pts_norm),
            }
            pts_px = [{"xy": to_px_point(p, W, H)} for p in pts_norm]
            overlay = draw_overlay(base, detections=[], points=pts_px)
            summary = f"Found {len(pts_norm)} point(s) for '{label}'."
            sections = {"RESULT": summary}
            payload = {"mode": "point", "points": pts_norm, "meta": meta}
            right = _panel_sections("point", sections, meta)
            return _updates(
                img_value=overlay,
                panel_html=right,
                code_html=make_code_details(jdump(payload)),
                show=True,
            )

        # --- DETECT ---
        lbs = [t.strip() for t in (labels or "").split(",") if t.strip()] or ["object"]
        t0 = time.time()
        all_norm = []
        for lb in lbs:
            res = MODEL.detect(base, lb)
            for o in res["objects"]:
                all_norm.append({"label": lb, **o})

        meta = {
            "ms": int((time.time() - t0) * 1000),
            "source": SOURCE,
            "detections": len(all_norm),
        }
        all_px = [{"label": d["label"], "box": to_px_box(d, W, H)} for d in all_norm]
        overlay = draw_overlay(base, detections=all_px, points=[])
        summary = f"Detections for: {', '.join(lbs)} ({len(all_norm)} total)."
        sections = {"RESULT": summary}
        payload = {"mode": "detect", "detections": all_norm, "meta": meta}
        right = _panel_sections("detect", sections, meta)
        return _updates(
            img_value=overlay,
            panel_html=right,
            code_html=make_code_details(jdump(payload)),
            show=True,
        )

    except Exception as e:
        payload = {"error": str(e)}
        right = (
            "<div class='panel'><div class='chips'>"
            "<span class='chip'>ERROR</span></div>"
            f"<div class='body'>Runtime error: {html.escape(str(e))}</div></div>"
        )
        return _updates(
            img_value=img,
            panel_html=right,
            code_html=make_code_details(jdump(payload)),
            show=True,
        )


In [None]:
custom_css = r"""
/* =========================================================
   GLOBAL BACKGROUND / RESET
   ========================================================= */

html, body, .gradio-container {
  height: 100%;
  margin: 0;
}

.gradio-container {
  min-height: 100vh;
  max-width: none !important;
  margin: 0 !important;
  padding: 0 !important;
  box-sizing: border-box;
  background:
    radial-gradient(circle at top left, rgba(239, 68, 68, 0.3), transparent 55%),
    radial-gradient(circle at bottom right, rgba(251, 146, 60, 0.28), transparent 55%),
    linear-gradient(135deg, #fef2f2, #fee2e2, #fecaca);
  font-family: system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI",
               Roboto, "Helvetica Neue", Arial, sans-serif;
  position: relative;
  overflow: hidden;
}

.gradio-container::before {
  content: "";
  position: absolute;
  top: -50%;
  left: -50%;
  width: 200%;
  height: 200%;
  background: 
    radial-gradient(circle at 20% 50%, rgba(220, 38, 38, 0.15) 0%, transparent 50%),
    radial-gradient(circle at 80% 80%, rgba(239, 68, 68, 0.12) 0%, transparent 50%),
    radial-gradient(circle at 40% 20%, rgba(251, 113, 133, 0.1) 0%, transparent 50%);
  animation: gradientShift 15s ease infinite;
  pointer-events: none;
  z-index: 0;
}

.gradio-container > * {
  position: relative;
  z-index: 1;
}

@keyframes gradientShift {
  0%, 100% {
    transform: translate(0, 0) rotate(0deg);
    opacity: 1;
  }
  33% {
    transform: translate(-5%, 5%) rotate(1deg);
    opacity: 0.9;
  }
  66% {
    transform: translate(5%, -5%) rotate(-1deg);
    opacity: 0.95;
  }
}

/* =========================================================
   OUTER SHELL & MAIN CARD (CENTERED RECTANGLE)
   ========================================================= */

.app-shell {
  width: 100%;
  max-width: 1440px;
  margin: 0 auto;
  padding: 40px 32px;
  box-sizing: border-box;
  min-height: calc(100vh - 80px);
  display: flex !important;
  align-items: center;
  justify-content: center;
}

@media (max-width: 900px) {
  .app-shell {
    align-items: flex-start;
    padding: 24px 16px;
  }
}

.app-shell > .gr-column {
  flex: 0 0 auto !important;
  height: auto !important;
  min-height: 0 !important;
}

/* White rectangle card */
.page-card {
  background: #ffffff;
  border-radius: 24px;
  box-shadow: 0 22px 60px rgba(15,23,42,.20);
  padding: 48px 64px 56px;
  max-width: 1600px;
  width: 95%;
  margin: 0 auto;
  height: auto !important;
  min-height: 0 !important;
  max-height: fit-content !important;
}

/* Stop Gradio from forcing full-height on inner elements */
.page-card > .gr-column,
.page-card > .gr-group,
.page-card > .gr-row,
.page-card .gr-block,
.page-card .gr-box,
.page-card .gr-panel {
  height: auto !important;
  min-height: 0 !important;
  flex: 0 0 auto !important;
  flex-grow: 0 !important;
}

/* =========================================================
   HERO HEADER
   ========================================================= */

.hero-chip {
  display: inline-flex;
  align-items: center;
  justify-content: center;
  padding: 4px 14px;
  border-radius: 999px;
  background: #eef2ff;
  color: #6366f1;
  font-size: 12px;
  font-weight: 600;
  letter-spacing: .08em;
  text-transform: uppercase;
  margin: 0 auto 18px;
  width: fit-content;
}

.hero-title {
  font-size: 48px;
  line-height: 1.05;
  font-weight: 800;
  color: #0f172a;
  margin: 0 0 14px;
  text-align: center;
}

.hero-subtitle {
  margin: 0 0 36px;
  font-size: 16px;
  line-height: 1.7;
  color: #4b5563;
  text-align: center;
  max-width: 900px;
  margin-left: auto;
  margin-right: auto;
}

/* =========================================================
   3-STEP ROW
   ========================================================= */

.hero-steps {
  display: grid;
  grid-template-columns: repeat(3, minmax(0, 1fr));
  gap: 24px;
  margin-bottom: 32px;
}

.hero-step {
  display: flex;
  gap: 12px;
  align-items: flex-start;
  background: #f8fafc;
  border-radius: 16px;
  border: 1px solid #e2e8f0;
  padding: 18px 20px;
}

@media (max-width: 900px) {
  .hero-steps {
    grid-template-columns: 1fr;
  }
}

.hero-step-num {
  flex-shrink: 0;
  width: 26px;
  height: 26px;
  border-radius: 999px;
  background: #2563eb;
  color: #f9fafb;
  display: flex;
  align-items: center;
  justify-content: center;
  font-size: 13px;
  font-weight: 600;
  box-shadow: 0 8px 18px rgba(37,99,235,.35);
}

.hero-step-body h3 {
  margin: 0 0 4px;
  font-size: 14px;
  font-weight: 700;
  color: #0f172a;
}

.hero-step-body p {
  margin: 0;
  font-size: 13px;
  color: #6b7280;
  line-height: 1.45;
}

/* =========================================================
   HOME BUTTON ROW
   ========================================================= */

.hero-buttons {
  display: flex !important;
  justify-content: center;
  align-items: center;
  flex-wrap: wrap;
  gap: 14px;
  margin-top: 10px;
}

.hero-buttons > * {
  flex: 0 0 auto !important;
  width: auto !important;
}

.hero-btn {
  border-radius: 999px !important;
  padding: 11px 22px !important;
  font-size: 14px !important;
  font-weight: 600 !important;
  border: 0 !important;
  min-width: 0 !important;
  cursor: pointer !important;
  box-shadow: 0 12px 26px rgba(15,23,42,.18) !important;
}

.hero-btn--primary {
  background: #2563eb !important;
  color: #f9fafb !important;
}

.hero-btn--primary:hover {
  filter: brightness(1.02);
}

.hero-btn--ghost {
  background: #ecfdf5 !important;
  color: #0f172a !important;
  box-shadow: 0 8px 20px rgba(16,185,129,.16) !important;
}

.hero-btn--ghost:hover {
  filter: brightness(1.01);
}

/* =========================================================
   PLAYGROUND CARD WRAPPER
   ========================================================= */

.play-card {
  background: #ffffff;
  border-radius: 14px;
  box-shadow: 0 10px 24px rgba(2,6,23,.06);
  padding: 16px;
}

.page-card .play-card > .gr-row,
.page-card .play-card > .gr-column {
  width: 100%;
}

/* =========================================================
   PROMPT SURFACE
   ========================================================= */

:root {
  --arrow-size: 40px;
  --arrow-gap: 18px;
}

.prompt-surface {
  display: flex !important;
  flex-direction: column;
  border: 1px solid #e5e7eb;
  border-radius: 12px;
  background: #ffffff;
  box-shadow: 0 1px 2px rgba(0,0,0,.04);
  position: relative;
}

.prompt-input,
.prompt-input > div,
.prompt-input .wrap {
  margin: 0 !important;
}

.prompt-input textarea {
  border: 0 !important;
  box-shadow: none !important;
  background: #ffffff !important;
  padding: 14px 16px 30px 16px !important;
  resize: vertical;
  min-height: 160px;
  border-top-left-radius: 12px;
  border-top-right-radius: 12px;
}

/* Small helper text under the agent textbox */
.prompt-hint {
  margin: 4px 12px 6px;
  font-size: 12px;
  line-height: 1.5;
  color: #6b7280;
}

.prompt-hint p {
  margin: 0;
}

/* =========================================================
   PROMPT FOOTER
   ========================================================= */

.prompt-footer {
  display: flex !important;
  align-items: center;
  flex-wrap: nowrap;
  gap: 16px;
  padding: 6px 12px;
  border-top: 0 !important;
  border-bottom-left-radius: 12px;
  border-bottom-right-radius: 12px;
  background: transparent !important;
  position: relative;
  padding-right: calc(var(--arrow-size) + var(--arrow-gap) + 6px);
  overflow: hidden;
  margin-top: 4px;
}

.prompt-footer > div {
  background: transparent !important;
  border: 0 !important;
  box-shadow: none !important;
}

.prompt-footer .controls {
  display: inline-flex !important;
  align-items: center;
  gap: 12px;
  flex: 0 1 auto;
  white-space: nowrap;
  min-width: 0;
}

/* Mode radio as pills */
.prompt-footer .mode-strip {
  display: flex !important;
  align-items: center;
  gap: 8px;
  white-space: nowrap;
  min-width: 0;
  flex: 0 0 auto;
}

.mode-strip input[type="radio"] {
  position: absolute;
  opacity: 0;
  pointer-events: none;
  width: 0;
  height: 0;
}

.mode-strip label {
  background: transparent;
  border: 1px solid transparent;
  border-radius: 10px;
  padding: 4px 10px;
  cursor: pointer;
  user-select: none;
  font-weight: 500;
  white-space: nowrap;
}

.mode-strip input[type="radio"]:checked + label {
  background: #ffffff;
  border-color: #d1d5db;
}

.mode-strip label:hover {
  filter: brightness(0.98);
}

.mode-strip label:focus-visible {
  outline: 2px solid #94a3b8;
  outline-offset: 2px;
}

/* Reasoning toggle */
.switch {
  display: inline-flex !important;
  align-items: center;
  gap: 8px;
  white-space: nowrap;
  flex: 0 0 auto;
  margin: 0;
}

.switch label {
  display: flex;
  align-items: center;
  gap: 8px;
  cursor: pointer;
  font-weight: 500;
  margin: 0;
}

.switch input[type="checkbox"] {
  appearance: none;
  width: 36px;
  height: 20px;
  border-radius: 999px;
  background: #e5e7eb;
  border: 1px solid #d1d5db;
  position: relative;
  outline: none;
  cursor: pointer;
}

.switch input[type="checkbox"]::after {
  content: "";
  position: absolute;
  top: 1px;
  left: 1px;
  width: 16px;
  height: 16px;
  border-radius: 50%;
  background: #ffffff;
  box-shadow: 0 1px 2px rgba(0,0,0,.2);
  transition: left .15s ease;
}

.switch input[type="checkbox"]:checked {
  background: #111827;
}

.switch input[type="checkbox"]:checked::after {
  left: calc(100% - 17px);
}

/* Arrow submit button */
.prompt-footer .gr-button,
.prompt-footer button {
  min-width: 0 !important;
  padding: 0 !important;
  box-shadow: none !important;
  line-height: 1 !important;
}

.submit-arrow {
  width: var(--arrow-size) !important;
  height: var(--arrow-size) !important;
  border-radius: 9999px !important;
  border: 1px solid #d1d5db !important;
  background: #ffffff !important;
  display: inline-flex !important;
  align-items: center !important;
  justify-content: center !important;
  font-size: 18px !important;
  line-height: 1 !important;
  position: absolute !important;
  right: var(--arrow-gap) !important;
  top: 50% !important;
  transform: translateY(-50%) !important;
  cursor: pointer;
}

.submit-arrow:hover {
  filter: brightness(0.97);
}

.submit-arrow.loading {
  pointer-events: none;
  color: transparent;
}

.submit-arrow.loading::after {
  content: "";
  position: absolute;
  left: 50%;
  top: 50%;
  transform: translate(-50%, -50%);
  width: 12px;
  height: 12px;
  border-radius: 50%;
  border: 2px solid #111827;
  border-right-color: transparent;
  animation: spin .9s linear infinite;
}

@keyframes spin {
  to { transform: translate(-50%, -50%) rotate(360deg); }
}

/* =========================================================
   OUTPUT PANEL
   ========================================================= */

.panel .chips {
  margin-bottom: 8px;
}

.chip {
  display: inline-block;
  font-weight: 600;
  font-size: 12px;
  letter-spacing: .02em;
  padding: 3px 8px;
  border-radius: 4px;
  background: #f3f4f6;
  color: #111827;
  border: 1px solid #d1d5db;
  margin-right: 6px;
}

.chip--muted {
  color: #334155;
  border: 1px solid #94a3b8;
  background: transparent;
}

.panel .section-h {
  font-weight: 700;
  text-transform: uppercase;
  font-size: 12px;
  color: #111827;
  margin: 8px 0 4px;
}

.gradio-container .panel .body,
.gradio-container .panel .body * {
  white-space: pre-line;
  line-height: 1.6;
  color: #111827;
  font-size: 15px !important;
  font-weight: 400 !important;
  font-family: inherit !important;
}

.panel .meta {
  color: #6b7280;
  font-size: 12px;
  margin-top: 12px;
}

/* =========================================================
   SUGGESTION CHIPS
   ========================================================= */

.sugg-row {
  position: relative;
  display: flex !important;
  justify-content: center;
  align-items: center;
  gap: 8px;
  margin: -24px auto 6px;
  width: fit-content;
  background: transparent !important;
  border: 0 !important;
  box-shadow: none !important;
  z-index: 2;
  flex-wrap: wrap;
}

.suggest-pill {
  display: inline-flex !important;
  align-items: center;
  justify-content: center;
  padding: 4px 12px !important;
  border-radius: 9999px !important;
  background: #f5f5f5 !important;
  border: 1px solid #e5e7eb !important;
  box-shadow: none !important;
  color: #4b5563 !important;
  font-size: 13px !important;
  font-weight: 400 !important;
  cursor: pointer !important;
  min-width: 0 !important;
  max-width: 260px;
  white-space: normal !important;
  text-align: center;
}

.suggest-pill:hover {
  background: #e5e7eb !important;
}

.suggest-pill:focus-visible {
  outline: 2px solid #94a3b8 !important;
  outline-offset: 2px !important;
}

.suggest-pill + .suggest-pill {
  margin-left: 8px;
}

.suggest-pill.loading {
  opacity: 0.65 !important;
  pointer-events: none !important;
  font-style: italic !important;
}


/* =========================================================
   SAMPLE IMAGE STRIP
   ========================================================= */

.sample-gallery {
  margin: 2px 0 12px 0;
  padding: 0 !important;
  border: 0 !important;
  box-shadow: none !important;
  background: transparent !important;
  height: auto !important;
  max-height: none !important;
  overflow: visible !important;
}

.sample-gallery > div,
.sample-gallery > div > div {
  padding: 0 !important;
  margin: 0 !important;
  border: 0 !important;
  box-shadow: none !important;
  background: transparent !important;
  height: auto !important;
  max-height: none !important;
  overflow: visible !important;
}

.sample-gallery div[style*="overflow"] {
  height: auto !important;
  max-height: none !important;
  overflow: visible !important;
}

.sample-gallery [data-testid="gallery"] {
  display: flex !important;
  flex-wrap: nowrap !important;
  gap: 10px;
  padding: 0 !important;
  margin: 0 !important;
  height: auto !important;
  max-height: none !important;
  border: 0 !important;
  box-shadow: none !important;
  background: transparent !important;
}

.sample-gallery [data-testid="gallery-item"] {
  flex: 0 0 80px !important;   /* was 120px */
  height: 55px !important;     /* was 80px  */
  display: flex !important;
  align-items: center !important;
  justify-content: center !important;
  border-radius: 10px !important;
  overflow: hidden !important;
  border: 1px solid #e5e7eb !important;
  background: #f9fafb !important;
  padding: 0 !important;
}


.sample-gallery img {
  width: 100% !important;
  height: 100% !important;
  object-fit: cover !important;
  display: block !important;
  cursor: pointer !important;
}

/* =========================================================
   TIGHTEN VERTICAL WHITESPACE INSIDE EACH PAGE CARD
   ========================================================= */

/* Shrink generic top/bottom margins between blocks/rows/columns */
.page-card .gr-block,
.page-card .gr-row,
.page-card .gr-column,
.page-card .gr-box {
  margin-top: 6px !important;
  margin-bottom: 6px !important;
}

/* Don't add extra space at the very top / bottom of the card */
.page-card > .gr-block:first-child,
.page-card > .gr-row:first-child,
.page-card > .gr-column:first-child {
  margin-top: 0 !important;
}
.page-card > .gr-block:last-child,
.page-card > .gr-row:last-child,
.page-card > .gr-column:last-child {
  margin-bottom: 0 !important;
}

.results-heading {
  margin-top: 8px !important;
}

.results-heading h1,
.results-heading h2,
.results-heading p {
  margin-top: 0 !important;
  margin-bottom: 6px !important;
}

/* =========================================================
   FIX WHITESPACE ISSUES
   ========================================================= */

/* Make the play-card row height auto instead of stretching */
.play-card > .gr-row {
  height: auto !important;
  align-items: flex-start !important;
}

/* Prevent columns from stretching to match each other */
.play-card .gr-column {
  align-self: flex-start !important;
}


/* Remove only Gradio's own "Built with Gradio" footer */
.gradio-container footer,
.gradio-container #footer {
  display: none !important;
  visibility: hidden !important;
  opacity: 0 !important;
  height: 0 !important;
  padding: 0 !important;
  margin: 0 !important;
}

/* Vision Agent run button styling */
.agent-run-btn {
  width: 100% !important;
  border-radius: 999px !important;
  padding: 10px 16px !important;
  font-size: 14px !important;
  font-weight: 600 !important;
  background: #111827 !important;
  color: #f9fafb !important;
  border: 0 !important;
  box-shadow: 0 10px 20px rgba(15,23,42,.20) !important;
  cursor: pointer !important;
}

.agent-run-btn:hover {
  filter: brightness(1.03);
}

/* =========================================================
   VISION AGENT ARROW ‚Äì FORCE SAME STYLE AS MOONDREAM
   ========================================================= */

.agent-arrow {
  width: var(--arrow-size) !important;
  height: var(--arrow-size) !important;
  border-radius: 9999px !important;
  border: 1px solid #d1d5db !important;
  background: #ffffff !important;
  display: inline-flex !important;
  align-items: center !important;
  justify-content: center !important;
  font-size: 18px !important;
  line-height: 1 !important;
  position: absolute !important;
  right: var(--arrow-gap) !important;
  top: 50% !important;
  transform: translateY(-50%) !important;
  cursor: pointer;
}

.agent-arrow:hover {
  filter: brightness(0.97);
}

.agent-arrow.loading {
  pointer-events: none;
  color: transparent;
}

.agent-arrow.loading::after {
  content: "";
  position: absolute;
  left: 50%;
  top: 50%;
  transform: translate(-50%, -50%);
  width: 12px;
  height: 12px;
  border-radius: 50%;
  border: 2px solid #111827;
  border-right-color: transparent;
  animation: spin .9s linear infinite;
}

/* =========================================================
   IMAGE UPLOAD ‚Äì SHARED FOR AGENT + MOONDREAM
   ========================================================= */

/* Only touch the left image column in the play-card rows */
.image-col [data-testid="image"] {
  width: 100% !important;
  min-height: 320px !important;
  max-height: 320px !important;
  display: flex !important;
  align-items: center !important;
  justify-content: center !important;
  background: #f8fafc;
  border-radius: 12px;
  overflow: hidden;
}

/* Fill by height, so extra space is left/right, not top/bottom */
.image-col [data-testid="image"] img,
.image-col [data-testid="image"] canvas {
  height: 100% !important;
  width: auto !important;
  max-width: none !important;
  object-fit: contain !important;
  display: block !important;
}

/* -----------------------------------------------
   Vision Agent: fix tip + arrow overlap
   ----------------------------------------------- */

/* Let the footer breathe and stop clipping the arrow */
.agent-col .prompt-footer {
  overflow: visible !important;            /* no clipping */
  padding-top: 8px !important;
  padding-bottom: 8px !important;
  min-height: 60px !important;             /* enough room for the circle */
  align-items: center !important;
}

/* Slightly tighten the hint so it doesn‚Äôt ‚Äúspill‚Äù into the footer */
.agent-col .prompt-hint {
  margin-top: 4px !important;
  margin-bottom: 2px !important;
  padding: 0 12px !important;
}

/* Nudge the textarea so there‚Äôs a tiny gap above the hint */
.agent-col .prompt-input textarea {
  margin-bottom: 2px !important;
}

/* Keep the arrow perfectly centered inside the (now taller) footer */
.agent-col .submit-arrow,
.agent-col .agent-arrow {
  top: 50% !important;
  transform: translateY(-50%) !important;
}

/* -----------------------------------------------
   Vision Agent Tip: collapsible pill
   ----------------------------------------------- */

.agent-col .prompt-hint {
  margin: 4px 0 4px !important;
  padding: 0 4px !important;
}

/* Container */
.agent-col .agent-tip {
  width: 100%;
  font-size: 13px;
}

/* Hide the checkbox */
.agent-col .agent-tip-toggle {
  display: none;
}

/* Header bar */
.agent-col .agent-tip-header {
  display: flex;
  align-items: center;
  gap: 8px;
  padding: 6px 10px;
  border-radius: 999px;
  background: #f3f4ff;
  border: 1px solid #e5e7ff;
  cursor: pointer;
  user-select: none;
  box-shadow: 0 6px 16px rgba(79,70,229,0.12);
}

.agent-col .agent-tip-label {
  font-weight: 600;
  color: #4f46e5;
}

.agent-col .agent-tip-text {
  color: #4b5563;
  white-space: nowrap;
  overflow: hidden;
  text-overflow: ellipsis;
}

.agent-col .agent-tip-chevron {
  margin-left: auto;
  font-size: 12px;
  transition: transform .15s ease;
}

/* Body content */
.agent-col .agent-tip-body {
  margin-top: 6px;
  padding: 8px 12px;
  border-radius: 12px;
  background: #f9fafb;
  border: 1px solid #e5e7eb;
  color: #4b5563;
  line-height: 1.5;
  max-height: 999px;
  opacity: 1;
  overflow: hidden;
  transition: max-height .18s ease, opacity .18s ease, margin-top .18s ease, padding .18s ease;
}

.agent-col .agent-tip-body ul {
  padding-left: 18px;
  margin: 0;
}

/* Collapsed state: when checkbox is NOT checked */
.agent-col .agent-tip-toggle:not(:checked) + .agent-tip-header + .agent-tip-body {
  max-height: 0;
  opacity: 0;
  margin-top: 0;
  padding-top: 0;
  padding-bottom: 0;
  border-width: 0;
}

/* Rotate chevron when collapsed */
.agent-col .agent-tip-toggle:not(:checked) + .agent-tip-header .agent-tip-chevron {
  transform: rotate(-90deg);
}

/* Keep footer/arrow from being squished */
.agent-col .prompt-footer {
  overflow: visible !important;
  padding-top: 8px !important;
  padding-bottom: 8px !important;
  min-height: 60px !important;
}

.agent-col .submit-arrow,
.agent-col .agent-arrow {
  top: 50% !important;
  transform: translateY(-50%) !important;
}

"""

In [None]:
with gr.Blocks(css=custom_css) as demo:
    # Outer gradient shell
    with gr.Column(elem_classes=["app-shell"]):

        # =========================================================
        # HOME (hero)
        # =========================================================
        with gr.Column(visible=True,
                       elem_classes=["page-card"]) as home_page:

            gr.HTML(
                """
<div style="text-align: center;">
  <div class="hero-chip">NASA AI LAB</div>
</div>
<h1 class="hero-title">ImageTagger Studio</h1>
<p class="hero-subtitle">
Explore NASA imagery with vision models and ChatGPT.
Choose a workspace to start tagging, searching, or reasoning over your images.
</p>

<div class="hero-steps">
  <div class="hero-step">
    <div class="hero-step-num">1</div>
    <div class="hero-step-body">
      <h3>Pick a workspace</h3>
      <p>Start in the vision reasoning agent or jump straight into the Moondream playground.</p>
    </div>
  </div>

  <div class="hero-step">
    <div class="hero-step-num">2</div>
    <div class="hero-step-body">
      <h3>Load your data</h3>
      <p>Upload images or connect to NASA data sources in the future.</p>
    </div>
  </div>

  <div class="hero-step">
    <div class="hero-step-num">3</div>
    <div class="hero-step-body">
      <h3>Explore & iterate</h3>
      <p>Ask questions, tag content, and refine prompts for your workflows.</p>
    </div>
  </div>
</div>
                """
            )

            with gr.Row(elem_classes=["hero-buttons"]):
                home_to_agent = gr.Button(
                    "ü§ñ Vision Reasoning Agent",
                    scale=0,
                    elem_classes=["hero-btn", "hero-btn--primary"],
                )
                home_to_moon = gr.Button(
                    "üñºÔ∏è Open Moondream Playground",
                    scale=0,
                    elem_classes=["hero-btn", "hero-btn--ghost"],
                )

        # =========================================================
        # VISION REASONING AGENT PAGE
        # =========================================================
        with gr.Column(visible=False,
                    elem_classes=["page-card"]) as vision_agent_page:

            with gr.Row():
                back_from_agent = gr.Button("‚Üê Back to home", scale=0)

            gr.Markdown(
                """
            ## Vision Reasoning Agent

            This workspace chains Moondream + GPT-4.1-mini into a semi-agentic pipeline.

            - Uses Moondream to caption the scene and optionally detect key objects / regions.
            - Uses a planner to decide which tools to call (if any).
            - Uses GPT-4.1-mini to produce a NASA-style reasoning trace and final answer.
            - Assigns an overall confidence score based on the tools and reasoning.

            Good questions focus on **safety**, **accessibility**, **equipment**, **terrain**, or **logistics**.
            """
            )


            gr.Markdown("**Sample Images**")
            agent_sample_gallery = gr.Gallery(
                value=SAMPLE_IMAGES,
                show_label=False,
                columns=len(SAMPLE_IMAGES),
                rows=1,
                allow_preview=False,
                elem_classes=["sample-gallery"],
                height=10,
            )

            # ---------- TOP: image + prompt card (same vibe as Moondream) ----------
            with gr.Row(elem_classes=["play-card"]):
                with gr.Column(scale=4, elem_classes=["image-col"]):
                    agent_img = gr.Image(
                        type="pil",
                        sources=["upload"],
                        image_mode="RGB",
                        label=None,
                        show_label=False,
                        interactive=True,
                        height=320,
                        elem_classes=["moondream-img"], 
                    )

                with gr.Column(scale=6, elem_classes=["agent-col"]):
                    with gr.Group(elem_classes=["prompt-surface"]):
                        vision_question = gr.Textbox(
                            label=None,
                            placeholder=(
                                "Ask a higher-level question about this scene.\n\n"
                                "Good questions focus on safety, accessibility, logistics, or risk. For example:\n"
                                "‚Ä¢ Based on this picture, how accessible is this area for people or vehicles?\n"
                                "‚Ä¢ What safety hazards or failure modes should a NASA team watch for here?\n"
                                "‚Ä¢ What logistics or maintenance issues can you infer from this setup, and how confident are you?"
                            ),
                            lines=7,
                            show_label=False,
                            container=False,
                            elem_classes=["prompt-input"],
                        )

                        # footer row with arrow button, same as Moondream
                        with gr.Row(elem_classes=["prompt-footer"]):
                            with gr.Row(elem_classes=["controls"]):
                                agent_detailed = gr.Checkbox(
                                    label="Detailed reasoning",
                                    value=True,
                                    container=False,
                                    show_label=False,
                                    elem_classes=["switch"],
                                    scale=0,
                                )

                            run_agent_btn = gr.Button(
                                "‚ûú",
                                elem_classes=["submit-arrow"],
                                scale=0,
                            )

            # ---------- BOTTOM: results card ----------
            gr.Markdown("## Results", elem_classes=["results-heading"])
            with gr.Row(elem_classes=["play-card"]):
                vision_answer = gr.HTML(
                    value="",
                    visible=False,
                )


        # =========================================================
        # MOONDREAM PLAYGROUND PAGE (existing UI)
        # =========================================================
        with gr.Column(visible=False,
                       elem_classes=["page-card"]) as moondream_page:

            with gr.Row():
                back_from_moon = gr.Button("‚Üê Back to home", scale=0)

            gr.Markdown("## Moondream Playground")

            gr.Markdown("**Sample Images**")
            moon_sample_gallery = gr.Gallery(
                value=SAMPLE_IMAGES,
                show_label=False,
                columns=len(SAMPLE_IMAGES),
                rows=1,
                allow_preview=False,
                elem_classes=["sample-gallery"],
                height=10,
            )

            # ---------- TOP: two columns ----------
            with gr.Row(elem_classes=["play-card"]):
                # Left: image
                with gr.Column(scale=1, elem_classes=["image-col"]):
                    img_in = gr.Image(
                        type="pil",
                        sources=["upload"],
                        image_mode="RGB",
                        label=None,
                        show_label=False,
                        interactive=True,
                        height = 320,
                        elem_classes=["moondream-img"],
                    )

                # Right: prompt card
                with gr.Column(scale=2):
                    with gr.Group(elem_classes=["prompt-surface"]):
                        # text area (shown for Query/Point/Detect)
                        with gr.Group(visible=True) as prompt_wrap:
                            question = gr.Textbox(
                                label=None,
                                placeholder="Enter a question‚Ä¶",
                                lines=7,
                                show_label=False,
                                container=False,
                                elem_classes=["prompt-input"],
                            )

                        # suggestions row (starts hidden)
                        with gr.Row(
                            visible=False, elem_classes=["sugg-row"]
                        ) as sugg_row:
                            sugg_btn1 = gr.Button(
                                "", visible=False, elem_classes=["suggest-pill"]
                            )
                            sugg_btn2 = gr.Button(
                                "", visible=False, elem_classes=["suggest-pill"]
                            )
                            sugg_btn3 = gr.Button(
                                "", visible=False, elem_classes=["suggest-pill"]
                            )

                        # footer ...
                        with gr.Row(elem_classes=["prompt-footer"]):
                            with gr.Row(elem_classes=["controls"]):
                                mode_radio = gr.Radio(
                                    ["Query", "Caption", "Point", "Detect"],
                                    value="Query",
                                    show_label=False,
                                    container=False,
                                    interactive=True,
                                    elem_classes=["mode-strip"],
                                    scale=0,
                                )

                                reasoning = gr.Checkbox(
                                    label="Reasoning",
                                    value=False,
                                    container=False,
                                    show_label=False,
                                    elem_classes=["switch"],
                                    visible=True,   # Query is default
                                    scale=0,
                                )

                            submit_btn = gr.Button(
                                "‚ûú",
                                elem_classes=["submit-arrow"],
                                scale=0,
                            )

                    # caption length row
                    with gr.Row(visible=False) as opts_caption:
                        cap_len = gr.Radio(
                            [("Short", "short"),
                             ("Normal", "normal"),
                             ("Long", "long")],
                            value="normal",
                            label="Caption length",
                        )

            # ---------- BOTTOM: results ----------
            gr.Markdown("## Results", elem_classes=["results-heading"])
            with gr.Row(elem_classes=["play-card"]):
                with gr.Column(scale=1):
                    out_img = gr.Image(label=None, height=320, visible=False)
                    code_html = gr.HTML(visible=False)
                with gr.Column(scale=1):
                    out_panel = gr.HTML(visible=False)

            # ---------- TAGGING / EXPORT ----------
            gr.Markdown("## Image Tags (optional)", elem_classes=["results-heading"])
            with gr.Row(elem_classes=["play-card"]):
                with gr.Column(scale=1):
                    tag_btn = gr.Button(
                        "Generate tags for this image",
                        elem_id="tag-btn",
                    )
                    download_log_btn = gr.Button(
                        "Download tag log (JSON)",
                        elem_id="tag-download-btn",
                    )
                with gr.Column(scale=2):
                    tag_output = gr.JSON(
                        label="Latest tags",
                        value=None,
                        visible=False,
                    )


            
    # =========================================================
    # LOGIC / CALLBACKS (outside columns but inside Blocks)
    # =========================================================
    def _run_vision_agent(img, q, detailed):
        return run_vision_agent(img, q, detailed=detailed)

    def _show_agent_thinking():
        html = (
            "<div class='panel'>"
            "<div class='chips'><span class='chip'>AGENT</span></div>"
            "<div class='section-h'>STATUS</div>"
            "<div class='body'>Thinking through the question‚Ä¶</div>"
            "</div>"
        )
        return gr.update(value=html, visible=True)

    def _start_agent_loading():
        # keep submit-arrow + agent-arrow AND add loading
        return gr.update(elem_classes=["submit-arrow", "agent-arrow", "loading"])

    def _stop_agent_loading():
        # restore both base classes, no loading
        return gr.update(elem_classes=["submit-arrow", "agent-arrow"])

    run_agent_btn.click(
        _start_agent_loading,
        inputs=None,
        outputs=[run_agent_btn],
    ).then(
        _show_agent_thinking,
        inputs=None,
        outputs=[vision_answer],
    ).then(
        _run_vision_agent,
        inputs=[agent_img, vision_question, agent_detailed],
        outputs=[vision_answer],
    ).then(
        _stop_agent_loading,
        inputs=None,
        outputs=[run_agent_btn],
    )


    def _show_thinking(mode):
        mode_label = (mode or "query").upper()
        html = (
            "<div class='panel'>"
            f"<div class='chips'><span class='chip'>{mode_label}</span></div>"
            "<div class='section-h'>STATUS</div>"
            "<div class='body'>Thinking through the question‚Ä¶</div>"
            "</div>"
        )
        return (
            gr.update(value=None, visible=False),
            gr.update(value=html, visible=True),
            gr.update(value="", visible=False),
        )


    def _on_mode_change(mode):
        show_prompt = mode in ("Query", "Point", "Detect")
        placeholder = {
            "Query": "Enter a question‚Ä¶",
            "Point": "Type the object to point at (e.g., 'grapes')",
            "Detect": "Comma-separated labels (e.g., 'bottle, grapes')",
            "Caption": "",
        }[mode]
        return (
            gr.update(visible=show_prompt),          # prompt_wrap
            gr.update(
                placeholder=placeholder,
                value="" if mode == "Caption" else None,
            ),                                       # question
            gr.update(visible=(mode == "Caption")),  # opts_caption
            gr.update(visible=(mode == "Query")),    # reasoning
        )

    mode_radio.change(
        _on_mode_change,
        mode_radio,
        [prompt_wrap, question, opts_caption, reasoning],
    )

    def _hide_results_on_new_image(_):
        return (
            gr.update(value=None, visible=False),  # out_img
            gr.update(value="", visible=False),    # out_panel
            gr.update(value="", visible=False),    # code_html
        )

    img_in.change(
        _hide_results_on_new_image,
        img_in,
        [out_img, out_panel, code_html],
    )

    def _start_sugg_loading(img, mode):
        row_hidden = gr.update(visible=False)
        empty_btn  = gr.update(value="", visible=False, interactive=False)

        if img is None or mode != "Query":
            return row_hidden, empty_btn, empty_btn, empty_btn

        row_shown    = gr.update(visible=True)
        loading_pill = gr.update(
            value="Generating suggestions‚Ä¶",
            visible=True,
            interactive=False,
            elem_classes=["suggest-pill", "loading"],
        )
        return (row_shown, loading_pill, empty_btn, empty_btn)

    def _update_suggestions(img, mode):
        row_hidden = gr.update(visible=False)
        empty_btn  = gr.update(value="", visible=False)

        if img is None or mode != "Query":
            return row_hidden, empty_btn, empty_btn, empty_btn

        fallback_qs = [
            "What are the main objects in this image?",
            "What is the setting or environment?",
            "Is there anything unusual or interesting here?",
        ]

        try:
            base = exif_rgb(img)
            qs = suggest_questions(base, n=3)
        except Exception:
            qs = fallback_qs

        if not qs:
            qs = fallback_qs

        updates = []
        for q in qs + [""] * (3 - len(qs)):
            if q:
                updates.append(
                    gr.update(
                        value=q,
                        visible=True,
                        interactive=True,
                        elem_classes=["suggest-pill"],
                    )
                )
            else:
                updates.append(
                    gr.update(value="", visible=False, interactive=False)
                )

        row_shown = gr.update(visible=True)
        return (row_shown, *updates)

    img_in.change(
        _start_sugg_loading,
        [img_in, mode_radio],
        [sugg_row, sugg_btn1, sugg_btn2, sugg_btn3],
    ).then(
        _update_suggestions,
        [img_in, mode_radio],
        [sugg_row, sugg_btn1, sugg_btn2, sugg_btn3],
    )

    mode_radio.change(
        _start_sugg_loading,
        [img_in, mode_radio],
        [sugg_row, sugg_btn1, sugg_btn2, sugg_btn3],
    ).then(
        _update_suggestions,
        [img_in, mode_radio],
        [sugg_row, sugg_btn1, sugg_btn2, sugg_btn3],
    )

        # --- Sample image handlers ---

    def _load_sample_for_moondream(evt: gr.SelectData):
        idx = evt.index  # which thumbnail was clicked
        # return the corresponding PIL image
        return SAMPLE_IMAGES[idx]

    moon_sample_gallery.select(
        _load_sample_for_moondream,
        inputs=None,
        outputs=[img_in],
    )

    def _load_sample_for_agent(evt: gr.SelectData):
        idx = evt.index
        return SAMPLE_IMAGES[idx]

    agent_sample_gallery.select(
        _load_sample_for_agent,
        inputs=None,
        outputs=[agent_img],
    )

    def _use_suggestion(text):
        return gr.update(value=text)

    sugg_btn1.click(_use_suggestion, inputs=sugg_btn1, outputs=question)
    sugg_btn2.click(_use_suggestion, inputs=sugg_btn2, outputs=question)
    sugg_btn3.click(_use_suggestion, inputs=sugg_btn3, outputs=question)

    def _run_bridge(mode, img, text, reasoning_val, cap_len):
        if mode != "Query":
            reasoning_val = False

        if mode == "Point":
            labels = text or ""
        elif mode == "Detect":
            labels = text or ""
        else:
            labels = ""

        return run(mode, img, text, reasoning_val, cap_len, labels)

    def _start_loading():
        return gr.update(elem_classes=["submit-arrow", "loading"])

    def _stop_loading():
        return gr.update(elem_classes=["submit-arrow"])

    submit_btn.click(
        _start_loading,
        inputs=None,
        outputs=[submit_btn],
    ).then(
        _show_thinking,
        inputs=[mode_radio],                # <-- pass mode here
        outputs=[out_img, out_panel, code_html],
    ).then(
        _run_bridge,
        [mode_radio, img_in, question, reasoning, cap_len],
        [out_img, out_panel, code_html],
    ).then(
        _stop_loading,
        inputs=None,
        outputs=[submit_btn],
    )

    question.submit(
        _start_loading,
        inputs=None,
        outputs=[submit_btn],
    ).then(
        _show_thinking,
        inputs=[mode_radio],                # <-- and here
        outputs=[out_img, out_panel, code_html],
    ).then(
        _run_bridge,
        [mode_radio, img_in, question, reasoning, cap_len],
        [out_img, out_panel, code_html],
    ).then(
        _stop_loading,
        inputs=None,
        outputs=[submit_btn],
    )

    # =========================================================
    # TAG GENERATION + EXPORT CALLBACKS
    # =========================================================

    def _on_generate_tags(img):
        """
        Callback for the 'Generate tags for this image' button in the Moondream page.
        """
        if img is None:
            # Just show a friendly JSON message
            return gr.update(
                value={"error": "Upload an image or select a sample image first."},
                visible=True,
            )

        tags = generate_image_tags(img)
        return gr.update(value=tags, visible=True)


    def _download_tag_log():
        """
        Provide the in-memory TAG_LOG as a downloadable JSON file.

        Note: In a plain notebook setting, this returns the dict;
        Gradio will handle the front-end download.
        """
        # In basic Gradio, returning a dict from a Button is displayed.
        # If you want a real file download, you could instead write to disk and use gr.File.
        return TAG_LOG or [{"note": "No tags have been generated in this session yet."}]


    tag_btn.click(
        _on_generate_tags,
        inputs=[img_in],
        outputs=[tag_output],
    )

    download_log_btn.click(
        _download_tag_log,
        inputs=None,
        outputs=[tag_output],
    )


    # --- Navigation between pages ---
    def show_home():
        return (
            gr.update(visible=True),   # home_page
            gr.update(visible=False),  # vision_agent_page
            gr.update(visible=False),  # moondream_page
        )

    def show_agent():
        return (
            gr.update(visible=False),
            gr.update(visible=True),
            gr.update(visible=False),
        )

    def show_moondream():
        return (
            gr.update(visible=False),
            gr.update(visible=False),
            gr.update(visible=True),
        )


    home_to_agent.click(
        show_agent,
        None,
        [home_page, vision_agent_page, moondream_page],
    )
    home_to_moon.click(
        show_moondream,
        None,
        [home_page, vision_agent_page, moondream_page],
    )

    back_from_agent.click(
        show_home,
        None,
        [home_page, vision_agent_page, moondream_page],
    )
    back_from_moon.click(
        show_home,
        None,
        [home_page, vision_agent_page, moondream_page],
    )


In [None]:
# %% launch
try:
    demo.close()
except Exception:
    pass

demo.queue().launch(
    inline=True,
    show_error=True,
    server_name="127.0.0.1",
    server_port=None,
)