# 🧑‍🏫 Lab X — Benchmarking Anthropic Models for Web-to-HTML Reconstruction

**Goals**

1. **Load** a screenshot of a webapp page (PNG/JPG)  
2. **Invoke** multiple Claude models on AWS Bedrock  
3. **Ask** each model to reproduce the HTML that generated the page  
4. **Capture & compare** latency, token usage, reasoning chains, and cost  
5. **Render** each generated HTML in the notebook for quick visual feedback

---

In [None]:
# 0. Environment Setup

# install AWS SDK, image handling, and table/output helpers
%pip install -q boto3 pillow pandas tabulate rich tqdm


# 1. Imports & Configuration

In [None]:
from __future__ import annotations
import os, json, logging, base64
from dataclasses import dataclass, field
from datetime import datetime
from time import sleep
from typing import Any, Dict, List

import boto3
from botocore.config import Config
import pandas as pd
from rich.console import Console
from rich.table import Table
from tabulate import tabulate

# For inline HTML rendering
from IPython.display import HTML, display

# ─── Logging & AWS Bedrock client ─────────────────────────────────────────
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s"
)
log = logging.getLogger("webapp_html_benchmark")
console = Console()

AWS_REGION = os.getenv("AWS_REGION", "us-east-1")
bedrock_cfg = Config(connect_timeout=10, read_timeout=300, retries={"max_attempts":3})
BEDROCK = boto3.client("bedrock-runtime", region_name=AWS_REGION, config=bedrock_cfg)


# 2. Model Catalogue & Cost Reference

In [None]:
@dataclass
class ModelConfig:
    key: str
    display_name: str
    model_id: str
    max_tokens: int = 10_000
    temperature: float = 0.0
    thinking: Dict[str, Any] = field(default_factory=dict)
    price_in_per_1M: float = 0.0
    price_out_per_1M: float = 0.0

    @property
    def per_token_rates(self) -> tuple[float,float]:
        return (self.price_in_per_1M/1e6, self.price_out_per_1M/1e6)

# update with your own pricing
PRICING = {
    "haiku3.5":    (0.8, 4.0),
    "sonnet3.5v2": (3.0,15.0),
    "sonnet3.7":   (3.0,15.0),
}

# helper to prefix model IDs by region
def _geo_prefix(region: str) -> str:
    if region.startswith("us-"): return "us."
    if region.startswith("eu-"): return "eu."
    return "ap."

PFX = _geo_prefix(AWS_REGION)

EVAL_MODELS: list[ModelConfig] = [
    ModelConfig(
        key="haiku3.5",
        display_name="Claude 3.5 Haiku",
        model_id=f"{PFX}anthropic.claude-3-5-haiku-20241022-v1:0",
        price_in_per_1M=PRICING["haiku3.5"][0],
        price_out_per_1M=PRICING["haiku3.5"][1],
    ),
    ModelConfig(
        key="sonnet3.5v2",
        display_name="Claude 3.5 Sonnet v2",
        model_id=f"{PFX}anthropic.claude-3-5-sonnet-20241022-v2:0",
        price_in_per_1M=PRICING["sonnet3.5v2"][0],
        price_out_per_1M=PRICING["sonnet3.5v2"][1],
    ),
    ModelConfig(
        key="sonnet3.7_low",
        display_name="Claude 3.7 Sonnet (low reasoning)",
        model_id=f"{PFX}anthropic.claude-3-7-sonnet-20250219-v1:0",
        thinking={"type":"enabled","budget_tokens":2048},
        price_in_per_1M=PRICING["sonnet3.7"][0],
        price_out_per_1M=PRICING["sonnet3.7"][1],
    ),
    ModelConfig(
        key="sonnet3.7_high",
        display_name="Claude 3.7 Sonnet (high reasoning)",
        model_id=f"{PFX}anthropic.claude-3-7-sonnet-20250219-v1:0",
        max_tokens=10000,
        thinking={"type":"enabled","budget_tokens":8192},
        price_in_per_1M=PRICING["sonnet3.7"][0],
        price_out_per_1M=PRICING["sonnet3.7"][1],
    ),
]

def show_pricing():
    tbl = Table(title="Price — USD per million tokens")
    tbl.add_column("Model")
    tbl.add_column("Input $/M", justify="right")
    tbl.add_column("Output $/M", justify="right")
    for cfg in EVAL_MODELS:
        in_rate, out_rate = cfg.price_in_per_1M, cfg.price_out_per_1M
        tbl.add_row(cfg.display_name, f"{in_rate:.2f}", f"{out_rate:.2f}")
    console.print(tbl)

show_pricing()


# 3. Load & Encode Webapp Screenshot

In [None]:
from PIL import Image

# path to your screenshot
SCREENSHOT_PATH = "webapp_screenshot.png"

# read & base64-encode
with open(SCREENSHOT_PATH, "rb") as f:
    raw_bytes = f.read()
b64_image = base64.b64encode(raw_bytes).decode("utf-8")

# prompt template
HTML_PROMPT = f"""
Below is a screenshot of a web application, encoded in base64:
{b64_image}
Please reproduce the minimal, well-formatted HTML source (no CSS or JS) that,
when rendered, would produce a visually equivalent page. Provide **only** the
\<html>…\</html> code block—no explanations.
"""

In [None]:
def build_payload(prompt: str, cfg: ModelConfig) -> Dict[str,Any]:
    body = {
        "anthropic_version": "bedrock-2023-05-31",
        "max_tokens": cfg.max_tokens,
        "temperature": cfg.temperature,
        "messages": [{"role":"user","content": prompt}],
    }
    if cfg.thinking:
        body["thinking"] = cfg.thinking
    return body

def invoke_model(cfg: ModelConfig, prompt: str) -> Dict[str,Any]:
    console.log(f"→ Invoking {cfg.display_name}")
    request = json.dumps(build_payload(prompt, cfg))
    t0 = datetime.now()
    try:
        resp = BEDROCK.invoke_model(
            modelId=cfg.model_id,
            body=request,
            contentType="application/json",
            accept="application/json"
        )
    except Exception as e:
        return {"name":cfg.display_name, "status":"ERROR", "error":str(e)}
    latency = (datetime.now() - t0).total_seconds()
    data = json.loads(resp["body"].read())

    # extract model output & reasoning
    full_text = "".join(chunk["text"] for chunk in data["content"] if chunk["type"]=="text").strip()
    reasoning  = "".join(chunk["thinking"] for chunk in data["content"] if chunk["type"]=="thinking").strip()

    tokens_out = len(full_text.split())
    tokens_in  = len(prompt.split())
    rate_in, rate_out = cfg.per_token_rates
    cost_usd = round(tokens_in*rate_in + tokens_out*rate_out, 4)

    sleep(1)  # pacing

    return {
        "name":           cfg.display_name,
        "status":         "OK",
        "latency_s":      round(latency,2),
        "tokens_in":      tokens_in,
        "tokens_out":     tokens_out,
        "thinking_tokens": len(reasoning.split()),
        "cost_usd":       cost_usd,
        "html":           full_text,
        "reasoning":      reasoning,
    }


In [None]:
# run all models
results = [invoke_model(cfg, HTML_PROMPT) for cfg in EVAL_MODELS]

# show metrics table
ok_results = [r for r in results if r["status"]=="OK"]
metrics_df = pd.DataFrame(ok_results)[
    ["name","latency_s","tokens_in","tokens_out","thinking_tokens","cost_usd"]
]
console.rule("🏷  Run Metrics")
print(tabulate(metrics_df, headers="keys", tablefmt="pretty", showindex=False))

# display each HTML and render it
for res in ok_results:
    console.rule(f"🔧 Output — {res['name']}")
    # show raw HTML in a code block
    print("```html")
    print(res["html"])
    print("```")
    # render inline
    display(HTML(res["html"]))
    if res["reasoning"]:
        console.print("[i]Reasoning chain captured:[/i]")
        console.print(res["reasoning"][:1000] + ("…" if len(res["reasoning"])>1000 else ""))


# 6. Discussion & Next Steps
Latency vs. Quality: …

Cost trade-offs: …

When to surface full reasoning: …

Tip: you can easily tweak temperature, max_tokens, or swap in other
Bedrock-hosted models (e.g. Google Gemini via Vertex, OpenAI GPT 4, LLaMA on-prem)
by adding another ModelConfig and rerunning the above cells.

pgsql
Copy
Edit

**Key points**  
- We read and base64-encode the screenshot so it travels in the JSON payload.  
- `invoke_model` collects everything: latency, in/out tokens, cost, and any reasoning.  
- Rendering returned HTML is as simple as `display(HTML(generated_html))`.  

Feel free to adapt this scaffold to more advanced “cascades” (e.g. feeding the HTML back into a second mo