# 🧑‍🏫 Lab X — Benchmarking Anthropic Models for Web-to-HTML Reconstruction

**Goals**

1. **Load** a screenshot of a webapp page (PNG/JPG)  
2. **Invoke** multiple Claude models on AWS Bedrock  
3. **Ask** each model to reproduce the HTML that generated the page  
4. **Capture & compare** latency, token usage, reasoning chains, and cost  
5. **Render** each generated HTML in the notebook for quick visual feedback


In [1]:
# 0. Environment Setup

# install AWS SDK, image handling, and table/output helpers
%pip install -q boto3 pillow pandas tabulate rich tqdm

Note: you may need to restart the kernel to use updated packages.


# 1. Imports & Configuration

In [2]:
from pathlib import Path
import json
import logging
import base64
from dataclasses import dataclass, field
from datetime import datetime
from time import sleep
from typing import Any, Dict, List, Optional, Tuple

import boto3
from botocore.config import Config
import pandas as pd
from rich.console import Console
from rich.table import Table
from tabulate import tabulate
from tqdm.auto import tqdm
from IPython.display import HTML, display

# Logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s"
)
logger = logging.getLogger("webapp_html_benchmark")
console = Console()

# AWS Bedrock client
AWS_REGION = "us-east-1"  # or os.getenv("AWS_REGION")
bedrock_config = Config(connect_timeout=10, read_timeout=300, retries={"max_attempts": 3})
bedrock_client = boto3.client(
    "bedrock-runtime",
    region_name=AWS_REGION,
    config=bedrock_config
)

2025-05-06 17:49:28,200 [INFO] botocore.credentials: Found credentials in shared credentials file: ~/.aws/credentials


# 2. Model Catalogue & Cost Reference

In [3]:
@dataclass
class ModelConfig:
    key: str
    display_name: str
    model_id: str
    max_tokens: int = 10_000
    temperature: float = 1.0
    thinking: Dict[str, Any] = field(default_factory=dict)
    price_in_per_1M: float = 0.0
    price_out_per_1M: float = 0.0

    @property
    def rate_in(self) -> float:
        return self.price_in_per_1M / 1e6

    @property
    def rate_out(self) -> float:
        return self.price_out_per_1M / 1e6

# Update with your region‐prefix logic if needed
def _geo_prefix(region: str) -> str:
    if region.startswith("us-"):
        return "us."
    if region.startswith("eu-"):
        return "eu."
    return "ap."

PFX = _geo_prefix(AWS_REGION)

# Per‐million‐token pricing
PRICING = {
    "haiku3.5":    (0.8, 4.0),
    "sonnet3.5v2": (3.0,15.0),
    "sonnet3.7":   (3.0,15.0),
}

EVAL_MODELS: List[ModelConfig] = [
    ModelConfig(
        key="haiku3.5",
        display_name="Claude 3.5 Haiku",
        model_id=f"{PFX}anthropic.claude-3-5-haiku-20241022-v1:0",
        price_in_per_1M=PRICING["haiku3.5"][0],
        price_out_per_1M=PRICING["haiku3.5"][1],
    ),
    ModelConfig(
        key="sonnet3.5v2",
        display_name="Claude 3.5 Sonnet v2",
        model_id=f"{PFX}anthropic.claude-3-5-sonnet-20241022-v2:0",
        price_in_per_1M=PRICING["sonnet3.5v2"][0],
        price_out_per_1M=PRICING["sonnet3.5v2"][1],
    ),
    ModelConfig(
        key="sonnet3.7_low",
        display_name="Claude 3.7 Sonnet (low reasoning)",
        model_id=f"{PFX}anthropic.claude-3-7-sonnet-20250219-v1:0",
        thinking={"type": "enabled", "budget_tokens": 2_048},
        price_in_per_1M=PRICING["sonnet3.7"][0],
        price_out_per_1M=PRICING["sonnet3.7"][1],
    ),
    ModelConfig(
        key="sonnet3.7_high",
        display_name="Claude 3.7 Sonnet (high reasoning)",
        model_id=f"{PFX}anthropic.claude-3-7-sonnet-20250219-v1:0",
        max_tokens=10_000,
        thinking={"type": "enabled", "budget_tokens": 8_192},
        price_in_per_1M=PRICING["sonnet3.7"][0],
        price_out_per_1M=PRICING["sonnet3.7"][1],
    ),
]

def show_pricing_table(models: List[ModelConfig]) -> None:
    table = Table(title="Price — USD per million tokens")
    table.add_column("Model", no_wrap=True)
    table.add_column("Input $/M", justify="right")
    table.add_column("Output $/M", justify="right")
    for m in models:
        table.add_row(m.display_name, f"{m.price_in_per_1M:.2f}", f"{m.price_out_per_1M:.2f}")
    console.print(table)

# Display pricing
show_pricing_table(EVAL_MODELS)

# 3. Load & Encode Webapp Screenshot

In [4]:
from PIL import Image
from io import BytesIO

def encode_image_to_base64(
    image_path: Path,
    max_dim: int = 512,
    jpeg_quality: int = 30
) -> str:
    """
    Opens an image, downscales it so its longest side is max_dim,
    saves it as a JPEG with the given quality into memory,
    and returns the base64 string.
    """
    if not image_path.exists():
        logger.error(f"Screenshot not found: {image_path}")
        raise FileNotFoundError(f"{image_path} does not exist")

    # 1) Open & downscale
    img = Image.open(image_path)
    img.thumbnail((max_dim, max_dim), Image.LANCZOS)

    # 2) JPEG-compress into a buffer
    buffer = BytesIO()
    img = img.convert("RGB")  # JPEG requires no alpha
    img.save(buffer, format="JPEG", quality=jpeg_quality, optimize=True)
    buffer.seek(0)

    # 3) Base64 encode
    b64 = base64.b64encode(buffer.read()).decode("utf-8")
    return b64

# Usage:
SCREENSHOT_PATH = Path("data") / "nextflix.png"
encoded_image = encode_image_to_base64(
    SCREENSHOT_PATH,
    max_dim=512,        # scale longest side to ≤512px
    jpeg_quality=30     # tune between 1–100 for size vs. fidelity
)

HTML_PROMPT = f"""
Below is a downscaled, JPEG-compressed screenshot of a web application, encoded in base64:
{encoded_image}

Please reproduce the minimal, well-formatted HTML source (CSS inline, no JS)
that, when rendered, would produce a visually equivalent page.
Provide **only** the <html>…</html> code block—no explanations.
"""

# 4. Invocation & Parsing

In [5]:
from datetime import datetime, timezone

def build_payload(prompt: str, cfg: ModelConfig) -> Dict[str, Any]:
    return {
        "messages": [{"role": "user", "content": prompt}],
        "max_tokens": cfg.max_tokens,
        "temperature": cfg.temperature,
        "anthropic_version": "bedrock-2023-05-31",
        **({"thinking": cfg.thinking} if cfg.thinking else {}),
    }

def parse_bedrock_response(
    body_bytes: bytes,
    prompt: str,
    cfg: ModelConfig
) -> Tuple[str, str, int, int]:
    """
    Returns (full_text, reasoning, tokens_in, tokens_out).
    Attempts to use reported usage if present, else falls back to splitting on whitespace.
    """
    data = json.loads(body_bytes)

    # 1) Try to pull 'usage' fields if available
    usage = data.get("usage", {})
    tokens_in  = usage.get("prompt_tokens", 0) or usage.get("promptTokens", 0) or 0
    tokens_out = usage.get("completion_tokens", 0) or usage.get("completionTokens", 0) or 0

    # 2) Extract text + reasoning
    content = data.get("content") or data.get("completions") or []
    texts, thoughts = [], []
    if isinstance(content, list):
        for chunk in content:
            if chunk.get("type") == "text":
                texts.append(chunk.get("text",""))
            if "thinking" in chunk:
                thoughts.append(chunk["thinking"])
    else:
        texts = [ data.get("completion","") ]
    full_text = "".join(texts).strip()
    reasoning = "".join(thoughts).strip()

    # 3) Fallback to whitespace‐split counts if usage wasn't provided
    if tokens_in == 0:
        tokens_in = len(prompt.split())
    if tokens_out == 0:
        tokens_out = len(full_text.split())

    return full_text, reasoning, tokens_in, tokens_out

def invoke_model(cfg: ModelConfig, prompt: str) -> Dict[str, Any]:
    logger.info(f"Invoking model: {cfg.display_name}")
    payload = build_payload(prompt, cfg)
    start = datetime.now(timezone.utc)

    try:
        resp = bedrock_client.invoke_model(
            modelId=cfg.model_id,
            contentType="application/json",
            accept="application/json",
            body=json.dumps(payload).encode("utf-8"),
        )
    except Exception as exc:
        logger.error(f"Error from {cfg.display_name}: {exc}")
        return {"name": cfg.display_name, "status": "ERROR", "error": str(exc)}

    latency = (datetime.now(timezone.utc) - start).total_seconds()
    body_bytes = resp["body"].read()

    # ← pass prompt along so fallback works
    html, reasoning, tokens_in, tokens_out = parse_bedrock_response(body_bytes, prompt, cfg)

    cost = round(tokens_in * cfg.rate_in + tokens_out * cfg.rate_out, 4)
    sleep(1)  # pacing between calls

    return {
        "name":            cfg.display_name,
        "status":          "OK",
        "latency_s":       round(latency, 2),
        "tokens_in":       tokens_in,
        "tokens_out":      tokens_out,
        "thinking_tokens": len(reasoning.split()),
        "cost_usd":        cost,
        "html":            html,
        "reasoning":       reasoning,
    }


In [6]:
# Which model do you want to benchmark today?  
# Change this to any key in EVAL_MODELS: "haiku3.5", "sonnet3.5v2", "sonnet3.7_low", "sonnet3.7_high"
SELECT_MODEL = "sonnet3.5v2"

# Find the matching ModelConfig (will error if you typo)
selected_cfgs = [m for m in EVAL_MODELS if m.key == SELECT_MODEL]
if not selected_cfgs:
    raise ValueError(f"No model matching key={SELECT_MODEL!r}. Valid keys: {[m.key for m in EVAL_MODELS]}")
# Optionally: you could support SELECT_MODEL = None to run all again
#    if SELECT_MODEL is None: selected_cfgs = EVAL_MODELS

In [7]:
# After:
results = []
for cfg in selected_cfgs:
    results.append(invoke_model(cfg, HTML_PROMPT))

# split successes / failures exactly as before...
successes = [r for r in results if r["status"] == "OK"]
failures  = [r for r in results if r["status"] != "OK"]

if failures:
    console.print("[yellow]⚠️ Model invocation failed:[/yellow]")
    console.print(f" • [bold]{failures[0]['name']}[/bold]: {failures[0]['error']}")

if successes:
    df = pd.DataFrame(successes)[
        ["name","latency_s","tokens_in","tokens_out","thinking_tokens","cost_usd"]
    ]
    console.rule(f"🏷 Run Metrics — {SELECT_MODEL}")
    print(tabulate(df, headers="keys", tablefmt="pretty", showindex=False))
else:
    console.print("[bold red]❌ No successful runs.[/bold red]")

2025-05-06 17:49:28,668 [INFO] webapp_html_benchmark: Invoking model: Claude 3.5 Sonnet v2


+----------------------+-----------+-----------+------------+-----------------+----------+
|         name         | latency_s | tokens_in | tokens_out | thinking_tokens | cost_usd |
+----------------------+-----------+-----------+------------+-----------------+----------+
| Claude 3.5 Sonnet v2 |   9.46    |    41     |    109     |        0        |  0.0018  |
+----------------------+-----------+-----------+------------+-----------------+----------+


# 6. Render Outputs Inline

In [8]:
for res in successes:
    console.rule(f"🔧 Output — {res['name']}")
    # raw HTML
    display(HTML(f"<pre style='white-space: pre-wrap; border:1px solid #ddd; padding:10px;'>"
                 f"{res['html']}</pre>"))
    # visual render
    display(HTML(res["html"]))
    if res["reasoning"]:
        console.print("[italic]Captured reasoning (truncated):[/italic]")
        console.print(res["reasoning"][:500] + ("…" if len(res["reasoning"]) > 500 else ""))

# 7. Discussion & Next Steps
Latency vs. Quality: …

Cost trade-offs: …

When to surface full reasoning: …

Tip: you can easily tweak temperature, max_tokens, or swap in other
Bedrock-hosted models (e.g. Google Gemini via Vertex, OpenAI GPT 4, LLaMA on-prem)
by adding another ModelConfig and rerunning the above cells.

pgsql
Copy
Edit

**Key points**  
- We read and base64-encode the screenshot so it travels in the JSON payload.  
- `invoke_model` collects everything: latency, in/out tokens, cost, and any reasoning.  
- Rendering returned HTML is as simple as `display(HTML(generated_html))`.  

Feel free to adapt this scaffold to more advanced “cascades” (e.g. feeding the HTML back into a second mo