## 1. Setup and Data Loading

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import json

# Set theme
import plotly.io as pio
pio.templates.default = "plotly_white"

In [None]:
# Load routing results
# Replace with your actual results file path
results_df = pd.read_csv("../data/routing_results.csv")

# Display basic info
print(f"Total requests: {len(results_df):,}")
print(f"Columns: {list(results_df.columns)}")
results_df.head()

## 2. Routing Decision Distribution

Visualize how requests were routed: weak only, strong only, or weak â†’ strong (evaluator triggered).

In [None]:
# Count routing decisions
routing_counts = results_df["model_used"].value_counts()

# Create labels with percentages
labels = []
values = []
for model, count in routing_counts.items():
    pct = (count / len(results_df)) * 100
    labels.append(f"{model}<br>({pct:.1f}%)")
    values.append(count)

# Create pie chart
fig = go.Figure(data=[go.Pie(
    labels=labels,
    values=values,
    hole=0.4,
    marker_colors=['#2ecc71', '#e74c3c', '#f39c12'],
)])

fig.update_layout(
    title="Routing Decision Distribution",
    annotations=[dict(text='Routing<br>Decisions', x=0.5, y=0.5, font_size=14, showarrow=False)],
    height=500,
)

fig.show()

## 3. Cost Comparison Analysis

Compare costs between our smart routing system and baseline (always using strong model).

In [None]:
# Calculate total and average costs
total_cost = results_df["cost_usd"].sum()
avg_cost = results_df["cost_usd"].mean()

# Calculate baseline (always strong) cost
# Assuming strong model costs 20x more than weak
baseline_cost = total_cost * 2.5  # Placeholder - adjust based on your data

# Prepare data
cost_data = pd.DataFrame({
    "Strategy": ["Smart Router", "Always Strong (Baseline)"],
    "Total Cost ($)": [total_cost, baseline_cost],
    "Avg Cost per Request ($)": [avg_cost, baseline_cost / len(results_df)],
})

# Create grouped bar chart
fig = go.Figure()

fig.add_trace(go.Bar(
    name='Total Cost',
    x=cost_data["Strategy"],
    y=cost_data["Total Cost ($)"],
    text=[f"${v:.2f}" for v in cost_data["Total Cost ($)"]],
    textposition='outside',
    marker_color=['#2ecc71', '#e74c3c'],
))

fig.update_layout(
    title=f"Cost Comparison<br><sub>Savings: ${baseline_cost - total_cost:.2f} ({(1 - total_cost/baseline_cost)*100:.1f}%)</sub>",
    yaxis_title="Total Cost (USD)",
    showlegend=False,
    height=500,
)

fig.show()

In [None]:
# Cost breakdown by routing decision
cost_by_model = results_df.groupby("model_used")["cost_usd"].agg(['sum', 'mean', 'count'])
cost_by_model['total_contribution'] = cost_by_model['sum'] / total_cost * 100

print("Cost breakdown by routing decision:")
print(cost_by_model)

# Visualize cost contribution
fig = px.bar(
    cost_by_model.reset_index(),
    x='model_used',
    y='sum',
    color='model_used',
    text=[f"${v:.2f}<br>({p:.1f}%)" for v, p in zip(cost_by_model['sum'], cost_by_model['total_contribution'])],
    labels={'model_used': 'Routing Decision', 'sum': 'Total Cost ($)'},
    title="Cost Contribution by Routing Decision",
    color_discrete_map={'weak': '#2ecc71', 'strong': '#e74c3c', 'weak_then_strong': '#f39c12'},
)

fig.update_traces(textposition='outside')
fig.update_layout(showlegend=False, height=500)
fig.show()

## 4. Quality Score Distributions

Analyze the distribution of quality scores from the evaluator model.

In [None]:
# Filter for cases where weak model was evaluated
weak_evaluated = results_df[results_df["weak_quality_score"].notna()]

print(f"Weak responses evaluated: {len(weak_evaluated):,}")
print(f"Mean quality score: {weak_evaluated['weak_quality_score'].mean():.3f}")
print(f"Median quality score: {weak_evaluated['weak_quality_score'].median():.3f}")

# Create histogram with threshold line
fig = go.Figure()

fig.add_trace(go.Histogram(
    x=weak_evaluated["weak_quality_score"],
    nbinsx=50,
    name="Quality Scores",
    marker_color='#3498db',
    opacity=0.7,
))

# Add threshold line
evaluator_threshold = 0.7  # Adjust to your config
fig.add_vline(
    x=evaluator_threshold,
    line_dash="dash",
    line_color="red",
    annotation_text=f"Threshold ({evaluator_threshold})",
    annotation_position="top right",
)

fig.update_layout(
    title="Distribution of Weak Response Quality Scores",
    xaxis_title="Quality Score",
    yaxis_title="Count",
    showlegend=False,
    height=500,
)

fig.show()

In [None]:
# Box plot comparing quality scores by final routing decision
fig = px.box(
    weak_evaluated,
    x="model_used",
    y="weak_quality_score",
    color="model_used",
    title="Quality Score Distribution by Routing Decision",
    labels={"model_used": "Final Routing", "weak_quality_score": "Quality Score"},
    color_discrete_map={'weak': '#2ecc71', 'weak_then_strong': '#f39c12'},
)

fig.add_hline(
    y=evaluator_threshold,
    line_dash="dash",
    line_color="red",
    annotation_text="Threshold",
)

fig.update_layout(showlegend=False, height=500)
fig.show()

## 5. Latency Analysis

Examine request latency across different routing decisions.

In [None]:
# Latency statistics
latency_stats = results_df.groupby("model_used")["latency_ms"].agg([
    ('Mean', 'mean'),
    ('Median', 'median'),
    ('P95', lambda x: x.quantile(0.95)),
    ('P99', lambda x: x.quantile(0.99)),
])

print("Latency statistics by routing decision (ms):")
print(latency_stats.round(0))

# Violin plot
fig = px.violin(
    results_df,
    x="model_used",
    y="latency_ms",
    color="model_used",
    box=True,
    points="outliers",
    title="Latency Distribution by Routing Decision",
    labels={"model_used": "Routing Decision", "latency_ms": "Latency (ms)"},
    color_discrete_map={'weak': '#2ecc71', 'strong': '#e74c3c', 'weak_then_strong': '#f39c12'},
)

fig.update_layout(showlegend=False, height=500)
fig.show()

## 6. Cost-Quality Trade-off Curves

Explore the relationship between cost and quality at different threshold settings.

In [None]:
# Simulate different threshold settings
# This would ideally use actual data from multiple threshold experiments

thresholds = np.arange(0.1, 1.0, 0.1)
simulated_data = []

for threshold in thresholds:
    # Simulate: higher threshold = more weak usage = lower cost but potentially lower quality
    strong_pct = (1 - threshold) * 100
    cost_factor = 0.2 + (strong_pct / 100) * 0.8  # Cost relative to always-strong
    quality_factor = 0.7 + (strong_pct / 100) * 0.3  # Quality relative to always-strong
    
    simulated_data.append({
        "threshold": threshold,
        "strong_model_usage_pct": strong_pct,
        "relative_cost": cost_factor,
        "relative_quality": quality_factor,
    })

sim_df = pd.DataFrame(simulated_data)

# Create trade-off curve
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=sim_df["relative_cost"] * 100,
    y=sim_df["relative_quality"] * 100,
    mode='lines+markers',
    name='Cost-Quality Frontier',
    text=[f"Threshold: {t:.1f}<br>Strong: {s:.0f}%" for t, s in zip(sim_df["threshold"], sim_df["strong_model_usage_pct"])],
    hovertemplate='<b>Cost:</b> %{x:.1f}%<br><b>Quality:</b> %{y:.1f}%<br>%{text}<extra></extra>',
    marker=dict(size=10, color=sim_df["threshold"], colorscale='Viridis', showscale=True, colorbar=dict(title="Threshold")),
    line=dict(width=2),
))

# Add current operating point
current_threshold = 0.7
current_point = sim_df[sim_df["threshold"] == current_threshold].iloc[0]
fig.add_trace(go.Scatter(
    x=[current_point["relative_cost"] * 100],
    y=[current_point["relative_quality"] * 100],
    mode='markers',
    name='Current Setting',
    marker=dict(size=15, color='red', symbol='star'),
))

fig.update_layout(
    title="Cost vs Quality Trade-off Curve",
    xaxis_title="Relative Cost (%)",
    yaxis_title="Relative Quality (%)",
    height=600,
    hovermode='closest',
)

fig.show()

## 7. Routing Score vs Actual Decision

Analyze how well the RouteLLM routing score correlates with final routing decisions.

In [None]:
# Scatter plot: routing score vs cost
fig = px.scatter(
    results_df,
    x="routing_score",
    y="cost_usd",
    color="model_used",
    title="Routing Score vs Actual Cost",
    labels={"routing_score": "RouteLLM Score (Strong Win Rate)", "cost_usd": "Cost ($)"},
    color_discrete_map={'weak': '#2ecc71', 'strong': '#e74c3c', 'weak_then_strong': '#f39c12'},
    opacity=0.6,
    height=500,
)

# Add routing threshold line
routing_threshold = 0.5  # Adjust to your config
fig.add_vline(
    x=routing_threshold,
    line_dash="dash",
    line_color="purple",
    annotation_text=f"Router Threshold ({routing_threshold})",
)

fig.show()

## 8. Summary Dashboard

Create a comprehensive dashboard with key metrics.

In [None]:
# Calculate key metrics
total_requests = len(results_df)
weak_only = (results_df["model_used"] == "weak").sum()
strong_only = (results_df["model_used"] == "strong").sum()
weak_then_strong = (results_df["model_used"] == "weak_then_strong").sum()
avg_cost = results_df["cost_usd"].mean()
total_cost = results_df["cost_usd"].sum()
avg_latency = results_df["latency_ms"].mean()
evaluator_trigger_rate = (weak_then_strong / total_requests) * 100

# Create subplots
fig = make_subplots(
    rows=2, cols=3,
    subplot_titles=(
        "Total Requests", "Avg Cost/Request", "Avg Latency",
        "Strong Model Usage", "Evaluator Trigger Rate", "Total Cost"
    ),
    specs=[
        [{"type": "indicator"}, {"type": "indicator"}, {"type": "indicator"}],
        [{"type": "indicator"}, {"type": "indicator"}, {"type": "indicator"}]
    ],
)

# Add indicators
fig.add_trace(go.Indicator(
    mode="number",
    value=total_requests,
    number={"font": {"size": 50}},
), row=1, col=1)

fig.add_trace(go.Indicator(
    mode="number+delta",
    value=avg_cost,
    number={"prefix": "$", "font": {"size": 50}},
    delta={"reference": avg_cost * 2, "relative": True},
), row=1, col=2)

fig.add_trace(go.Indicator(
    mode="number",
    value=avg_latency,
    number={"suffix": "ms", "font": {"size": 50}},
), row=1, col=3)

fig.add_trace(go.Indicator(
    mode="gauge+number",
    value=(strong_only + weak_then_strong) / total_requests * 100,
    number={"suffix": "%", "font": {"size": 40}},
    gauge={"axis": {"range": [0, 100]}, "bar": {"color": "#e74c3c"}},
), row=2, col=1)

fig.add_trace(go.Indicator(
    mode="gauge+number",
    value=evaluator_trigger_rate,
    number={"suffix": "%", "font": {"size": 40}},
    gauge={"axis": {"range": [0, 100]}, "bar": {"color": "#f39c12"}},
), row=2, col=2)

fig.add_trace(go.Indicator(
    mode="number",
    value=total_cost,
    number={"prefix": "$", "font": {"size": 50}},
), row=2, col=3)

fig.update_layout(
    title_text="Smart LLM Routing - Summary Dashboard",
    height=600,
)

fig.show()

## Conclusion

This notebook provides comprehensive visualization of the smart LLM routing system's performance:

- **Cost Savings**: Significant reduction compared to always-strong baseline
- **Quality Maintenance**: Evaluator ensures quality by triggering strong model when needed
- **Latency Trade-offs**: Understand performance characteristics of different routing decisions
- **Optimization Opportunities**: Use threshold tuning to find optimal cost-quality balance

Next steps:
1. Run more experiments with different threshold settings
2. Collect human evaluation data for quality assessment
3. A/B test against production traffic
4. Monitor metrics in real-time with dashboard