# Dynamic Batching Benchmark on CUDA (Google Colab)

This notebook sets up the environment to run the dynamic batching benchmark on a Tesla T4 GPU.

In [None]:
# 1. Clone the repository
!git clone https://github.com/adarsh-gadepalli/inference-batching.git
%cd inference-batching

# 2. Install dependencies
!pip install -r requirements.txt

# 3. Verify CUDA is available
import torch
print(f"CUDA Available: {torch.cuda.is_available()}")
print(f"Device Name: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")

In [None]:
# 4. Run the comparison benchmark
# This runs the server and benchmark client in the same environment
!python compare.py

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl

# -----------------------------
# Modern styling
# -----------------------------
mpl.rcParams.update({
    "figure.facecolor": "white",
    "axes.facecolor": "white",
    "axes.edgecolor": "#E5E7EB",
    "axes.linewidth": 1.0,
    "grid.color": "#E5E7EB",
    "grid.linestyle": "-",
    "grid.linewidth": 0.8,
    "axes.grid": True,
    "axes.grid.axis": "y",
    "font.size": 12,
    "axes.titlesize": 16,
    "axes.labelsize": 12,
    "legend.frameon": False,
})

COLORS = {
    "None": "#9CA3AF",        # gray
    "Dynamic": "#F59E0B",     # amber
    "Continuous": "#10B981",  # emerald
}

# -----------------------------
# Load results
# -----------------------------
with open("results.json", "r") as f:
    data = json.load(f)

df = pd.DataFrame(data)
df["experiment_id"] = df.apply(
    lambda x: f"{x['requests']} requests\n{x['concurrency']} concurrent",
    axis=1,
)

# -----------------------------
# Melt helpers
# -----------------------------
def melt_metric(prefix, value_name):
    m = df.melt(
        id_vars=["experiment_id"],
        value_vars=[f"{prefix}_none", f"{prefix}_dynamic", f"{prefix}_continuous"],
        var_name="Method",
        value_name=value_name,
    )
    m["Method"] = (
        m["Method"]
        .str.replace(f"{prefix}_", "", regex=False)
        .str.capitalize()
    )
    m["Method"] = pd.Categorical(
        m["Method"], ["None", "Dynamic", "Continuous"], ordered=True
    )
    return m

df_melt = melt_metric("throughput", "Requests/Sec")
df_melt_tps = melt_metric("tps", "Tokens/Sec")
df_melt_lat = melt_metric("latency", "Latency (ms)")

# -----------------------------
# Modern bar plot helper
# -----------------------------
def modern_barplot(ax, df, x, y, title, ylabel):
    methods = ["None", "Dynamic", "Continuous"]
    width = 0.22
    xs = range(len(df[x].unique()))

    for i, method in enumerate(methods):
        sub = df[df["Method"] == method]
        ax.bar(
            [v + (i - 1) * width for v in xs],
            sub[y],
            width=width,
            label=method,
            color=COLORS[method],
        )

    ax.set_title(title)
    ax.set_ylabel(ylabel)
    ax.set_xlabel("Experiment configuration")
    ax.set_xticks(xs)
    ax.set_xticklabels(df[x].unique())
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)

# -----------------------------
# Plot
# -----------------------------
fig, axes = plt.subplots(3, 1, figsize=(11, 14))

modern_barplot(
    axes[0],
    df_melt,
    "experiment_id",
    "Requests/Sec",
    "Request Throughput",
    "Requests / sec",
)

modern_barplot(
    axes[1],
    df_melt_tps,
    "experiment_id",
    "Tokens/Sec",
    "Token Generation Throughput",
    "Tokens / sec",
)

modern_barplot(
    axes[2],
    df_melt_lat,
    "experiment_id",
    "Latency (ms)",
    "Average Latency",
    "Latency (ms)",
)

# Single shared legend
handles, labels = axes[0].get_legend_handles_labels()
fig.legend(handles, labels, loc="upper center", ncol=3)

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()
