# Dynamic Batching Benchmark on CUDA (Google Colab)

This notebook sets up the environment to run the dynamic batching benchmark on a Tesla T4 GPU.

In [None]:
# 1. Clone the repository
!git clone https://github.com/adarsh-gadepalli/inference-batching.git
%cd inference-batching

# 2. Install dependencies
!pip install -r requirements.txt

# 3. Verify CUDA is available
import torch
print(f"CUDA Available: {torch.cuda.is_available()}")
print(f"Device Name: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")

In [None]:
# 4. Run the comparison benchmark
# This runs the server and benchmark client in the same environment
!python compare.py

In [None]:
# 5. Visualize Results
import json
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Load data
with open('results.json', 'r') as f:
    data = json.load(f)

df = pd.DataFrame(data)

# Create clearer labels for experiments
df['experiment_id'] = df.apply(lambda x: f"{x['requests']} reqs\n{x['concurrency']} conn", axis=1)

# Set style
sns.set_theme(style="whitegrid")
fig, axes = plt.subplots(3, 1, figsize=(12, 18))

# 1. Throughput (Requests/Sec)
df_melt = df.melt(id_vars=['experiment_id'], 
                  value_vars=['throughput_none', 'throughput_dynamic', 'throughput_continuous'],
                  var_name='Method', value_name='Requests/Sec')
df_melt['Method'] = df_melt['Method'].str.replace('throughput_', '').str.title()

sns.barplot(data=df_melt, x='experiment_id', y='Requests/Sec', hue='Method', ax=axes[0])
axes[0].set_title('Request Throughput (Higher is Better)', fontsize=14)
axes[0].set_xlabel('')

# 2. Tokens Per Second (TPS)
df_melt_tps = df.melt(id_vars=['experiment_id'], 
                      value_vars=['tps_none', 'tps_dynamic', 'tps_continuous'],
                      var_name='Method', value_name='Tokens/Sec')
df_melt_tps['Method'] = df_melt_tps['Method'].str.replace('tps_', '').str.title()

sns.barplot(data=df_melt_tps, x='experiment_id', y='Tokens/Sec', hue='Method', ax=axes[1])
axes[1].set_title('Token Generation Throughput (TPS) (Higher is Better)', fontsize=14)
axes[1].set_xlabel('')

# 3. Latency (Avg)
df_melt_lat = df.melt(id_vars=['experiment_id'], 
                      value_vars=['latency_none', 'latency_dynamic', 'latency_continuous'],
                      var_name='Method', value_name='Latency (ms)')
df_melt_lat['Method'] = df_melt_lat['Method'].str.replace('latency_', '').str.title()

sns.barplot(data=df_melt_lat, x='experiment_id', y='Latency (ms)', hue='Method', ax=axes[2])
axes[2].set_title('Average Latency (Lower is Better)', fontsize=14)
axes[2].set_yscale('log') # Log scale because "None" is often huge
axes[2].set_xlabel('Experiment Configuration')

plt.tight_layout()
plt.show()