diff --git a/docs/diagrams/plot_examples/multi_run/config_experiment_classification/pareto_curve_throughput_per_gpu_vs_interactivity.png b/docs/diagrams/plot_examples/multi_run/config_experiment_classification/pareto_curve_throughput_per_gpu_vs_interactivity.png
new file mode 100644
index 000000000..241db7f3e
Binary files /dev/null and b/docs/diagrams/plot_examples/multi_run/config_experiment_classification/pareto_curve_throughput_per_gpu_vs_interactivity.png differ
diff --git a/docs/diagrams/plot_examples/multi_run/config_experiment_classification/ttft_vs_throughput.png b/docs/diagrams/plot_examples/multi_run/config_experiment_classification/ttft_vs_throughput.png
new file mode 100644
index 000000000..f61078dc0
Binary files /dev/null and b/docs/diagrams/plot_examples/multi_run/config_experiment_classification/ttft_vs_throughput.png differ
diff --git a/docs/diagrams/plot_examples/multi_run/pareto_curve_throughput_per_gpu_vs_interactivity.png b/docs/diagrams/plot_examples/multi_run/pareto_curve_throughput_per_gpu_vs_interactivity.png
index b202fc631..17c00dc89 100644
Binary files a/docs/diagrams/plot_examples/multi_run/pareto_curve_throughput_per_gpu_vs_interactivity.png and b/docs/diagrams/plot_examples/multi_run/pareto_curve_throughput_per_gpu_vs_interactivity.png differ
diff --git a/docs/diagrams/plot_examples/multi_run/pareto_curve_throughput_per_gpu_vs_latency.png b/docs/diagrams/plot_examples/multi_run/pareto_curve_throughput_per_gpu_vs_latency.png
index 9e4a9d5ed..4d3305811 100644
Binary files a/docs/diagrams/plot_examples/multi_run/pareto_curve_throughput_per_gpu_vs_latency.png and b/docs/diagrams/plot_examples/multi_run/pareto_curve_throughput_per_gpu_vs_latency.png differ
diff --git a/docs/diagrams/plot_examples/multi_run/theme_dark_mode/pareto_curve_throughput_per_gpu_vs_interactivity.png b/docs/diagrams/plot_examples/multi_run/theme_dark_mode/pareto_curve_throughput_per_gpu_vs_interactivity.png
index 754d1dc46..92fce1549 100644
Binary files a/docs/diagrams/plot_examples/multi_run/theme_dark_mode/pareto_curve_throughput_per_gpu_vs_interactivity.png and b/docs/diagrams/plot_examples/multi_run/theme_dark_mode/pareto_curve_throughput_per_gpu_vs_interactivity.png differ
diff --git a/docs/diagrams/plot_examples/multi_run/theme_dark_mode/pareto_curve_throughput_per_gpu_vs_latency.png b/docs/diagrams/plot_examples/multi_run/theme_dark_mode/pareto_curve_throughput_per_gpu_vs_latency.png
index 0338df32f..06021ab43 100644
Binary files a/docs/diagrams/plot_examples/multi_run/theme_dark_mode/pareto_curve_throughput_per_gpu_vs_latency.png and b/docs/diagrams/plot_examples/multi_run/theme_dark_mode/pareto_curve_throughput_per_gpu_vs_latency.png differ
diff --git a/docs/diagrams/plot_examples/multi_run/theme_dark_mode/ttft_vs_throughput.png b/docs/diagrams/plot_examples/multi_run/theme_dark_mode/ttft_vs_throughput.png
index 9dc55eabf..58e3d1839 100644
Binary files a/docs/diagrams/plot_examples/multi_run/theme_dark_mode/ttft_vs_throughput.png and b/docs/diagrams/plot_examples/multi_run/theme_dark_mode/ttft_vs_throughput.png differ
diff --git a/docs/diagrams/plot_examples/multi_run/ttft_vs_throughput.png b/docs/diagrams/plot_examples/multi_run/ttft_vs_throughput.png
index 1d66c0110..44524bd13 100644
Binary files a/docs/diagrams/plot_examples/multi_run/ttft_vs_throughput.png and b/docs/diagrams/plot_examples/multi_run/ttft_vs_throughput.png differ
diff --git a/docs/diagrams/plot_examples/single_run/timeslices/timeslices_throughput_warning.png b/docs/diagrams/plot_examples/single_run/timeslices/timeslices_throughput_warning.png
new file mode 100644
index 000000000..fe1890f94
Binary files /dev/null and b/docs/diagrams/plot_examples/single_run/timeslices/timeslices_throughput_warning.png differ
diff --git a/docs/tutorials/plot.md b/docs/tutorials/plot.md
index 789113fda..258c57039 100644
--- a/docs/tutorials/plot.md
+++ b/docs/tutorials/plot.md
@@ -5,229 +5,300 @@ SPDX-License-Identifier: Apache-2.0
 
 # Visualization and Plotting with AIPerf
 
-Generate PNG visualizations from AIPerf profiling data with automatic mode detection (single-run analysis or multi-run comparison), NVIDIA brand styling, and support for multi-run comparisons and single-run analysis.
+Generate PNG visualizations from AIPerf profiling data with automatic mode detection, NVIDIA brand styling, and support for multi-run comparisons and single-run analysis.
 
 ## Overview
 
-The `aiperf plot` command generates static PNG visualizations from your profiling results. It automatically detects whether to show multi-run comparison plots or single-run time series analysis based on your directory structure, making it easy to visualize performance trends without manual configuration.
+The `aiperf plot` command automatically detects whether to generate multi-run comparison plots or single-run time series analysis based on your directory structure. It integrates GPU telemetry and timeslice data when available.
 
 **Key Features:**
-- **Automatic mode detection**: Compares multiple runs or analyzes single runs based on directory structure
-- **GPU telemetry integration**: Visualize power, utilization, memory, and temperature metrics
-- **Timeslice support**: View performance evolution across time windows
+- Automatic mode detection (multi-run comparison vs single-run analysis)
+- GPU telemetry integration (power, utilization, memory, temperature)
+- Timeslice support (performance evolution across time windows)
+- Configurable plots via `~/.aiperf/plot_config.yaml`
 
 ## Quick Start
 
 ```bash
-# Analyze a single profiling run (outputs to <single_run_name>/plots/)
+# Analyze a single profiling run
 aiperf plot <single_run_name>
 
-# Compare multiple runs in a directory (outputs to <run_directory>/plots/)
+# Compare multiple runs in a directory
 aiperf plot <run_directory>
 
-# Compare all runs in multiple directories (outputs to <run_directory>/plots/)
-aiperf plot <run_directory1> <run_directory2> ... <run_directoryn>
+# Compare all runs across multiple directories
+aiperf plot <dir1> <dir2> <dir3>
 
-# Compare multiple specific runs (outputs to <single_run_name1>/plots/)
-aiperf plot <single_run_name1> <single_run_name2> ... <single_run_namen>
+# Compare specific runs
+aiperf plot <run1> <run2> <run3>
 
 # Specify custom output location
 aiperf plot <path> --output <output_directory>
 
-# Use dark theme for presentations
+# Use dark theme
 aiperf plot <path> --theme dark
 ```
 
-The command automatically detects visualization mode:
-- **Multi-run comparison**: When directory contains multiple run subdirectories
-- **Single-run analysis**: When directory contains `profile_export.jsonl` directly
+**Output directory logic:**
+- If `--output` specified: uses that path
+- Otherwise: `<first_input_path>/plots/`
+- Default (no paths): `./artifacts/plots/`
+
+**Customize plots**: Edit `~/.aiperf/plot_config.yaml` (auto-created on first run) to enable/disable plots or customize visualizations. See [Plot Configuration](#plot-configuration-yaml) for details.
 
 ## Visualization Modes
 
-The plot command automatically detects the visualization mode based on directory structure:
+The plot command automatically detects visualization mode based on directory structure:
 
 ### Multi-Run Comparison Mode
 
-**Detected when:**
-- Directory contains multiple run subdirectories
-- Multiple paths are specified as arguments
+Compares metrics across multiple profiling runs to identify optimal configurations.
+
+**Auto-detected when:**
+- Directory contains multiple run subdirectories, OR
+- Multiple paths specified as arguments
 
-**Example directory structures:**
+**Example:**
 ```
-artifacts/sweep_qwen/          # Contains multiple runs
+artifacts/sweep_qwen/
 ├── Qwen3-0.6B-concurrency1/
 ├── Qwen3-0.6B-concurrency2/
 └── Qwen3-0.6B-concurrency4/
 ```
 
-**Generated plots (3 default):**
-2. **TTFT vs Throughput** - Time to first token vs request throughput across concurrency levels
-4. **Token Throughput per GPU vs Latency** - GPU efficiency vs latency (when GPU telemetry available)
-5. **Token Throughput per GPU vs Interactivity** - GPU efficiency vs TTFT (when GPU telemetry available)
+**Default plots (3):**
+1. **TTFT vs Throughput** - Time to first token vs request throughput
+2. **Token Throughput per GPU vs Latency** - GPU efficiency vs latency (requires GPU telemetry)
+3. **Token Throughput per GPU vs Interactivity** - GPU efficiency vs TTFT (requires GPU telemetry)
+
+> [!TIP]
+> Use [Experiment Classification](#experiment-classification) to assign semantic colors (grey for baselines, green for treatments) for clearer visual distinction.
 
-#### Example Multi-Run Visualizations
+#### Example Visualizations
 
 ![TTFT vs Throughput](../diagrams/plot_examples/multi_run/ttft_vs_throughput.png)
 
-The TTFT vs Throughput plot shows how time to first token varies with request throughput across different concurrency levels, helping identify configurations that balance responsiveness with system load.
+Shows how time to first token varies with request throughput across concurrency levels, helping identify configurations that balance responsiveness with load.
 
 ![Pareto Curve: Throughput per GPU vs Latency](../diagrams/plot_examples/multi_run/pareto_curve_throughput_per_gpu_vs_latency.png)
 
-The Pareto curve highlights optimal configurations that maximize GPU efficiency while minimizing latency. Points on the Pareto frontier represent the best trade-offs between these metrics.
+Highlights optimal configurations on the Pareto frontier that maximize GPU efficiency while minimizing latency.
 
 ![Pareto Curve: Throughput per GPU vs Interactivity](../diagrams/plot_examples/multi_run/pareto_curve_throughput_per_gpu_vs_interactivity.png)
 
-This Pareto curve shows the trade-off between GPU efficiency (tokens/sec/GPU) and interactivity (TTFT), helping identify configurations that maximize GPU utilization while maintaining acceptable first-token latency.
+Shows the trade-off between GPU efficiency and interactivity (TTFT).
 
 ### Single-Run Analysis Mode
 
-**Detected when:**
+Analyzes performance over time for a single profiling run.
+
+**Auto-detected when:**
 - Directory contains `profile_export.jsonl` directly
-- Path points to a single profiling run
 
-**Example directory structure:**
+**Example:**
 ```
-artifacts/single_run/          # Single run directory
+artifacts/single_run/
 └── profile_export.jsonl
 ```
 
-**Generated plots (4+ default):**
-1. **TTFT Over Time** - Scatter plot of time to first token for each request
-2. **Inter-Token Latency Over Time** - Scatter plot of ITL for each request
-3. **Request Latency Over Time** - Area chart showing end-to-end latency progression
-4. **Dispersed Throughput Over Time** - Event-based throughput showing continuous token generation rate
+**Default plots (4+):**
+1. **TTFT Over Time** - Time to first token per request
+2. **Inter-Token Latency Over Time** - ITL per request
+3. **Request Latency Over Time** - End-to-end latency progression
+4. **Dispersed Throughput Over Time** - Continuous token generation rate
 
 **Additional plots (when data available):**
-- **Timeslice plots**: TTFT, ITL, throughput, and latency metrics across time windows (when `--slice-duration` was used)
-- **GPU telemetry plots**: GPU utilization and memory usage over time (when `--gpu-telemetry` was used)
+- Timeslice plots (when `--slice-duration` used during profiling)
+- GPU telemetry plots (when `--gpu-telemetry` used during profiling)
 
-#### Example Single-Run Time Series Visualizations
+#### Example Visualizations
 
 ![TTFT Over Time](../diagrams/plot_examples/single_run/time_series/ttft_over_time.png)
 
-The TTFT Over Time scatter plot shows the time to first token for each request throughout the benchmark run, helping identify patterns in prefill latency and potential warm-up or degradation effects.
+Time to first token for each request, revealing prefill latency patterns and potential warm-up effects.
 
 ![Inter-Token Latency Over Time](../diagrams/plot_examples/single_run/time_series/itl_over_time.png)
 
-The ITL Over Time scatter plot displays inter-token latency for each request, revealing generation performance consistency and identifying outliers or performance variations over the run duration.
+Inter-token latency per request, showing generation performance consistency.
 
 ![Request Latency Over Time](../diagrams/plot_examples/single_run/time_series/latency_over_time.png)
 
-The Request Latency Over Time area chart shows end-to-end latency progression throughout the run, providing a holistic view of system performance including both prefill and generation phases.
+End-to-end latency progression throughout the run.
 
-### Dispersed Throughput Over Time
+### Dispersed Throughput
 
-The **Dispersed Throughput Over Time** plot uses an event-based approach to accurately represent token generation rates. Unlike traditional binning methods that create artificial spikes when requests complete, this visualization distributes tokens evenly across the time they were actually generated.
+The **Dispersed Throughput Over Time** plot uses an event-based approach for accurate token generation rate visualization. Unlike binning methods that create artificial spikes, this distributes tokens evenly across their actual generation time:
+- **Prefill phase** (request_start → TTFT): 0 tok/sec
+- **Generation phase** (TTFT → request_end): constant rate = output_tokens / (request_end - TTFT)
 
-**How it works:**
-- **Prefill phase** (`request_start` → `TTFT`): No output tokens (0 tok/sec)
-- **Generation phase** (`TTFT` → `request_end`): Constant token rate = `output_tokens / (request_end - TTFT)`
-- Plot shows sum of all concurrent token generation rates
-- Throughput changes only at discrete events (request starts generating or completes)
+This provides smooth, continuous representation that correlates better with server metrics like GPU utilization.
 
-**Why dispersed vs binning?**
+![Dispersed Throughput Over Time](../diagrams/plot_examples/single_run/dispersed_throughput_over_time.png)
 
-Traditional binning creates artificial spikes, requires arbitrary bin size selection, and misses patterns between bins. Dispersed throughput provides smooth, continuous representation that accurately correlates with server metrics like GPU utilization.
+## Customization Options
 
-![Dispersed Throughput Over Time](../diagrams/plot_examples/single_run/dispersed_throughput_over_time.png)
+### Plot Configuration YAML
 
-## Command Options
+Customize which plots are generated and how they appear by editing `~/.aiperf/plot_config.yaml`.
 
-### Basic Options
+#### Enable/Disable Plots
 
-```bash
-# Analyze a single profiling run (outputs to <single_run_name>/plots/)
-aiperf plot <single_run_name>
+**Multi-run plots:**
+```yaml
+visualization:
+  multi_run_defaults:
+    - pareto_curve_throughput_per_gpu_vs_latency
+    - pareto_curve_throughput_per_gpu_vs_interactivity
+    - ttft_vs_throughput
+```
 
-# Compare multiple runs in a directory (outputs to <run_directory>/plots/)
-aiperf plot <run_directory>
+**Single-run plots:**
+```yaml
+visualization:
+  single_run_defaults:
+    - ttft_over_time
+    - itl_over_time
+    - dispersed_throughput_over_time
+    # ... add or remove plots
+```
 
-# Compare all runs in multiple directories (outputs to <run_directory>/plots/)
-aiperf plot <run_directory1> <run_directory2> ... <run_directoryn>
+#### Customize Plot Grouping
 
-# Compare multiple specific runs (outputs to <single_run_name1>/plots/)
-aiperf plot <single_run_name1> <single_run_name2> ... <single_run_namen>
+Multi-run comparison plots group runs to create colored lines/series. Customize the `groups:` field in plot presets:
 
-# Specify custom output location
-aiperf plot <path> --output <output_directory>
+**Group by model** (useful for comparing different models):
+```yaml
+multi_run_plots:
+  ttft_vs_throughput:
+    groups: [model]
+```
 
-# Use dark theme for presentations
-aiperf plot <path> --theme dark
+**Group by directory** (useful for hierarchical experiments):
+```yaml
+multi_run_plots:
+  ttft_vs_throughput:
+    groups: [experiment_group]
 ```
 
-### Output Directory Logic
+**Group by run name** (default - each run is separate):
+```yaml
+multi_run_plots:
+  ttft_vs_throughput:
+    groups: [run_name]
+```
 
-The output directory follows this logic:
-1. If `--output` is specified, use that path
-2. Otherwise, use `<first_input_path>/plots/`
-3. Default first input path is `./artifacts` if no paths specified
+> [!NOTE]
+> When experiment classification is enabled, all multi-run plots automatically group by `experiment_group` to preserve treatment variants with semantic colors.
 
-**Examples:**
-```bash
-# Outputs to: ./artifacts/plots/
-aiperf plot
+> [!TIP]
+> See the CONFIGURATION GUIDE section in `~/.aiperf/plot_config.yaml` for detailed customization options.
+
+### Experiment Classification
+
+Classify runs as "baseline" or "treatment" for semantic color assignment in multi-run comparisons.
+
+**Configuration** (`~/.aiperf/plot_config.yaml`):
+```yaml
+experiment_classification:
+  baselines:
+    - "*baseline*"     # Glob patterns
+    - "*_agg_*"
+  treatments:
+    - "*treatment*"
+    - "*_disagg_*"
+  default: treatment   # Fallback when no match
+```
 
-# Outputs to: <sweep_directory>/plots/
-aiperf plot <sweep_directory>
+**Result:**
+- **Baselines**: Grey shades, listed first in legend
+- **Treatments**: NVIDIA green shades, listed after baselines
+- **Use case**: Clear visual distinction for A/B testing
 
-# Outputs to: <custom_output_path>
-aiperf plot <sweep_directory> --output <custom_output_path>
+> [!IMPORTANT]
+> When enabled, **all multi-run plots automatically group by experiment_group** (directory name) to preserve individual treatment variants with semantic baseline/treatment colors.
+
+**Pattern notes**: Uses glob syntax (`*` = wildcard), case-sensitive, first match wins.
+
+#### Example
+
+**Directory structure:**
+```
+artifacts/
+├── baseline_moderate_io_isl100_osl200_streaming/           # Grey
+│   ├── concurrency_1/
+│   └── concurrency_2/
+├── treatment_large_context_isl500_osl50_streaming/         # Green
+│   ├── concurrency_1/
+│   └── concurrency_2/
+└── treatment_long_generation_isl50_osl500_streaming/       # Blue
+    ├── concurrency_1/
+    └── concurrency_2/
 ```
 
-## Theme Options
+**Result**: 3 lines in plots (1 baseline + 2 treatments, each with semantic colors)
 
-Choose between light and dark themes for your plots:
+**Advanced**: Use `group_extraction_pattern` to aggregate variants:
+```yaml
+group_extraction_pattern: "^(treatment_\d+)"  # Groups treatment_1_varA + treatment_1_varB → "treatment_1"
+```
+
+> [!TIP]
+> See `src/aiperf/plot/default_plot_config.yaml` for all configuration options.
+
+![Pareto Curve with Experiment Classification](../diagrams/plot_examples/multi_run/config_experiment_classification/pareto_curve_throughput_per_gpu_vs_interactivity.png)
+
+![TTFT vs Throughput with Experiment Classification](../diagrams/plot_examples/multi_run/config_experiment_classification/ttft_vs_throughput.png)
+
+### Theme Options
 
 ```bash
 # Light theme (default)
 aiperf plot <path>
 
-aiperf plot <path> --theme light
-
-# Dark theme
+# Dark theme (for presentations)
 aiperf plot <path> --theme dark
 ```
 
-### Dark Theme Examples
-
-The dark theme uses a dark background optimized for presentations and low-light environments while maintaining NVIDIA brand colors and readability.
+The dark theme uses a dark background optimized for presentations while maintaining NVIDIA brand colors.
 
 #### Multi-Run Dark Theme
 
 ![TTFT vs Throughput (Dark)](../diagrams/plot_examples/multi_run/theme_dark_mode/ttft_vs_throughput.png)
 
-![Pareto Curve: Throughput per GPU vs Latency (Dark)](../diagrams/plot_examples/multi_run/theme_dark_mode/pareto_curve_throughput_per_gpu_vs_latency.png)
+![Pareto Curve: Latency (Dark)](../diagrams/plot_examples/multi_run/theme_dark_mode/pareto_curve_throughput_per_gpu_vs_latency.png)
 
-![Pareto Curve: Throughput per GPU vs Interactivity (Dark)](../diagrams/plot_examples/multi_run/theme_dark_mode/pareto_curve_throughput_per_gpu_vs_interactivity.png)
+![Pareto Curve: Interactivity (Dark)](../diagrams/plot_examples/multi_run/theme_dark_mode/pareto_curve_throughput_per_gpu_vs_interactivity.png)
 
 #### Single-Run Dark Theme
 
-![GPU Utilization and Throughput Over Time (Dark)](../diagrams/plot_examples/single_run/time_series/theme_dark_mode/gpu_utilization_and_throughput_over_time.png)
+![GPU Utilization (Dark)](../diagrams/plot_examples/single_run/time_series/theme_dark_mode/gpu_utilization_and_throughput_over_time.png)
 
-![Inter-Token Latency Over Time (Dark)](../diagrams/plot_examples/single_run/time_series/theme_dark_mode/itl_over_time.png)
+![ITL Over Time (Dark)](../diagrams/plot_examples/single_run/time_series/theme_dark_mode/itl_over_time.png)
 
 ![ITL Across Timeslices (Dark)](../diagrams/plot_examples/single_run/time_series/theme_dark_mode/timeslices_itl.png)
 
-## GPU Telemetry Integration
+## Advanced Features
+
+### GPU Telemetry Integration
 
-When GPU telemetry is collected during profiling (via `--gpu-telemetry` flag), the plot command automatically includes GPU metrics in visualizations.
+When GPU telemetry is collected (via `--gpu-telemetry` flag during profiling), plots automatically include GPU metrics.
 
-**Multi-run plots** (when telemetry available across runs):
+**Multi-run plots** (when telemetry available):
 - Token Throughput per GPU vs Latency
-- Token Throughput per GPU vs Interactivity (TTFT)
+- Token Throughput per GPU vs Interactivity
 
-**Single-run plots** (time series with telemetry):
+**Single-run plots** (time series):
 - GPU Utilization Over Time
 - GPU Memory Usage Over Time
 
 ![GPU Utilization and Throughput Over Time](../diagrams/plot_examples/single_run/time_series/gpu_utilization_and_throughput_over_time.png)
 
 > [!TIP]
-> For GPU telemetry setup and detailed analysis, see the [GPU Telemetry Tutorial](gpu-telemetry.md).
+> See the [GPU Telemetry Tutorial](gpu-telemetry.md) for setup and detailed analysis.
 
-## Timeslice Integration
+### Timeslice Integration
 
-When timeslice data is available (via `--slice-duration` during profiling), the plot command automatically generates timeslice visualizations showing performance evolution across time windows.
+When timeslice data is available (via `--slice-duration` during profiling), plots show performance evolution across time windows.
 
 **Generated timeslice plots:**
 - TTFT Across Timeslices
@@ -244,67 +315,60 @@ These help identify warm-up effects, performance degradation, and steady-state b
 ![Latency Across Timeslices](../diagrams/plot_examples/single_run/timeslices/timeslices_latency.png)
 
 > [!TIP]
-> For detailed timeslice configuration and analysis, see the [Timeslices Tutorial](timeslices.md).
-
+> See the [Timeslices Tutorial](timeslices.md) for configuration and analysis.
 
 ## Output Files
 
-The plot command generates the following files in the output directory:
+Plots are saved as PNG files in the output directory:
 
 ```
 plots/
-├── pareto_curve_latency_vs_throughput.png
 ├── ttft_vs_throughput.png
-├── output_token_throughput_per_user_vs_concurrency.png
-├── dispersed_throughput_over_time.png (for single-run analysis)
-├── token_throughput_per_gpu_vs_latency.png (if GPU telemetry available)
-├── token_throughput_per_gpu_vs_ttft.png (if GPU telemetry available)
-└── ... (additional plots based on mode and available data)
+├── pareto_curve_throughput_per_gpu_vs_latency.png
+├── pareto_curve_throughput_per_gpu_vs_interactivity.png
+├── ttft_over_time.png (single-run)
+├── dispersed_throughput_over_time.png (single-run)
+├── gpu_utilization_and_throughput_over_time.png (if GPU telemetry)
+└── timeslices_*.png (if timeslice data available)
 ```
 
-
-
 ## Best Practices
 
 > [!TIP]
-> **Consistent Configurations**: When comparing runs, keep all parameters identical except the one you're testing (e.g., only vary concurrency). This ensures plots show the impact of that specific parameter.
-> Future features in interactive mode will allow pop-ups to show specific configurations of plotted runs.
+> **Consistent Configurations**: When comparing runs, vary only one parameter (e.g., concurrency) while keeping others constant. This isolates the impact of that specific parameter.
 
 > [!TIP]
-> **Include Warmup**: Use `--warmup-request-count` to ensure the server reaches steady state before measurement. This reduces noise in your visualizations.
+> **Use Experiment Classification**: Configure [experiment classification](#experiment-classification) to distinguish baselines from treatments with semantic colors.
+
+> [!TIP]
+> **Include Warmup**: Use `--warmup-request-count` to ensure steady state before measurement, reducing noise in visualizations.
 
 > [!WARNING]
-> **Directory Structure**: The plot command relies on consistent directory naming. Ensure all runs you want to compare are in subdirectories of a common parent directory.
+> **Directory Structure**: Ensure consistent naming - runs to compare must be in subdirectories of a common parent.
 
 > [!NOTE]
-> **GPU Metrics**: GPU telemetry plots only appear when telemetry data is available. Make sure DCGM is running and accessible during profiling. See [GPU Telemetry Tutorial](gpu-telemetry.md).
+> **GPU Metrics**: GPU telemetry plots only appear when telemetry data is available. Ensure DCGM is running during profiling. See [GPU Telemetry Tutorial](gpu-telemetry.md).
 
 ## Troubleshooting
 
 ### No Plots Generated
 
-**Problem**: Running `aiperf plot` but no PNG files appear.
-
 **Solutions**:
-- Verify the input directory contains valid profiling data (`profile_export.jsonl` files)
-- Check that the output directory is writable
-- Look for error messages in the console output
+- Verify input directory contains valid `profile_export.jsonl` files
+- Check output directory is writable
+- Review console output for error messages
 
 ### Missing GPU Plots
 
-**Problem**: Expected GPU telemetry plots but they don't appear.
-
 **Solutions**:
-- Verify GPU telemetry was collected during profiling (check `gpu_telemetry_export.jsonl` for telemetry data)
-- Ensure DCGM exporter was running and accessible during profiling
-- Confirm telemetry data is present in the profile exports
+- Verify `gpu_telemetry_export.jsonl` exists and contains data
+- Ensure DCGM exporter was running during profiling
+- Check telemetry data is present in profile exports
 
 ### Incorrect Mode Detection
 
-**Problem**: Multi-run data showing single-run plots or vice versa.
-
 **Solutions**:
-- Check directory structure matches expected format:
+- Check directory structure:
   - Multi-run: parent directory with multiple run subdirectories
   - Single-run: directory with `profile_export.jsonl` directly inside
 - Ensure all run directories contain valid `profile_export.jsonl` files
@@ -312,6 +376,6 @@ plots/
 ## Related Documentation
 
 - [Working with Profile Exports](working-with-profile-exports.md) - Understanding profiling data format
-- [GPU Telemetry](gpu-telemetry.md) - Collecting GPU metrics during profiling
+- [GPU Telemetry](gpu-telemetry.md) - Collecting GPU metrics
 - [Timeslices](timeslices.md) - Time-windowed performance analysis
-- [Request Rate and Concurrency](request-rate-concurrency.md) - Load generation strategies for sweeps
+- [Request Rate and Concurrency](request-rate-concurrency.md) - Load generation strategies
diff --git a/src/aiperf/cli.py b/src/aiperf/cli.py
index f7d4446e6..0391fb382 100644
--- a/src/aiperf/cli.py
+++ b/src/aiperf/cli.py
@@ -40,6 +40,8 @@ def plot(
     paths: list[str] | None = None,
     output: str | None = None,
     theme: str = "light",
+    config: str | None = None,
+    verbose: bool = False,
 ) -> None:
     """Generate PNG visualizations from AIPerf profiling data.
 
@@ -48,12 +50,28 @@ def plot(
     whether to generate multi-run comparison plots or single-run time series plots
     based on the directory structure.
 
+    On first run, automatically creates ~/.aiperf/plot_config.yaml which you can edit to
+    customize plots, including experiment classification (baseline vs treatment runs).
+    Use --config to specify a different config file.
+
+    Examples:
+        # Generate plots (auto-creates ~/.aiperf/plot_config.yaml on first run)
+        aiperf plot
+
+        # Use custom config
+        aiperf plot --config my_plots.yaml
+
+        # Show detailed error tracebacks
+        aiperf plot --verbose
+
     Args:
         paths: Paths to profiling run directories. Defaults to ./artifacts if not specified.
         output: Directory to save generated plots. Defaults to <first_path>/plots if not specified.
         theme: Plot theme to use: 'light' (white background) or 'dark' (dark background). Defaults to 'light'.
+        config: Path to custom plot configuration YAML file. If not specified, auto-creates and uses ~/.aiperf/plot_config.yaml.
+        verbose: Show detailed error tracebacks in console (errors are always logged to <output_dir>/aiperf_plot.log).
     """
-    with exit_on_error(title="Error Running Plot Command"):
+    with exit_on_error(title="Error Running Plot Command", show_traceback=verbose):
         from aiperf.plot.cli_runner import run_plot_controller
 
-        run_plot_controller(paths, output, theme=theme)
+        run_plot_controller(paths, output, theme=theme, config=config, verbose=verbose)
diff --git a/src/aiperf/cli_utils.py b/src/aiperf/cli_utils.py
index 9768956d5..52e45319d 100644
--- a/src/aiperf/cli_utils.py
+++ b/src/aiperf/cli_utils.py
@@ -68,6 +68,7 @@ class exit_on_error(AbstractContextManager):
         text_color: The text color to use.
         title: The title of the error.
         exit_code: The exit code to use.
+        show_traceback: Whether to show the full exception traceback. Defaults to True.
     """
 
     def __init__(
@@ -77,12 +78,14 @@ def __init__(
         text_color: "StyleType | None" = None,
         title: str = "Error",
         exit_code: int = 1,
+        show_traceback: bool = True,
     ):
         self.message: RenderableType = message
         self.text_color: StyleType | None = text_color
         self.title: str = title
         self.exit_code: int = exit_code
         self.exceptions: tuple[type[BaseException], ...] = exceptions
+        self.show_traceback: bool = show_traceback
 
     def __enter__(self):
         return self
@@ -98,13 +101,17 @@ def __exit__(self, exc_type, exc_value, traceback):
             from rich.console import Console
 
             console = Console()
-            console.print_exception(
-                show_locals=True,
-                max_frames=10,
-                word_wrap=True,
-                width=console.width,
-            )
-            console.file.flush()
+
+            # Only show full traceback if requested
+            if self.show_traceback:
+                console.print_exception(
+                    show_locals=True,
+                    max_frames=10,
+                    word_wrap=True,
+                    width=console.width,
+                )
+                console.file.flush()
+
             message = (
                 self.message.format(e=exc_value)
                 if isinstance(self.message, str)
diff --git a/src/aiperf/common/enums/__init__.py b/src/aiperf/common/enums/__init__.py
index 3f61b0b62..0a6f9cf13 100644
--- a/src/aiperf/common/enums/__init__.py
+++ b/src/aiperf/common/enums/__init__.py
@@ -67,6 +67,7 @@
     MetricValueType,
     MetricValueTypeInfo,
     MetricValueTypeVarT,
+    PlotMetricDirection,
     PowerMetricUnit,
     PowerMetricUnitInfo,
     TemperatureMetricUnit,
@@ -153,6 +154,7 @@
     "MetricValueTypeInfo",
     "MetricValueTypeVarT",
     "ModelSelectionStrategy",
+    "PlotMetricDirection",
     "PowerMetricUnit",
     "PowerMetricUnitInfo",
     "PromptSource",
diff --git a/src/aiperf/common/enums/metric_enums.py b/src/aiperf/common/enums/metric_enums.py
index 69b3db5ea..1b0750a31 100644
--- a/src/aiperf/common/enums/metric_enums.py
+++ b/src/aiperf/common/enums/metric_enums.py
@@ -414,6 +414,16 @@ class MetricType(CaseInsensitiveStrEnum):
     Examples: request throughput, output token throughput, etc."""
 
 
+class PlotMetricDirection(CaseInsensitiveStrEnum):
+    """Direction indicating whether higher or lower metric values are better for plotting purposes."""
+
+    HIGHER = "higher"
+    """Higher values are better (e.g., throughput, accuracy)."""
+
+    LOWER = "lower"
+    """Lower values are better (e.g., latency, error rate)."""
+
+
 class MetricValueTypeInfo(BasePydanticEnumInfo):
     """Information about a metric value type."""
 
diff --git a/src/aiperf/plot/__init__.py b/src/aiperf/plot/__init__.py
index 3a444848d..08a5a728b 100644
--- a/src/aiperf/plot/__init__.py
+++ b/src/aiperf/plot/__init__.py
@@ -12,6 +12,10 @@
 from aiperf.plot.cli_runner import (
     run_plot_controller,
 )
+from aiperf.plot.config import (
+    PlotConfig,
+    logger,
+)
 from aiperf.plot.constants import (
     ALL_STAT_KEYS,
     AVAILABLE_STATS,
@@ -23,6 +27,7 @@
     DEFAULT_PLOT_HEIGHT,
     DEFAULT_PLOT_WIDTH,
     DEFAULT_PNG_OUTPUT_DIR,
+    DERIVED_METRIC_DIRECTIONS,
     LIGHT_THEME_COLORS,
     NON_METRIC_KEYS,
     NVIDIA_BORDER_DARK,
@@ -35,6 +40,7 @@
     NVIDIA_GREEN,
     NVIDIA_TEXT_LIGHT,
     NVIDIA_WHITE,
+    OUTLIER_RED,
     PLOT_FONT_FAMILY,
     PLOT_LOG_FILE,
     PROFILE_EXPORT_AIPERF_JSON,
@@ -49,6 +55,7 @@
     DataLoader,
     DataSource,
     DerivedMetricCalculator,
+    ExperimentClassificationConfig,
     MetricSpec,
     ModeDetector,
     PlotGenerator,
@@ -66,6 +73,7 @@
     auto_select_label_by,
     calculate_rolling_percentiles,
     calculate_throughput_events,
+    detect_directional_outliers,
     detect_swept_parameters,
     flatten_config,
     get_nvidia_color_scheme,
@@ -98,11 +106,16 @@
     TimeSliceHandler,
 )
 from aiperf.plot.logging import (
+    setup_console_only_logging,
     setup_plot_logging,
 )
 from aiperf.plot.metric_names import (
+    get_aggregated_metrics,
     get_all_metric_display_names,
+    get_gpu_metrics,
     get_metric_display_name,
+    get_request_metrics,
+    get_timeslice_metrics,
 )
 from aiperf.plot.plot_controller import (
     PlotController,
@@ -125,11 +138,13 @@
     "DEFAULT_PLOT_HEIGHT",
     "DEFAULT_PLOT_WIDTH",
     "DEFAULT_PNG_OUTPUT_DIR",
+    "DERIVED_METRIC_DIRECTIONS",
     "DataLoadError",
     "DataLoader",
     "DataSource",
     "DerivedMetricCalculator",
     "DualAxisHandler",
+    "ExperimentClassificationConfig",
     "HistogramHandler",
     "LIGHT_THEME_COLORS",
     "MetricSpec",
@@ -147,6 +162,7 @@
     "NVIDIA_GREEN",
     "NVIDIA_TEXT_LIGHT",
     "NVIDIA_WHITE",
+    "OUTLIER_RED",
     "PLOT_FONT_FAMILY",
     "PLOT_LOG_FILE",
     "PROFILE_EXPORT_AIPERF_JSON",
@@ -154,6 +170,7 @@
     "PROFILE_EXPORT_JSONL",
     "PROFILE_EXPORT_TIMESLICES_CSV",
     "ParetoHandler",
+    "PlotConfig",
     "PlotController",
     "PlotError",
     "PlotGenerationError",
@@ -179,14 +196,21 @@
     "auto_select_label_by",
     "calculate_rolling_percentiles",
     "calculate_throughput_events",
+    "detect_directional_outliers",
     "detect_swept_parameters",
     "flatten_config",
+    "get_aggregated_metrics",
     "get_all_metric_display_names",
+    "get_gpu_metrics",
     "get_metric_display_name",
     "get_nvidia_color_scheme",
+    "get_request_metrics",
+    "get_timeslice_metrics",
+    "logger",
     "prepare_request_timeseries",
     "prepare_timeslice_metrics",
     "run_plot_controller",
+    "setup_console_only_logging",
     "setup_plot_logging",
     "validate_request_uniformity",
 ]
diff --git a/src/aiperf/plot/cli_runner.py b/src/aiperf/plot/cli_runner.py
index 75c6060fb..8a7ca4f00 100644
--- a/src/aiperf/plot/cli_runner.py
+++ b/src/aiperf/plot/cli_runner.py
@@ -4,7 +4,7 @@
 
 from pathlib import Path
 
-from aiperf.plot.constants import PlotMode, PlotTheme
+from aiperf.plot.constants import PLOT_LOG_FILE, PlotMode, PlotTheme
 from aiperf.plot.plot_controller import PlotController
 
 
@@ -13,6 +13,8 @@ def run_plot_controller(
     output: str | None = None,
     mode: PlotMode | str = PlotMode.PNG,
     theme: PlotTheme | str = PlotTheme.LIGHT,
+    config: str | None = None,
+    verbose: bool = False,
 ) -> None:
     """Generate plots from AIPerf profiling data.
 
@@ -21,6 +23,8 @@ def run_plot_controller(
         output: Directory to save generated plots. Defaults to <first_path>/plots if not specified.
         mode: Output mode for plots. Defaults to PNG.
         theme: Plot theme to use (LIGHT or DARK). Defaults to LIGHT.
+        config: Path to custom plot configuration YAML file. If not specified, uses default config.
+        verbose: Show detailed error tracebacks in console.
     """
     input_paths = paths or ["./artifacts"]
     input_paths = [Path(p) for p in input_paths]
@@ -32,14 +36,19 @@ def run_plot_controller(
     if isinstance(theme, str):
         theme = PlotTheme(theme.lower())
 
+    config_path = Path(config) if config else None
+
     controller = PlotController(
         paths=input_paths,
         output_dir=output_dir,
         mode=mode,
         theme=theme,
+        config_path=config_path,
+        verbose=verbose,
     )
 
     generated_files = controller.run()
 
     print(f"\nGenerated {len(generated_files)} plots")
     print(f"Saved to: {output_dir}")
+    print(f"Logs: {output_dir / PLOT_LOG_FILE}")
diff --git a/src/aiperf/plot/config.py b/src/aiperf/plot/config.py
new file mode 100644
index 000000000..5f05584a8
--- /dev/null
+++ b/src/aiperf/plot/config.py
@@ -0,0 +1,510 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Plot configuration loader for YAML-based plot definitions.
+
+Loads plot specifications from YAML files with the following priority:
+1. Custom path (if provided via --config flag)
+2. User home config (~/.aiperf/plot_config.yaml) - auto-created on first run
+3. Default shipped config (src/aiperf/plot/default_plot_config.yaml)
+"""
+
+import logging
+import shutil
+from pathlib import Path
+
+from ruamel.yaml import YAML
+
+from aiperf.plot.constants import ALL_STAT_KEYS
+from aiperf.plot.core.plot_specs import (
+    DataSource,
+    ExperimentClassificationConfig,
+    MetricSpec,
+    PlotSpec,
+    PlotType,
+    Style,
+    TimeSlicePlotSpec,
+)
+from aiperf.plot.metric_names import (
+    get_aggregated_metrics,
+    get_gpu_metrics,
+    get_request_metrics,
+    get_timeslice_metrics,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def _detect_invalid_stat_pattern(metric_name: str) -> str | None:
+    """
+    Detect if metric name has an invalid stat-like suffix pattern.
+
+    Args:
+        metric_name: Full metric name
+
+    Returns:
+        The invalid stat suffix if detected (e.g., "p67"), None otherwise
+    """
+    if "_" not in metric_name:
+        return None
+
+    _, potential_stat = metric_name.rsplit("_", 1)
+
+    if potential_stat in ["avg", "min", "max", "std"]:
+        return None
+
+    if (
+        potential_stat.startswith("p")
+        and potential_stat[1:].isdigit()
+        and potential_stat not in ALL_STAT_KEYS
+    ):
+        return potential_stat
+
+    return None
+
+
+def _parse_and_validate_metric_name(metric_name: str) -> tuple[str, str | None]:
+    """
+    Parse and validate metric name format.
+
+    Supports two formats:
+    1. {metric_name}_{stat} - e.g., "request_latency_p50"
+    2. {metric_name} - e.g., "request_number"
+
+    Args:
+        metric_name: Metric shortcut name
+
+    Returns:
+        Tuple of (base_metric_name, stat) where stat is None if no suffix
+
+    Raises:
+        ValueError: If metric name has invalid stat suffix pattern
+    """
+    if "_" not in metric_name:
+        return (metric_name, None)
+
+    base_name, potential_stat = metric_name.rsplit("_", 1)
+
+    if potential_stat in ALL_STAT_KEYS:
+        return (base_name, potential_stat)
+
+    invalid_stat = _detect_invalid_stat_pattern(metric_name)
+    if invalid_stat:
+        import difflib
+
+        close_matches = difflib.get_close_matches(
+            invalid_stat, ALL_STAT_KEYS, n=3, cutoff=0.6
+        )
+
+        error_msg = (
+            f"Invalid stat suffix '{invalid_stat}' in metric '{metric_name}'.\n\n"
+        )
+        error_msg += "Valid stat suffixes are:\n"
+        error_msg += f"  {', '.join(ALL_STAT_KEYS)}\n"
+
+        if close_matches:
+            error_msg += "\nDid you mean one of these?\n"
+            for match in close_matches:
+                error_msg += f"  - {base_name}_{match}\n"
+
+        raise ValueError(error_msg)
+
+    return (metric_name, None)
+
+
+class PlotConfig:
+    """
+    Load and manage plot configuration from YAML.
+
+    Supports loading from multiple sources with priority:
+    1. Custom config path (CLI override)
+    2. User home config (~/.aiperf/plot_config.yaml)
+    3. Default shipped config
+
+    Args:
+        config_path: Optional custom path to YAML config file
+    """
+
+    def __init__(self, config_path: Path | None = None, verbose: bool = False) -> None:
+        """
+        Initialize plot configuration loader.
+
+        Args:
+            config_path: Optional custom path to YAML config file
+            verbose: Show detailed error tracebacks in console
+        """
+        self.custom_path = config_path
+        self.verbose = verbose
+        self.resolved_path = self._resolve_config_path()
+        self.config = self._load_yaml()
+
+    def _resolve_config_path(self) -> Path:
+        """
+        Resolve which config file to use based on priority.
+
+        Priority:
+        1. Custom path via --config flag (explicit override)
+        2. ~/.aiperf/plot_config.yaml (auto-created from default on first run)
+        3. System default (fallback only, indicates package issue)
+
+        Console messages:
+        - Shows "Using config: <path>" when using customized config (Priority 1 or 2)
+        - Shows creation message when auto-creating config on first run
+        - Silent when using system defaults
+
+        Returns:
+            Path to the configuration file to use
+
+        Raises:
+            FileNotFoundError: If custom path is specified but doesn't exist
+        """
+        # Priority 1: Custom path via CLI
+        if self.custom_path:
+            if not self.custom_path.exists():
+                raise FileNotFoundError(
+                    f"Configuration file not found: {self.custom_path}"
+                )
+            print(f"Using config: {self.custom_path}")
+            return self.custom_path
+
+        # Priority 2: User home config (auto-create if missing)
+        user_config = Path.home() / ".aiperf" / "plot_config.yaml"
+        if not user_config.exists():
+            default_config = Path(__file__).parent / "default_plot_config.yaml"
+            if not default_config.exists():
+                raise FileNotFoundError(
+                    f"Default plot config not found at {default_config}. "
+                    "This indicates a package installation issue."
+                )
+
+            user_config.parent.mkdir(parents=True, exist_ok=True)
+            shutil.copy(default_config, user_config)
+
+            print(f"\nCreated plot configuration: {user_config}")
+            print(
+                "   Edit this file to customize plots (changes take effect on next run)\n"
+            )
+        else:
+            print(f"Using config: {user_config}")
+
+        return user_config
+
+    def _load_yaml(self) -> dict:
+        """
+        Load and parse YAML configuration file.
+
+        Returns:
+            Dictionary containing the parsed YAML configuration
+
+        Raises:
+            FileNotFoundError: If config file doesn't exist
+            ValueError: If YAML is invalid or malformed
+        """
+        if not self.resolved_path.exists():
+            raise FileNotFoundError(
+                f"Configuration file not found: {self.resolved_path}"
+            )
+
+        try:
+            yaml = YAML(typ="safe")
+            with open(self.resolved_path, encoding="utf-8") as f:
+                config = yaml.load(f)
+
+            if not isinstance(config, dict):
+                raise ValueError(
+                    f"Invalid YAML config: expected dictionary, got {type(config).__name__}"
+                )
+
+            if "visualization" not in config:
+                raise ValueError(
+                    "Invalid YAML config: missing 'visualization' top-level key"
+                )
+
+            return config
+
+        except Exception as e:
+            raise ValueError(
+                f"Failed to load YAML config from {self.resolved_path}: {e}"
+            ) from e
+
+    def get_multi_run_plot_specs(self) -> list[PlotSpec]:
+        """
+        Get plot specifications for multi-run comparison plots.
+
+        Returns:
+            List of PlotSpec objects for multi-run visualizations
+
+        Raises:
+            ValueError: If multi_run section is missing or invalid
+        """
+        viz_config = self.config.get("visualization", {})
+
+        defaults = viz_config.get("multi_run_defaults", [])
+        if not isinstance(defaults, list):
+            raise ValueError(
+                f"Invalid config: 'visualization.multi_run_defaults' must be a list, "
+                f"got {type(defaults).__name__}"
+            )
+
+        presets = viz_config.get("multi_run_plots", {})
+        if not isinstance(presets, dict):
+            raise ValueError(
+                f"Invalid config: 'visualization.multi_run_plots' must be a dict, "
+                f"got {type(presets).__name__}"
+            )
+
+        plot_specs = []
+        for plot_name in defaults:
+            try:
+                if plot_name not in presets:
+                    raise ValueError(
+                        f"Plot '{plot_name}' listed in multi_run_defaults but not found in multi_run_plots"
+                    )
+
+                preset = presets[plot_name]
+                plot_spec = self._preset_to_plot_spec(plot_name, preset)
+                plot_specs.append(plot_spec)
+            except Exception as e:
+                error_context = (
+                    f"Failed to parse multi_run plot preset '{plot_name}'\n"
+                    f"Config file: {self.resolved_path}\n"
+                    f"Preset: {preset if plot_name in presets else '<not found>'}\n"
+                    f"Error: {e}"
+                )
+                logger.error(error_context, exc_info=True)
+
+                raise ValueError(
+                    f"Config validation failed for multi_run plot '{plot_name}'. "
+                    f"Check the configuration file at {self.resolved_path}"
+                ) from e
+
+        return plot_specs
+
+    def get_single_run_plot_specs(self) -> list[PlotSpec]:
+        """
+        Get plot specifications for single-run time series plots.
+
+        Returns:
+            List of PlotSpec objects for single-run visualizations
+
+        Raises:
+            ValueError: If single_run section is missing or invalid
+        """
+        viz_config = self.config.get("visualization", {})
+
+        defaults = viz_config.get("single_run_defaults", [])
+        if not isinstance(defaults, list):
+            raise ValueError(
+                f"Invalid config: 'visualization.single_run_defaults' must be a list, "
+                f"got {type(defaults).__name__}"
+            )
+
+        presets = viz_config.get("single_run_plots", {})
+        if not isinstance(presets, dict):
+            raise ValueError(
+                f"Invalid config: 'visualization.single_run_plots' must be a dict, "
+                f"got {type(presets).__name__}"
+            )
+
+        plot_specs = []
+        for plot_name in defaults:
+            try:
+                if plot_name not in presets:
+                    raise ValueError(
+                        f"Plot '{plot_name}' listed in single_run_defaults but not found in single_run_plots"
+                    )
+
+                preset = presets[plot_name]
+                plot_spec = self._preset_to_plot_spec(plot_name, preset)
+                plot_specs.append(plot_spec)
+            except Exception as e:
+                error_context = (
+                    f"Failed to parse single_run plot preset '{plot_name}'\n"
+                    f"Config file: {self.resolved_path}\n"
+                    f"Preset: {preset if plot_name in presets else '<not found>'}\n"
+                    f"Error: {e}"
+                )
+                logger.error(error_context, exc_info=True)
+
+                raise ValueError(
+                    f"Config validation failed for single_run plot '{plot_name}'. "
+                    f"Check the configuration file at {self.resolved_path}"
+                ) from e
+
+        return plot_specs
+
+    def get_experiment_classification_config(
+        self,
+    ) -> ExperimentClassificationConfig | None:
+        """
+        Get experiment classification configuration for baseline/treatment assignment.
+
+        Returns:
+            ExperimentClassificationConfig object if section exists, None otherwise
+
+        Raises:
+            ValueError: If experiment_classification section is invalid
+        """
+        exp_class_config = self.config.get("experiment_classification")
+
+        if exp_class_config is None:
+            return None
+
+        if not isinstance(exp_class_config, dict):
+            raise ValueError(
+                f"Invalid config: 'experiment_classification' must be a dict, "
+                f"got {type(exp_class_config).__name__}"
+            )
+
+        try:
+            return ExperimentClassificationConfig(**exp_class_config)
+        except Exception as e:
+            raise ValueError(
+                f"Failed to parse experiment_classification config: {e}"
+            ) from e
+
+    def _preset_to_plot_spec(
+        self, name: str, preset: dict
+    ) -> PlotSpec | TimeSlicePlotSpec:
+        """
+        Convert preset dictionary to PlotSpec object.
+
+        Args:
+            name: Plot name/key from YAML
+            preset: Preset dictionary with simplified format
+
+        Returns:
+            PlotSpec or TimeSlicePlotSpec object
+
+        Raises:
+            ValueError: If preset is invalid
+        """
+        if not isinstance(preset, dict):
+            raise ValueError(
+                f"Expected dictionary for preset, got {type(preset).__name__}"
+            )
+
+        plot_type_str = preset.get("type")
+        if not plot_type_str:
+            raise ValueError(f"Missing 'type' field in preset '{name}'")
+        plot_type = PlotType(plot_type_str)
+
+        metrics = []
+
+        x_metric = preset.get("x")
+        if x_metric:
+            metrics.append(
+                self._expand_metric_shortcut(x_metric, "x", preset.get("source"))
+            )
+
+        y_metric = preset.get("y")
+        if y_metric:
+            y_stat = preset.get("stat")
+            metrics.append(
+                self._expand_metric_shortcut(
+                    y_metric, "y", preset.get("source"), y_stat
+                )
+            )
+
+        y2_metric = preset.get("y2")
+        if y2_metric:
+            metrics.append(self._expand_metric_shortcut(y2_metric, "y2", None))
+
+        if not metrics:
+            raise ValueError(f"No metrics defined in preset '{name}'")
+
+        exp_class_config = self.get_experiment_classification_config()
+        if exp_class_config is not None:
+            # When experiment classification is enabled, ALWAYS use experiment_group
+            groups = "experiment_group"
+            logger.info(
+                f"Classification enabled for plot '{name}': forcing groups={groups}"
+            )
+        else:
+            # When classification disabled, use explicit YAML setting or default
+            groups = preset.get("groups")
+            if groups is None or groups == []:
+                groups = ["run_name"]
+            logger.info(
+                f"Classification disabled for plot '{name}': using groups={groups}"
+            )
+
+        spec_kwargs = {
+            "name": name,
+            "plot_type": plot_type,
+            "metrics": metrics,
+            "title": preset.get("title"),
+            "filename": f"{name}.png",
+            "label_by": preset.get("labels"),
+            "group_by": groups,
+        }
+
+        if "primary_style" in preset:
+            spec_kwargs["primary_style"] = Style(**preset["primary_style"])
+        if "secondary_style" in preset:
+            spec_kwargs["secondary_style"] = Style(**preset["secondary_style"])
+        if "supplementary_col" in preset:
+            spec_kwargs["supplementary_col"] = preset["supplementary_col"]
+
+        if "use_slice_duration" in preset:
+            spec_kwargs["use_slice_duration"] = preset["use_slice_duration"]
+            return TimeSlicePlotSpec(**spec_kwargs)
+
+        return PlotSpec(**spec_kwargs)
+
+    def _expand_metric_shortcut(
+        self,
+        metric_name: str,
+        axis: str,
+        source_override: str | None = None,
+        stat_override: str | None = None,
+    ) -> MetricSpec:
+        """
+        Expand metric shortcut to full MetricSpec using dynamic pattern matching.
+
+        Supports two formats:
+        1. {metric_name}_{stat} - e.g., "time_to_first_token_avg", "request_latency_p90"
+        2. {metric_name} - e.g., "request_number", "timestamp"
+
+        Args:
+            metric_name: Metric shortcut name
+            axis: Axis assignment ("x", "y", "y2")
+            source_override: Override data source (for timeslice plots)
+            stat_override: Override stat (for timeslice plots)
+
+        Returns:
+            MetricSpec object
+
+        Raises:
+            ValueError: If metric name or stat is not recognized
+        """
+        base_name, stat = _parse_and_validate_metric_name(metric_name)
+        source: DataSource
+
+        if base_name in get_aggregated_metrics():
+            source = DataSource.AGGREGATED
+        elif base_name in get_request_metrics():
+            source = DataSource.REQUESTS
+        elif base_name in get_timeslice_metrics():
+            source = DataSource.TIMESLICES
+        elif base_name in get_gpu_metrics():
+            source = DataSource.GPU_TELEMETRY
+        else:
+            all_known = (
+                get_aggregated_metrics()
+                + get_request_metrics()
+                + get_timeslice_metrics()
+                + get_gpu_metrics()
+            )
+            raise ValueError(
+                f"Unknown metric: '{base_name}' (from shortcut '{metric_name}'). "
+                f"Known metrics: {all_known}"
+            )
+
+        if source_override:
+            source = DataSource(source_override)
+        if stat_override:
+            stat = stat_override
+
+        return MetricSpec(name=base_name, source=source, axis=axis, stat=stat)
diff --git a/src/aiperf/plot/constants.py b/src/aiperf/plot/constants.py
index b9a4dbe4a..b25f98ca2 100644
--- a/src/aiperf/plot/constants.py
+++ b/src/aiperf/plot/constants.py
@@ -60,6 +60,14 @@ class PlotTheme(CaseInsensitiveStrEnum):
 NVIDIA_CARD_BG = "#252525"
 OUTLIER_RED = "#E74C3C"
 
+# Direction indicators for derived metrics (not in MetricRegistry)
+# Maps metric name to direction: True = ↑ (higher is better), False = ↓ (lower is better)
+DERIVED_METRIC_DIRECTIONS = {
+    "output_token_throughput_per_gpu": True,
+    "output_token_throughput_per_user": True,
+}
+
+
 DARK_THEME_COLORS = {
     "primary": NVIDIA_GREEN,
     "secondary": NVIDIA_GOLD,
diff --git a/src/aiperf/plot/core/__init__.py b/src/aiperf/plot/core/__init__.py
index 6b4d7e750..c44b9b976 100644
--- a/src/aiperf/plot/core/__init__.py
+++ b/src/aiperf/plot/core/__init__.py
@@ -26,10 +26,12 @@
 )
 from aiperf.plot.core.plot_generator import (
     PlotGenerator,
+    detect_directional_outliers,
     get_nvidia_color_scheme,
 )
 from aiperf.plot.core.plot_specs import (
     DataSource,
+    ExperimentClassificationConfig,
     MetricSpec,
     PlotSpec,
     PlotType,
@@ -52,6 +54,7 @@
     "DataLoader",
     "DataSource",
     "DerivedMetricCalculator",
+    "ExperimentClassificationConfig",
     "MetricSpec",
     "ModeDetector",
     "PlotGenerator",
@@ -69,6 +72,7 @@
     "auto_select_label_by",
     "calculate_rolling_percentiles",
     "calculate_throughput_events",
+    "detect_directional_outliers",
     "detect_swept_parameters",
     "flatten_config",
     "get_nvidia_color_scheme",
diff --git a/src/aiperf/plot/core/data_loader.py b/src/aiperf/plot/core/data_loader.py
index 26f01db90..553bef926 100644
--- a/src/aiperf/plot/core/data_loader.py
+++ b/src/aiperf/plot/core/data_loader.py
@@ -10,6 +10,7 @@
 """
 
 import json
+from fnmatch import fnmatch
 from pathlib import Path
 from typing import Any
 
@@ -26,6 +27,7 @@
     PROFILE_EXPORT_JSONL,
     PROFILE_EXPORT_TIMESLICES_CSV,
 )
+from aiperf.plot.core.plot_specs import ExperimentClassificationConfig
 from aiperf.plot.exceptions import DataLoadError
 
 
@@ -54,6 +56,14 @@ class RunMetadata(AIPerfBaseModel):
     was_cancelled: bool = Field(
         default=False, description="Whether the profiling run was cancelled early"
     )
+    experiment_type: str = Field(
+        default="treatment",
+        description="Classification of run as 'baseline' or 'treatment' for visualization",
+    )
+    experiment_group: str = Field(
+        default="",
+        description="Experiment group identifier extracted from run name or path for grouping variants",
+    )
 
 
 class RunData(AIPerfBaseModel):
@@ -148,8 +158,18 @@ class DataLoader(AIPerfLoggerMixin):
     and parse them into structured formats for visualization.
     """
 
-    def __init__(self):
+    def __init__(
+        self,
+        classification_config: ExperimentClassificationConfig | None = None,
+    ):
+        """
+        Initialize DataLoader.
+
+        Args:
+            classification_config: Configuration for baseline/treatment classification
+        """
         super().__init__()
+        self.classification_config = classification_config
 
     def load_run(self, run_path: Path, load_per_request_data: bool = True) -> RunData:
         """
@@ -762,6 +782,76 @@ def parse_line(line: str) -> dict:
         )
         return df
 
+    def _classify_experiment_type(self, run_path: Path, run_name: str) -> str:
+        """
+        Classify run as baseline or treatment.
+
+        Priority (highest to lowest):
+        1. Pattern matching from plot_config.yaml
+        2. Default from plot_config.yaml (or "treatment" if no config)
+
+        Args:
+            run_path: Path to the run directory
+            run_name: Name of the run (typically directory name)
+
+        Returns:
+            "baseline" or "treatment"
+        """
+        if self.classification_config:
+            for pattern in self.classification_config.baselines:
+                if fnmatch(run_name, pattern) or fnmatch(str(run_path), pattern):
+                    return "baseline"
+
+            for pattern in self.classification_config.treatments:
+                if fnmatch(run_name, pattern) or fnmatch(str(run_path), pattern):
+                    return "treatment"
+
+            return self.classification_config.default
+
+        return "treatment"
+
+    def _extract_experiment_group(self, run_path: Path, run_name: str) -> str:
+        """
+        Extract experiment group identifier from run path.
+
+        If experiment classification is configured and the parent directory matches
+        any baseline or treatment pattern, uses parent directory name.
+        Otherwise uses run directory name.
+
+        Args:
+            run_path: Path to the run directory
+            run_name: Name of the run (typically directory name)
+
+        Returns:
+            Experiment group identifier for grouping runs
+        """
+        # Try parent-based grouping if classification config exists
+        if self.classification_config:
+            parent = run_path.parent
+            if parent and parent.name:
+                parent_name = parent.name
+
+                # Check if parent matches any baseline pattern
+                for pattern in self.classification_config.baselines:
+                    if fnmatch(parent_name, pattern):
+                        return parent_name
+
+                # Check if parent matches any treatment pattern
+                for pattern in self.classification_config.treatments:
+                    if fnmatch(parent_name, pattern):
+                        return parent_name
+
+        # Fallback: use run_name
+        result = run_name if run_name else str(run_path.name)
+
+        if not result:
+            self.warning(
+                f"Could not extract experiment_group from {run_path}, using full path"
+            )
+            result = str(run_path)
+
+        return result
+
     def _extract_metadata(
         self,
         run_path: Path,
@@ -826,6 +916,10 @@ def _extract_metadata(
                 else:
                     duration_seconds = duration / 1e9
 
+        experiment_type = self._classify_experiment_type(run_path, run_name)
+
+        experiment_group = self._extract_experiment_group(run_path, run_name)
+
         return RunMetadata(
             run_name=run_name,
             run_path=run_path,
@@ -837,4 +931,6 @@ def _extract_metadata(
             start_time=start_time,
             end_time=end_time,
             was_cancelled=was_cancelled,
+            experiment_type=experiment_type,
+            experiment_group=experiment_group,
         )
diff --git a/src/aiperf/plot/core/data_preparation.py b/src/aiperf/plot/core/data_preparation.py
index 3b34c2d32..e26d1bfb9 100644
--- a/src/aiperf/plot/core/data_preparation.py
+++ b/src/aiperf/plot/core/data_preparation.py
@@ -290,9 +290,7 @@ def prepare_timeslice_metrics(
         plot_dfs.append(stat_df)
 
     if not plot_dfs:
-        raise DataLoadError(
-            f"No timeslice data for {metric_name} ({', '.join(stats)})"
-        )
+        raise DataLoadError(f"No timeslice data for {metric_name} ({', '.join(stats)})")
 
     plot_df = plot_dfs[0]
     for df in plot_dfs[1:]:
diff --git a/src/aiperf/plot/core/plot_generator.py b/src/aiperf/plot/core/plot_generator.py
index 992592843..23820367b 100644
--- a/src/aiperf/plot/core/plot_generator.py
+++ b/src/aiperf/plot/core/plot_generator.py
@@ -9,16 +9,23 @@
 plots, line charts, and time series.
 """
 
+import logging
+
 import matplotlib.colors as mcolors
 import numpy as np
 import pandas as pd
 import plotly.graph_objects as go
 import seaborn as sns
 
+from aiperf.common.enums import PlotMetricDirection
+from aiperf.common.enums.metric_enums import MetricFlags
+from aiperf.metrics.metric_registry import MetricRegistry
 from aiperf.plot.constants import (
     DARK_THEME_COLORS,
+    DERIVED_METRIC_DIRECTIONS,
     LIGHT_THEME_COLORS,
     NVIDIA_GOLD,
+    NVIDIA_GRAY,
     NVIDIA_GREEN,
     OUTLIER_RED,
     PLOT_FONT_FAMILY,
@@ -134,6 +141,7 @@ def __init__(self, theme: PlotTheme = PlotTheme.LIGHT, color_pool_size: int = 10
         self._group_color_registry: dict[str, str] = {}
         self._color_pool: list[str] = self._generate_color_pool(color_pool_size)
         self._next_color_index: int = 0
+        self._shown_warnings: set[str] = set()
 
     def _generate_color_pool(self, pool_size: int) -> list[str]:
         """Generate master color pool for consistent group coloring.
@@ -261,40 +269,322 @@ def _get_base_layout(
         return layout
 
     def _prepare_groups(
-        self, df: pd.DataFrame, group_by: str | None
-    ) -> tuple[list[str | None], dict[str, str]]:
+        self,
+        df: pd.DataFrame,
+        group_by: str | None,
+        experiment_types: dict[str, str] | None = None,
+        group_display_names: dict[str, str] | None = None,
+    ) -> tuple[list[str | None], dict[str, str], dict[str, str]]:
         """
         Prepare group list and color mapping for multi-series plots.
 
-        Uses a persistent color registry to ensure the same group always gets
-        the same color across all plots in a session. New groups are assigned
-        colors sequentially from a pre-generated color pool.
+        Supports two modes:
+        1. Experiment groups coloring: When experiment_types provided, uses NVIDIA brand colors
+           (baselines=grey, treatments=green) with custom legend ordering.
+        2. Other coloring: Uses distinct seaborn colors for each group.
 
         Args:
             df: DataFrame containing the data
-            group_by: Column name to group by (e.g., "model", "concurrency"), or
-                None for no grouping
+            group_by: Column name to group by (e.g., "model", "concurrency"), list of column names,
+                or None for no grouping
+            experiment_types: Optional mapping of group_name -> "baseline"|"treatment"
+                If provided, uses default NVIDIA colors. If None, use seaborn colors.
+            group_display_names: Optional mapping of group_name -> display_name for legends
 
         Returns:
-            Tuple of (groups, group_colors) where:
-            - groups: Sorted list of group values, or [None] if no grouping
+            Tuple of (groups, group_colors, group_display_names) where:
+            - groups: Sorted list of group values (baselines first, then treatments),
+                or [None] if no grouping
             - group_colors: Dict mapping group values to color hex codes
+            - group_display_names: Dict mapping group values to display names (or empty dict)
         """
-        if group_by and group_by in df.columns:
-            groups = sorted(df[group_by].unique())
+        logger = logging.getLogger(__name__)
 
-            for group in groups:
-                if group not in self._group_color_registry:
-                    color_index = self._next_color_index % len(self._color_pool)
-                    self._group_color_registry[group] = self._color_pool[color_index]
-                    self._next_color_index += 1
+        if not group_by or group_by not in df.columns:
+            logger.info(f"No grouping applied (group_by={group_by})")
+            return [None], {}, {}
+
+        groups = sorted(df[group_by].unique())
+        logger.info(
+            f"Preparing groups with group_by='{group_by}': found {len(groups)} unique values: {groups}"
+        )
+
+        # Experiment groups coloring: Use grey for baselines, green for first treatment, and distinct seaborn colors for remaining treatments
+        if experiment_types:
+            baselines = [g for g in groups if experiment_types.get(g) == "baseline"]
+            treatments = [g for g in groups if experiment_types.get(g) == "treatment"]
+
+            baselines = sorted(baselines)
+            treatments = sorted(treatments)
+
+            ordered_groups = baselines + treatments
 
-            group_colors = {group: self._group_color_registry[group] for group in groups}
-        else:
-            groups = [None]
             group_colors = {}
 
-        return groups, group_colors
+            for group in baselines:
+                group_colors[group] = NVIDIA_GRAY
+
+            if len(treatments) > 0:
+                group_colors[treatments[0]] = NVIDIA_GREEN
+
+            if len(treatments) > 1:
+                seaborn_colors = sns.color_palette(
+                    "bright", n_colors=len(treatments) - 1
+                ).as_hex()
+                for i, group in enumerate(treatments[1:]):
+                    group_colors[group] = seaborn_colors[i]
+
+            logger.info(
+                f"Applied semantic coloring: {len(baselines)} baselines, {len(treatments)} treatments"
+            )
+            logger.info(f"  Baselines: {baselines}")
+            logger.info(f"  Treatments: {treatments}")
+            logger.info(f"  Color assignments: {group_colors}")
+
+            self._validate_line_count(len(ordered_groups))
+
+            display_names = group_display_names or {}
+
+            return ordered_groups, group_colors, display_names
+
+        # Other coloring: Use distinct seaborn colors for each group
+        for group in groups:
+            if group not in self._group_color_registry:
+                color_index = self._next_color_index % len(self._color_pool)
+                self._group_color_registry[group] = self._color_pool[color_index]
+                self._next_color_index += 1
+
+        group_colors = {group: self._group_color_registry[group] for group in groups}
+        return groups, group_colors, {}
+
+    def _validate_line_count(self, n_traces: int) -> None:
+        """Warn if more than 4 lines/traces in a single plot (once per session)."""
+        if n_traces > 4:
+            warning_key = f"too_many_traces_{n_traces}"
+            if warning_key not in self._shown_warnings:
+                self._shown_warnings.add(warning_key)
+                logger = logging.getLogger(__name__)
+                logger.warning(
+                    f"Plot contains {n_traces} traces, which exceeds the recommended "
+                    f"maximum of 4 for clarity."
+                )
+
+    def _get_metric_direction(self, metric_tag: str) -> PlotMetricDirection | str:
+        """
+        Get direction indicator for metric.
+
+        Checks MetricRegistry first, then falls back to derived metrics registry.
+
+        Args:
+            metric_tag: Metric tag name (e.g., "request_latency", "output_token_throughput_per_gpu")
+
+        Returns:
+            PlotMetricDirection.HIGHER if higher is better (LARGER_IS_BETTER or derived metric marked as True)
+            PlotMetricDirection.LOWER if lower is better (not LARGER_IS_BETTER or derived metric marked as False)
+            "" if metric not found in either registry
+        """
+        try:
+            metric_class = MetricRegistry.get_class(metric_tag)
+            if metric_class.has_flags(MetricFlags.LARGER_IS_BETTER):
+                return PlotMetricDirection.HIGHER
+            return PlotMetricDirection.LOWER
+        except Exception:
+            pass
+
+        if metric_tag in DERIVED_METRIC_DIRECTIONS:
+            return (
+                PlotMetricDirection.HIGHER
+                if DERIVED_METRIC_DIRECTIONS[metric_tag]
+                else PlotMetricDirection.LOWER
+            )
+
+        logger = logging.getLogger(__name__)
+        logger.debug(f"Could not determine direction for metric: {metric_tag}")
+        return ""
+
+    def _compute_pareto_frontier(
+        self,
+        x_values: np.ndarray,
+        y_values: np.ndarray,
+        x_direction: PlotMetricDirection,
+        y_direction: PlotMetricDirection,
+    ) -> np.ndarray:
+        """
+        Compute Pareto frontier using O(n log n) sweep algorithm.
+
+        The algorithm leverages the fact that after sorting by x-coordinate, we can
+        scan once (left-to-right or right-to-left depending on metric directions)
+        and track the best y-value seen so far to determine Pareto optimality.
+
+        Args:
+            x_values: X-axis metric values (must already be sorted ascending)
+            y_values: Y-axis metric values (corresponding to x_values)
+            x_direction: Whether higher or lower x is better
+            y_direction: Whether higher or lower y is better
+
+        Returns:
+            Boolean array where True indicates point is on Pareto frontier
+        """
+        n = len(x_values)
+
+        if n == 0:
+            return np.array([], dtype=bool)
+        if n == 1:
+            return np.array([True], dtype=bool)
+
+        is_pareto = np.zeros(n, dtype=bool)
+
+        if x_direction == PlotMetricDirection.LOWER:
+            if y_direction == PlotMetricDirection.HIGHER:
+                best_y = float("-inf")
+                for i in range(n):
+                    if y_values[i] >= best_y:
+                        is_pareto[i] = True
+                        best_y = y_values[i]
+            else:
+                best_y = float("inf")
+                for i in range(n):
+                    if y_values[i] <= best_y:
+                        is_pareto[i] = True
+                        best_y = y_values[i]
+        else:
+            if y_direction == PlotMetricDirection.HIGHER:
+                best_y = float("-inf")
+                for i in range(n - 1, -1, -1):
+                    if y_values[i] >= best_y:
+                        is_pareto[i] = True
+                        best_y = y_values[i]
+            else:
+                best_y = float("inf")
+                for i in range(n - 1, -1, -1):
+                    if y_values[i] <= best_y:
+                        is_pareto[i] = True
+                        best_y = y_values[i]
+
+        return is_pareto
+
+    def _direction_to_arrow(self, direction: PlotMetricDirection | str) -> str:
+        """
+        Convert a PlotMetricDirection to its unicode arrow representation.
+
+        Args:
+            direction: PlotMetricDirection enum value or empty string
+
+        Returns:
+            "↑" (U+2191) if direction is HIGHER
+            "↓" (U+2193) if direction is LOWER
+            "" if direction is empty string
+        """
+        if direction == PlotMetricDirection.HIGHER:
+            return "\u2191"
+        elif direction == PlotMetricDirection.LOWER:
+            return "\u2193"
+        return ""
+
+    def _generate_optimal_direction_subtitle(self, x_metric: str, y_metric: str) -> str:
+        """
+        Generate subtitle describing optimal direction for 2D plot.
+
+        Args:
+            x_metric: X-axis metric tag
+            y_metric: Y-axis metric tag
+
+        Returns:
+            Explanatory subtitle or empty string if directions unknown
+        """
+        x_dir = self._get_metric_direction(x_metric)
+        y_dir = self._get_metric_direction(y_metric)
+
+        if not x_dir or not y_dir:
+            return ""
+
+        # Determine quadrant name
+        if x_dir == PlotMetricDirection.LOWER and y_dir == PlotMetricDirection.HIGHER:
+            quadrant = "upper-left"
+        elif (
+            x_dir == PlotMetricDirection.HIGHER and y_dir == PlotMetricDirection.HIGHER
+        ):
+            quadrant = "upper-right"
+        elif x_dir == PlotMetricDirection.LOWER and y_dir == PlotMetricDirection.LOWER:
+            quadrant = "lower-left"
+        else:  # x_dir == PlotMetricDirection.HIGHER and y_dir == PlotMetricDirection.LOWER
+            quadrant = "lower-right"
+
+        x_name = get_metric_display_name(x_metric)
+        y_name = get_metric_display_name(y_metric)
+        x_word = "low" if x_dir == PlotMetricDirection.LOWER else "high"
+        y_word = "high" if y_dir == PlotMetricDirection.HIGHER else "low"
+
+        # Convert directions to arrows for display
+        x_arrow = self._direction_to_arrow(x_dir)
+        y_arrow = self._direction_to_arrow(y_dir)
+
+        return f"Optimal: {quadrant} quadrant ({y_word} {y_name} {y_arrow}, {x_word} {x_name} {x_arrow})"
+
+    def _add_optimal_quadrant_shading(
+        self,
+        fig: go.Figure,
+        x_metric: str,
+        y_metric: str,
+        x_data: list[float],
+        y_data: list[float],
+    ) -> None:
+        """
+        Add semi-transparent shading to optimal quadrant of 2D plot.
+
+        Args:
+            fig: Plotly figure to modify
+            x_metric: X-axis metric tag
+            y_metric: Y-axis metric tag
+            x_data: List of x-axis values
+            y_data: List of y-axis values
+        """
+        x_dir = self._get_metric_direction(x_metric)
+        y_dir = self._get_metric_direction(y_metric)
+
+        if not x_dir or not y_dir or not x_data or not y_data:
+            return
+
+        x_lower_is_better = x_dir == PlotMetricDirection.LOWER
+        y_higher_is_better = y_dir == PlotMetricDirection.HIGHER
+
+        # Find optimal corner point
+        optimal_x = min(x_data) if x_lower_is_better else max(x_data)
+        optimal_y = max(y_data) if y_higher_is_better else min(y_data)
+
+        # Calculate rectangle bounds
+        x_min, x_max = min(x_data), max(x_data)
+        y_min, y_max = min(y_data), max(y_data)
+
+        rect_x0 = x_min if x_lower_is_better else optimal_x
+        rect_x1 = optimal_x if x_lower_is_better else x_max
+        rect_y0 = optimal_y if y_higher_is_better else y_min
+        rect_y1 = y_max if y_higher_is_better else optimal_y
+
+        # Add semi-transparent green overlay
+        fig.add_shape(
+            type="rect",
+            x0=rect_x0,
+            x1=rect_x1,
+            y0=rect_y0,
+            y1=rect_y1,
+            fillcolor="rgba(118, 185, 0, 0.08)",  # Very light NVIDIA green
+            line_width=0,
+            layer="below",
+        )
+
+        # Add star annotation at optimal corner
+        fig.add_annotation(
+            x=optimal_x,
+            y=optimal_y,
+            text="\u2605 Optimal",  # ★
+            showarrow=False,
+            font=dict(size=14, color=NVIDIA_GREEN),
+            xanchor="right" if x_lower_is_better else "left",
+            yanchor="bottom" if y_higher_is_better else "top",
+            xshift=-10 if x_lower_is_better else 10,
+            yshift=10 if y_higher_is_better else -10,
+        )
 
     def create_pareto_plot(
         self,
@@ -306,6 +596,8 @@ def create_pareto_plot(
         title: str | None = None,
         x_label: str | None = None,
         y_label: str | None = None,
+        experiment_types: dict[str, str] | None = None,
+        group_display_names: dict[str, str] | None = None,
     ) -> go.Figure:
         """Create a Pareto curve plot showing trade-offs between two metrics.
 
@@ -336,7 +628,26 @@ def create_pareto_plot(
         x_label = x_label or get_metric_display_name(x_metric)
         y_label = y_label or get_metric_display_name(y_metric)
 
-        groups, group_colors = self._prepare_groups(df_sorted, group_by)
+        # Add direction indicators to axis labels
+        x_direction = self._get_metric_direction(x_metric)
+        y_direction = self._get_metric_direction(y_metric)
+        if x_direction:
+            x_label = f"{x_label} {self._direction_to_arrow(x_direction)}"
+        if y_direction:
+            y_label = f"{y_label} {self._direction_to_arrow(y_direction)}"
+
+        # Generate subtitle with optimal direction hint
+        subtitle = self._generate_optimal_direction_subtitle(x_metric, y_metric)
+        if subtitle:
+            title = f"{title}<br><sub>{subtitle}</sub>"
+
+        groups, group_colors, display_names = self._prepare_groups(
+            df_sorted, group_by, experiment_types, group_display_names
+        )
+
+        # Collect all data points for optimal quadrant shading
+        all_x_data = []
+        all_y_data = []
 
         for group in groups:
             if group is None:
@@ -344,15 +655,38 @@ def create_pareto_plot(
                 group_color = self._get_palette_colors(1)[0]
                 group_name = "Data"
             else:
-                group_data = df_sorted[df_sorted[group_by] == group].sort_values(
-                    x_metric
-                )
+                # df_sorted is already sorted by x_metric, filtering preserves order
+                group_data = df_sorted[df_sorted[group_by] == group]
                 group_color = group_colors[group]
-                group_name = group
+                # Use display name if available, otherwise use group ID
+                group_name = display_names.get(group, group)
+
+            # Collect data for optimal quadrant shading
+            all_x_data.extend(group_data[x_metric].tolist())
+            all_y_data.extend(group_data[y_metric].tolist())
+
+            # Calculate Pareto frontier for this group based on metric directions
+            x_dir = self._get_metric_direction(x_metric)
+            y_dir = self._get_metric_direction(y_metric)
+
+            if not x_dir or not y_dir:
+                missing = []
+                if not x_dir:
+                    missing.append(f"x-axis metric '{x_metric}'")
+                if not y_dir:
+                    missing.append(f"y-axis metric '{y_metric}'")
+
+                raise ValueError(
+                    f"Cannot determine optimization direction for {' and '.join(missing)}. "
+                    f"Metrics must be registered in MetricRegistry with LARGER_IS_BETTER flag "
+                    f"or defined in DERIVED_METRIC_DIRECTIONS. Add the metric(s) to ensure "
+                    f"correct Pareto frontier calculation."
+                )
+
+            x_values = group_data[x_metric].values
+            y_values = group_data[y_metric].values
+            is_pareto = self._compute_pareto_frontier(x_values, y_values, x_dir, y_dir)
 
-            # Calculate Pareto frontier for this group using vectorized operations
-            max_y_cumulative = group_data[y_metric].cummax()
-            is_pareto = group_data[y_metric] == max_y_cumulative
             df_pareto = group_data[is_pareto].copy()
 
             if not df_pareto.empty:
@@ -438,6 +772,11 @@ def create_pareto_plot(
         layout = self._get_base_layout(title, x_label, y_label)
         fig.update_layout(layout)
 
+        # Add optimal quadrant shading
+        self._add_optimal_quadrant_shading(
+            fig, x_metric, y_metric, all_x_data, all_y_data
+        )
+
         return fig
 
     def create_scatter_line_plot(
@@ -450,6 +789,8 @@ def create_scatter_line_plot(
         title: str | None = None,
         x_label: str | None = None,
         y_label: str | None = None,
+        experiment_types: dict[str, str] | None = None,
+        group_display_names: dict[str, str] | None = None,
     ) -> go.Figure:
         """Create a scatter plot with connecting lines.
 
@@ -477,8 +818,27 @@ def create_scatter_line_plot(
         x_label = x_label or get_metric_display_name(x_metric)
         y_label = y_label or get_metric_display_name(y_metric)
 
+        # Add direction indicators to axis labels
+        x_direction = self._get_metric_direction(x_metric)
+        y_direction = self._get_metric_direction(y_metric)
+        if x_direction:
+            x_label = f"{x_label} {self._direction_to_arrow(x_direction)}"
+        if y_direction:
+            y_label = f"{y_label} {self._direction_to_arrow(y_direction)}"
+
+        # Generate subtitle with optimal direction hint
+        subtitle = self._generate_optimal_direction_subtitle(x_metric, y_metric)
+        if subtitle:
+            title = f"{title}<br><sub>{subtitle}</sub>"
+
         # Prepare groups and colors
-        groups, group_colors = self._prepare_groups(df_sorted, group_by)
+        groups, group_colors, display_names = self._prepare_groups(
+            df_sorted, group_by, experiment_types, group_display_names
+        )
+
+        # Collect all data points for optimal quadrant shading
+        all_x_data = []
+        all_y_data = []
 
         for group in groups:
             if group is None:
@@ -486,11 +846,15 @@ def create_scatter_line_plot(
                 group_color = self._get_palette_colors(1)[0]
                 group_name = "Data"
             else:
-                group_data = df_sorted[df_sorted[group_by] == group].sort_values(
-                    x_metric
-                )
+                # df_sorted is already sorted by x_metric, filtering preserves order
+                group_data = df_sorted[df_sorted[group_by] == group]
                 group_color = group_colors[group]
-                group_name = group
+                # Use display name if available, otherwise use group ID
+                group_name = display_names.get(group, group)
+
+            # Collect data for optimal quadrant shading
+            all_x_data.extend(group_data[x_metric].tolist())
+            all_y_data.extend(group_data[y_metric].tolist())
 
             # Shadow layer
             fig.add_trace(
@@ -540,6 +904,11 @@ def create_scatter_line_plot(
         layout = self._get_base_layout(title, x_label, y_label)
         fig.update_layout(layout)
 
+        # Add optimal quadrant shading
+        self._add_optimal_quadrant_shading(
+            fig, x_metric, y_metric, all_x_data, all_y_data
+        )
+
         return fig
 
     def create_time_series_scatter(
@@ -1105,8 +1474,14 @@ def create_timeslice_scatter(
 
             # Configure custom ticks with range labels at center positions
             layout["xaxis"]["tickmode"] = "array"
-            tick_positions = [i * slice_duration + slice_duration / 2 for i in range(int(max_slice) + 1)]
-            tick_labels = [f"{int(i * slice_duration)}-{int((i + 1) * slice_duration)}" for i in range(int(max_slice) + 1)]
+            tick_positions = [
+                i * slice_duration + slice_duration / 2
+                for i in range(int(max_slice) + 1)
+            ]
+            tick_labels = [
+                f"{int(i * slice_duration)}-{int((i + 1) * slice_duration)}"
+                for i in range(int(max_slice) + 1)
+            ]
             layout["xaxis"]["tickvals"] = tick_positions
             layout["xaxis"]["ticktext"] = tick_labels
             layout["xaxis"]["tickangle"] = -45
diff --git a/src/aiperf/plot/core/plot_specs.py b/src/aiperf/plot/core/plot_specs.py
index 201097839..b85404de9 100644
--- a/src/aiperf/plot/core/plot_specs.py
+++ b/src/aiperf/plot/core/plot_specs.py
@@ -6,8 +6,9 @@
 from enum import Enum
 from typing import Literal
 
-from pydantic import Field
+from pydantic import Field, field_validator
 
+from aiperf.common.config import BaseConfig
 from aiperf.common.models import AIPerfBaseModel
 
 
@@ -44,6 +45,34 @@ class Style(AIPerfBaseModel):
     )
 
 
+class ExperimentClassificationConfig(BaseConfig):
+    """Configuration for classifying runs as baseline or treatment."""
+
+    baselines: list[str] = Field(
+        default_factory=list,
+        description="List of glob patterns to match baseline runs (e.g., '*_agg_*', '*baseline*')",
+    )
+    treatments: list[str] = Field(
+        default_factory=list,
+        description="List of glob patterns to match treatment runs (e.g., '*_disagg_*', '*kvrouter*')",
+    )
+    default: Literal["baseline", "treatment"] = Field(
+        default="treatment",
+        description="Default classification when no patterns match",
+    )
+    group_extraction_pattern: str | None = Field(
+        default=r"^(baseline|treatment_\d+)",
+        description="Regex pattern to extract experiment group from run name or parent directory names. "
+        "First capture group is used. Example: '^(baseline|treatment_\\d+)' extracts 'treatment_1' "
+        "from 'treatment_1_large_input_small_output'. Used for grouping treatment variants.",
+    )
+    group_display_names: dict[str, str] | None = Field(
+        default=None,
+        description="Optional mapping of experiment group IDs to human-readable display names for legends. "
+        "Example: {'baseline': 'Baseline', 'treatment_1': 'Large Input Small Output'}",
+    )
+
+
 class DataSource(Enum):
     """Data sources for plot metrics."""
 
@@ -59,6 +88,7 @@ class PlotType(Enum):
     SCATTER = "scatter"
     AREA = "area"
     HISTOGRAM = "histogram"
+    TIMESLICE = "timeslice"
     PARETO = "pareto"
     SCATTER_LINE = "scatter_line"
     DUAL_AXIS = "dual_axis"
@@ -94,12 +124,50 @@ class PlotSpec(AIPerfBaseModel):
     )
     label_by: str | None = Field(
         default=None,
-        description="Column to use for labeling points (for multi-series plots)",
+        description="Column to use for labeling points (single column only). "
+        "Must be provided as a single-element list in YAML (e.g., [concurrency]).",
     )
     group_by: str | None = Field(
         default=None,
-        description="Column to use for grouping data (for multi-series plots)",
+        description="Column to use for grouping data into separate series (single column only). "
+        "Must be provided as a single-element list in YAML (e.g., [model]). "
+        "Note: When experiment_classification is enabled, this is auto-overridden to 'experiment_group'.",
     )
+
+    @field_validator("label_by", "group_by", mode="before")
+    @classmethod
+    def _normalize_list_to_string(cls, v: str | list[str] | None) -> str | None:
+        """Convert single-element list to string.
+
+        Args:
+            v: Single-element list, string, or None
+
+        Returns:
+            String value or None
+
+        Raises:
+            ValueError: If v is not a single-element list, string, or None
+        """
+        if v is None:
+            return None
+
+        if isinstance(v, str):
+            return v
+
+        if isinstance(v, list):
+            if len(v) == 0:
+                return None
+            if len(v) == 1:
+                return v[0]
+            raise ValueError(
+                f"Multi-column grouping is not supported. "
+                f"Provide a single column as a string or single-element list, got: {v}"
+            )
+
+        raise ValueError(
+            f"label_by and group_by must be a string or list, got {type(v).__name__}"
+        )
+
     primary_style: Style | None = Field(
         default=None,
         description="Style configuration for primary (y) axis trace",
@@ -122,217 +190,3 @@ class TimeSlicePlotSpec(PlotSpec):
         description="Whether to pass slice_duration to the plot generator "
         "for proper time-based x-axis formatting",
     )
-
-
-# Single-run plot specifications
-SINGLE_RUN_PLOT_SPECS: list[PlotSpec] = [
-    PlotSpec(
-        name="ttft_over_time",
-        plot_type=PlotType.SCATTER,
-        metrics=[
-            MetricSpec(name="request_number", source=DataSource.REQUESTS, axis="x"),
-            MetricSpec(
-                name="time_to_first_token", source=DataSource.REQUESTS, axis="y"
-            ),
-        ],
-        title="TTFT Per Request Over Time",
-        filename="ttft_over_time.png",
-    ),
-    PlotSpec(
-        name="itl_over_time",
-        plot_type=PlotType.SCATTER,
-        metrics=[
-            MetricSpec(name="request_number", source=DataSource.REQUESTS, axis="x"),
-            MetricSpec(
-                name="inter_token_latency", source=DataSource.REQUESTS, axis="y"
-            ),
-        ],
-        title="Inter-Token Latency Per Request Over Time",
-        filename="itl_over_time.png",
-    ),
-    PlotSpec(
-        name="latency_over_time",
-        plot_type=PlotType.SCATTER_WITH_PERCENTILES,
-        metrics=[
-            MetricSpec(name="timestamp", source=DataSource.REQUESTS, axis="x"),
-            MetricSpec(name="request_latency", source=DataSource.REQUESTS, axis="y"),
-        ],
-        title="Request Latency Over Time with Percentiles",
-        filename="latency_over_time.png",
-    ),
-    PlotSpec(
-        name="dispersed_throughput_over_time",
-        plot_type=PlotType.AREA,
-        metrics=[
-            MetricSpec(name="timestamp_s", source=DataSource.REQUESTS, axis="x"),
-            MetricSpec(
-                name="throughput_tokens_per_sec", source=DataSource.REQUESTS, axis="y"
-            ),
-        ],
-        title="Dispersed Output Token Throughput Over Time",
-        filename="dispersed_throughput_over_time.png",
-    ),
-]
-
-
-# Timeslice plot specifications
-TIMESLICE_PLOT_SPECS: list[TimeSlicePlotSpec] = [
-    TimeSlicePlotSpec(
-        name="timeslices_ttft",
-        plot_type=PlotType.HISTOGRAM,
-        metrics=[
-            MetricSpec(name="Timeslice", source=DataSource.TIMESLICES, axis="x"),
-            MetricSpec(
-                name="Time to First Token",
-                source=DataSource.TIMESLICES,
-                axis="y",
-                stat="avg",
-            ),
-        ],
-        title="Average Time to First Token Across Time Slices",
-        filename="timeslices_ttft.png",
-        use_slice_duration=True,
-    ),
-    TimeSlicePlotSpec(
-        name="timeslices_itl",
-        plot_type=PlotType.HISTOGRAM,
-        metrics=[
-            MetricSpec(name="Timeslice", source=DataSource.TIMESLICES, axis="x"),
-            MetricSpec(
-                name="Inter Token Latency",
-                source=DataSource.TIMESLICES,
-                axis="y",
-                stat="avg",
-            ),
-        ],
-        title="Average Inter Token Latency Across Time Slices",
-        filename="timeslices_itl.png",
-        use_slice_duration=True,
-    ),
-    TimeSlicePlotSpec(
-        name="timeslices_throughput",
-        plot_type=PlotType.HISTOGRAM,
-        metrics=[
-            MetricSpec(name="Timeslice", source=DataSource.TIMESLICES, axis="x"),
-            MetricSpec(
-                name="Request Throughput",
-                source=DataSource.TIMESLICES,
-                axis="y",
-                stat="avg",
-            ),
-        ],
-        title="Average Request Throughput Across Time Slices",
-        filename="timeslices_throughput.png",
-        use_slice_duration=True,
-    ),
-    TimeSlicePlotSpec(
-        name="timeslices_latency",
-        plot_type=PlotType.HISTOGRAM,
-        metrics=[
-            MetricSpec(name="Timeslice", source=DataSource.TIMESLICES, axis="x"),
-            MetricSpec(
-                name="Request Latency",
-                source=DataSource.TIMESLICES,
-                axis="y",
-                stat="avg",
-            ),
-        ],
-        title="Average Request Latency Across Time Slices",
-        filename="timeslices_latency.png",
-        use_slice_duration=True,
-    ),
-]
-
-
-# GPU plot specifications
-GPU_PLOT_SPECS: list[PlotSpec] = [
-    PlotSpec(
-        name="gpu_utilization_and_throughput_over_time",
-        plot_type=PlotType.DUAL_AXIS,
-        metrics=[
-            MetricSpec(name="timestamp_s", source=DataSource.REQUESTS, axis="x"),
-            MetricSpec(
-                name="throughput_tokens_per_sec", source=DataSource.REQUESTS, axis="y"
-            ),
-            MetricSpec(
-                name="gpu_utilization", source=DataSource.GPU_TELEMETRY, axis="y2"
-            ),
-        ],
-        title="Output Token Throughput with GPU Utilization",
-        filename="gpu_utilization_and_throughput_over_time.png",
-        primary_style=Style(mode="lines", line_shape="hv", fill=None),
-        secondary_style=Style(mode="lines", line_shape=None, fill="tozeroy"),
-        supplementary_col="active_requests",
-    ),
-]
-
-
-# Multi-run comparison plot specifications
-MULTI_RUN_PLOT_SPECS: list[PlotSpec] = [
-    PlotSpec(
-        name="pareto_curve_throughput_per_gpu_vs_latency",
-        plot_type=PlotType.PARETO,
-        metrics=[
-            MetricSpec(
-                name="request_latency",
-                source=DataSource.AGGREGATED,
-                axis="x",
-                stat="avg",
-            ),
-            MetricSpec(
-                name="output_token_throughput_per_gpu",
-                source=DataSource.AGGREGATED,
-                axis="y",
-                stat="avg",
-            ),
-        ],
-        title="Pareto Curve: Token Throughput per GPU vs Latency",
-        filename="pareto_curve_throughput_per_gpu_vs_latency.png",
-        label_by="concurrency",
-        group_by="model",
-    ),
-    PlotSpec(
-        name="ttft_vs_throughput",
-        plot_type=PlotType.SCATTER_LINE,
-        metrics=[
-            MetricSpec(
-                name="time_to_first_token",
-                source=DataSource.AGGREGATED,
-                axis="x",
-                stat="p50",
-            ),
-            MetricSpec(
-                name="request_throughput",
-                source=DataSource.AGGREGATED,
-                axis="y",
-                stat="avg",
-            ),
-        ],
-        title="TTFT vs Throughput",
-        filename="ttft_vs_throughput.png",
-        label_by="concurrency",
-        group_by="model",
-    ),
-    PlotSpec(
-        name="pareto_curve_throughput_per_gpu_vs_interactivity",
-        plot_type=PlotType.SCATTER_LINE,
-        metrics=[
-            MetricSpec(
-                name="output_token_throughput_per_gpu",
-                source=DataSource.AGGREGATED,
-                axis="x",
-                stat="avg",
-            ),
-            MetricSpec(
-                name="output_token_throughput_per_user",
-                source=DataSource.AGGREGATED,
-                axis="y",
-                stat="avg",
-            ),
-        ],
-        title="Pareto Curve: Token Throughput per GPU vs Interactivity",
-        filename="pareto_curve_throughput_per_gpu_vs_interactivity.png",
-        label_by="concurrency",
-        group_by="model",
-    ),
-]
diff --git a/src/aiperf/plot/default_plot_config.yaml b/src/aiperf/plot/default_plot_config.yaml
new file mode 100644
index 000000000..b0d05b816
--- /dev/null
+++ b/src/aiperf/plot/default_plot_config.yaml
@@ -0,0 +1,184 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+################################################################################
+# AIPerf Plot Configuration
+################################################################################
+#
+# This file defines which plots are generated by 'aiperf plot'.
+#
+# CUSTOMIZATION:
+#   • First run: Auto-creates ~/.aiperf/plot_config.yaml
+#   • Edit ~/.aiperf/plot_config.yaml to customize
+#   • Changes take effect on next run
+#
+# QUICK START:
+#   • Enable/disable plots: Edit the *_defaults lists below
+#   • Customize plots: Modify preset definitions
+#   • Add new plots: Add to presets, then reference in defaults
+#
+################################################################################
+
+visualization:
+  # =============================================================================
+  # MULTI-RUN COMPARISON: Default plots when comparing multiple runs
+  # =============================================================================
+  multi_run_defaults:
+    - pareto_curve_throughput_per_gpu_vs_latency
+    - pareto_curve_throughput_per_gpu_vs_interactivity
+    - ttft_vs_throughput
+
+  # =============================================================================
+  # SINGLE-RUN ANALYSIS: Default plots for analyzing one run over time
+  # =============================================================================
+  single_run_defaults:
+    - ttft_over_time
+    - itl_over_time
+    - latency_over_time
+    - dispersed_throughput_over_time
+    - timeslices_ttft
+    - timeslices_itl
+    - timeslices_throughput
+    - timeslices_latency
+    - gpu_utilization_and_throughput_over_time
+
+  # =============================================================================
+  # MULTI-RUN COMPARISON PRESETS
+  # =============================================================================
+  multi_run_plots:
+    # CONFIGURATION GUIDE:
+    #   groups: Controls how runs are grouped into lines/series (determines colors)
+    #     - Accepts a single column name as a list: [model] or [concurrency]
+    #     - When experiment_classification is enabled: Auto-overridden to [experiment_group]
+    #
+    #   labels: Controls what text appears at each data point
+    #     - Accepts a single column name as a list: [model] or [concurrency]
+    pareto_curve_throughput_per_gpu_vs_latency:
+      type: pareto
+      x: request_latency_avg
+      y: output_token_throughput_per_gpu_avg
+      labels: [concurrency]
+      groups: [model]
+      title: "Pareto Curve: Token Throughput per GPU vs Latency"
+
+    pareto_curve_throughput_per_gpu_vs_interactivity:
+      type: scatter_line
+      x: output_token_throughput_per_gpu_avg
+      y: output_token_throughput_per_user_avg
+      labels: [concurrency]
+      groups: [model]
+      title: "Pareto Curve: Token Throughput per GPU vs Interactivity"
+    
+    ttft_vs_throughput:
+      type: scatter_line
+      x: time_to_first_token_avg
+      y: request_throughput_avg
+      labels: [concurrency]
+      groups: [model]
+      title: "TTFT vs Throughput"
+
+  # =============================================================================
+  # SINGLE-RUN PRESETS (time-series over duration of profiling run)
+  # =============================================================================
+  single_run_plots:
+    ttft_over_time:
+      type: scatter
+      x: request_number
+      y: time_to_first_token
+      title: "TTFT Per Request Over Time"
+
+    itl_over_time:
+      type: scatter
+      x: request_number
+      y: inter_token_latency
+      title: "Inter-Token Latency Per Request Over Time"
+
+    latency_over_time:
+      type: scatter_with_percentiles
+      x: timestamp
+      y: request_latency
+      title: "Request Latency Over Time with Percentiles"
+
+    dispersed_throughput_over_time:
+      type: area
+      x: timestamp_s
+      y: throughput_tokens_per_sec
+      title: "Dispersed Output Token Throughput Over Time"
+
+    timeslices_ttft:
+      type: timeslice
+      x: Timeslice
+      y: Time to First Token
+      stat: avg
+      source: timeslices
+      title: "Time to First Token Across Time Slices"
+      use_slice_duration: true
+
+    timeslices_itl:
+      type: timeslice
+      x: Timeslice
+      y: Inter Token Latency
+      stat: avg
+      source: timeslices
+      title: "Inter Token Latency Across Time Slices"
+      use_slice_duration: true
+
+    timeslices_throughput:
+      type: timeslice
+      x: Timeslice
+      y: Request Throughput
+      stat: avg
+      source: timeslices
+      title: "Request Throughput Across Time Slices"
+      use_slice_duration: true
+
+    timeslices_latency:
+      type: timeslice
+      x: Timeslice
+      y: Request Latency
+      stat: avg
+      source: timeslices
+      title: "Request Latency Across Time Slices"
+      use_slice_duration: true
+
+    gpu_utilization_and_throughput_over_time:
+      type: dual_axis
+      x: timestamp_s
+      y: throughput_tokens_per_sec
+      y2: gpu_utilization
+      title: "Output Token Throughput with GPU Utilization"
+      primary_style:
+        mode: lines
+        line_shape: hv
+      secondary_style:
+        mode: lines
+        fill: tozeroy
+      supplementary_col: active_requests
+
+
+# ==============================================================================
+# EXPERIMENT CLASSIFICATION (Optional)
+# ==============================================================================
+#
+# Classify runs as "baseline" or "treatment" for semantic color assignment:
+#   • Baselines: Grey shades | Treatments: NVIDIA green shades
+#   • Legend: Baselines first, then treatments (alphabetically sorted)
+#
+# BEHAVIOR:
+#   When enabled, ALL multi-run plots automatically group by experiment_type
+#   (overrides groups: [model] setting above for clean baseline vs treatment comparison)
+#
+# PRIORITY: Pattern matching > Default fallback
+#   Patterns: Glob patterns (e.g., "*baseline*" matches baseline, my_baseline_1)
+#   Default: Fallback when no patterns match
+#
+### UNCOMMENT BELOW TO ENABLE EXPERIMENT CLASSIFICATION ###
+# experiment_classification:
+#   baselines:
+#     - "*_agg_*"
+#     - "*baseline*"
+#   treatments:
+#     - "*_disagg_*"
+#     - "*kvrouter*"
+#     - "*treatment*"
+#   default: treatment
diff --git a/src/aiperf/plot/exporters/base.py b/src/aiperf/plot/exporters/base.py
index cf3e54e5a..39b6514fa 100644
--- a/src/aiperf/plot/exporters/base.py
+++ b/src/aiperf/plot/exporters/base.py
@@ -44,7 +44,9 @@ def __init__(
         super().__init__()
         self.output_dir = Path(output_dir)
         self.theme = theme
-        self.plot_generator = PlotGenerator(theme=theme, color_pool_size=color_pool_size)
+        self.plot_generator = PlotGenerator(
+            theme=theme, color_pool_size=color_pool_size
+        )
 
     @abstractmethod
     def export(self, *args, **kwargs):
diff --git a/src/aiperf/plot/exporters/png/multi_run.py b/src/aiperf/plot/exporters/png/multi_run.py
index 7fb8e210d..a45a817a9 100644
--- a/src/aiperf/plot/exporters/png/multi_run.py
+++ b/src/aiperf/plot/exporters/png/multi_run.py
@@ -17,7 +17,7 @@
 from aiperf.plot.constants import DEFAULT_PERCENTILE, NON_METRIC_KEYS
 from aiperf.plot.core.data_loader import RunData
 from aiperf.plot.core.data_preparation import flatten_config
-from aiperf.plot.core.plot_specs import MULTI_RUN_PLOT_SPECS, PlotSpec
+from aiperf.plot.core.plot_specs import PlotSpec
 from aiperf.plot.core.plot_type_handlers import PlotTypeHandlerFactory
 from aiperf.plot.exporters.png.base import BasePNGExporter
 
@@ -34,13 +34,20 @@ class MultiRunPNGExporter(BasePNGExporter):
     5. Token Throughput per GPU vs Interactivity (conditional on telemetry)
     """
 
-    def export(self, runs: list[RunData], available_metrics: dict) -> list[Path]:
+    def export(
+        self,
+        runs: list[RunData],
+        available_metrics: dict,
+        plot_specs: list[PlotSpec],
+        classification_config=None,
+    ) -> list[Path]:
         """
         Export multi-run comparison plots as PNG files.
 
         Args:
             runs: List of RunData objects with aggregated metrics
             available_metrics: Dictionary with display_names and units for metrics
+            plot_specs: List of plot specifications defining which plots to generate
 
         Returns:
             List of Path objects for generated PNG files
@@ -49,11 +56,11 @@ def export(self, runs: list[RunData], available_metrics: dict) -> list[Path]:
 
         self.output_dir.mkdir(parents=True, exist_ok=True)
 
-        df = self._runs_to_dataframe(runs, available_metrics)
+        df = self._runs_to_dataframe(runs, available_metrics, classification_config)
 
         generated_files = []
 
-        for spec in MULTI_RUN_PLOT_SPECS:
+        for spec in plot_specs:
             try:
                 if not self._can_generate_plot(spec, df):
                     self.debug(f"Skipping {spec.name} - required columns not available")
@@ -111,7 +118,7 @@ def _create_plot_from_spec(
         return handler.create_plot(spec, df, available_metrics)
 
     def _runs_to_dataframe(
-        self, runs: list[RunData], available_metrics: dict
+        self, runs: list[RunData], available_metrics: dict, classification_config=None
     ) -> pd.DataFrame:
         """
         Convert list of run data into a DataFrame for plotting.
@@ -129,10 +136,13 @@ def _runs_to_dataframe(
         for run in runs:
             row = {}
 
+            row["run_name"] = run.metadata.run_name
             row["model"] = run.metadata.model or "Unknown"
             row["concurrency"] = run.metadata.concurrency or 1
             row["request_count"] = run.metadata.request_count
             row["duration_seconds"] = run.metadata.duration_seconds
+            row["experiment_type"] = run.metadata.experiment_type
+            row["experiment_group"] = run.metadata.experiment_group
             if run.metadata.endpoint_type:
                 row["endpoint_type"] = run.metadata.endpoint_type
 
@@ -163,4 +173,61 @@ def _runs_to_dataframe(
 
             rows.append(row)
 
-        return pd.DataFrame(rows)
+        df = pd.DataFrame(rows)
+
+        if "experiment_group" in df.columns:
+            if classification_config and classification_config.group_display_names:
+                df["group_display_name"] = (
+                    df["experiment_group"]
+                    .map(classification_config.group_display_names)
+                    .fillna(df["experiment_group"])
+                )
+            else:
+                df["group_display_name"] = df["experiment_group"]
+
+        if "experiment_group" in df.columns:
+            unique_groups = df["experiment_group"].unique()
+            self.info(
+                f"DataFrame has {len(unique_groups)} unique experiment_groups: {sorted(unique_groups)}"
+            )
+
+        if "experiment_type" in df.columns:
+            unique_types = df["experiment_type"].unique()
+            self.info(
+                f"DataFrame has {len(unique_types)} unique experiment_types: {sorted(unique_types)}"
+            )
+
+        return df
+
+    def _extract_experiment_types(
+        self, df: pd.DataFrame, group_by: str | None
+    ) -> dict[str, str] | None:
+        """
+        Extract experiment types mapping from DataFrame.
+
+        Args:
+            df: DataFrame with aggregated metrics
+            group_by: Column name to group by (determines keys for experiment_types dict)
+
+        Returns:
+            Dictionary mapping group values to experiment_type ("baseline" or "treatment"),
+            or None if group_by is not specified or experiment_type column is missing
+        """
+        if not group_by or group_by not in df.columns:
+            return None
+
+        if "experiment_type" not in df.columns:
+            return None
+
+        # Create mapping from group values to experiment_type
+        experiment_types = {}
+        for group_val in df[group_by].unique():
+            group_df = df[df[group_by] == group_val]
+            experiment_types[group_val] = group_df["experiment_type"].iloc[0]
+
+        if experiment_types:
+            self.info(
+                f"Extracted experiment_types mapping for group_by='{group_by}': {experiment_types}"
+            )
+
+        return experiment_types
diff --git a/src/aiperf/plot/exporters/png/single_run.py b/src/aiperf/plot/exporters/png/single_run.py
index fb12732c1..6dd9ecd51 100644
--- a/src/aiperf/plot/exporters/png/single_run.py
+++ b/src/aiperf/plot/exporters/png/single_run.py
@@ -14,9 +14,6 @@
 import aiperf.plot.handlers.single_run_handlers  # noqa: F401
 from aiperf.plot.core.data_loader import RunData
 from aiperf.plot.core.plot_specs import (
-    GPU_PLOT_SPECS,
-    SINGLE_RUN_PLOT_SPECS,
-    TIMESLICE_PLOT_SPECS,
     DataSource,
     PlotSpec,
 )
@@ -39,13 +36,16 @@ class SingleRunPNGExporter(BasePNGExporter):
        - Latency across time slices
     """
 
-    def export(self, run: RunData, available_metrics: dict) -> list[Path]:
+    def export(
+        self, run: RunData, available_metrics: dict, plot_specs: list[PlotSpec]
+    ) -> list[Path]:
         """
         Export single-run time series plots as PNG files.
 
         Args:
             run: RunData object with per-request data
             available_metrics: Dictionary with display_names and units for metrics
+            plot_specs: List of plot specifications defining which plots to generate
 
         Returns:
             List of Path objects for generated PNG files
@@ -56,9 +56,7 @@ def export(self, run: RunData, available_metrics: dict) -> list[Path]:
 
         generated_files = []
 
-        all_specs = SINGLE_RUN_PLOT_SPECS + TIMESLICE_PLOT_SPECS + GPU_PLOT_SPECS
-
-        for spec in all_specs:
+        for spec in plot_specs:
             try:
                 if not self._can_generate_plot(spec, run):
                     self.debug(f"Skipping {spec.name} - required data not available")
diff --git a/src/aiperf/plot/handlers/multi_run_handlers.py b/src/aiperf/plot/handlers/multi_run_handlers.py
index f9e6021d6..b3b0714e3 100644
--- a/src/aiperf/plot/handlers/multi_run_handlers.py
+++ b/src/aiperf/plot/handlers/multi_run_handlers.py
@@ -58,6 +58,58 @@ def _get_metric_label(
             return display_name
         return metric_name
 
+    def _extract_experiment_types(
+        self, data: pd.DataFrame, group_by: str | None
+    ) -> dict[str, str] | None:
+        """
+        Extract experiment types from DataFrame for experiment groups color assignment.
+
+        Args:
+            data: DataFrame with aggregated metrics
+            group_by: Column name to group by
+
+        Returns:
+            Dictionary mapping group values to experiment_type, or None
+        """
+        if not group_by or group_by not in data.columns:
+            return None
+
+        if "experiment_type" not in data.columns:
+            return None
+
+        experiment_types = {}
+        for group_val in data[group_by].unique():
+            group_df = data[data[group_by] == group_val]
+            experiment_types[group_val] = group_df["experiment_type"].iloc[0]
+
+        return experiment_types
+
+    def _extract_group_display_names(
+        self, data: pd.DataFrame, group_by: str | None
+    ) -> dict[str, str] | None:
+        """
+        Extract group display names from DataFrame for legend labels.
+
+        Args:
+            data: DataFrame with aggregated metrics
+            group_by: Column name to group by
+
+        Returns:
+            Dictionary mapping group values to display names, or None
+        """
+        if not group_by or group_by not in data.columns:
+            return None
+
+        if "group_display_name" not in data.columns:
+            return None
+
+        display_names = {}
+        for group_val in data[group_by].unique():
+            group_df = data[data[group_by] == group_val]
+            display_names[group_val] = group_df["group_display_name"].iloc[0]
+
+        return display_names
+
 
 @PlotTypeHandlerFactory.register(PlotType.PARETO)
 class ParetoHandler(BaseMultiRunHandler):
@@ -88,6 +140,9 @@ def create_plot(
             y_metric.name, y_metric.stat or "avg", available_metrics
         )
 
+        experiment_types = self._extract_experiment_types(data, spec.group_by)
+        group_display_names = self._extract_group_display_names(data, spec.group_by)
+
         return self.plot_generator.create_pareto_plot(
             df=data,
             x_metric=x_metric.name,
@@ -97,6 +152,8 @@ def create_plot(
             title=spec.title,
             x_label=x_label,
             y_label=y_label,
+            experiment_types=experiment_types,
+            group_display_names=group_display_names,
         )
 
 
@@ -129,6 +186,9 @@ def create_plot(
             y_metric.name, y_metric.stat or "avg", available_metrics
         )
 
+        experiment_types = self._extract_experiment_types(data, spec.group_by)
+        group_display_names = self._extract_group_display_names(data, spec.group_by)
+
         return self.plot_generator.create_scatter_line_plot(
             df=data,
             x_metric=x_metric.name,
@@ -138,4 +198,6 @@ def create_plot(
             title=spec.title,
             x_label=x_label,
             y_label=y_label,
+            experiment_types=experiment_types,
+            group_display_names=group_display_names,
         )
diff --git a/src/aiperf/plot/handlers/single_run_handlers.py b/src/aiperf/plot/handlers/single_run_handlers.py
index a776bb9d8..1595564e8 100644
--- a/src/aiperf/plot/handlers/single_run_handlers.py
+++ b/src/aiperf/plot/handlers/single_run_handlers.py
@@ -192,7 +192,7 @@ def create_plot(
         )
 
 
-@PlotTypeHandlerFactory.register(PlotType.HISTOGRAM)
+@PlotTypeHandlerFactory.register(PlotType.TIMESLICE)
 class TimeSliceHandler(BaseSingleRunHandler):
     """Handler for timeslice scatter plot type."""
 
@@ -283,12 +283,11 @@ def _get_average_for_timeslice_metric(
         return avg, label, std
 
 
+@PlotTypeHandlerFactory.register(PlotType.HISTOGRAM)
 class HistogramHandler(BaseSingleRunHandler):
-    """Handler for histogram/bar chart plots (preserved for future use).
+    """Handler for histogram/bar chart plots.
 
-    This handler is not currently registered to any PlotType and won't generate
-    plots automatically. It's kept available for future use when bar chart
-    visualization is needed.
+    Generates histogram/bar chart visualizations for timeslice data.
     """
 
     def can_handle(self, spec: PlotSpec, data: RunData) -> bool:
diff --git a/src/aiperf/plot/logging.py b/src/aiperf/plot/logging.py
index 09907d547..19c993cd1 100644
--- a/src/aiperf/plot/logging.py
+++ b/src/aiperf/plot/logging.py
@@ -21,21 +21,52 @@
 _logger = AIPerfLogger(__name__)
 
 
+def setup_console_only_logging(log_level: str = "INFO") -> None:
+    """
+    Set up console-only logging for plot operations.
+
+    This is a fallback mode used when the output directory is not available
+    or cannot be created. Configures logging to output only to console via
+    RichHandler without a file handler.
+
+    Args:
+        log_level: Logging level (e.g., "DEBUG", "INFO", "WARNING"). Defaults to "INFO".
+    """
+    root_logger = logging.getLogger()
+    level = log_level.upper()
+    root_logger.setLevel(level)
+
+    for existing_handler in root_logger.handlers[:]:
+        root_logger.removeHandler(existing_handler)
+
+    rich_handler = RichHandler(
+        rich_tracebacks=True,
+        show_path=True,
+        console=Console(),
+        show_time=True,
+        show_level=True,
+        tracebacks_show_locals=False,
+        log_time_format="%H:%M:%S.%f",
+        omit_repeated_times=False,
+    )
+    rich_handler.setLevel(level)
+    root_logger.addHandler(rich_handler)
+
+
 def setup_plot_logging(output_dir: Path, log_level: str = "INFO") -> None:
     """
     Set up logging for the plot command.
 
     Configures logging to output to both console (via RichHandler) and a log
-    file in the output directory. This should be called at the start of the
-    plot command before any plot operations.
+    file in the output directory. This function can be called multiple times
+    safely as it clears existing handlers before adding new ones.
+
+    Console output shows WARNING+ by default, or all logs when verbose (DEBUG/INFO level).
+    File output always captures all logs at the specified level.
 
     Args:
         output_dir: Directory where plot outputs (and logs) will be saved.
         log_level: Logging level (e.g., "DEBUG", "INFO", "WARNING"). Defaults to "INFO".
-
-    Examples:
-        >>> from pathlib import Path
-        >>> setup_plot_logging(Path("./artifacts/plots"), log_level="INFO")
     """
     root_logger = logging.getLogger()
 
@@ -45,6 +76,8 @@ def setup_plot_logging(output_dir: Path, log_level: str = "INFO") -> None:
     for existing_handler in root_logger.handlers[:]:
         root_logger.removeHandler(existing_handler)
 
+    # Console handler: show all logs if verbose (DEBUG level), otherwise only WARNING+
+    console_level = level if level == "DEBUG" else "WARNING"
     rich_handler = RichHandler(
         rich_tracebacks=True,
         show_path=True,
@@ -55,7 +88,7 @@ def setup_plot_logging(output_dir: Path, log_level: str = "INFO") -> None:
         log_time_format="%H:%M:%S.%f",
         omit_repeated_times=False,
     )
-    rich_handler.setLevel(level)
+    rich_handler.setLevel(console_level)
     root_logger.addHandler(rich_handler)
 
     output_dir.mkdir(parents=True, exist_ok=True)
diff --git a/src/aiperf/plot/metric_names.py b/src/aiperf/plot/metric_names.py
index 9b81f87aa..2babe8513 100644
--- a/src/aiperf/plot/metric_names.py
+++ b/src/aiperf/plot/metric_names.py
@@ -9,6 +9,7 @@
 derived metrics.
 """
 
+from aiperf.common.enums.metric_enums import MetricFlags, MetricType
 from aiperf.gpu_telemetry.constants import GPU_TELEMETRY_METRICS_CONFIG
 from aiperf.metrics.metric_registry import MetricRegistry
 
@@ -29,6 +30,41 @@
     "output_token_throughput_per_gpu": "Output Token Throughput Per GPU",
 }
 
+# Pre-compute metric lists by data source at module load time
+_AGGREGATED_METRICS: list[str] = MetricRegistry.tags_applicable_to(
+    MetricFlags.NONE,
+    MetricFlags.NONE,
+    MetricType.RECORD,
+    MetricType.DERIVED,
+) + [
+    # Add derived metrics calculated during data loading (not in MetricRegistry)
+    "output_token_throughput_per_gpu",
+]
+
+_REQUEST_METRICS: list[str] = MetricRegistry.tags_applicable_to(
+    MetricFlags.NONE,
+    MetricFlags.NONE,
+    MetricType.RECORD,
+) + [
+    "request_number",
+    "timestamp",
+    "timestamp_s",
+    "throughput_tokens_per_sec",
+    "active_requests",
+]
+
+_TIMESLICE_METRICS: list[str] = [
+    "Timeslice",
+    "Time to First Token",
+    "Inter Token Latency",
+    "Request Throughput",
+    "Request Latency",
+]
+
+_GPU_METRICS: list[str] = [
+    field_name for _, field_name, _ in GPU_TELEMETRY_METRICS_CONFIG
+]
+
 
 def get_all_metric_display_names() -> dict[str, str]:
     """
@@ -66,3 +102,83 @@ def get_metric_display_name(metric_tag: str) -> str:
         'Unknown Metric'
     """
     return _ALL_METRIC_NAMES.get(metric_tag, metric_tag.replace("_", " ").title())
+
+
+def get_aggregated_metrics() -> list[str]:
+    """
+    Get metrics available in aggregated statistics (RECORD + DERIVED types).
+
+    These are metrics that have aggregated statistics like avg, min, max, std, and
+    percentiles computed across all requests.
+
+    Returns:
+        List of metric tags available in aggregated data
+
+    Examples:
+        >>> metrics = get_aggregated_metrics()
+        >>> 'request_latency' in metrics
+        True
+        >>> 'time_to_first_token' in metrics
+        True
+    """
+    return _AGGREGATED_METRICS
+
+
+def get_request_metrics() -> list[str]:
+    """
+    Get metrics available in per-request data (RECORD type + computed columns).
+
+    These are metrics available in the requests DataFrame, including both metrics
+    from MetricRegistry and computed columns added during data preparation.
+
+    Returns:
+        List of metric/column names available in requests data
+
+    Examples:
+        >>> metrics = get_request_metrics()
+        >>> 'request_number' in metrics
+        True
+        >>> 'timestamp' in metrics
+        True
+        >>> 'request_latency' in metrics
+        True
+    """
+    return _REQUEST_METRICS
+
+
+def get_timeslice_metrics() -> list[str]:
+    """
+    Get display names of metrics available in timeslice data.
+
+    These are the human-readable column names used in the timeslice CSV exports.
+
+    Returns:
+        List of display names for timeslice metrics
+
+    Examples:
+        >>> metrics = get_timeslice_metrics()
+        >>> 'Timeslice' in metrics
+        True
+        >>> 'Time to First Token' in metrics
+        True
+    """
+    return _TIMESLICE_METRICS
+
+
+def get_gpu_metrics() -> list[str]:
+    """
+    Get field names of metrics available in GPU telemetry data.
+
+    These are the field names used in GPU telemetry DataFrames.
+
+    Returns:
+        List of GPU telemetry metric field names
+
+    Examples:
+        >>> metrics = get_gpu_metrics()
+        >>> 'gpu_utilization' in metrics
+        True
+        >>> 'gpu_memory_used' in metrics
+        True
+    """
+    return _GPU_METRICS
diff --git a/src/aiperf/plot/plot_controller.py b/src/aiperf/plot/plot_controller.py
index 4b00604ed..bdbae388d 100644
--- a/src/aiperf/plot/plot_controller.py
+++ b/src/aiperf/plot/plot_controller.py
@@ -2,12 +2,19 @@
 # SPDX-License-Identifier: Apache-2.0
 """Plot controller for generating visualizations from profiling data."""
 
+import logging
 from pathlib import Path
 
+from aiperf.plot.config import PlotConfig
 from aiperf.plot.constants import PlotMode, PlotTheme
 from aiperf.plot.core.data_loader import DataLoader
 from aiperf.plot.core.mode_detector import ModeDetector, VisualizationMode
 from aiperf.plot.exporters import MultiRunPNGExporter, SingleRunPNGExporter
+from aiperf.plot.logging import setup_console_only_logging, setup_plot_logging
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["PlotController"]
 
 
 class PlotController:
@@ -22,6 +29,8 @@ class PlotController:
         output_dir: Directory to save generated plots
         mode: Output mode (currently only PNG supported)
         theme: Plot theme (LIGHT or DARK). Defaults to LIGHT.
+        config_path: Optional path to custom plot configuration YAML file
+        verbose: Show detailed error tracebacks in console
     """
 
     def __init__(
@@ -30,13 +39,35 @@ def __init__(
         output_dir: Path,
         mode: PlotMode = PlotMode.PNG,
         theme: PlotTheme = PlotTheme.LIGHT,
+        config_path: Path | None = None,
+        verbose: bool = False,
     ):
         self.paths = paths
         self.output_dir = output_dir
         self.mode = mode
         self.theme = theme
-        self.loader = DataLoader()
+        self.verbose = verbose
+
+        log_level = "DEBUG" if verbose else "INFO"
+        try:
+            setup_plot_logging(output_dir, log_level=log_level)
+        except Exception as e:
+            setup_console_only_logging(log_level=log_level)
+            logger.warning(
+                f"Could not set up file logging to {output_dir}: {e}. Using console only."
+            )
+
         self.mode_detector = ModeDetector()
+        self.plot_config = PlotConfig(config_path, verbose=verbose)
+
+        classification_config = self.plot_config.get_experiment_classification_config()
+        if classification_config:
+            print(
+                "Experiment classification enabled: grouping runs by baseline/treatment patterns"
+            )
+        self.loader = DataLoader(
+            classification_config=classification_config,
+        )
 
     def run(self) -> list[Path]:
         """Execute plot generation pipeline.
@@ -109,14 +140,21 @@ def _export_multi_run_plots(self, run_dirs: list[Path]) -> list[Path]:
                 run_data = self.loader.load_run(run_dir, load_per_request_data=False)
                 runs.append(run_data)
             except Exception as e:
-                print(f"Warning: Failed to load run from {run_dir}: {e}")
+                logger.warning(f"Failed to load run from {run_dir}: {e}")
 
         if not runs:
             raise ValueError("Failed to load any valid profiling runs")
 
         available = self.loader.get_available_metrics(runs[0])
+        plot_specs = self.plot_config.get_multi_run_plot_specs()
+        classification_config = self.plot_config.get_experiment_classification_config()
         exporter = MultiRunPNGExporter(self.output_dir, theme=self.theme)
-        return exporter.export(runs, available)
+        return exporter.export(
+            runs,
+            available,
+            plot_specs=plot_specs,
+            classification_config=classification_config,
+        )
 
     def _export_single_run_plots(self, run_dir: Path) -> list[Path]:
         """Export single-run time series plots.
@@ -129,5 +167,6 @@ def _export_single_run_plots(self, run_dir: Path) -> list[Path]:
         """
         run_data = self.loader.load_run(run_dir, load_per_request_data=True)
         available = self.loader.get_available_metrics(run_data)
+        plot_specs = self.plot_config.get_single_run_plot_specs()
         exporter = SingleRunPNGExporter(self.output_dir, theme=self.theme)
-        return exporter.export(run_data, available)
+        return exporter.export(run_data, available, plot_specs=plot_specs)
diff --git a/tests/unit/plot/test_cli_runner.py b/tests/unit/plot/test_cli_runner.py
index 44b8ae8cd..c0e6d2331 100644
--- a/tests/unit/plot/test_cli_runner.py
+++ b/tests/unit/plot/test_cli_runner.py
@@ -286,6 +286,7 @@ def test_all_parameters_passed_to_controller(
             output=output,
             mode=PlotMode.PNG,
             theme=PlotTheme.DARK,
+            verbose=True,
         )
 
         mock_controller_class.assert_called_once_with(
@@ -293,6 +294,8 @@ def test_all_parameters_passed_to_controller(
             output_dir=Path(output),
             mode=PlotMode.PNG,
             theme=PlotTheme.DARK,
+            config_path=None,
+            verbose=True,
         )
 
     def test_invalid_mode_string_raises_value_error(
diff --git a/tests/unit/plot/test_data_loader.py b/tests/unit/plot/test_data_loader.py
index d049df866..41bae4ec2 100644
--- a/tests/unit/plot/test_data_loader.py
+++ b/tests/unit/plot/test_data_loader.py
@@ -1176,9 +1176,7 @@ def test_get_metric_returns_none_for_empty_aggregated(self, tmp_path: Path) -> N
         metric = run_data.get_metric("time_to_first_token")
         assert metric is None
 
-    def test_get_metric_prefers_nested_metrics_over_flat(
-        self, tmp_path: Path
-    ) -> None:
+    def test_get_metric_prefers_nested_metrics_over_flat(self, tmp_path: Path) -> None:
         """Test that nested 'metrics' structure is preferred over flat when both exist."""
         aggregated = {
             "time_to_first_token": {"avg": 100.0, "unit": "ms"},
diff --git a/tests/unit/plot/test_experiment_classification.py b/tests/unit/plot/test_experiment_classification.py
new file mode 100644
index 000000000..d988dd8ff
--- /dev/null
+++ b/tests/unit/plot/test_experiment_classification.py
@@ -0,0 +1,339 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Tests for experiment classification functionality in DataLoader.
+"""
+
+from pathlib import Path
+
+import pytest
+
+from aiperf.plot.core.data_loader import DataLoader
+from aiperf.plot.core.plot_specs import ExperimentClassificationConfig
+
+
+class TestDataLoaderExperimentClassification:
+    """Tests for experiment classification logic."""
+
+    def test_classify_with_baseline_pattern(self, tmp_path: Path) -> None:
+        """Test classification with baseline pattern match."""
+        config = ExperimentClassificationConfig(
+            baselines=["*baseline*"],
+            treatments=["*treatment*"],
+            default="treatment",
+        )
+        loader = DataLoader(classification_config=config)
+
+        run_path = tmp_path / "my_baseline_run"
+        run_name = "my_baseline_run"
+
+        result = loader._classify_experiment_type(run_path, run_name)
+        assert result == "baseline"
+
+    def test_classify_with_treatment_pattern(self, tmp_path: Path) -> None:
+        """Test classification with treatment pattern match."""
+        config = ExperimentClassificationConfig(
+            baselines=["*baseline*"],
+            treatments=["*treatment*"],
+            default="treatment",
+        )
+        loader = DataLoader(classification_config=config)
+
+        run_path = tmp_path / "treatment_v1"
+        run_name = "treatment_v1"
+
+        result = loader._classify_experiment_type(run_path, run_name)
+        assert result == "treatment"
+
+    def test_classify_with_multiple_patterns(self, tmp_path: Path) -> None:
+        """Test classification with multiple patterns."""
+        config = ExperimentClassificationConfig(
+            baselines=["*_agg_*", "*baseline*"],
+            treatments=["*_disagg_*", "*treatment*"],
+            default="treatment",
+        )
+        loader = DataLoader(classification_config=config)
+
+        test_cases = [
+            ("model_agg_config", "baseline"),
+            ("my_baseline", "baseline"),
+            ("model_disagg_v2", "treatment"),
+            ("treatment_large_io", "treatment"),
+        ]
+
+        for run_name, expected in test_cases:
+            run_path = tmp_path / run_name
+            result = loader._classify_experiment_type(run_path, run_name)
+            assert result == expected, f"Failed for {run_name}"
+
+    def test_classify_first_matching_pattern_wins(self, tmp_path: Path) -> None:
+        """Test that first matching pattern wins."""
+        # baseline patterns checked first
+        config = ExperimentClassificationConfig(
+            baselines=["*model*"],
+            treatments=["*model*"],  # Same pattern
+            default="treatment",
+        )
+        loader = DataLoader(classification_config=config)
+
+        run_path = tmp_path / "model_test"
+        run_name = "model_test"
+
+        result = loader._classify_experiment_type(run_path, run_name)
+        # baseline patterns are checked first, so should return baseline
+        assert result == "baseline"
+
+    def test_classify_uses_default_when_no_match(self, tmp_path: Path) -> None:
+        """Test that default is used when no patterns match."""
+        config = ExperimentClassificationConfig(
+            baselines=["*baseline*"],
+            treatments=["*treatment*"],
+            default="treatment",
+        )
+        loader = DataLoader(classification_config=config)
+
+        run_path = tmp_path / "random_run_name"
+        run_name = "random_run_name"
+
+        result = loader._classify_experiment_type(run_path, run_name)
+        assert result == "treatment"
+
+    def test_classify_uses_baseline_default(self, tmp_path: Path) -> None:
+        """Test that default can be set to baseline."""
+        config = ExperimentClassificationConfig(
+            baselines=["*baseline*"],
+            treatments=["*treatment*"],
+            default="baseline",
+        )
+        loader = DataLoader(classification_config=config)
+
+        run_path = tmp_path / "random_run_name"
+        run_name = "random_run_name"
+
+        result = loader._classify_experiment_type(run_path, run_name)
+        assert result == "baseline"
+
+    def test_classify_without_config(self, tmp_path: Path) -> None:
+        """Test classification without config falls back to treatment."""
+        loader = DataLoader(classification_config=None)
+
+        run_path = tmp_path / "any_run"
+        run_name = "any_run"
+
+        result = loader._classify_experiment_type(run_path, run_name)
+        assert result == "treatment"
+
+    def test_classify_matches_full_path(self, tmp_path: Path) -> None:
+        """Test that patterns can match against full path."""
+        config = ExperimentClassificationConfig(
+            baselines=["*/experiment/*baseline*"],
+            treatments=["*treatment*"],
+            default="treatment",
+        )
+        loader = DataLoader(classification_config=config)
+
+        run_path = tmp_path / "experiment" / "baseline_run"
+        run_name = "baseline_run"
+
+        result = loader._classify_experiment_type(run_path, run_name)
+        assert result == "baseline"
+
+    def test_classify_case_sensitive(self, tmp_path: Path) -> None:
+        """Test that pattern matching is case-sensitive."""
+        config = ExperimentClassificationConfig(
+            baselines=["*baseline*"],
+            treatments=["*treatment*"],
+            default="treatment",
+        )
+        loader = DataLoader(classification_config=config)
+
+        # Uppercase should not match lowercase pattern
+        run_path = tmp_path / "BASELINE_run"
+        run_name = "BASELINE_run"
+
+        result = loader._classify_experiment_type(run_path, run_name)
+        assert result == "treatment"  # Falls back to default
+
+    def test_extract_experiment_group_parent_matches_baseline(
+        self, tmp_path: Path
+    ) -> None:
+        """Test that runs with parent matching baseline pattern use parent name."""
+        config = ExperimentClassificationConfig(
+            baselines=["*baseline*"],
+            treatments=["*treatment*"],
+            default="treatment",
+        )
+        loader = DataLoader(classification_config=config)
+
+        # Create: baseline/concurrency_1/
+        parent = tmp_path / "baseline"
+        nested = parent / "concurrency_1"
+        nested.mkdir(parents=True)
+
+        result = loader._extract_experiment_group(nested, "concurrency_1")
+        assert result == "baseline"
+
+    def test_extract_experiment_group_parent_matches_treatment(
+        self, tmp_path: Path
+    ) -> None:
+        """Test that runs with parent matching treatment pattern use parent name."""
+        config = ExperimentClassificationConfig(
+            baselines=["*baseline*"],
+            treatments=["*treatment*"],
+            default="treatment",
+        )
+        loader = DataLoader(classification_config=config)
+
+        # Create: treatment_1/concurrency_1/
+        parent = tmp_path / "treatment_1"
+        nested = parent / "concurrency_1"
+        nested.mkdir(parents=True)
+
+        result = loader._extract_experiment_group(nested, "concurrency_1")
+        assert result == "treatment_1"
+
+    def test_extract_experiment_group_multiple_treatments_separate_groups(
+        self, tmp_path: Path
+    ) -> None:
+        """Test that treatment_1 and treatment_2 are separate groups."""
+        config = ExperimentClassificationConfig(
+            baselines=["*baseline*"],
+            treatments=["*treatment*"],
+            default="treatment",
+        )
+        loader = DataLoader(classification_config=config)
+
+        # Create: treatment_1/run/ and treatment_2/run/
+        for i in [1, 2]:
+            parent = tmp_path / f"treatment_{i}"
+            nested = parent / "run"
+            nested.mkdir(parents=True)
+
+            result = loader._extract_experiment_group(nested, "run")
+            assert result == f"treatment_{i}"
+
+    def test_extract_experiment_group_parent_no_match_uses_run_name(
+        self, tmp_path: Path
+    ) -> None:
+        """Test that if parent doesn't match any pattern, uses run name."""
+        config = ExperimentClassificationConfig(
+            baselines=["*baseline*"],
+            treatments=["*treatment*"],
+            default="treatment",
+        )
+        loader = DataLoader(classification_config=config)
+
+        # Create: artifacts/random_run/
+        parent = tmp_path / "artifacts"
+        nested = parent / "random_run"
+        nested.mkdir(parents=True)
+
+        result = loader._extract_experiment_group(nested, "random_run")
+        assert result == "random_run"  # Parent doesn't match, use run name
+
+    def test_extract_experiment_group_without_config(self, tmp_path: Path) -> None:
+        """Test that without classification config, uses run name."""
+        loader = DataLoader(classification_config=None)
+
+        # Create: baseline/concurrency_8/
+        parent = tmp_path / "baseline"
+        nested = parent / "concurrency_8"
+        nested.mkdir(parents=True)
+
+        result = loader._extract_experiment_group(nested, "concurrency_8")
+        assert result == "concurrency_8"  # No config, use run name
+
+    def test_extract_experiment_group_no_valid_parent(self, tmp_path: Path) -> None:
+        """Test that runs without valid parent use run name."""
+        config = ExperimentClassificationConfig(
+            baselines=["*baseline*"],
+            treatments=["*treatment*"],
+            default="treatment",
+        )
+        loader = DataLoader(classification_config=config)
+
+        # Top-level run (parent is tmp_path, which likely doesn't match patterns)
+        run_path = tmp_path / "treatment_1"
+        run_name = "treatment_1"
+
+        result = loader._extract_experiment_group(run_path, run_name)
+        # Parent (tmp_path) unlikely to match "*treatment*", so use run_name
+        assert result == "treatment_1"
+
+    @pytest.mark.parametrize(
+        "pattern,run_name,expected",
+        [
+            ("*", "anything", "baseline"),  # Wildcard matches all
+            ("exact_match", "exact_match", "baseline"),  # Exact match
+            ("*prefix", "my_prefix", "baseline"),  # Suffix match
+            ("suffix*", "suffix_test", "baseline"),  # Prefix match
+            ("*middle*", "has_middle_part", "baseline"),  # Middle match
+        ],
+    )
+    def test_classify_with_various_patterns(
+        self, tmp_path: Path, pattern: str, run_name: str, expected: str
+    ) -> None:
+        """Test classification with various glob patterns."""
+        config = ExperimentClassificationConfig(
+            baselines=[pattern],
+            treatments=[],
+            default="treatment",
+        )
+        loader = DataLoader(classification_config=config)
+
+        run_path = tmp_path / run_name
+        result = loader._classify_experiment_type(run_path, run_name)
+        assert result == expected
+
+    def test_empty_pattern_lists(self, tmp_path: Path) -> None:
+        """Test with empty baseline and treatment lists."""
+        config = ExperimentClassificationConfig(
+            baselines=[],
+            treatments=[],
+            default="baseline",
+        )
+        loader = DataLoader(classification_config=config)
+
+        run_path = tmp_path / "test_run"
+        run_name = "test_run"
+
+        result = loader._classify_experiment_type(run_path, run_name)
+        assert result == "baseline"  # Should use default
+
+    def test_extract_experiment_group_nested_structure(self, tmp_path: Path) -> None:
+        """Test that nested runs group by immediate parent if it matches."""
+        config = ExperimentClassificationConfig(
+            baselines=["*baseline*"],
+            treatments=["*treatment*"],
+            default="treatment",
+        )
+        loader = DataLoader(classification_config=config)
+
+        # Create: experiments/treatment_3/concurrency_8/
+        nested = tmp_path / "experiments" / "treatment_3" / "concurrency_8"
+        nested.mkdir(parents=True)
+
+        result = loader._extract_experiment_group(nested, "concurrency_8")
+        # Immediate parent "treatment_3" matches "*treatment*"
+        assert result == "treatment_3"
+
+    def test_extract_experiment_group_deeply_nested_checks_immediate_parent(
+        self, tmp_path: Path
+    ) -> None:
+        """Test that only immediate parent is checked, not ancestors."""
+        config = ExperimentClassificationConfig(
+            baselines=["*baseline*"],
+            treatments=["*treatment*"],
+            default="treatment",
+        )
+        loader = DataLoader(classification_config=config)
+
+        # Create: treatment_1/artifacts/run/
+        # Parent "artifacts" doesn't match patterns, so use run name
+        nested = tmp_path / "treatment_1" / "artifacts" / "run"
+        nested.mkdir(parents=True)
+
+        result = loader._extract_experiment_group(nested, "run")
+        # Immediate parent "artifacts" doesn't match, use run name
+        assert result == "run"
diff --git a/tests/unit/plot/test_logging.py b/tests/unit/plot/test_logging.py
new file mode 100644
index 000000000..a967dd660
--- /dev/null
+++ b/tests/unit/plot/test_logging.py
@@ -0,0 +1,207 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Tests for plot logging configuration.
+
+This module tests the logging setup functions for console-only and file-based
+logging configurations.
+"""
+
+import logging
+
+import pytest
+from rich.logging import RichHandler
+
+from aiperf.plot.constants import PLOT_LOG_FILE
+from aiperf.plot.logging import setup_console_only_logging, setup_plot_logging
+
+
+class TestSetupConsoleOnlyLogging:
+    """Tests for setup_console_only_logging function."""
+
+    def test_console_only_logging_default_level(self):
+        """Verifies INFO level set by default."""
+        setup_console_only_logging()
+
+        root_logger = logging.getLogger()
+        assert root_logger.level == logging.INFO
+
+    @pytest.mark.parametrize("level", ["DEBUG", "WARNING", "ERROR"])
+    def test_console_only_logging_custom_level(self, level):
+        """Tests DEBUG, WARNING, ERROR levels."""
+        setup_console_only_logging(log_level=level)
+
+        root_logger = logging.getLogger()
+        assert root_logger.level == getattr(logging, level)
+
+    def test_console_only_logging_handler_cleanup(self):
+        """Removes existing handlers before adding new."""
+        root_logger = logging.getLogger()
+        existing_handler = logging.StreamHandler()
+        root_logger.addHandler(existing_handler)
+
+        setup_console_only_logging()
+
+        assert len(root_logger.handlers) == 1
+        assert isinstance(root_logger.handlers[0], RichHandler)
+
+    def test_console_only_logging_rich_handler_configuration(self):
+        """Verifies RichHandler settings."""
+        setup_console_only_logging()
+
+        root_logger = logging.getLogger()
+        handler = root_logger.handlers[0]
+
+        assert isinstance(handler, RichHandler)
+        assert handler.level == logging.INFO
+
+    def test_console_only_logging_uppercase_conversion(self):
+        """Converts 'info' to 'INFO'."""
+        setup_console_only_logging(log_level="info")
+
+        root_logger = logging.getLogger()
+        assert root_logger.level == logging.INFO
+
+    def test_console_only_logging_multiple_calls(self):
+        """Safely handles multiple setup calls."""
+        setup_console_only_logging()
+        setup_console_only_logging()
+
+        root_logger = logging.getLogger()
+        assert len(root_logger.handlers) == 1
+
+    def test_console_only_logging_handler_level_matches_root(self):
+        """Handler level matches root logger level."""
+        setup_console_only_logging(log_level="DEBUG")
+
+        root_logger = logging.getLogger()
+        handler = root_logger.handlers[0]
+
+        assert root_logger.level == logging.DEBUG
+        assert handler.level == logging.DEBUG
+
+
+class TestSetupPlotLogging:
+    """Tests for setup_plot_logging function."""
+
+    def test_setup_plot_logging_default_level(self, tmp_path):
+        """Verifies INFO level set by default."""
+        setup_plot_logging(tmp_path)
+
+        root_logger = logging.getLogger()
+        assert root_logger.level == logging.INFO
+
+    @pytest.mark.parametrize("level", ["DEBUG", "WARNING", "ERROR"])
+    def test_setup_plot_logging_custom_level(self, tmp_path, level):
+        """Tests DEBUG, WARNING, ERROR levels."""
+        setup_plot_logging(tmp_path, log_level=level)
+
+        root_logger = logging.getLogger()
+        assert root_logger.level == getattr(logging, level)
+
+    def test_setup_plot_logging_console_level_logic(self, tmp_path):
+        """DEBUG shows all, others show WARNING+."""
+        setup_plot_logging(tmp_path, log_level="DEBUG")
+        root_logger = logging.getLogger()
+        rich_handler = next(
+            h for h in root_logger.handlers if isinstance(h, RichHandler)
+        )
+        assert rich_handler.level == logging.DEBUG
+
+        setup_plot_logging(tmp_path, log_level="INFO")
+        root_logger = logging.getLogger()
+        rich_handler = next(
+            h for h in root_logger.handlers if isinstance(h, RichHandler)
+        )
+        assert rich_handler.level == logging.WARNING
+
+        setup_plot_logging(tmp_path, log_level="WARNING")
+        root_logger = logging.getLogger()
+        rich_handler = next(
+            h for h in root_logger.handlers if isinstance(h, RichHandler)
+        )
+        assert rich_handler.level == logging.WARNING
+
+    def test_setup_plot_logging_creates_output_directory(self, tmp_path):
+        """Creates output_dir with parents=True."""
+        nested_dir = tmp_path / "level1" / "level2" / "output"
+
+        setup_plot_logging(nested_dir)
+
+        assert nested_dir.exists()
+        assert nested_dir.is_dir()
+
+    def test_setup_plot_logging_log_file_creation(self, tmp_path):
+        """Creates log file at correct path."""
+        setup_plot_logging(tmp_path)
+
+        log_file = tmp_path / PLOT_LOG_FILE
+        assert log_file.exists()
+        assert log_file.is_file()
+
+    def test_setup_plot_logging_file_handler_encoding(self, tmp_path):
+        """Verifies utf-8 encoding."""
+        setup_plot_logging(tmp_path)
+
+        root_logger = logging.getLogger()
+        file_handler = next(
+            h for h in root_logger.handlers if isinstance(h, logging.FileHandler)
+        )
+        assert file_handler.encoding == "utf-8"
+
+    def test_setup_plot_logging_file_handler_level(self, tmp_path):
+        """File handler uses specified level."""
+        setup_plot_logging(tmp_path, log_level="DEBUG")
+
+        root_logger = logging.getLogger()
+        file_handler = next(
+            h for h in root_logger.handlers if isinstance(h, logging.FileHandler)
+        )
+        assert file_handler.level == logging.DEBUG
+
+    def test_setup_plot_logging_handler_cleanup(self, tmp_path):
+        """Removes existing handlers before adding new."""
+        root_logger = logging.getLogger()
+        existing_handler = logging.StreamHandler()
+        root_logger.addHandler(existing_handler)
+
+        setup_plot_logging(tmp_path)
+
+        assert len(root_logger.handlers) == 2
+        assert any(isinstance(h, RichHandler) for h in root_logger.handlers)
+        assert any(isinstance(h, logging.FileHandler) for h in root_logger.handlers)
+
+    def test_setup_plot_logging_multiple_calls(self, tmp_path):
+        """Safely handles multiple setup calls."""
+        setup_plot_logging(tmp_path)
+        setup_plot_logging(tmp_path)
+
+        root_logger = logging.getLogger()
+        assert len(root_logger.handlers) == 2
+
+    def test_setup_plot_logging_log_file_created(self, tmp_path):
+        """Verifies log file is created at correct location."""
+        setup_plot_logging(tmp_path)
+
+        log_file_path = tmp_path / PLOT_LOG_FILE
+        assert log_file_path.exists()
+        assert log_file_path.is_file()
+
+    def test_setup_plot_logging_uppercase_conversion(self, tmp_path):
+        """Converts 'info' to 'INFO'."""
+        setup_plot_logging(tmp_path, log_level="info")
+
+        root_logger = logging.getLogger()
+        assert root_logger.level == logging.INFO
+
+    def test_setup_plot_logging_handler_count(self, tmp_path):
+        """Ensures exactly 2 handlers: RichHandler and FileHandler."""
+        setup_plot_logging(tmp_path)
+
+        root_logger = logging.getLogger()
+        assert len(root_logger.handlers) == 2
+        assert sum(isinstance(h, RichHandler) for h in root_logger.handlers) == 1
+        assert (
+            sum(isinstance(h, logging.FileHandler) for h in root_logger.handlers) == 1
+        )
diff --git a/tests/unit/plot/test_multi_run_handlers.py b/tests/unit/plot/test_multi_run_handlers.py
new file mode 100644
index 000000000..af6fe6c14
--- /dev/null
+++ b/tests/unit/plot/test_multi_run_handlers.py
@@ -0,0 +1,613 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Tests for multi-run plot type handlers.
+
+This module tests the handler classes that create comparison plots from multiple
+profiling runs, including Pareto curves and scatter line plots.
+"""
+
+from unittest.mock import MagicMock
+
+import pandas as pd
+import pytest
+
+from aiperf.plot.constants import DEFAULT_PERCENTILE
+from aiperf.plot.core.plot_generator import PlotGenerator
+from aiperf.plot.core.plot_specs import DataSource, MetricSpec, PlotSpec, PlotType
+from aiperf.plot.handlers.multi_run_handlers import (
+    BaseMultiRunHandler,
+    ParetoHandler,
+    ScatterLineHandler,
+)
+
+
+@pytest.fixture
+def sample_multi_run_dataframe():
+    """
+    Create a sample DataFrame with multi-run data for testing.
+
+    Returns:
+        DataFrame with experiment_group, experiment_type, group_display_name columns
+    """
+    return pd.DataFrame(
+        {
+            "run_name": ["run1", "run2", "run3"],
+            "experiment_group": ["baseline", "treatment_a", "treatment_b"],
+            "experiment_type": ["baseline", "treatment", "treatment"],
+            "group_display_name": [
+                "Baseline Model",
+                "Treatment A",
+                "Treatment B",
+            ],
+            "concurrency": [1, 2, 4],
+            "request_latency": [100.0, 150.0, 200.0],
+            "request_throughput": [10.0, 15.0, 20.0],
+        }
+    )
+
+
+@pytest.fixture
+def mock_plot_generator():
+    """
+    Create a mock PlotGenerator for testing handlers.
+
+    Returns:
+        MagicMock of PlotGenerator with create_pareto_plot and create_scatter_line_plot
+    """
+    generator = MagicMock(spec=PlotGenerator)
+    generator.create_pareto_plot.return_value = MagicMock()
+    generator.create_scatter_line_plot.return_value = MagicMock()
+    return generator
+
+
+@pytest.fixture
+def sample_available_metrics():
+    """
+    Create a sample available_metrics dictionary for testing.
+
+    Returns:
+        Dict with metric display_names and units
+    """
+    return {
+        "request_latency": {
+            "display_name": "Request Latency",
+            "unit": "ms",
+        },
+        "request_throughput": {
+            "display_name": "Request Throughput",
+            "unit": "req/s",
+        },
+        "time_to_first_token": {
+            "display_name": "Time to First Token",
+            "unit": "ms",
+        },
+    }
+
+
+class TestBaseMultiRunHandler:
+    """Tests for BaseMultiRunHandler base class functionality."""
+
+    def test_get_metric_label_with_available_metrics(
+        self, mock_plot_generator, sample_available_metrics
+    ):
+        """Test label formatting with display_name and unit from available_metrics."""
+        handler = BaseMultiRunHandler(mock_plot_generator)
+        label = handler._get_metric_label(
+            "request_latency", "p50", sample_available_metrics
+        )
+        assert label == "Request Latency (p50) (ms)"
+
+    def test_get_metric_label_with_stat_filtering(
+        self, mock_plot_generator, sample_available_metrics
+    ):
+        """Verify 'avg' and 'value' stats are filtered out from label."""
+        handler = BaseMultiRunHandler(mock_plot_generator)
+
+        label_avg = handler._get_metric_label(
+            "request_latency", "avg", sample_available_metrics
+        )
+        assert label_avg == "Request Latency (ms)"
+
+        label_value = handler._get_metric_label(
+            "request_latency", "value", sample_available_metrics
+        )
+        assert label_value == "Request Latency (ms)"
+
+    def test_get_metric_label_without_available_metrics(self, mock_plot_generator):
+        """Fallback to metric_name when not in available_metrics."""
+        handler = BaseMultiRunHandler(mock_plot_generator)
+        label = handler._get_metric_label("unknown_metric", "p50", {})
+        assert label == "unknown_metric"
+
+    def test_get_metric_label_with_stat_inclusion(
+        self, mock_plot_generator, sample_available_metrics
+    ):
+        """Stats like 'p50', 'p99' are included in label."""
+        handler = BaseMultiRunHandler(mock_plot_generator)
+
+        label_p50 = handler._get_metric_label(
+            "request_latency", "p50", sample_available_metrics
+        )
+        assert label_p50 == "Request Latency (p50) (ms)"
+
+        label_p99 = handler._get_metric_label(
+            "request_latency", "p99", sample_available_metrics
+        )
+        assert label_p99 == "Request Latency (p99) (ms)"
+
+    def test_get_metric_label_without_unit(self, mock_plot_generator):
+        """Test label formatting when metric has no unit."""
+        handler = BaseMultiRunHandler(mock_plot_generator)
+        available_metrics = {
+            "request_latency": {
+                "display_name": "Request Latency",
+            }
+        }
+        label = handler._get_metric_label("request_latency", "p50", available_metrics)
+        assert label == "Request Latency (p50)"
+
+    def test_extract_experiment_types_with_valid_data(
+        self, mock_plot_generator, sample_multi_run_dataframe
+    ):
+        """Extract experiment types from DataFrame."""
+        handler = BaseMultiRunHandler(mock_plot_generator)
+        experiment_types = handler._extract_experiment_types(
+            sample_multi_run_dataframe, "experiment_group"
+        )
+
+        assert experiment_types is not None
+        assert experiment_types["baseline"] == "baseline"
+        assert experiment_types["treatment_a"] == "treatment"
+        assert experiment_types["treatment_b"] == "treatment"
+
+    def test_extract_experiment_types_without_group_by(
+        self, mock_plot_generator, sample_multi_run_dataframe
+    ):
+        """Returns None when no group_by specified."""
+        handler = BaseMultiRunHandler(mock_plot_generator)
+        experiment_types = handler._extract_experiment_types(
+            sample_multi_run_dataframe, None
+        )
+        assert experiment_types is None
+
+    def test_extract_experiment_types_missing_group_by_column(
+        self, mock_plot_generator, sample_multi_run_dataframe
+    ):
+        """Returns None when group_by column not in DataFrame."""
+        handler = BaseMultiRunHandler(mock_plot_generator)
+        experiment_types = handler._extract_experiment_types(
+            sample_multi_run_dataframe, "nonexistent_column"
+        )
+        assert experiment_types is None
+
+    def test_extract_experiment_types_without_experiment_type_column(
+        self, mock_plot_generator
+    ):
+        """Returns None when experiment_type column missing."""
+        df = pd.DataFrame(
+            {
+                "experiment_group": ["baseline", "treatment_a"],
+                "request_latency": [100.0, 150.0],
+            }
+        )
+        handler = BaseMultiRunHandler(mock_plot_generator)
+        experiment_types = handler._extract_experiment_types(df, "experiment_group")
+        assert experiment_types is None
+
+    def test_extract_experiment_types_deduplication(self, mock_plot_generator):
+        """Takes first value when group has multiple experiment types."""
+        df = pd.DataFrame(
+            {
+                "experiment_group": ["baseline", "baseline"],
+                "experiment_type": ["baseline", "treatment"],
+            }
+        )
+        handler = BaseMultiRunHandler(mock_plot_generator)
+        experiment_types = handler._extract_experiment_types(df, "experiment_group")
+
+        assert experiment_types is not None
+        assert experiment_types["baseline"] == "baseline"
+
+    def test_extract_group_display_names_with_valid_data(
+        self, mock_plot_generator, sample_multi_run_dataframe
+    ):
+        """Extract display names from DataFrame."""
+        handler = BaseMultiRunHandler(mock_plot_generator)
+        display_names = handler._extract_group_display_names(
+            sample_multi_run_dataframe, "experiment_group"
+        )
+
+        assert display_names is not None
+        assert display_names["baseline"] == "Baseline Model"
+        assert display_names["treatment_a"] == "Treatment A"
+        assert display_names["treatment_b"] == "Treatment B"
+
+    def test_extract_group_display_names_without_group_by(
+        self, mock_plot_generator, sample_multi_run_dataframe
+    ):
+        """Returns None when no group_by specified."""
+        handler = BaseMultiRunHandler(mock_plot_generator)
+        display_names = handler._extract_group_display_names(
+            sample_multi_run_dataframe, None
+        )
+        assert display_names is None
+
+    def test_extract_group_display_names_missing_group_by_column(
+        self, mock_plot_generator, sample_multi_run_dataframe
+    ):
+        """Returns None when group_by column not in DataFrame."""
+        handler = BaseMultiRunHandler(mock_plot_generator)
+        display_names = handler._extract_group_display_names(
+            sample_multi_run_dataframe, "nonexistent_column"
+        )
+        assert display_names is None
+
+    def test_extract_group_display_names_without_column(self, mock_plot_generator):
+        """Returns None when group_display_name column missing."""
+        df = pd.DataFrame(
+            {
+                "experiment_group": ["baseline", "treatment_a"],
+                "request_latency": [100.0, 150.0],
+            }
+        )
+        handler = BaseMultiRunHandler(mock_plot_generator)
+        display_names = handler._extract_group_display_names(df, "experiment_group")
+        assert display_names is None
+
+
+class TestParetoHandler:
+    """Tests for ParetoHandler class."""
+
+    def test_can_handle_with_all_columns_present(
+        self, mock_plot_generator, sample_multi_run_dataframe
+    ):
+        """Returns True when all metrics available in DataFrame."""
+        handler = ParetoHandler(mock_plot_generator)
+        spec = PlotSpec(
+            name="test_pareto",
+            title="Test Pareto",
+            plot_type=PlotType.PARETO,
+            filename="test.png",
+            metrics=[
+                MetricSpec(
+                    name="request_latency", source=DataSource.AGGREGATED, axis="x"
+                ),
+                MetricSpec(
+                    name="request_throughput", source=DataSource.AGGREGATED, axis="y"
+                ),
+            ],
+        )
+        assert handler.can_handle(spec, sample_multi_run_dataframe) is True
+
+    def test_can_handle_with_missing_metric(
+        self, mock_plot_generator, sample_multi_run_dataframe
+    ):
+        """Returns False when metric column missing from DataFrame."""
+        handler = ParetoHandler(mock_plot_generator)
+        spec = PlotSpec(
+            name="test_pareto",
+            title="Test Pareto",
+            plot_type=PlotType.PARETO,
+            filename="test.png",
+            metrics=[
+                MetricSpec(
+                    name="nonexistent_metric", source=DataSource.AGGREGATED, axis="x"
+                ),
+                MetricSpec(
+                    name="request_throughput", source=DataSource.AGGREGATED, axis="y"
+                ),
+            ],
+        )
+        assert handler.can_handle(spec, sample_multi_run_dataframe) is False
+
+    def test_can_handle_with_concurrency_metric(
+        self, mock_plot_generator, sample_multi_run_dataframe
+    ):
+        """Special handling for 'concurrency' column - always available."""
+        handler = ParetoHandler(mock_plot_generator)
+        spec = PlotSpec(
+            name="test_pareto",
+            title="Test Pareto",
+            plot_type=PlotType.PARETO,
+            filename="test.png",
+            metrics=[
+                MetricSpec(name="concurrency", source=DataSource.AGGREGATED, axis="x"),
+                MetricSpec(
+                    name="request_throughput", source=DataSource.AGGREGATED, axis="y"
+                ),
+            ],
+        )
+        assert handler.can_handle(spec, sample_multi_run_dataframe) is True
+
+    def test_create_plot_with_concurrency_as_x_metric(
+        self, mock_plot_generator, sample_multi_run_dataframe, sample_available_metrics
+    ):
+        """Uses 'Concurrency Level' label for concurrency metric."""
+        handler = ParetoHandler(mock_plot_generator)
+        spec = PlotSpec(
+            name="test_pareto",
+            title="Test Pareto",
+            plot_type=PlotType.PARETO,
+            filename="test.png",
+            metrics=[
+                MetricSpec(name="concurrency", source=DataSource.AGGREGATED, axis="x"),
+                MetricSpec(
+                    name="request_throughput",
+                    source=DataSource.AGGREGATED,
+                    axis="y",
+                    stat="avg",
+                ),
+            ],
+        )
+
+        handler.create_plot(spec, sample_multi_run_dataframe, sample_available_metrics)
+
+        mock_plot_generator.create_pareto_plot.assert_called_once()
+        call_kwargs = mock_plot_generator.create_pareto_plot.call_args[1]
+        assert call_kwargs["x_label"] == "Concurrency Level"
+        assert call_kwargs["x_metric"] == "concurrency"
+
+    def test_create_plot_with_regular_metrics(
+        self, mock_plot_generator, sample_multi_run_dataframe, sample_available_metrics
+    ):
+        """Uses _get_metric_label for both axes with regular metrics."""
+        handler = ParetoHandler(mock_plot_generator)
+        spec = PlotSpec(
+            name="test_pareto",
+            title="Test Pareto",
+            plot_type=PlotType.PARETO,
+            filename="test.png",
+            metrics=[
+                MetricSpec(
+                    name="request_latency", source=DataSource.AGGREGATED, axis="x"
+                ),
+                MetricSpec(
+                    name="request_throughput", source=DataSource.AGGREGATED, axis="y"
+                ),
+            ],
+        )
+
+        handler.create_plot(spec, sample_multi_run_dataframe, sample_available_metrics)
+
+        mock_plot_generator.create_pareto_plot.assert_called_once()
+        call_kwargs = mock_plot_generator.create_pareto_plot.call_args[1]
+        assert call_kwargs["x_label"] == f"Request Latency ({DEFAULT_PERCENTILE}) (ms)"
+        assert call_kwargs["y_label"] == "Request Throughput (req/s)"
+
+    def test_create_plot_passes_experiment_types(
+        self, mock_plot_generator, sample_multi_run_dataframe, sample_available_metrics
+    ):
+        """Verifies experiment_types passed to plot_generator."""
+        handler = ParetoHandler(mock_plot_generator)
+        spec = PlotSpec(
+            name="test_pareto",
+            title="Test Pareto",
+            plot_type=PlotType.PARETO,
+            filename="test.png",
+            metrics=[
+                MetricSpec(
+                    name="request_latency", source=DataSource.AGGREGATED, axis="x"
+                ),
+                MetricSpec(
+                    name="request_throughput", source=DataSource.AGGREGATED, axis="y"
+                ),
+            ],
+            group_by=["experiment_group"],
+        )
+
+        handler.create_plot(spec, sample_multi_run_dataframe, sample_available_metrics)
+
+        mock_plot_generator.create_pareto_plot.assert_called_once()
+        call_kwargs = mock_plot_generator.create_pareto_plot.call_args[1]
+        assert call_kwargs["experiment_types"] is not None
+        assert call_kwargs["experiment_types"]["baseline"] == "baseline"
+        assert call_kwargs["experiment_types"]["treatment_a"] == "treatment"
+
+    def test_create_plot_passes_display_names(
+        self, mock_plot_generator, sample_multi_run_dataframe, sample_available_metrics
+    ):
+        """Verifies group_display_names passed to plot_generator."""
+        handler = ParetoHandler(mock_plot_generator)
+        spec = PlotSpec(
+            name="test_pareto",
+            title="Test Pareto",
+            plot_type=PlotType.PARETO,
+            filename="test.png",
+            metrics=[
+                MetricSpec(
+                    name="request_latency", source=DataSource.AGGREGATED, axis="x"
+                ),
+                MetricSpec(
+                    name="request_throughput", source=DataSource.AGGREGATED, axis="y"
+                ),
+            ],
+            group_by=["experiment_group"],
+        )
+
+        handler.create_plot(spec, sample_multi_run_dataframe, sample_available_metrics)
+
+        mock_plot_generator.create_pareto_plot.assert_called_once()
+        call_kwargs = mock_plot_generator.create_pareto_plot.call_args[1]
+        assert call_kwargs["group_display_names"] is not None
+        assert call_kwargs["group_display_names"]["baseline"] == "Baseline Model"
+        assert call_kwargs["group_display_names"]["treatment_a"] == "Treatment A"
+
+
+class TestScatterLineHandler:
+    """Tests for ScatterLineHandler class."""
+
+    def test_can_handle_with_all_columns_present(
+        self, mock_plot_generator, sample_multi_run_dataframe
+    ):
+        """Returns True when all metrics available in DataFrame."""
+        handler = ScatterLineHandler(mock_plot_generator)
+        spec = PlotSpec(
+            name="test_scatter",
+            title="Test Scatter Line",
+            plot_type=PlotType.SCATTER_LINE,
+            filename="test.png",
+            metrics=[
+                MetricSpec(
+                    name="request_latency", source=DataSource.AGGREGATED, axis="x"
+                ),
+                MetricSpec(
+                    name="request_throughput", source=DataSource.AGGREGATED, axis="y"
+                ),
+            ],
+        )
+        assert handler.can_handle(spec, sample_multi_run_dataframe) is True
+
+    def test_can_handle_with_missing_metric(
+        self, mock_plot_generator, sample_multi_run_dataframe
+    ):
+        """Returns False when metric column missing from DataFrame."""
+        handler = ScatterLineHandler(mock_plot_generator)
+        spec = PlotSpec(
+            name="test_scatter",
+            title="Test Scatter Line",
+            plot_type=PlotType.SCATTER_LINE,
+            filename="test.png",
+            metrics=[
+                MetricSpec(
+                    name="nonexistent_metric", source=DataSource.AGGREGATED, axis="x"
+                ),
+                MetricSpec(
+                    name="request_throughput", source=DataSource.AGGREGATED, axis="y"
+                ),
+            ],
+        )
+        assert handler.can_handle(spec, sample_multi_run_dataframe) is False
+
+    def test_can_handle_with_concurrency_metric(
+        self, mock_plot_generator, sample_multi_run_dataframe
+    ):
+        """Special handling for 'concurrency' column - always available."""
+        handler = ScatterLineHandler(mock_plot_generator)
+        spec = PlotSpec(
+            name="test_scatter",
+            title="Test Scatter Line",
+            plot_type=PlotType.SCATTER_LINE,
+            filename="test.png",
+            metrics=[
+                MetricSpec(name="concurrency", source=DataSource.AGGREGATED, axis="x"),
+                MetricSpec(
+                    name="request_throughput", source=DataSource.AGGREGATED, axis="y"
+                ),
+            ],
+        )
+        assert handler.can_handle(spec, sample_multi_run_dataframe) is True
+
+    def test_create_plot_with_concurrency_as_x_metric(
+        self, mock_plot_generator, sample_multi_run_dataframe, sample_available_metrics
+    ):
+        """Uses 'Concurrency Level' label for concurrency metric."""
+        handler = ScatterLineHandler(mock_plot_generator)
+        spec = PlotSpec(
+            name="test_scatter",
+            title="Test Scatter Line",
+            plot_type=PlotType.SCATTER_LINE,
+            filename="test.png",
+            metrics=[
+                MetricSpec(name="concurrency", source=DataSource.AGGREGATED, axis="x"),
+                MetricSpec(
+                    name="request_throughput",
+                    source=DataSource.AGGREGATED,
+                    axis="y",
+                    stat="avg",
+                ),
+            ],
+        )
+
+        handler.create_plot(spec, sample_multi_run_dataframe, sample_available_metrics)
+
+        mock_plot_generator.create_scatter_line_plot.assert_called_once()
+        call_kwargs = mock_plot_generator.create_scatter_line_plot.call_args[1]
+        assert call_kwargs["x_label"] == "Concurrency Level"
+        assert call_kwargs["x_metric"] == "concurrency"
+
+    def test_create_plot_with_regular_metrics(
+        self, mock_plot_generator, sample_multi_run_dataframe, sample_available_metrics
+    ):
+        """Uses _get_metric_label for both axes with regular metrics."""
+        handler = ScatterLineHandler(mock_plot_generator)
+        spec = PlotSpec(
+            name="test_scatter",
+            title="Test Scatter Line",
+            plot_type=PlotType.SCATTER_LINE,
+            filename="test.png",
+            metrics=[
+                MetricSpec(
+                    name="request_latency", source=DataSource.AGGREGATED, axis="x"
+                ),
+                MetricSpec(
+                    name="request_throughput", source=DataSource.AGGREGATED, axis="y"
+                ),
+            ],
+        )
+
+        handler.create_plot(spec, sample_multi_run_dataframe, sample_available_metrics)
+
+        mock_plot_generator.create_scatter_line_plot.assert_called_once()
+        call_kwargs = mock_plot_generator.create_scatter_line_plot.call_args[1]
+        assert call_kwargs["x_label"] == f"Request Latency ({DEFAULT_PERCENTILE}) (ms)"
+        assert call_kwargs["y_label"] == "Request Throughput (req/s)"
+
+    def test_create_plot_passes_experiment_types(
+        self, mock_plot_generator, sample_multi_run_dataframe, sample_available_metrics
+    ):
+        """Verifies experiment_types passed to plot_generator."""
+        handler = ScatterLineHandler(mock_plot_generator)
+        spec = PlotSpec(
+            name="test_scatter",
+            title="Test Scatter Line",
+            plot_type=PlotType.SCATTER_LINE,
+            filename="test.png",
+            metrics=[
+                MetricSpec(
+                    name="request_latency", source=DataSource.AGGREGATED, axis="x"
+                ),
+                MetricSpec(
+                    name="request_throughput", source=DataSource.AGGREGATED, axis="y"
+                ),
+            ],
+            group_by=["experiment_group"],
+        )
+
+        handler.create_plot(spec, sample_multi_run_dataframe, sample_available_metrics)
+
+        mock_plot_generator.create_scatter_line_plot.assert_called_once()
+        call_kwargs = mock_plot_generator.create_scatter_line_plot.call_args[1]
+        assert call_kwargs["experiment_types"] is not None
+        assert call_kwargs["experiment_types"]["baseline"] == "baseline"
+        assert call_kwargs["experiment_types"]["treatment_a"] == "treatment"
+
+    def test_create_plot_passes_display_names(
+        self, mock_plot_generator, sample_multi_run_dataframe, sample_available_metrics
+    ):
+        """Verifies group_display_names passed to plot_generator."""
+        handler = ScatterLineHandler(mock_plot_generator)
+        spec = PlotSpec(
+            name="test_scatter",
+            title="Test Scatter Line",
+            plot_type=PlotType.SCATTER_LINE,
+            filename="test.png",
+            metrics=[
+                MetricSpec(
+                    name="request_latency", source=DataSource.AGGREGATED, axis="x"
+                ),
+                MetricSpec(
+                    name="request_throughput", source=DataSource.AGGREGATED, axis="y"
+                ),
+            ],
+            group_by=["experiment_group"],
+        )
+
+        handler.create_plot(spec, sample_multi_run_dataframe, sample_available_metrics)
+
+        mock_plot_generator.create_scatter_line_plot.assert_called_once()
+        call_kwargs = mock_plot_generator.create_scatter_line_plot.call_args[1]
+        assert call_kwargs["group_display_names"] is not None
+        assert call_kwargs["group_display_names"]["baseline"] == "Baseline Model"
+        assert call_kwargs["group_display_names"]["treatment_a"] == "Treatment A"
diff --git a/tests/unit/plot/test_plot_config.py b/tests/unit/plot/test_plot_config.py
new file mode 100644
index 000000000..8782beab2
--- /dev/null
+++ b/tests/unit/plot/test_plot_config.py
@@ -0,0 +1,1076 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Tests for PlotConfig class.
+
+Tests YAML configuration loading, validation, and conversion to PlotSpec objects.
+"""
+
+from pathlib import Path
+
+import pytest
+
+from aiperf.plot.config import PlotConfig
+from aiperf.plot.core.plot_specs import (
+    DataSource,
+    MetricSpec,
+    PlotSpec,
+    PlotType,
+    TimeSlicePlotSpec,
+)
+
+
+class TestPlotConfigLoading:
+    """Tests for PlotConfig loading and priority."""
+
+    def test_auto_create_user_config(self, tmp_path, monkeypatch):
+        """Test that user config is auto-created on first access if it doesn't exist."""
+        # Create fake home directory
+        fake_home = tmp_path / "home"
+        fake_home.mkdir()
+        monkeypatch.setattr(Path, "home", lambda: fake_home)
+
+        # User config should not exist yet
+        user_config = fake_home / ".aiperf" / "plot_config.yaml"
+        assert not user_config.exists()
+
+        # Instantiate PlotConfig - should auto-create user config
+        config = PlotConfig()
+
+        # Verify user config was created
+        assert user_config.exists()
+        assert config.resolved_path == user_config
+        assert config.config is not None
+        assert "visualization" in config.config
+
+    def test_load_custom_config(self, tmp_path):
+        """Test loading from a custom config path."""
+        custom_config = tmp_path / "custom_config.yaml"
+        custom_config.write_text(
+            """
+visualization:
+  multi_run:
+    - name: test_plot
+      plot_type: scatter
+      metrics:
+        - name: x_metric
+          source: aggregated
+          axis: x
+        - name: y_metric
+          source: aggregated
+          axis: y
+  single_run: []
+"""
+        )
+
+        config = PlotConfig(custom_config)
+
+        assert config.resolved_path == custom_config
+        assert config.config["visualization"]["multi_run"][0]["name"] == "test_plot"
+
+    def test_load_user_config(self, tmp_path, monkeypatch):
+        """Test loading from user home config (~/.aiperf/plot_config.yaml)."""
+        # Create fake home directory
+        fake_home = tmp_path / "home"
+        fake_home.mkdir()
+        monkeypatch.setenv("HOME", str(fake_home))
+
+        # Create user config
+        user_config_dir = fake_home / ".aiperf"
+        user_config_dir.mkdir()
+        user_config = user_config_dir / "plot_config.yaml"
+        user_config.write_text(
+            """
+visualization:
+  multi_run:
+    - name: user_plot
+      plot_type: pareto
+      metrics:
+        - name: x
+          source: aggregated
+          axis: x
+        - name: y
+          source: aggregated
+          axis: y
+  single_run: []
+"""
+        )
+
+        # Monkeypatch Path.home() to return fake_home
+        monkeypatch.setattr(Path, "home", lambda: fake_home)
+
+        config = PlotConfig()
+
+        assert config.resolved_path == user_config
+        assert config.config["visualization"]["multi_run"][0]["name"] == "user_plot"
+
+    def test_config_priority(self, tmp_path, monkeypatch):
+        """Test that CLI config takes priority over user config."""
+        # Create fake home
+        fake_home = tmp_path / "home"
+        fake_home.mkdir()
+
+        # Create user config
+        user_config_dir = fake_home / ".aiperf"
+        user_config_dir.mkdir()
+        user_config = user_config_dir / "plot_config.yaml"
+        user_config.write_text(
+            """
+visualization:
+  multi_run:
+    - name: user_plot
+      plot_type: scatter
+      metrics:
+        - name: x
+          source: aggregated
+          axis: x
+        - name: y
+          source: aggregated
+          axis: y
+  single_run: []
+"""
+        )
+
+        # Create CLI config
+        cli_config = tmp_path / "cli_config.yaml"
+        cli_config.write_text(
+            """
+visualization:
+  multi_run:
+    - name: cli_plot
+      plot_type: pareto
+      metrics:
+        - name: x
+          source: aggregated
+          axis: x
+        - name: y
+          source: aggregated
+          axis: y
+  single_run: []
+"""
+        )
+
+        # Monkeypatch Path.home()
+        monkeypatch.setattr(Path, "home", lambda: fake_home)
+
+        # CLI config should take priority
+        config = PlotConfig(cli_config)
+
+        assert config.resolved_path == cli_config
+        assert config.config["visualization"]["multi_run"][0]["name"] == "cli_plot"
+
+    def test_missing_config_file(self, tmp_path):
+        """Test error when custom config file doesn't exist."""
+        missing_config = tmp_path / "missing.yaml"
+
+        with pytest.raises(FileNotFoundError, match="Configuration file not found"):
+            PlotConfig(missing_config)
+
+    def test_invalid_yaml_syntax(self, tmp_path):
+        """Test error handling for invalid YAML syntax."""
+        invalid_config = tmp_path / "invalid.yaml"
+        invalid_config.write_text("{ invalid: yaml: syntax")
+
+        with pytest.raises(ValueError, match="Failed to load YAML config"):
+            PlotConfig(invalid_config)
+
+    def test_missing_visualization_key(self, tmp_path):
+        """Test error when YAML is missing 'visualization' top-level key."""
+        config_file = tmp_path / "config.yaml"
+        config_file.write_text("other_key: value")
+
+        with pytest.raises(ValueError, match="missing 'visualization' top-level key"):
+            PlotConfig(config_file)
+
+
+class TestPlotSpecConversion:
+    """Tests for converting YAML to PlotSpec objects."""
+
+    def test_get_multi_run_plot_specs(self, tmp_path):
+        """Test getting multi-run plot specs from config."""
+        config_file = tmp_path / "config.yaml"
+        config_file.write_text(
+            """
+visualization:
+  multi_run_defaults:
+    - test_plot
+  multi_run_plots:
+    test_plot:
+      type: scatter_line
+      x: request_latency_p50
+      y: request_throughput_avg
+      title: "Test Plot"
+      labels: [concurrency]
+      groups: [model]
+  single_run_defaults: []
+  single_run_plots: {}
+"""
+        )
+
+        config = PlotConfig(config_file)
+        specs = config.get_multi_run_plot_specs()
+
+        assert len(specs) == 1
+        assert isinstance(specs[0], PlotSpec)
+        assert specs[0].name == "test_plot"
+        assert specs[0].plot_type == PlotType.SCATTER_LINE
+        assert specs[0].title == "Test Plot"
+        assert specs[0].filename == "test_plot.png"
+        assert specs[0].label_by == "concurrency"
+        assert specs[0].group_by == "model"
+
+        # Check metrics
+        assert len(specs[0].metrics) == 2
+        assert isinstance(specs[0].metrics[0], MetricSpec)
+        assert specs[0].metrics[0].name == "request_latency"
+        assert specs[0].metrics[0].source == DataSource.AGGREGATED
+        assert specs[0].metrics[0].axis == "x"
+        assert specs[0].metrics[0].stat == "p50"
+
+    def test_get_single_run_plot_specs(self, tmp_path):
+        """Test getting single-run plot specs from config."""
+        config_file = tmp_path / "config.yaml"
+        config_file.write_text(
+            """
+visualization:
+  multi_run_defaults: []
+  multi_run_plots: {}
+  single_run_defaults:
+    - ttft_plot
+  single_run_plots:
+    ttft_plot:
+      type: scatter
+      x: request_number
+      y: time_to_first_token
+      title: "TTFT Over Time"
+"""
+        )
+
+        config = PlotConfig(config_file)
+        specs = config.get_single_run_plot_specs()
+
+        assert len(specs) == 1
+        assert isinstance(specs[0], PlotSpec)
+        assert specs[0].name == "ttft_plot"
+        assert specs[0].plot_type == PlotType.SCATTER
+        assert len(specs[0].metrics) == 2
+        assert specs[0].metrics[0].source == DataSource.REQUESTS
+
+    def test_timeslice_plot_spec_conversion(self, tmp_path):
+        """Test conversion of TimeSlicePlotSpec with use_slice_duration field."""
+        config_file = tmp_path / "config.yaml"
+        config_file.write_text(
+            """
+visualization:
+  multi_run_defaults: []
+  multi_run_plots: {}
+  single_run_defaults:
+    - timeslice_plot
+  single_run_plots:
+    timeslice_plot:
+      type: histogram
+      x: Timeslice
+      y: Time to First Token
+      stat: avg
+      source: timeslices
+      use_slice_duration: true
+"""
+        )
+
+        config = PlotConfig(config_file)
+        specs = config.get_single_run_plot_specs()
+
+        assert len(specs) == 1
+        assert isinstance(specs[0], TimeSlicePlotSpec)
+        assert specs[0].use_slice_duration is True
+
+    def test_multiple_plot_specs(self, tmp_path):
+        """Test loading multiple plot specs."""
+        config_file = tmp_path / "config.yaml"
+        config_file.write_text(
+            """
+visualization:
+  multi_run_defaults:
+    - plot1
+    - plot2
+  multi_run_plots:
+    plot1:
+      type: scatter
+      x: request_latency_p50
+      y: request_throughput_avg
+    plot2:
+      type: pareto
+      x: request_latency_avg
+      y: output_token_throughput_per_gpu_avg
+  single_run_defaults: []
+  single_run_plots: {}
+"""
+        )
+
+        config = PlotConfig(config_file)
+        specs = config.get_multi_run_plot_specs()
+
+        assert len(specs) == 2
+        assert specs[0].name == "plot1"
+        assert specs[1].name == "plot2"
+
+    def test_missing_required_field(self, tmp_path):
+        """Test error when required field is missing in plot spec."""
+        config_file = tmp_path / "config.yaml"
+        config_file.write_text(
+            """
+visualization:
+  multi_run_defaults:
+    - incomplete_plot
+  multi_run_plots:
+    incomplete_plot:
+      # Missing type field
+      x: request_latency_p50
+      y: request_throughput_avg
+  single_run_defaults: []
+  single_run_plots: {}
+"""
+        )
+
+        config = PlotConfig(config_file)
+
+        with pytest.raises(
+            ValueError,
+            match="Config validation failed for multi_run plot 'incomplete_plot'",
+        ):
+            config.get_multi_run_plot_specs()
+
+    def test_invalid_enum_value(self, tmp_path):
+        """Test error when enum value is invalid."""
+        config_file = tmp_path / "config.yaml"
+        config_file.write_text(
+            """
+visualization:
+  multi_run_defaults:
+    - bad_plot
+  multi_run_plots:
+    bad_plot:
+      type: invalid_type
+      x: request_latency_p50
+      y: request_throughput_avg
+  single_run_defaults: []
+  single_run_plots: {}
+"""
+        )
+
+        config = PlotConfig(config_file)
+
+        with pytest.raises(ValueError):
+            config.get_multi_run_plot_specs()
+
+    def test_empty_multi_run_list(self, tmp_path):
+        """Test handling of empty multi_run defaults."""
+        config_file = tmp_path / "config.yaml"
+        config_file.write_text(
+            """
+visualization:
+  multi_run_defaults: []
+  multi_run_plots: {}
+  single_run_defaults: []
+  single_run_plots: {}
+"""
+        )
+
+        config = PlotConfig(config_file)
+        specs = config.get_multi_run_plot_specs()
+
+        assert specs == []
+
+    def test_default_config_structure(self, tmp_path, monkeypatch):
+        """Test that default config has expected structure and valid specs."""
+        # Create fake home directory for auto-creation
+        fake_home = tmp_path / "home"
+        fake_home.mkdir()
+        monkeypatch.setattr(Path, "home", lambda: fake_home)
+
+        config = PlotConfig()
+
+        # Test multi-run specs
+        multi_run_specs = config.get_multi_run_plot_specs()
+        assert len(multi_run_specs) > 0
+        for spec in multi_run_specs:
+            assert isinstance(spec, PlotSpec)
+            assert spec.name
+            assert spec.plot_type
+            assert len(spec.metrics) > 0
+
+        # Test single-run specs
+        single_run_specs = config.get_single_run_plot_specs()
+        assert len(single_run_specs) > 0
+        for spec in single_run_specs:
+            assert isinstance(spec, PlotSpec)
+            assert spec.name
+            assert spec.plot_type
+            assert len(spec.metrics) > 0
+
+
+class TestExperimentClassificationOverride:
+    """Tests for automatic groups override when experiment classification is enabled."""
+
+    def test_classification_enabled_overrides_groups_to_experiment_type(
+        self, tmp_path: Path
+    ) -> None:
+        """Test that when classification is enabled, groups are overridden to experiment_group."""
+        config_file = tmp_path / "config.yaml"
+        config_file.write_text(
+            """
+experiment_classification:
+  baselines:
+    - "*baseline*"
+  treatments:
+    - "*treatment*"
+  default: treatment
+
+visualization:
+  multi_run_defaults:
+    - test_plot
+  multi_run_plots:
+    test_plot:
+      type: scatter_line
+      x: request_latency_avg
+      y: request_throughput_avg
+      groups: [model]
+  single_run_defaults: []
+  single_run_plots: {}
+"""
+        )
+
+        config = PlotConfig(config_file)
+        specs = config.get_multi_run_plot_specs()
+
+        assert len(specs) == 1
+        # Should override to experiment_group
+        assert specs[0].group_by == "experiment_group"
+
+    def test_classification_disabled_respects_original_groups(
+        self, tmp_path: Path
+    ) -> None:
+        """Test that without classification, original groups setting is used."""
+        config_file = tmp_path / "config.yaml"
+        config_file.write_text(
+            """
+# experiment_classification commented out (disabled)
+
+visualization:
+  multi_run_defaults:
+    - test_plot
+  multi_run_plots:
+    test_plot:
+      type: scatter_line
+      x: request_latency_avg
+      y: request_throughput_avg
+      groups: [model]
+  single_run_defaults: []
+  single_run_plots: {}
+"""
+        )
+
+        config = PlotConfig(config_file)
+        specs = config.get_multi_run_plot_specs()
+
+        assert len(specs) == 1
+        # Should keep original groups setting
+        assert specs[0].group_by == "model"
+
+    def test_classification_overrides_all_group_types(self, tmp_path: Path) -> None:
+        """Test that classification overrides groups regardless of original value."""
+        config_file = tmp_path / "config.yaml"
+        config_file.write_text(
+            """
+experiment_classification:
+  baselines:
+    - "*baseline*"
+  treatments:
+    - "*treatment*"
+  default: treatment
+
+visualization:
+  multi_run_defaults:
+    - plot1
+    - plot2
+    - plot3
+  multi_run_plots:
+    plot1:
+      type: scatter_line
+      x: request_latency_avg
+      y: request_throughput_avg
+      groups: [model]
+    plot2:
+      type: scatter_line
+      x: request_latency_avg
+      y: request_throughput_avg
+      groups: [run_name]
+    plot3:
+      type: scatter_line
+      x: request_latency_avg
+      y: request_throughput_avg
+      groups: [concurrency]
+  single_run_defaults: []
+  single_run_plots: {}
+"""
+        )
+
+        config = PlotConfig(config_file)
+        specs = config.get_multi_run_plot_specs()
+
+        assert len(specs) == 3
+        # All should be overridden to experiment_group
+        for spec in specs:
+            assert spec.group_by == "experiment_group"
+
+    def test_classification_with_nested_directory_structure(
+        self, tmp_path: Path
+    ) -> None:
+        """Test that classification works with nested baseline/treatment directories."""
+        config_file = tmp_path / "config.yaml"
+        config_file.write_text(
+            """
+experiment_classification:
+  baselines:
+    - "*baseline*"
+  treatments:
+    - "*treatment*"
+  default: treatment
+
+visualization:
+  multi_run_defaults:
+    - test_plot
+  multi_run_plots:
+    test_plot:
+      type: scatter_line
+      x: request_latency_avg
+      y: request_throughput_avg
+      groups: [model]
+  single_run_defaults: []
+  single_run_plots: {}
+"""
+        )
+
+        config = PlotConfig(config_file)
+        specs = config.get_multi_run_plot_specs()
+
+        # Verify override happens
+        assert specs[0].group_by == "experiment_group"
+
+        # Verify classification config is accessible
+        classification = config.get_experiment_classification_config()
+        assert classification is not None
+        assert "*baseline*" in classification.baselines
+        assert "*treatment*" in classification.treatments
+
+
+class TestPlotSpecDetails:
+    """Tests for detailed PlotSpec field handling."""
+
+    def test_optional_fields(self, tmp_path):
+        """Test that optional fields are properly handled."""
+        config_file = tmp_path / "config.yaml"
+        config_file.write_text(
+            """
+visualization:
+  multi_run_defaults:
+    - minimal_plot
+  multi_run_plots:
+    minimal_plot:
+      type: scatter
+      x: request_latency_p50
+      y: request_throughput_avg
+      # Only required fields, no optional ones
+  single_run_defaults: []
+  single_run_plots: {}
+"""
+        )
+
+        config = PlotConfig(config_file)
+        specs = config.get_multi_run_plot_specs()
+
+        assert specs[0].title is None
+        assert specs[0].filename == "minimal_plot.png"  # Auto-generated
+        assert specs[0].label_by is None
+        assert specs[0].group_by == "run_name"  # Smart default when groups omitted
+
+    def test_dual_axis_plot_spec(self, tmp_path):
+        """Test dual-axis plot spec with y2 axis."""
+        config_file = tmp_path / "config.yaml"
+        config_file.write_text(
+            """
+visualization:
+  multi_run_defaults: []
+  multi_run_plots: {}
+  single_run_defaults:
+    - dual_axis_plot
+  single_run_plots:
+    dual_axis_plot:
+      type: dual_axis
+      x: timestamp_s
+      y: throughput_tokens_per_sec
+      y2: gpu_utilization
+      primary_style:
+        mode: lines
+        line_shape: hv
+      secondary_style:
+        mode: lines
+        fill: tozeroy
+      supplementary_col: active_requests
+"""
+        )
+
+        config = PlotConfig(config_file)
+        specs = config.get_single_run_plot_specs()
+
+        assert len(specs) == 1
+        assert specs[0].plot_type == PlotType.DUAL_AXIS
+        assert len(specs[0].metrics) == 3
+        assert specs[0].metrics[2].axis == "y2"
+        assert specs[0].primary_style.mode == "lines"
+        assert specs[0].primary_style.line_shape == "hv"
+        assert specs[0].secondary_style.fill == "tozeroy"
+        assert specs[0].supplementary_col == "active_requests"
+
+
+class TestMultiValueGrouping:
+    """Tests for multi-value label_by and group_by support."""
+
+    def test_single_value_list_label_by(self, tmp_path):
+        """Test single value in list format for label_by is converted to string."""
+        config_file = tmp_path / "config.yaml"
+        config_file.write_text(
+            """
+visualization:
+  multi_run_defaults:
+    - test_plot
+  multi_run_plots:
+    test_plot:
+      type: scatter_line
+      x: request_latency_p50
+      y: request_throughput_avg
+      labels: [concurrency]
+  single_run_defaults: []
+  single_run_plots: {}
+"""
+        )
+
+        config = PlotConfig(config_file)
+        specs = config.get_multi_run_plot_specs()
+
+        assert len(specs) == 1
+        assert specs[0].label_by == "concurrency"
+
+    def test_single_value_list_group_by(self, tmp_path):
+        """Test single value in list format for group_by is converted to string."""
+        config_file = tmp_path / "config.yaml"
+        config_file.write_text(
+            """
+visualization:
+  multi_run_defaults:
+    - test_plot
+  multi_run_plots:
+    test_plot:
+      type: pareto
+      x: request_latency_avg
+      y: output_token_throughput_per_gpu_avg
+      groups: [model]
+  single_run_defaults: []
+  single_run_plots: {}
+"""
+        )
+
+        config = PlotConfig(config_file)
+        specs = config.get_multi_run_plot_specs()
+
+        assert len(specs) == 1
+        assert specs[0].group_by == "model"
+
+    def test_multi_element_list_raises_error(self, tmp_path):
+        """Test that multi-element lists in groups raise validation error."""
+        config_file = tmp_path / "config.yaml"
+        config_file.write_text(
+            """
+visualization:
+  multi_run_defaults:
+    - test_plot
+  multi_run_plots:
+    test_plot:
+      type: pareto
+      x: request_latency_avg
+      y: request_throughput_avg
+      groups: [model, endpoint_type]
+      labels: [concurrency]
+  single_run_defaults: []
+  single_run_plots: {}
+"""
+        )
+
+        config = PlotConfig(config_file)
+        with pytest.raises(ValueError, match="Config validation failed"):
+            config.get_multi_run_plot_specs()
+
+    def test_omitted_fields_for_auto_detection(self, tmp_path):
+        """Test that omitted label_by and group_by default to None for auto-detection."""
+        config_file = tmp_path / "config.yaml"
+        config_file.write_text(
+            """
+visualization:
+  multi_run_defaults:
+    - test_plot
+  multi_run_plots:
+    test_plot:
+      type: pareto
+      x: request_latency_avg
+      y: output_token_throughput_per_gpu_avg
+      # label_by and group_by omitted for auto-detection
+  single_run_defaults: []
+  single_run_plots: {}
+"""
+        )
+
+        config = PlotConfig(config_file)
+        specs = config.get_multi_run_plot_specs()
+
+        assert len(specs) == 1
+        assert specs[0].label_by is None
+        assert specs[0].group_by == "run_name"  # Smart default when groups omitted
+
+    def test_explicit_null_for_auto_detection(self, tmp_path):
+        """Test explicit null/None values for auto-detection."""
+        config_file = tmp_path / "config.yaml"
+        config_file.write_text(
+            """
+visualization:
+  multi_run_defaults:
+    - test_plot
+  multi_run_plots:
+    test_plot:
+      type: scatter_line
+      x: request_latency_p50
+      y: request_throughput_avg
+      labels: null
+      groups: null
+  single_run_defaults: []
+  single_run_plots: {}
+"""
+        )
+
+        config = PlotConfig(config_file)
+        specs = config.get_multi_run_plot_specs()
+
+        assert len(specs) == 1
+        assert specs[0].label_by is None
+        assert specs[0].group_by == "run_name"  # Smart default when groups omitted
+
+
+class TestDynamicMetricShortcuts:
+    """Tests for dynamic metric shortcut resolution."""
+
+    def test_aggregated_metric_with_avg(self, tmp_path):
+        """Test dynamic resolution of aggregated metric with avg stat."""
+        config_file = tmp_path / "config.yaml"
+        config_file.write_text(
+            """
+visualization:
+  multi_run_defaults:
+    - test_plot
+  multi_run_plots:
+    test_plot:
+      type: scatter_line
+      x: time_to_first_token_avg
+      y: request_throughput_avg
+  single_run_defaults: []
+  single_run_plots: {}
+"""
+        )
+
+        config = PlotConfig(config_file)
+        specs = config.get_multi_run_plot_specs()
+
+        assert len(specs) == 1
+        assert specs[0].metrics[0].name == "time_to_first_token"
+        assert specs[0].metrics[0].stat == "avg"
+        assert specs[0].metrics[0].source == DataSource.AGGREGATED
+
+    def test_aggregated_metric_with_percentiles(self, tmp_path):
+        """Test dynamic resolution with different percentiles."""
+        config_file = tmp_path / "config.yaml"
+        config_file.write_text(
+            """
+visualization:
+  multi_run_defaults:
+    - p50_plot
+    - p90_plot
+    - p99_plot
+  multi_run_plots:
+    p50_plot:
+      type: scatter
+      x: request_latency_p50
+      y: request_throughput_avg
+    p90_plot:
+      type: scatter
+      x: request_latency_p90
+      y: request_throughput_avg
+    p99_plot:
+      type: scatter
+      x: request_latency_p99
+      y: request_throughput_avg
+  single_run_defaults: []
+  single_run_plots: {}
+"""
+        )
+
+        config = PlotConfig(config_file)
+        specs = config.get_multi_run_plot_specs()
+
+        assert len(specs) == 3
+        assert specs[0].metrics[0].stat == "p50"
+        assert specs[1].metrics[0].stat == "p90"
+        assert specs[2].metrics[0].stat == "p99"
+
+    def test_all_stat_types(self, tmp_path):
+        """Test that all stat types work: avg, min, max, std, p1-p99."""
+        config_file = tmp_path / "config.yaml"
+        config_file.write_text(
+            """
+visualization:
+  multi_run_defaults:
+    - test_avg
+    - test_min
+    - test_max
+    - test_std
+    - test_p1
+    - test_p95
+  multi_run_plots:
+    test_avg:
+      type: scatter
+      x: request_latency_avg
+      y: request_throughput_avg
+    test_min:
+      type: scatter
+      x: request_latency_min
+      y: request_throughput_avg
+    test_max:
+      type: scatter
+      x: request_latency_max
+      y: request_throughput_avg
+    test_std:
+      type: scatter
+      x: request_latency_std
+      y: request_throughput_avg
+    test_p1:
+      type: scatter
+      x: request_latency_p1
+      y: request_throughput_avg
+    test_p95:
+      type: scatter
+      x: request_latency_p95
+      y: request_throughput_avg
+  single_run_defaults: []
+  single_run_plots: {}
+"""
+        )
+
+        config = PlotConfig(config_file)
+        specs = config.get_multi_run_plot_specs()
+
+        assert len(specs) == 6
+        assert specs[0].metrics[0].stat == "avg"
+        assert specs[1].metrics[0].stat == "min"
+        assert specs[2].metrics[0].stat == "max"
+        assert specs[3].metrics[0].stat == "std"
+        assert specs[4].metrics[0].stat == "p1"
+        assert specs[5].metrics[0].stat == "p95"
+
+    def test_request_metrics_without_stat(self, tmp_path):
+        """Test request metrics without stat suffix."""
+        config_file = tmp_path / "config.yaml"
+        config_file.write_text(
+            """
+visualization:
+  multi_run_defaults: []
+  multi_run_plots: {}
+  single_run_defaults:
+    - test_plot
+  single_run_plots:
+    test_plot:
+      type: scatter
+      x: request_number
+      y: time_to_first_token
+"""
+        )
+
+        config = PlotConfig(config_file)
+        specs = config.get_single_run_plot_specs()
+
+        assert len(specs) == 1
+        assert specs[0].metrics[0].name == "request_number"
+        assert specs[0].metrics[0].stat is None
+        assert specs[0].metrics[0].source == DataSource.REQUESTS
+        assert specs[0].metrics[1].name == "time_to_first_token"
+        assert specs[0].metrics[1].stat is None
+
+    def test_invalid_metric_name(self, tmp_path):
+        """Test that invalid metric names raise helpful errors."""
+        config_file = tmp_path / "config.yaml"
+        config_file.write_text(
+            """
+visualization:
+  multi_run_defaults:
+    - bad_plot
+  multi_run_plots:
+    bad_plot:
+      type: scatter
+      x: nonexistent_metric_avg
+      y: request_throughput_avg
+  single_run_defaults: []
+  single_run_plots: {}
+"""
+        )
+
+        config = PlotConfig(config_file)
+
+        with pytest.raises(
+            ValueError, match="Config validation failed for multi_run plot 'bad_plot'"
+        ):
+            config.get_multi_run_plot_specs()
+
+    def test_invalid_stat_type(self, tmp_path):
+        """Test that invalid stat types raise helpful errors."""
+        config_file = tmp_path / "config.yaml"
+        config_file.write_text(
+            """
+visualization:
+  multi_run_defaults:
+    - bad_plot
+  multi_run_plots:
+    bad_plot:
+      type: scatter
+      x: request_latency_p999
+      y: request_throughput_avg
+  single_run_defaults: []
+  single_run_plots: {}
+"""
+        )
+
+        config = PlotConfig(config_file)
+
+        with pytest.raises(
+            ValueError, match="Config validation failed for multi_run plot 'bad_plot'"
+        ):
+            config.get_multi_run_plot_specs()
+
+
+class TestMetricNameValidation:
+    """Tests for _parse_and_validate_metric_name function."""
+
+    def test_parse_metric_with_avg_stat(self):
+        """Test parsing metric with avg stat suffix."""
+        from aiperf.plot.config import _parse_and_validate_metric_name
+
+        base, stat = _parse_and_validate_metric_name("request_latency_avg")
+        assert base == "request_latency"
+        assert stat == "avg"
+
+    def test_parse_metric_with_percentile_stats(self):
+        """Test parsing metrics with valid percentile suffixes."""
+        from aiperf.plot.config import _parse_and_validate_metric_name
+
+        for p in [1, 5, 10, 25, 50, 75, 90, 95, 99]:
+            base, stat = _parse_and_validate_metric_name(f"metric_p{p}")
+            assert base == "metric"
+            assert stat == f"p{p}"
+
+    def test_parse_metric_with_all_basic_stats(self):
+        """Test parsing metrics with min, max, std stats."""
+        from aiperf.plot.config import _parse_and_validate_metric_name
+
+        for stat_type in ["min", "max", "std"]:
+            base, stat = _parse_and_validate_metric_name(f"latency_{stat_type}")
+            assert base == "latency"
+            assert stat == stat_type
+
+    def test_parse_metric_without_stat(self):
+        """Test parsing simple metric name without stat suffix."""
+        from aiperf.plot.config import _parse_and_validate_metric_name
+
+        base, stat = _parse_and_validate_metric_name("request_number")
+        assert base == "request_number"
+        assert stat is None
+
+    def test_invalid_percentile_p100_raises_error(self):
+        """Test that p100 raises an error with suggestions."""
+        from aiperf.plot.config import _parse_and_validate_metric_name
+
+        with pytest.raises(ValueError) as exc_info:
+            _parse_and_validate_metric_name("metric_p100")
+
+        error_msg = str(exc_info.value)
+        assert "Invalid stat suffix 'p100'" in error_msg
+        assert "Valid stat suffixes are:" in error_msg
+        assert "p99" in error_msg
+
+    def test_invalid_percentile_p999_raises_error(self):
+        """Test that p999 raises an error."""
+        from aiperf.plot.config import _parse_and_validate_metric_name
+
+        with pytest.raises(ValueError) as exc_info:
+            _parse_and_validate_metric_name("metric_p999")
+
+        assert "Invalid stat suffix 'p999'" in str(exc_info.value)
+
+    def test_metric_with_underscore_in_name(self):
+        """Test metric with underscores in base name."""
+        from aiperf.plot.config import _parse_and_validate_metric_name
+
+        base, stat = _parse_and_validate_metric_name("time_to_first_token_p50")
+        assert base == "time_to_first_token"
+        assert stat == "p50"
+
+    def test_invalid_percentile_p42_raises_error(self):
+        """Test that p42 raises an error with suggestions."""
+        from aiperf.plot.config import _parse_and_validate_metric_name
+
+        with pytest.raises(ValueError) as exc_info:
+            _parse_and_validate_metric_name("metric_p42")
+
+        error_msg = str(exc_info.value)
+        assert "Invalid stat suffix 'p42'" in error_msg
+
+    def test_invalid_stat_suffix_raises_helpful_error(self):
+        """Test that invalid stat suffixes like p67 raise helpful errors."""
+        from aiperf.plot.config import _parse_and_validate_metric_name
+
+        with pytest.raises(ValueError) as exc_info:
+            _parse_and_validate_metric_name("latency_p67")
+
+        error_msg = str(exc_info.value)
+        assert "Invalid stat suffix 'p67'" in error_msg
+        assert "Valid stat suffixes are:" in error_msg
+        assert "p50" in error_msg
+        assert "Did you mean" in error_msg
+        assert "latency_p75" in error_msg or "latency_p50" in error_msg
+
+    def test_fuzzy_matching_suggests_close_percentiles(self):
+        """Test that fuzzy matching suggests numerically close percentiles."""
+        from aiperf.plot.config import _parse_and_validate_metric_name
+
+        with pytest.raises(ValueError) as exc_info:
+            _parse_and_validate_metric_name("latency_p92")
+
+        error_msg = str(exc_info.value)
+        assert "p90" in error_msg or "p95" in error_msg
+
+    def test_valid_percentiles_do_not_raise(self):
+        """Test that valid percentiles don't raise errors."""
+        from aiperf.plot.config import _parse_and_validate_metric_name
+
+        base, stat = _parse_and_validate_metric_name("metric_p50")
+        assert base == "metric"
+        assert stat == "p50"
+
+        base, stat = _parse_and_validate_metric_name("metric_p95")
+        assert base == "metric"
+        assert stat == "p95"
diff --git a/tests/unit/plot/test_plot_controller.py b/tests/unit/plot/test_plot_controller.py
index dfad16923..99d99fcc5 100644
--- a/tests/unit/plot/test_plot_controller.py
+++ b/tests/unit/plot/test_plot_controller.py
@@ -273,9 +273,9 @@ def mock_load_run(run_dir, **kwargs):
 
         result = controller._export_multi_run_plots(multiple_run_dirs)
 
-        # Verify warning was printed
+        # Verify warning was logged
         captured = capsys.readouterr()
-        assert "Warning: Failed to load run" in captured.out
+        assert "Failed to load run" in captured.out
 
         # Verify export was still called with successful runs
         assert result == [tmp_path / "plot1.png"]
diff --git a/tests/unit/plot/test_plot_generator.py b/tests/unit/plot/test_plot_generator.py
index dea9dcf57..adb6cbc51 100644
--- a/tests/unit/plot/test_plot_generator.py
+++ b/tests/unit/plot/test_plot_generator.py
@@ -8,6 +8,8 @@
 type is created correctly with proper styling and data handling.
 """
 
+from unittest import mock
+
 import numpy as np
 import pandas as pd
 import plotly.graph_objects as go
@@ -17,6 +19,7 @@
     DARK_THEME_COLORS,
     NVIDIA_CARD_BG,
     NVIDIA_DARK_BG,
+    NVIDIA_GRAY,
     NVIDIA_GREEN,
     NVIDIA_TEXT_LIGHT,
     NVIDIA_WHITE,
@@ -113,9 +116,13 @@ def test_create_pareto_plot_custom_labels(self, plot_generator, multi_run_df):
         )
 
         # Verify custom labels are used
-        assert fig.layout.title.text == title
-        assert fig.layout.xaxis.title.text == x_label
-        assert fig.layout.yaxis.title.text == y_label
+        # Custom titles now include optimal direction subtitle
+        assert fig.layout.title.text.startswith(title)
+        assert "<br><sub>" in fig.layout.title.text
+        assert "Optimal:" in fig.layout.title.text
+        # Axis labels now include directional arrows
+        assert fig.layout.xaxis.title.text.startswith(x_label)
+        assert fig.layout.yaxis.title.text.startswith(y_label)
 
     def test_create_pareto_plot_no_grouping(self, plot_generator):
         """Test Pareto plot without grouping."""
@@ -354,7 +361,7 @@ def test_dynamic_model_color_assignment(self, plot_generator):
         assert len(fig.data) > 0
 
         # Test that groups are registered in the color registry
-        groups, color_map = plot_generator._prepare_groups(df, "model")
+        groups, color_map, display_names = plot_generator._prepare_groups(df, "model")
 
         # Verify all models get colors
         assert len(color_map) == 3
@@ -375,10 +382,10 @@ def test_dynamic_model_color_assignment(self, plot_generator):
     def test_color_consistency_across_models(self, plot_generator):
         """Test that same model gets same color across different calls."""
         df1 = pd.DataFrame({"model": ["ModelX", "ModelY", "ModelZ"]})
-        groups1, colors1 = plot_generator._prepare_groups(df1, "model")
+        groups1, colors1, display_names1 = plot_generator._prepare_groups(df1, "model")
 
         df2 = pd.DataFrame({"model": ["ModelX", "ModelY", "ModelZ"]})
-        groups2, colors2 = plot_generator._prepare_groups(df2, "model")
+        groups2, colors2, display_names2 = plot_generator._prepare_groups(df2, "model")
 
         # Same models should get same colors across calls
         assert colors1 == colors2
@@ -392,7 +399,7 @@ def test_color_assignment_with_many_models(self, plot_generator):
         # Create more models than available colors in the pool (default 10)
         model_names = [f"Model{i}" for i in range(15)]
         df = pd.DataFrame({"model": model_names})
-        groups, color_map = plot_generator._prepare_groups(df, "model")
+        groups, color_map, display_names = plot_generator._prepare_groups(df, "model")
 
         # All models should get a color
         assert len(color_map) == 15
@@ -934,9 +941,12 @@ def test_detect_outliers_throughput_vs_latency(self):
             latency_values, "time_to_first_token", run_avg, run_std
         )
         # Only 130.0 is outlier (above upper bound 110.0)
-        assert latency_outliers[2] == True
-        assert latency_outliers[0] == False
-        assert latency_outliers[1] == False
+        is_95_outlier = latency_outliers[0]
+        is_105_outlier = latency_outliers[1]
+        is_130_outlier = latency_outliers[2]
+        assert not is_95_outlier
+        assert not is_105_outlier
+        assert is_130_outlier
 
         # Throughput metrics: low values are bad
         throughput_values = np.array([105.0, 95.0, 70.0])
@@ -944,9 +954,12 @@ def test_detect_outliers_throughput_vs_latency(self):
             throughput_values, "request_throughput", run_avg, run_std
         )
         # Only 70.0 is outlier (below lower bound 90.0)
-        assert throughput_outliers[2] == True
-        assert throughput_outliers[0] == False
-        assert throughput_outliers[1] == False
+        is_105_throughput_outlier = throughput_outliers[0]
+        is_95_throughput_outlier = throughput_outliers[1]
+        is_70_throughput_outlier = throughput_outliers[2]
+        assert not is_105_throughput_outlier
+        assert not is_95_throughput_outlier
+        assert is_70_throughput_outlier
 
     def test_detect_outliers_with_slice_stds(self):
         """Test outlier detection incorporates per-slice standard deviations."""
@@ -968,7 +981,7 @@ def test_detect_outliers_with_slice_stds(self):
         outliers_no_slice = detect_directional_outliers(
             values, "time_to_first_token", run_avg, run_std, slice_stds=None
         )
-        assert outliers_no_slice[3] == True
+        assert outliers_no_slice[3]
 
     def test_detect_outliers_mismatched_slice_stds_length(self):
         """Test slice_stds length mismatch defaults to zeros."""
@@ -982,8 +995,8 @@ def test_detect_outliers_mismatched_slice_stds_length(self):
         )
 
         # Should use zeros for slice_stds (upper bound = 80)
-        assert outliers[3] == True
-        assert outliers[0] == False
+        assert outliers[3]
+        assert not outliers[0]
 
 
 class TestDarkTheme:
@@ -1076,7 +1089,7 @@ def test_prepare_groups_none_group_by(self):
         plot_gen = PlotGenerator()
         df = pd.DataFrame({"model": ["a", "b", "c"]})
 
-        groups, color_map = plot_gen._prepare_groups(df, group_by=None)
+        groups, color_map, display_names = plot_gen._prepare_groups(df, group_by=None)
 
         # Should return [None] groups and empty color_map
         assert groups == [None]
@@ -1089,7 +1102,7 @@ def test_color_cycling_more_groups_than_pool(self):
         model_names = [f"model-{i:02d}" for i in range(12)]
         df = pd.DataFrame({"model": model_names})
 
-        groups, color_map = plot_gen._prepare_groups(df, "model")
+        groups, color_map, display_names = plot_gen._prepare_groups(df, "model")
 
         # All models should get a color
         assert len(color_map) == 12
@@ -1120,8 +1133,689 @@ def test_prepare_groups_missing_column_returns_no_groups(self):
         df = pd.DataFrame({"model": ["a", "b", "c"]})
 
         # Try to group by non-existent column
-        groups, color_map = plot_gen._prepare_groups(df, "nonexistent_column")
+        groups, color_map, display_names = plot_gen._prepare_groups(
+            df, "nonexistent_column"
+        )
 
         # Should return [None] and empty color_map (no grouping)
         assert groups == [None]
         assert color_map == {}
+
+
+class TestGetNvidiaColorScheme:
+    """Tests for get_nvidia_color_scheme function."""
+
+    def test_brand_colors_less_than_requested(self):
+        """Returns only NVIDIA colors when n_colors <= 2."""
+        from aiperf.plot.core.plot_generator import get_nvidia_color_scheme
+
+        colors = get_nvidia_color_scheme(n_colors=1, use_brand_colors=True)
+        assert len(colors) == 1
+        assert colors[0] == NVIDIA_GREEN
+
+        colors = get_nvidia_color_scheme(n_colors=2, use_brand_colors=True)
+        assert len(colors) == 2
+        assert colors[0] == NVIDIA_GREEN
+
+    def test_brand_colors_with_palette_expansion(self):
+        """Adds seaborn colors when n_colors > 2."""
+        from aiperf.plot.core.plot_generator import get_nvidia_color_scheme
+
+        colors = get_nvidia_color_scheme(n_colors=5, use_brand_colors=True)
+        assert len(colors) == 5
+        assert colors[0] == NVIDIA_GREEN
+        # Remaining colors should be from seaborn palette
+        for color in colors[2:]:
+            assert color.startswith("#")
+
+    def test_without_brand_colors(self):
+        """Uses only seaborn palette when use_brand_colors=False."""
+        from aiperf.plot.core.plot_generator import get_nvidia_color_scheme
+
+        colors = get_nvidia_color_scheme(n_colors=5, use_brand_colors=False)
+        assert len(colors) == 5
+        # Should not start with NVIDIA brand colors
+        assert colors[0] != NVIDIA_GREEN
+        # All should be valid hex colors
+        for color in colors:
+            assert color.startswith("#")
+
+    def test_bright_palette_with_brand_colors(self):
+        """Verifies bright palette used with brand colors."""
+        from aiperf.plot.core.plot_generator import get_nvidia_color_scheme
+
+        colors = get_nvidia_color_scheme(
+            n_colors=5, palette_name="bright", use_brand_colors=True
+        )
+        assert len(colors) == 5
+        assert colors[0] == NVIDIA_GREEN
+
+    def test_deep_palette_without_brand_colors(self):
+        """Verifies deep palette used without brand colors."""
+        from aiperf.plot.core.plot_generator import get_nvidia_color_scheme
+
+        colors = get_nvidia_color_scheme(
+            n_colors=5, palette_name="deep", use_brand_colors=False
+        )
+        assert len(colors) == 5
+        for color in colors:
+            assert color.startswith("#")
+
+    def test_color_pool_cycling(self):
+        """Verifies colors are valid hex strings."""
+        from aiperf.plot.core.plot_generator import get_nvidia_color_scheme
+
+        colors = get_nvidia_color_scheme(n_colors=15, use_brand_colors=True)
+        assert len(colors) == 15
+        for color in colors:
+            assert color.startswith("#")
+            assert len(color) == 7  # #RRGGBB format
+
+
+class TestGetMetricDirection:
+    """Tests for _get_metric_direction method."""
+
+    def test_get_metric_direction_from_registry(self):
+        """Uses MetricRegistry when available."""
+        from unittest.mock import patch
+
+        from aiperf.common.enums import PlotMetricDirection
+        from aiperf.common.enums.metric_enums import MetricFlags
+
+        plot_gen = PlotGenerator()
+
+        # Mock a metric with LARGER_IS_BETTER flag
+        with patch(
+            "aiperf.plot.core.plot_generator.MetricRegistry.get_class"
+        ) as mock_get_class:
+            mock_metric = type(
+                "MockMetric",
+                (),
+                {
+                    "has_flags": lambda flags: flags == MetricFlags.LARGER_IS_BETTER,
+                },
+            )
+            mock_get_class.return_value = mock_metric
+
+            direction = plot_gen._get_metric_direction("test_throughput")
+            assert direction == PlotMetricDirection.HIGHER
+
+    def test_get_metric_direction_fallback_to_derived(self):
+        """Falls back to DERIVED_METRIC_DIRECTIONS."""
+        from unittest.mock import patch
+
+        from aiperf.common.enums import PlotMetricDirection
+
+        plot_gen = PlotGenerator()
+
+        # Mock MetricRegistry to raise exception
+        with (
+            patch(
+                "aiperf.plot.core.plot_generator.MetricRegistry.get_class",
+                side_effect=Exception,
+            ),
+            patch(
+                "aiperf.plot.core.plot_generator.DERIVED_METRIC_DIRECTIONS",
+                {"custom_throughput_metric": True},
+            ),
+        ):
+            direction = plot_gen._get_metric_direction("custom_throughput_metric")
+            assert direction == PlotMetricDirection.HIGHER
+
+    def test_get_metric_direction_default_to_empty_string(self):
+        """Returns empty string for unknown metrics."""
+        from unittest.mock import patch
+
+        plot_gen = PlotGenerator()
+
+        # Mock MetricRegistry to raise exception and empty derived directions
+        with (
+            patch(
+                "aiperf.plot.core.plot_generator.MetricRegistry.get_class",
+                side_effect=Exception,
+            ),
+            patch("aiperf.plot.core.plot_generator.DERIVED_METRIC_DIRECTIONS", {}),
+        ):
+            direction = plot_gen._get_metric_direction("unknown_metric")
+            assert direction == ""
+
+
+class TestAddOptimalQuadrantShading:
+    """Tests for _add_optimal_quadrant_shading method."""
+
+    def test_quadrant_shading_both_lower_is_better(self):
+        """Lower-left quadrant shaded when both metrics lower is better."""
+        from aiperf.common.enums import PlotMetricDirection
+
+        plot_gen = PlotGenerator()
+        fig = go.Figure()
+
+        x_data = [100.0, 150.0, 200.0]
+        y_data = [10.0, 15.0, 20.0]
+
+        # Mock metric directions - both lower is better
+        with mock.patch.object(
+            plot_gen,
+            "_get_metric_direction",
+            side_effect=lambda m: PlotMetricDirection.LOWER,
+        ):
+            plot_gen._add_optimal_quadrant_shading(
+                fig, "latency_metric", "another_latency", x_data, y_data
+            )
+
+        # Should have added shape and annotation
+        assert len(fig.layout.shapes) == 1
+        assert len(fig.layout.annotations) == 1
+
+        # Verify optimal corner is lower-left (min x, min y)
+        shape = fig.layout.shapes[0]
+        assert shape.x0 == min(x_data)
+        assert shape.y0 == min(y_data)
+
+    def test_quadrant_shading_both_higher_is_better(self):
+        """Upper-right quadrant shaded when both metrics higher is better."""
+        from aiperf.common.enums import PlotMetricDirection
+
+        plot_gen = PlotGenerator()
+        fig = go.Figure()
+
+        x_data = [100.0, 150.0, 200.0]
+        y_data = [10.0, 15.0, 20.0]
+
+        # Mock metric directions - both higher is better
+        with mock.patch.object(
+            plot_gen,
+            "_get_metric_direction",
+            side_effect=lambda m: PlotMetricDirection.HIGHER,
+        ):
+            plot_gen._add_optimal_quadrant_shading(
+                fig, "throughput_metric", "another_throughput", x_data, y_data
+            )
+
+        # Should have added shape and annotation
+        assert len(fig.layout.shapes) == 1
+        assert len(fig.layout.annotations) == 1
+
+        # Verify optimal corner is upper-right (max x, max y)
+        shape = fig.layout.shapes[0]
+        assert shape.x0 == max(x_data)
+        assert shape.y1 == max(y_data)
+
+    @pytest.mark.parametrize(  # fmt: skip
+        "x_dir,y_dir,expected_optimal_x,expected_optimal_y",
+        [
+            ("LOWER", "HIGHER", "min", "max"),  # Lower-right quadrant
+            ("HIGHER", "LOWER", "max", "min"),  # Upper-left quadrant
+            ("LOWER", "LOWER", "min", "min"),  # Lower-left quadrant
+            ("HIGHER", "HIGHER", "max", "max"),  # Upper-right quadrant
+        ],
+    )
+    def test_quadrant_shading_mixed_directions(
+        self, x_dir, y_dir, expected_optimal_x, expected_optimal_y
+    ):
+        """Test all 4 direction combinations."""
+        from aiperf.common.enums import PlotMetricDirection
+
+        plot_gen = PlotGenerator()
+        fig = go.Figure()
+
+        x_data = [100.0, 150.0, 200.0]
+        y_data = [10.0, 15.0, 20.0]
+
+        x_direction = (
+            PlotMetricDirection.LOWER
+            if x_dir == "LOWER"
+            else PlotMetricDirection.HIGHER
+        )
+        y_direction = (
+            PlotMetricDirection.LOWER
+            if y_dir == "LOWER"
+            else PlotMetricDirection.HIGHER
+        )
+
+        with mock.patch.object(
+            plot_gen,
+            "_get_metric_direction",
+            side_effect=lambda m: x_direction if "x" in m else y_direction,
+        ):
+            plot_gen._add_optimal_quadrant_shading(
+                fig, "x_metric", "y_metric", x_data, y_data
+            )
+
+        # Verify shape was added
+        assert len(fig.layout.shapes) == 1
+
+        # Verify optimal point is correct
+        expected_x = min(x_data) if expected_optimal_x == "min" else max(x_data)
+        expected_y = min(y_data) if expected_optimal_y == "min" else max(y_data)
+
+        annotation = fig.layout.annotations[0]
+        assert annotation.x == expected_x
+        assert annotation.y == expected_y
+
+    def test_quadrant_shading_math_calculation(self):
+        """Verifies rectangle bounds calculation."""
+        from aiperf.common.enums import PlotMetricDirection
+
+        plot_gen = PlotGenerator()
+        fig = go.Figure()
+
+        x_data = [50.0, 100.0, 150.0]
+        y_data = [5.0, 10.0, 15.0]
+
+        # Lower x, higher y (lower-right quadrant)
+        with mock.patch.object(
+            plot_gen,
+            "_get_metric_direction",
+            side_effect=lambda m: (
+                PlotMetricDirection.LOWER
+                if m == "latency"
+                else PlotMetricDirection.HIGHER
+            ),
+        ):
+            plot_gen._add_optimal_quadrant_shading(
+                fig, "latency", "throughput", x_data, y_data
+            )
+
+        shape = fig.layout.shapes[0]
+        # Optimal point: (min x=50, max y=15)
+        # Rectangle: x0=50, x1=50, y0=15, y1=15... wait that doesn't make sense
+
+        # Actually for lower x and higher y, optimal corner is lower-right
+        # So rectangle should shade from min_x to optimal_x (min), and optimal_y (max) to max_y
+        assert shape.x0 == min(x_data)  # 50
+        assert shape.x1 == min(x_data)  # 50 (optimal x)
+        assert shape.y0 == max(y_data)  # 15 (optimal y)
+        assert shape.y1 == max(y_data)  # 15
+
+    def test_quadrant_shading_annotation_text(self):
+        """Verifies 'Optimal' annotation."""
+        from aiperf.common.enums import PlotMetricDirection
+
+        plot_gen = PlotGenerator()
+        fig = go.Figure()
+
+        x_data = [100.0, 150.0, 200.0]
+        y_data = [10.0, 15.0, 20.0]
+
+        with mock.patch.object(
+            plot_gen,
+            "_get_metric_direction",
+            side_effect=lambda m: PlotMetricDirection.HIGHER,
+        ):
+            plot_gen._add_optimal_quadrant_shading(
+                fig, "metric1", "metric2", x_data, y_data
+            )
+
+        annotation = fig.layout.annotations[0]
+        assert "Optimal" in annotation.text
+        assert "\u2605" in annotation.text  # Star symbol
+
+    def test_quadrant_shading_skipped_when_no_direction(self):
+        """No shading when metric direction unknown."""
+        plot_gen = PlotGenerator()
+        fig = go.Figure()
+
+        x_data = [100.0, 150.0, 200.0]
+        y_data = [10.0, 15.0, 20.0]
+
+        # Mock to return empty string (unknown direction)
+        with mock.patch.object(plot_gen, "_get_metric_direction", return_value=""):
+            plot_gen._add_optimal_quadrant_shading(
+                fig, "unknown_metric", "another_unknown", x_data, y_data
+            )
+
+        # Should not add shapes or annotations
+        assert len(fig.layout.shapes) == 0
+        assert len(fig.layout.annotations) == 0
+
+
+class TestPrepareGroupsExperimentTypes:
+    """Tests for _prepare_groups with experiment_types logic."""
+
+    def test_prepare_groups_experiment_types_baseline_vs_treatment(self):
+        """Separates baselines from treatments."""
+        plot_gen = PlotGenerator()
+        df = pd.DataFrame(
+            {
+                "experiment_group": [
+                    "baseline_a",
+                    "treatment_a",
+                    "baseline_b",
+                    "treatment_b",
+                ],
+                "value": [1, 2, 3, 4],
+            }
+        )
+        experiment_types = {
+            "baseline_a": "baseline",
+            "treatment_a": "treatment",
+            "baseline_b": "baseline",
+            "treatment_b": "treatment",
+        }
+
+        groups, colors, display_names = plot_gen._prepare_groups(
+            df, "experiment_group", experiment_types
+        )
+
+        # Baselines should come first, then treatments
+        assert groups[:2] == ["baseline_a", "baseline_b"]
+        assert groups[2:] == ["treatment_a", "treatment_b"]
+
+        # Baselines should be gray
+        assert colors["baseline_a"] == NVIDIA_GRAY
+        assert colors["baseline_b"] == NVIDIA_GRAY
+
+        # First treatment should be green
+        assert colors["treatment_a"] == NVIDIA_GREEN
+
+    def test_prepare_groups_experiment_types_single_treatment(self):
+        """Single treatment gets green color."""
+        plot_gen = PlotGenerator()
+        df = pd.DataFrame(
+            {
+                "experiment_group": ["baseline", "treatment"],
+                "value": [1, 2],
+            }
+        )
+        experiment_types = {
+            "baseline": "baseline",
+            "treatment": "treatment",
+        }
+
+        groups, colors, display_names = plot_gen._prepare_groups(
+            df, "experiment_group", experiment_types
+        )
+
+        assert colors["baseline"] == NVIDIA_GRAY
+        assert colors["treatment"] == NVIDIA_GREEN
+
+    def test_prepare_groups_experiment_types_multiple_treatments(self):
+        """Multiple treatments: first=green, rest=seaborn colors."""
+        plot_gen = PlotGenerator()
+        df = pd.DataFrame(
+            {
+                "experiment_group": [
+                    "baseline",
+                    "treatment1",
+                    "treatment2",
+                    "treatment3",
+                ],
+                "value": [1, 2, 3, 4],
+            }
+        )
+        experiment_types = {
+            "baseline": "baseline",
+            "treatment1": "treatment",
+            "treatment2": "treatment",
+            "treatment3": "treatment",
+        }
+
+        groups, colors, display_names = plot_gen._prepare_groups(
+            df, "experiment_group", experiment_types
+        )
+
+        assert colors["baseline"] == NVIDIA_GRAY
+        assert colors["treatment1"] == NVIDIA_GREEN
+        # Other treatments should have different colors
+        assert colors["treatment2"] != NVIDIA_GREEN
+        assert colors["treatment2"] != NVIDIA_GRAY
+        assert colors["treatment3"] != NVIDIA_GREEN
+        assert colors["treatment3"] != NVIDIA_GRAY
+
+    def test_prepare_groups_with_string_input(self):
+        """Accepts string input directly (validator converts lists to strings)."""
+        plot_gen = PlotGenerator()
+        df = pd.DataFrame(
+            {
+                "model": ["model_a", "model_b"],
+                "value": [1, 2],
+            }
+        )
+
+        # Pass string directly (validator already converted list to string)
+        groups, colors, display_names = plot_gen._prepare_groups(df, group_by="model")
+
+        # Should successfully group by model
+        assert groups == ["model_a", "model_b"]
+        assert len(colors) == 2
+
+
+class TestParetoFrontierOptimization:
+    """Tests for optimized O(n log n) Pareto frontier calculation."""
+
+    @pytest.mark.parametrize(
+        "x_dir,y_dir,x_vals,y_vals,expected",
+        [
+            # LOWER x, HIGHER y (classic Pareto: minimize x, maximize y)
+            ("LOWER", "HIGHER", [1, 2, 3, 4], [4, 5, 3, 6], [True, True, False, True]),
+            ("LOWER", "HIGHER", [1, 2, 3], [3, 2, 1], [True, False, False]),
+            # HIGHER x, HIGHER y (maximize both)
+            (
+                "HIGHER",
+                "HIGHER",
+                [1, 2, 3, 4],
+                [4, 5, 3, 6],
+                [False, False, False, True],
+            ),
+            ("HIGHER", "HIGHER", [1, 2, 3], [1, 2, 3], [False, False, True]),
+            # LOWER x, LOWER y (minimize both)
+            ("LOWER", "LOWER", [1, 2, 3, 4], [4, 3, 5, 2], [True, True, False, True]),
+            ("LOWER", "LOWER", [1, 2, 3], [3, 2, 1], [True, True, True]),
+            # HIGHER x, LOWER y (maximize x, minimize y)
+            (
+                "HIGHER",
+                "LOWER",
+                [1, 2, 3, 4],
+                [4, 5, 3, 2],
+                [False, False, False, True],
+            ),
+            ("HIGHER", "LOWER", [1, 2, 3], [3, 2, 1], [False, False, True]),
+            # Edge cases: duplicate x values
+            ("LOWER", "HIGHER", [1, 1, 2], [5, 3, 6], [True, False, True]),
+            # Edge cases: duplicate y values
+            ("LOWER", "HIGHER", [1, 2, 3], [5, 5, 5], [True, True, True]),
+            # Edge cases: all points on frontier (monotonic increase)
+            ("LOWER", "HIGHER", [1, 2, 3, 4], [1, 2, 3, 4], [True, True, True, True]),
+            # Edge cases: single best point
+            ("HIGHER", "HIGHER", [1, 2, 3], [1, 1, 10], [False, False, True]),
+        ],  # fmt: skip
+    )
+    def test_pareto_frontier_directions(
+        self, plot_generator, x_dir, y_dir, x_vals, y_vals, expected
+    ):
+        """Test Pareto frontier calculation for all direction combinations."""
+        from aiperf.common.enums import PlotMetricDirection
+
+        x_direction = PlotMetricDirection(x_dir)
+        y_direction = PlotMetricDirection(y_dir)
+
+        x_array = np.array(x_vals, dtype=float)
+        y_array = np.array(y_vals, dtype=float)
+
+        result = plot_generator._compute_pareto_frontier(
+            x_array, y_array, x_direction, y_direction
+        )
+
+        expected_array = np.array(expected, dtype=bool)
+        np.testing.assert_array_equal(
+            result,
+            expected_array,
+            err_msg=f"Failed for x_dir={x_dir}, y_dir={y_dir}, x={x_vals}, y={y_vals}",
+        )
+
+    def test_pareto_empty_array(self, plot_generator):
+        """Test with empty arrays."""
+        from aiperf.common.enums import PlotMetricDirection
+
+        result = plot_generator._compute_pareto_frontier(
+            np.array([]),
+            np.array([]),
+            PlotMetricDirection.LOWER,
+            PlotMetricDirection.HIGHER,
+        )
+        assert len(result) == 0
+        assert result.dtype == bool
+
+    def test_pareto_single_point(self, plot_generator):
+        """Test with single point."""
+        from aiperf.common.enums import PlotMetricDirection
+
+        result = plot_generator._compute_pareto_frontier(
+            np.array([1.0]),
+            np.array([2.0]),
+            PlotMetricDirection.LOWER,
+            PlotMetricDirection.HIGHER,
+        )
+        np.testing.assert_array_equal(result, [True])
+
+    def test_pareto_two_points(self, plot_generator):
+        """Test with two points - various domination scenarios."""
+        from aiperf.common.enums import PlotMetricDirection
+
+        # Point 2 dominates point 1 (minimize x, maximize y)
+        # Data must be sorted by x ascending (1.0 comes before 2.0)
+        result = plot_generator._compute_pareto_frontier(
+            np.array([1.0, 2.0]),  # Sorted by x
+            np.array([2.0, 1.0]),  # Point 1 has y=2, point 2 has y=1
+            PlotMetricDirection.LOWER,
+            PlotMetricDirection.HIGHER,
+        )
+        # Point 1 (x=1, y=2) is on frontier, Point 2 (x=2, y=1) is dominated
+        np.testing.assert_array_equal(result, [True, False])
+
+        # Both points on frontier (non-dominated, moving away from each other)
+        result = plot_generator._compute_pareto_frontier(
+            np.array([1.0, 2.0]),  # Sorted by x
+            np.array([1.0, 2.0]),  # Both increase together
+            PlotMetricDirection.LOWER,
+            PlotMetricDirection.HIGHER,
+        )
+        # For minimize x, maximize y: point 1 (x=1, y=1) is on frontier
+        # Point 2 (x=2, y=2) has worse x but better y, so it's also on frontier
+        np.testing.assert_array_equal(result, [True, True])
+
+    def test_pareto_backwards_compatibility(self, multi_run_df):
+        """
+        Verify that the optimized algorithm produces identical results to the
+        O(n²) algorithm for typical multi-run data.
+        """
+        from aiperf.common.enums import PlotMetricDirection
+
+        plot_gen = PlotGenerator()
+        df = multi_run_df.sort_values("request_latency")
+
+        # Test on latency (LOWER) vs throughput (HIGHER) - classic Pareto
+        x_vals = df["request_latency"].values
+        y_vals = df["request_throughput"].values
+
+        result = plot_gen._compute_pareto_frontier(
+            x_vals, y_vals, PlotMetricDirection.LOWER, PlotMetricDirection.HIGHER
+        )
+
+        # Verify at least one point is on the frontier
+        assert np.any(result), "At least one point should be on Pareto frontier"
+
+        # Verify no point on the frontier is dominated by another point on the frontier
+        pareto_points = np.where(result)[0]
+        for i in pareto_points:
+            for j in pareto_points:
+                if i == j:
+                    continue
+                # For minimize x, maximize y: j should not have both (x_j <= x_i and y_j >= y_i) with strict inequality
+                if x_vals[j] < x_vals[i] and y_vals[j] > y_vals[i]:
+                    pytest.fail(
+                        f"Point {j} dominates point {i}, but both are on frontier"
+                    )
+
+    def test_pareto_performance_large_dataset(self, plot_generator):
+        """Benchmark with large dataset to verify O(n log n) performance."""
+        import time
+
+        from aiperf.common.enums import PlotMetricDirection
+
+        # Generate 1000 random points
+        np.random.seed(42)
+        n = 1000
+        x_vals = np.random.rand(n) * 100
+        y_vals = np.random.rand(n) * 100
+
+        # Sort by x (as the real algorithm expects)
+        sorted_indices = np.argsort(x_vals)
+        x_vals = x_vals[sorted_indices]
+        y_vals = y_vals[sorted_indices]
+
+        start = time.time()
+        result = plot_generator._compute_pareto_frontier(
+            x_vals, y_vals, PlotMetricDirection.LOWER, PlotMetricDirection.HIGHER
+        )
+        elapsed = time.time() - start
+
+        # Should complete in well under 0.1 seconds for 1000 points
+        assert elapsed < 0.1, f"Algorithm took {elapsed}s for {n} points (too slow)"
+        assert np.any(result), "Should have at least one point on frontier"
+
+    def test_pareto_all_points_identical(self, plot_generator):
+        """Test when all points have identical coordinates."""
+        from aiperf.common.enums import PlotMetricDirection
+
+        # All points are identical, so all should be on the frontier
+        result = plot_generator._compute_pareto_frontier(
+            np.array([5.0, 5.0, 5.0]),
+            np.array([3.0, 3.0, 3.0]),
+            PlotMetricDirection.LOWER,
+            PlotMetricDirection.HIGHER,
+        )
+        # All identical points are considered on the frontier (use >= comparison)
+        np.testing.assert_array_equal(result, [True, True, True])
+
+    def test_pareto_raises_error_for_unknown_metric_directions(self, multi_run_df):
+        """Test that ValueError is raised when metric directions are unknown."""
+        from unittest.mock import patch
+
+        plot_gen = PlotGenerator()
+
+        # Mock _get_metric_direction to return empty string (unknown direction)
+        with (
+            patch.object(plot_gen, "_get_metric_direction", return_value=""),
+            pytest.raises(
+                ValueError,
+                match="Cannot determine optimization direction for x-axis metric 'request_latency' and y-axis metric 'request_throughput'",
+            ),
+        ):
+            plot_gen.create_pareto_plot(
+                df=multi_run_df,
+                x_metric="request_latency",
+                y_metric="request_throughput",
+                label_by="concurrency",
+                group_by="model",
+            )
+
+    def test_pareto_raises_error_for_one_unknown_metric(self, multi_run_df):
+        """Test that ValueError is raised when one metric direction is unknown."""
+        from unittest.mock import patch
+
+        from aiperf.common.enums import PlotMetricDirection
+
+        plot_gen = PlotGenerator()
+
+        # Mock _get_metric_direction to return known for x, unknown for y
+        def mock_direction(metric):
+            if metric == "request_latency":
+                return PlotMetricDirection.LOWER
+            return ""
+
+        with (
+            patch.object(plot_gen, "_get_metric_direction", side_effect=mock_direction),
+            pytest.raises(
+                ValueError,
+                match="Cannot determine optimization direction for y-axis metric 'request_throughput'",
+            ),
+        ):
+            plot_gen.create_pareto_plot(
+                df=multi_run_df,
+                x_metric="request_latency",
+                y_metric="request_throughput",
+                label_by="concurrency",
+                group_by="model",
+            )
diff --git a/tests/unit/plot/test_png_exporter.py b/tests/unit/plot/test_png_exporter.py
index 619267884..22f62e91e 100644
--- a/tests/unit/plot/test_png_exporter.py
+++ b/tests/unit/plot/test_png_exporter.py
@@ -19,6 +19,12 @@
     prepare_request_timeseries,
     validate_request_uniformity,
 )
+from aiperf.plot.core.plot_specs import (
+    DataSource,
+    MetricSpec,
+    PlotSpec,
+    PlotType,
+)
 from aiperf.plot.exporters.png import MultiRunPNGExporter, SingleRunPNGExporter
 
 
@@ -36,6 +42,233 @@ def single_run_exporter(tmp_path):
     return SingleRunPNGExporter(output_dir)
 
 
+@pytest.fixture
+def sample_plot_specs():
+    """Create plot specs for single-run testing (matches original hardcoded specs)."""
+    from aiperf.plot.core.plot_specs import Style, TimeSlicePlotSpec
+
+    # Single-run plot specifications
+    single_run_specs = [
+        PlotSpec(
+            name="ttft_over_time",
+            plot_type=PlotType.SCATTER,
+            metrics=[
+                MetricSpec(name="request_number", source=DataSource.REQUESTS, axis="x"),
+                MetricSpec(
+                    name="time_to_first_token", source=DataSource.REQUESTS, axis="y"
+                ),
+            ],
+            title="TTFT Per Request Over Time",
+            filename="ttft_over_time.png",
+        ),
+        PlotSpec(
+            name="itl_over_time",
+            plot_type=PlotType.SCATTER,
+            metrics=[
+                MetricSpec(name="request_number", source=DataSource.REQUESTS, axis="x"),
+                MetricSpec(
+                    name="inter_token_latency", source=DataSource.REQUESTS, axis="y"
+                ),
+            ],
+            title="Inter-Token Latency Per Request Over Time",
+            filename="itl_over_time.png",
+        ),
+        PlotSpec(
+            name="latency_over_time",
+            plot_type=PlotType.SCATTER_WITH_PERCENTILES,
+            metrics=[
+                MetricSpec(name="timestamp", source=DataSource.REQUESTS, axis="x"),
+                MetricSpec(
+                    name="request_latency", source=DataSource.REQUESTS, axis="y"
+                ),
+            ],
+            title="Request Latency Over Time with Percentiles",
+            filename="latency_over_time.png",
+        ),
+        PlotSpec(
+            name="dispersed_throughput_over_time",
+            plot_type=PlotType.AREA,
+            metrics=[
+                MetricSpec(name="timestamp_s", source=DataSource.REQUESTS, axis="x"),
+                MetricSpec(
+                    name="throughput_tokens_per_sec",
+                    source=DataSource.REQUESTS,
+                    axis="y",
+                ),
+            ],
+            title="Dispersed Output Token Throughput Over Time",
+            filename="dispersed_throughput_over_time.png",
+        ),
+    ]
+
+    # Timeslice plot specifications
+    timeslice_specs = [
+        TimeSlicePlotSpec(
+            name="timeslices_ttft",
+            plot_type=PlotType.HISTOGRAM,
+            metrics=[
+                MetricSpec(name="Timeslice", source=DataSource.TIMESLICES, axis="x"),
+                MetricSpec(
+                    name="Time to First Token",
+                    source=DataSource.TIMESLICES,
+                    axis="y",
+                    stat="avg",
+                ),
+            ],
+            title="Average Time to First Token Across Time Slices",
+            filename="timeslices_ttft.png",
+            use_slice_duration=True,
+        ),
+        TimeSlicePlotSpec(
+            name="timeslices_itl",
+            plot_type=PlotType.HISTOGRAM,
+            metrics=[
+                MetricSpec(name="Timeslice", source=DataSource.TIMESLICES, axis="x"),
+                MetricSpec(
+                    name="Inter Token Latency",
+                    source=DataSource.TIMESLICES,
+                    axis="y",
+                    stat="avg",
+                ),
+            ],
+            title="Average Inter Token Latency Across Time Slices",
+            filename="timeslices_itl.png",
+            use_slice_duration=True,
+        ),
+        TimeSlicePlotSpec(
+            name="timeslices_throughput",
+            plot_type=PlotType.HISTOGRAM,
+            metrics=[
+                MetricSpec(name="Timeslice", source=DataSource.TIMESLICES, axis="x"),
+                MetricSpec(
+                    name="Request Throughput",
+                    source=DataSource.TIMESLICES,
+                    axis="y",
+                    stat="avg",
+                ),
+            ],
+            title="Average Request Throughput Across Time Slices",
+            filename="timeslices_throughput.png",
+            use_slice_duration=True,
+        ),
+        TimeSlicePlotSpec(
+            name="timeslices_latency",
+            plot_type=PlotType.HISTOGRAM,
+            metrics=[
+                MetricSpec(name="Timeslice", source=DataSource.TIMESLICES, axis="x"),
+                MetricSpec(
+                    name="Request Latency",
+                    source=DataSource.TIMESLICES,
+                    axis="y",
+                    stat="avg",
+                ),
+            ],
+            title="Average Request Latency Across Time Slices",
+            filename="timeslices_latency.png",
+            use_slice_duration=True,
+        ),
+    ]
+
+    # GPU plot specifications
+    gpu_specs = [
+        PlotSpec(
+            name="gpu_utilization_and_throughput_over_time",
+            plot_type=PlotType.DUAL_AXIS,
+            metrics=[
+                MetricSpec(name="timestamp_s", source=DataSource.REQUESTS, axis="x"),
+                MetricSpec(
+                    name="throughput_tokens_per_sec",
+                    source=DataSource.REQUESTS,
+                    axis="y",
+                ),
+                MetricSpec(
+                    name="gpu_utilization", source=DataSource.GPU_TELEMETRY, axis="y2"
+                ),
+            ],
+            title="Output Token Throughput with GPU Utilization",
+            filename="gpu_utilization_and_throughput_over_time.png",
+            primary_style=Style(mode="lines", line_shape="hv", fill=None),
+            secondary_style=Style(mode="lines", line_shape=None, fill="tozeroy"),
+            supplementary_col="active_requests",
+        ),
+    ]
+
+    return single_run_specs + timeslice_specs + gpu_specs
+
+
+@pytest.fixture
+def sample_multi_run_plot_specs():
+    """Create plot specs for multi-run testing (matches original hardcoded specs)."""
+    return [
+        PlotSpec(
+            name="pareto_curve_throughput_per_gpu_vs_latency",
+            plot_type=PlotType.PARETO,
+            metrics=[
+                MetricSpec(
+                    name="request_latency",
+                    source=DataSource.AGGREGATED,
+                    axis="x",
+                    stat="avg",
+                ),
+                MetricSpec(
+                    name="output_token_throughput_per_gpu",
+                    source=DataSource.AGGREGATED,
+                    axis="y",
+                    stat="avg",
+                ),
+            ],
+            title="Pareto Curve: Token Throughput per GPU vs Latency",
+            filename="pareto_curve_throughput_per_gpu_vs_latency.png",
+            label_by=None,
+            group_by=None,
+        ),
+        PlotSpec(
+            name="ttft_vs_throughput",
+            plot_type=PlotType.SCATTER_LINE,
+            metrics=[
+                MetricSpec(
+                    name="time_to_first_token",
+                    source=DataSource.AGGREGATED,
+                    axis="x",
+                    stat="p50",
+                ),
+                MetricSpec(
+                    name="request_throughput",
+                    source=DataSource.AGGREGATED,
+                    axis="y",
+                    stat="avg",
+                ),
+            ],
+            title="TTFT vs Throughput",
+            filename="ttft_vs_throughput.png",
+            label_by=None,
+            group_by=None,
+        ),
+        PlotSpec(
+            name="pareto_curve_throughput_per_gpu_vs_interactivity",
+            plot_type=PlotType.SCATTER_LINE,
+            metrics=[
+                MetricSpec(
+                    name="output_token_throughput_per_gpu",
+                    source=DataSource.AGGREGATED,
+                    axis="x",
+                    stat="avg",
+                ),
+                MetricSpec(
+                    name="output_token_throughput_per_user",
+                    source=DataSource.AGGREGATED,
+                    axis="y",
+                    stat="avg",
+                ),
+            ],
+            title="Pareto Curve: Token Throughput per GPU vs Interactivity",
+            filename="pareto_curve_throughput_per_gpu_vs_interactivity.png",
+            label_by=["concurrency"],
+            group_by=None,
+        ),
+    ]
+
+
 @pytest.fixture
 def sample_multi_run_data(tmp_path):
     """Create sample multi-run data for testing."""
@@ -182,16 +415,17 @@ def test_export_multi_run_creates_files(
         multi_run_exporter,
         sample_multi_run_data,
         sample_available_metrics,
+        sample_multi_run_plot_specs,
     ):
-        """Test that multi-run export creates PNG files."""
+        """Test that multi-run export creates PNG files from provided specs."""
         generated_files = multi_run_exporter.export(
-            sample_multi_run_data, sample_available_metrics
+            sample_multi_run_data, sample_available_metrics, sample_multi_run_plot_specs
         )
 
-        # Should generate 3 plots for multi-run
-        assert len(generated_files) == 3
+        # Should generate plots based on the specs provided (config-driven)
+        assert len(generated_files) > 0
 
-        # Check that files exist
+        # Check that files exist and are PNGs
         for file_path in generated_files:
             assert file_path.exists()
             assert file_path.suffix == ".png"
@@ -201,28 +435,33 @@ def test_export_multi_run_creates_expected_plots(
         multi_run_exporter,
         sample_multi_run_data,
         sample_available_metrics,
+        sample_multi_run_plot_specs,
     ):
-        """Test that expected plot files are created."""
+        """Test that plots matching the provided specs are created."""
         generated_files = multi_run_exporter.export(
-            sample_multi_run_data, sample_available_metrics
+            sample_multi_run_data, sample_available_metrics, sample_multi_run_plot_specs
         )
 
         # Get filenames
         filenames = {f.name for f in generated_files}
 
-        # Check expected files
-        assert "pareto_curve_throughput_per_gpu_vs_latency.png" in filenames
-        assert "ttft_vs_throughput.png" in filenames
-        assert "pareto_curve_throughput_per_gpu_vs_interactivity.png" in filenames
+        # Check that filenames match the specs provided (config-driven)
+        spec_filenames = {spec.filename for spec in sample_multi_run_plot_specs}
+        assert filenames.issubset(spec_filenames), (
+            f"Generated unexpected files: {filenames - spec_filenames}"
+        )
 
     def test_export_multi_run_creates_summary(
         self,
         multi_run_exporter,
         sample_multi_run_data,
         sample_available_metrics,
+        sample_multi_run_plot_specs,
     ):
         """Test that summary file is created."""
-        multi_run_exporter.export(sample_multi_run_data, sample_available_metrics)
+        generated_files = multi_run_exporter.export(
+            sample_multi_run_data, sample_available_metrics, sample_multi_run_plot_specs
+        )
 
         summary_path = multi_run_exporter.output_dir / "summary.txt"
         assert summary_path.exists()
@@ -230,7 +469,7 @@ def test_export_multi_run_creates_summary(
         # Check summary content
         content = summary_path.read_text(encoding="utf-8")
         assert "AIPerf Plot Export Summary" in content
-        assert "Generated 3 plots" in content
+        assert f"Generated {len(generated_files)} plot" in content
 
     def test_runs_to_dataframe_with_metric_result_objects(
         self, multi_run_exporter, tmp_path
@@ -349,16 +588,17 @@ def test_export_single_run_creates_files(
         single_run_exporter,
         sample_single_run_data,
         sample_available_metrics,
+        sample_plot_specs,
     ):
-        """Test that single-run export creates PNG files."""
+        """Test that single-run export creates PNG files from provided specs."""
         generated_files = single_run_exporter.export(
-            sample_single_run_data, sample_available_metrics
+            sample_single_run_data, sample_available_metrics, sample_plot_specs
         )
 
-        # Should generate 4 plots for single-run (ttft, itl, latency, dispersed_throughput)
-        assert len(generated_files) == 4
+        # Should generate plots based on available data and specs (config-driven)
+        assert len(generated_files) > 0
 
-        # Check that files exist
+        # Check that files exist and are PNGs
         for file_path in generated_files:
             assert file_path.exists()
             assert file_path.suffix == ".png"
@@ -368,25 +608,27 @@ def test_export_single_run_creates_expected_plots(
         single_run_exporter,
         sample_single_run_data,
         sample_available_metrics,
+        sample_plot_specs,
     ):
-        """Test that expected plot files are created for single run."""
+        """Test that plots matching the provided specs are created for single run."""
         generated_files = single_run_exporter.export(
-            sample_single_run_data, sample_available_metrics
+            sample_single_run_data, sample_available_metrics, sample_plot_specs
         )
 
         # Get filenames
         filenames = {f.name for f in generated_files}
 
-        # Check expected files
-        assert "ttft_over_time.png" in filenames
-        assert "itl_over_time.png" in filenames
-        assert "latency_over_time.png" in filenames
-        assert "dispersed_throughput_over_time.png" in filenames
+        # Check that filenames match specs that can be generated with available data
+        spec_filenames = {spec.filename for spec in sample_plot_specs}
+        assert filenames.issubset(spec_filenames), (
+            f"Generated unexpected files: {filenames - spec_filenames}"
+        )
 
     def test_export_single_run_with_no_per_request_data(
         self,
         single_run_exporter,
         sample_available_metrics,
+        sample_plot_specs,
         tmp_path,
     ):
         """Test handling of single run with no per-request data."""
@@ -403,7 +645,9 @@ def test_export_single_run_with_no_per_request_data(
             slice_duration=None,
         )
 
-        generated_files = single_run_exporter.export(run_data, sample_available_metrics)
+        generated_files = single_run_exporter.export(
+            run_data, sample_available_metrics, sample_plot_specs
+        )
 
         # Should return empty list when no data available
         assert len(generated_files) == 0
@@ -456,6 +700,7 @@ def test_export_single_run_with_timeslice_data(
         sample_single_run_data,
         sample_timeslice_data,
         sample_available_metrics,
+        sample_plot_specs,
         tmp_path,
     ):
         """Test that timeslice plots are generated when timeslice data is available."""
@@ -469,15 +714,18 @@ def test_export_single_run_with_timeslice_data(
         )
 
         generated_files = single_run_exporter.export(
-            run_with_timeslices, sample_available_metrics
+            run_with_timeslices, sample_available_metrics, sample_plot_specs
         )
 
-        # Should generate 3 regular plots + 1 dispersed_throughput + 1 timeslice plot = 5
-        assert len(generated_files) == 5
+        # Should generate more plots with timeslice data available
+        assert len(generated_files) > 0
 
-        # Check that timeslice plot is in the generated files
+        # Check that at least one timeslice plot is in the generated files
         filenames = {f.name for f in generated_files}
-        assert "timeslices_ttft.png" in filenames
+        timeslice_plots = [f for f in filenames if "timeslices_" in f]
+        assert len(timeslice_plots) > 0, (
+            "Expected at least one timeslice plot to be generated"
+        )
 
         # Validate that the timeslice plot was created successfully
         ttft_data = sample_timeslice_data[
@@ -508,21 +756,23 @@ def test_timeslices_plot_handles_missing_data_gracefully(
         single_run_exporter,
         sample_single_run_data,
         sample_available_metrics,
+        sample_plot_specs,
     ):
         """Test that missing timeslice data is handled gracefully."""
         # Run without timeslice data (None)
         generated_files = single_run_exporter.export(
-            sample_single_run_data, sample_available_metrics
+            sample_single_run_data, sample_available_metrics, sample_plot_specs
         )
 
-        # Should generate 3 regular plots + 1 dispersed_throughput = 4 (no timeslice plots)
-        assert len(generated_files) == 4
+        # Should generate plots, but no timeslice plots since data is missing
+        assert len(generated_files) > 0
 
         filenames = {f.name for f in generated_files}
-        assert "ttft_over_time.png" in filenames
-        assert "itl_over_time.png" in filenames
-        assert "latency_over_time.png" in filenames
-        # No timeslice plots
+        # Should not generate timeslice plots when data is missing
+        timeslice_plots = [f for f in filenames if "timeslices_" in f]
+        assert len(timeslice_plots) == 0, (
+            "Should not generate timeslice plots without data"
+        )
         assert "timeslices_ttft.png" not in filenames
 
     def test_uniform_requests_no_warning(
@@ -530,6 +780,7 @@ def test_uniform_requests_no_warning(
         single_run_exporter,
         tmp_path,
         sample_available_metrics,
+        sample_plot_specs,
     ):
         """Test that uniform requests (identical ISL/OSL) show no warning."""
         per_request_data = pd.DataFrame(
@@ -570,11 +821,11 @@ def test_uniform_requests_no_warning(
         assert is_uniform is True
         assert warning is None
 
-        generated_files = single_run_exporter.export(run_data, sample_available_metrics)
-        throughput_plot = [
-            f for f in generated_files if "timeslices_throughput" in f.name
-        ]
-        assert len(throughput_plot) == 1
+        generated_files = single_run_exporter.export(
+            run_data, sample_available_metrics, sample_plot_specs
+        )
+        throughput_plot = [f for f in generated_files if "throughput" in f.name]
+        assert len(throughput_plot) > 0
 
         fig = single_run_exporter.plot_generator.create_time_series_histogram(
             df=timeslice_data[timeslice_data["Stat"] == "avg"][
@@ -598,6 +849,7 @@ def test_non_uniform_isl_shows_warning(
         single_run_exporter,
         tmp_path,
         sample_available_metrics,
+        sample_plot_specs,
     ):
         """Test that non-uniform ISL (varying input lengths) shows warning."""
         per_request_data = pd.DataFrame(
@@ -691,6 +943,7 @@ def test_non_uniform_osl_shows_warning(
         single_run_exporter,
         tmp_path,
         sample_available_metrics,
+        sample_plot_specs,
     ):
         """Test that non-uniform OSL (varying output lengths) shows warning."""
         per_request_data = pd.DataFrame(
@@ -738,6 +991,7 @@ def test_warning_only_on_throughput_plot(
         single_run_exporter,
         tmp_path,
         sample_available_metrics,
+        sample_plot_specs,
     ):
         """Test that warning appears only on throughput plot, not other metrics."""
         per_request_data = pd.DataFrame(
@@ -791,7 +1045,9 @@ def test_warning_only_on_throughput_plot(
             slice_duration=10.0,
         )
 
-        generated_files = single_run_exporter.export(run_data, sample_available_metrics)
+        generated_files = single_run_exporter.export(
+            run_data, sample_available_metrics, sample_plot_specs
+        )
 
         assert len(generated_files) > 0
 
@@ -915,20 +1171,26 @@ def test_on_demand_loading_missing_file(
 class TestSharedExporterFunctionality:
     """Tests for shared functionality across both exporters."""
 
-    def test_output_directory_created(self, tmp_path, sample_multi_run_data):
+    def test_output_directory_created(
+        self, tmp_path, sample_multi_run_data, sample_multi_run_plot_specs
+    ):
         """Test that output directory is created if it doesn't exist."""
         output_dir = tmp_path / "new_directory" / "plots"
         assert not output_dir.exists()
 
         exporter = MultiRunPNGExporter(output_dir)
-        exporter.export(sample_multi_run_data, {"display_names": {}, "units": {}})
+        exporter.export(
+            sample_multi_run_data,
+            {"display_names": {}, "units": {}},
+            sample_multi_run_plot_specs,
+        )
 
         # Directory should be created
         assert output_dir.exists()
         assert output_dir.is_dir()
 
     def test_export_handles_missing_metrics_gracefully(
-        self, tmp_path, sample_available_metrics
+        self, tmp_path, sample_available_metrics, sample_multi_run_plot_specs
     ):
         """Test that export handles missing metrics without crashing."""
         output_dir = tmp_path / "plots"
@@ -953,7 +1215,9 @@ def test_export_handles_missing_metrics_gracefully(
         ]
 
         # Should not raise an exception
-        generated_files = exporter.export(incomplete_data, sample_available_metrics)
+        generated_files = exporter.export(
+            incomplete_data, sample_available_metrics, sample_multi_run_plot_specs
+        )
 
         # May generate fewer plots if metrics are missing
         assert isinstance(generated_files, list)
@@ -1013,7 +1277,12 @@ def requests_df_for_gpu(self):
         )
 
     def test_generate_gpu_plots_with_telemetry(
-        self, tmp_path, sample_available_metrics, gpu_telemetry_df, requests_df_for_gpu
+        self,
+        tmp_path,
+        sample_available_metrics,
+        sample_plot_specs,
+        gpu_telemetry_df,
+        requests_df_for_gpu,
     ):
         """Test that GPU plots are generated when telemetry data is available."""
         exporter = SingleRunPNGExporter(output_dir=tmp_path)
@@ -1032,7 +1301,7 @@ def test_generate_gpu_plots_with_telemetry(
             gpu_telemetry=gpu_telemetry_df,
         )
 
-        all_files = exporter.export(run, sample_available_metrics)
+        all_files = exporter.export(run, sample_available_metrics, sample_plot_specs)
 
         # Check that GPU files were generated
         gpu_files = [f for f in all_files if "gpu" in f.name]
@@ -1041,7 +1310,9 @@ def test_generate_gpu_plots_with_telemetry(
             assert file_path.exists()
             assert file_path.suffix == ".png"
 
-    def test_generate_gpu_plots_no_telemetry(self, tmp_path, sample_available_metrics):
+    def test_generate_gpu_plots_no_telemetry(
+        self, tmp_path, sample_available_metrics, sample_plot_specs
+    ):
         """Test that no GPU plots are generated when telemetry data is missing."""
         exporter = SingleRunPNGExporter(output_dir=tmp_path)
 
@@ -1059,13 +1330,13 @@ def test_generate_gpu_plots_no_telemetry(self, tmp_path, sample_available_metric
             gpu_telemetry=None,
         )
 
-        all_files = exporter.export(run, sample_available_metrics)
+        all_files = exporter.export(run, sample_available_metrics, sample_plot_specs)
         gpu_files = [f for f in all_files if "gpu" in f.name]
 
         assert gpu_files == []
 
     def test_generate_gpu_plots_empty_telemetry(
-        self, tmp_path, sample_available_metrics
+        self, tmp_path, sample_available_metrics, sample_plot_specs
     ):
         """Test that no GPU plots are generated when telemetry DataFrame is empty."""
         exporter = SingleRunPNGExporter(output_dir=tmp_path)
@@ -1084,13 +1355,18 @@ def test_generate_gpu_plots_empty_telemetry(
             gpu_telemetry=pd.DataFrame(),
         )
 
-        all_files = exporter.export(run, sample_available_metrics)
+        all_files = exporter.export(run, sample_available_metrics, sample_plot_specs)
         gpu_files = [f for f in all_files if "gpu" in f.name]
 
         assert gpu_files == []
 
     def test_generate_gpu_utilization_with_throughput(
-        self, tmp_path, sample_available_metrics, gpu_telemetry_df, requests_df_for_gpu
+        self,
+        tmp_path,
+        sample_available_metrics,
+        sample_plot_specs,
+        gpu_telemetry_df,
+        requests_df_for_gpu,
     ):
         """Test GPU utilization with throughput overlay plot generation."""
         exporter = SingleRunPNGExporter(output_dir=tmp_path)
@@ -1109,18 +1385,16 @@ def test_generate_gpu_utilization_with_throughput(
             gpu_telemetry=gpu_telemetry_df,
         )
 
-        all_files = exporter.export(run, sample_available_metrics)
-        gpu_files = [
-            f
-            for f in all_files
-            if f.name == "gpu_utilization_and_throughput_over_time.png"
-        ]
+        all_files = exporter.export(run, sample_available_metrics, sample_plot_specs)
+        gpu_files = [f for f in all_files if "gpu" in f.name.lower()]
 
-        assert len(gpu_files) == 1
-        assert gpu_files[0].exists()
+        # Should generate GPU-related plots when GPU telemetry is available
+        assert len(gpu_files) > 0
+        for f in gpu_files:
+            assert f.exists()
 
     def test_generate_gpu_utilization_no_requests(
-        self, tmp_path, sample_available_metrics, gpu_telemetry_df
+        self, tmp_path, sample_available_metrics, sample_plot_specs, gpu_telemetry_df
     ):
         """Test GPU utilization plot when requests data is missing."""
         exporter = SingleRunPNGExporter(output_dir=tmp_path)
@@ -1139,7 +1413,7 @@ def test_generate_gpu_utilization_no_requests(
             gpu_telemetry=gpu_telemetry_df,
         )
 
-        all_files = exporter.export(run, sample_available_metrics)
+        all_files = exporter.export(run, sample_available_metrics, sample_plot_specs)
         gpu_util_files = [
             f
             for f in all_files
@@ -1149,7 +1423,7 @@ def test_generate_gpu_utilization_no_requests(
         assert gpu_util_files == []
 
     def test_generate_dispersed_throughput_over_time(
-        self, tmp_path, sample_available_metrics, requests_df_for_gpu
+        self, tmp_path, sample_available_metrics, sample_plot_specs, requests_df_for_gpu
     ):
         """Test dispersed throughput over time plot generation."""
         exporter = SingleRunPNGExporter(output_dir=tmp_path)
@@ -1168,16 +1442,16 @@ def test_generate_dispersed_throughput_over_time(
             gpu_telemetry=None,
         )
 
-        all_files = exporter.export(run, sample_available_metrics)
-        throughput_files = [
-            f for f in all_files if f.name == "dispersed_throughput_over_time.png"
-        ]
+        all_files = exporter.export(run, sample_available_metrics, sample_plot_specs)
+        throughput_files = [f for f in all_files if "throughput" in f.name.lower()]
 
-        assert len(throughput_files) == 1
-        assert throughput_files[0].exists()
+        # Should generate throughput plots when requests data is available
+        assert len(throughput_files) > 0
+        for f in throughput_files:
+            assert f.exists()
 
     def test_generate_gpu_plots_multi_gpu_aggregation(
-        self, tmp_path, sample_available_metrics, requests_df_for_gpu
+        self, tmp_path, sample_available_metrics, sample_plot_specs, requests_df_for_gpu
     ):
         """Test that GPU plots aggregate data correctly across multiple GPUs."""
         exporter = SingleRunPNGExporter(output_dir=tmp_path)
@@ -1218,7 +1492,7 @@ def test_generate_gpu_plots_multi_gpu_aggregation(
             gpu_telemetry=multi_gpu_df,
         )
 
-        all_files = exporter.export(run, sample_available_metrics)
+        all_files = exporter.export(run, sample_available_metrics, sample_plot_specs)
         gpu_files = [f for f in all_files if "gpu" in f.name]
 
         assert len(gpu_files) > 0
@@ -1571,3 +1845,151 @@ def test_axis_labels_from_available_metrics(
 
         assert "Custom Throughput Label" in str(fig.layout.yaxis.title.text)
         assert "Custom GPU Label" in str(fig.layout.yaxis2.title.text)
+
+
+class TestPlotSpecListValidation:
+    """Tests for PlotSpec validation with list-based label_by and group_by."""
+
+    def test_label_by_single_element_list(self):
+        """Test that label_by with a single-element list is converted to string."""
+        spec = PlotSpec(
+            name="test_plot",
+            plot_type=PlotType.PARETO,
+            metrics=[
+                MetricSpec(
+                    name="request_latency",
+                    source=DataSource.AGGREGATED,
+                    axis="x",
+                    stat="avg",
+                ),
+                MetricSpec(
+                    name="request_throughput",
+                    source=DataSource.AGGREGATED,
+                    axis="y",
+                    stat="avg",
+                ),
+            ],
+            label_by=["concurrency"],
+        )
+
+        assert spec.label_by == "concurrency"
+
+    def test_group_by_single_element_list(self):
+        """Test that group_by with a single-element list is converted to string."""
+        spec = PlotSpec(
+            name="test_plot",
+            plot_type=PlotType.PARETO,
+            metrics=[
+                MetricSpec(
+                    name="request_latency",
+                    source=DataSource.AGGREGATED,
+                    axis="x",
+                    stat="avg",
+                ),
+                MetricSpec(
+                    name="request_throughput",
+                    source=DataSource.AGGREGATED,
+                    axis="y",
+                    stat="avg",
+                ),
+            ],
+            group_by=["model"],
+        )
+
+        assert spec.group_by == "model"
+
+    def test_label_by_accepts_plain_string(self):
+        """Test that plain strings are accepted for label_by (backward compatibility)."""
+        spec = PlotSpec(
+            name="test_plot",
+            plot_type=PlotType.PARETO,
+            metrics=[
+                MetricSpec(
+                    name="request_latency",
+                    source=DataSource.AGGREGATED,
+                    axis="x",
+                    stat="avg",
+                ),
+                MetricSpec(
+                    name="request_throughput",
+                    source=DataSource.AGGREGATED,
+                    axis="y",
+                    stat="avg",
+                ),
+            ],
+            label_by="concurrency",
+        )
+        assert spec.label_by == "concurrency"
+
+    def test_group_by_accepts_plain_string(self):
+        """Test that plain strings are accepted for group_by (backward compatibility)."""
+        spec = PlotSpec(
+            name="test_plot",
+            plot_type=PlotType.PARETO,
+            metrics=[
+                MetricSpec(
+                    name="request_latency",
+                    source=DataSource.AGGREGATED,
+                    axis="x",
+                    stat="avg",
+                ),
+                MetricSpec(
+                    name="request_throughput",
+                    source=DataSource.AGGREGATED,
+                    axis="y",
+                    stat="avg",
+                ),
+            ],
+            group_by="model",
+        )
+        assert spec.group_by == "model"
+
+    def test_label_by_and_group_by_as_none(self):
+        """Test that label_by and group_by can be None."""
+        spec = PlotSpec(
+            name="test_plot",
+            plot_type=PlotType.PARETO,
+            metrics=[
+                MetricSpec(
+                    name="request_latency",
+                    source=DataSource.AGGREGATED,
+                    axis="x",
+                    stat="avg",
+                ),
+                MetricSpec(
+                    name="request_throughput",
+                    source=DataSource.AGGREGATED,
+                    axis="y",
+                    stat="avg",
+                ),
+            ],
+            label_by=None,
+            group_by=None,
+        )
+
+        assert spec.label_by is None
+        assert spec.group_by is None
+
+    def test_empty_list_converts_to_none(self):
+        """Test that empty lists are converted to None."""
+        spec = PlotSpec(
+            name="test_plot",
+            plot_type=PlotType.PARETO,
+            metrics=[
+                MetricSpec(
+                    name="request_latency",
+                    source=DataSource.AGGREGATED,
+                    axis="x",
+                    stat="avg",
+                ),
+                MetricSpec(
+                    name="request_throughput",
+                    source=DataSource.AGGREGATED,
+                    axis="y",
+                    stat="avg",
+                ),
+            ],
+            label_by=[],
+        )
+
+        assert spec.label_by is None
diff --git a/tests/unit/plot/test_swept_params.py b/tests/unit/plot/test_swept_params.py
index af0423dc6..03c7c8ef6 100644
--- a/tests/unit/plot/test_swept_params.py
+++ b/tests/unit/plot/test_swept_params.py
@@ -13,7 +13,6 @@
 import pytest
 
 from aiperf.plot.core.swept_params import (
-    DEFAULT_IGNORE_PARAMS,
     auto_select_group_by,
     auto_select_label_by,
     detect_swept_parameters,