# CUPED Comparison: Previous vs Next Period

**Objectives:** Compare outcomes between a previous period and a next period using CUPED variance reduction while treating each `LLMScore` metric independently. The workflow builds the CUPED baseline from the previous period only, applies the adjustment to both periods, and analyzes per-case and per-metric effects with bootstrap uncertainty quantification.

In [1]:
# Cell 1 — Configuration & Library Imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from typing import Dict
from IPython.display import display

try:
    from statsmodels.api import OLS, add_constant
except ModuleNotFoundError:
    OLS = None
    add_constant = None

# Plot settings
sns.set_theme(style='whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

# Configuration
PREVIOUS_PATH = Path('data/UnchangedEvalRuns.csv')
NEXT_PATH = Path('data/FakeEvalChangeOnlyOne.csv')
N_BOOT = 10_000
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

print('Configuration loaded:')
print(f"Previous path: {PREVIOUS_PATH}")
print(f"Next path: {NEXT_PATH}")
print(f"Bootstrap iterations: {N_BOOT}")
print(f"Random seed: {RANDOM_SEED}")

Configuration loaded:
Previous path: data\UnchangedEvalRuns.csv
Next path: data\FakeEvalChangeOnlyOne.csv
Bootstrap iterations: 10000
Random seed: 42


In [2]:
# Cell 2 — Robust File Loaders
REQUIRED_COLUMNS = ['RunId', 'TrialId', 'TestCaseId', 'LLMScore', 'Value']

def load_dataset(path: Path) -> pd.DataFrame:
    if not path.exists():
        raise FileNotFoundError(f"Input file not found: {path}")
    if path.suffix.lower() == '.csv':
        df = pd.read_csv(path)
    elif path.suffix.lower() in {'.parquet', '.pq'}:
        df = pd.read_parquet(path)
    else:
        raise ValueError(f"Unsupported file type for {path}. Use CSV or Parquet.")
    missing_cols = set(REQUIRED_COLUMNS) - set(df.columns)
    if missing_cols:
        raise ValueError(f"Dataset {path} is missing required columns: {sorted(missing_cols)}")
    # Drop TrialID explicitly as instructed (single TrialID only).
    df = df.drop(columns=['TrialId'])
    df['Value'] = pd.to_numeric(df['Value'], errors='coerce')
    if df['Value'].isna().any():
        bad_rows = df[df['Value'].isna()]
        raise ValueError(
            f"Non-numeric values detected in 'Value' column for dataset {path}. "
            f"Offending row indices (first 5): {bad_rows.index.tolist()[:5]}"
        )
    return df

previous_df = load_dataset(PREVIOUS_PATH)
next_df = load_dataset(NEXT_PATH)
print('Loaded previous:', previous_df.shape)
print('Loaded next:', next_df.shape)


Loaded previous: (1566, 4)
Loaded next: (261, 4)


In [3]:
# Cell 3 — Data Hygiene & Coverage Snapshot

def coverage_snapshot(prev: pd.DataFrame, nxt: pd.DataFrame) -> pd.DataFrame:
    summary_rows = []
    metrics = sorted(set(prev['LLMScore']).union(nxt['LLMScore']))
    for metric in metrics:
        prev_cases = set(prev.loc[prev['LLMScore'] == metric, 'TestCaseId'])
        next_cases = set(nxt.loc[nxt['LLMScore'] == metric, 'TestCaseId'])
        inter = prev_cases & next_cases
        coverage = len(inter) / len(next_cases) if next_cases else np.nan
        summary_rows.append({
            'LLMScore': metric,
            'cases_prev': len(prev_cases),
            'cases_next': len(next_cases),
            'cases_intersection': len(inter),
            'coverage_rate_next': coverage,
            'low_overlap_flag': (not np.isnan(coverage)) and (coverage < 0.7)
        })
    return pd.DataFrame(summary_rows)

prev_counts = previous_df.groupby('LLMScore').agg(
    rows=('TestCaseId', 'size'),
    unique_cases=('TestCaseId', pd.Series.nunique)
).reset_index()
next_counts = next_df.groupby('LLMScore').agg(
    rows=('TestCaseId', 'size'),
    unique_cases=('TestCaseId', pd.Series.nunique)
).reset_index()
print('Previous snapshot:')
display(prev_counts)
print('Next snapshot:')
display(next_counts)
coverage_df = coverage_snapshot(previous_df, next_df)
print('Coverage overview:')
display(coverage_df)

Previous snapshot:


Unnamed: 0,LLMScore,rows,unique_cases
0,Accuracy,522,87
1,Efficiency,522,87
2,Relevance,522,87


Next snapshot:


Unnamed: 0,LLMScore,rows,unique_cases
0,Accuracy,87,87
1,Efficiency,87,87
2,Relevance,87,87


Coverage overview:


Unnamed: 0,LLMScore,cases_prev,cases_next,cases_intersection,coverage_rate_next,low_overlap_flag
0,Accuracy,87,87,87,1.0,False
1,Efficiency,87,87,87,1.0,False
2,Relevance,87,87,87,1.0,False


In [4]:
# Cell 4 — Build CUPED Baseline (X) from Previous
X_baseline = (
    previous_df
    .groupby(['TestCaseId', 'LLMScore'], as_index=False)['Value']
    .mean()
    .rename(columns={'Value': 'X_baseline'})
)
metric_baseline_stats = (
    X_baseline.groupby('LLMScore')['X_baseline']
    .agg(['mean', 'std', 'count'])
    .rename(columns={'count': 'n_cases'})
    .reset_index()
)
print('Baseline preview:')
display(X_baseline.head())
print('Baseline statistics per metric:')
display(metric_baseline_stats)

Baseline preview:


Unnamed: 0,TestCaseId,LLMScore,X_baseline
0,1,Accuracy,4.0
1,1,Efficiency,4.0
2,1,Relevance,4.0
3,2,Accuracy,3.166667
4,2,Efficiency,2.666667


Baseline statistics per metric:


Unnamed: 0,LLMScore,mean,std,n_cases
0,Accuracy,2.183908,1.053797,87
1,Efficiency,3.310577,0.752408,87
2,Relevance,3.438697,0.680627,87


In [5]:
# Cell 5 — Estimate θ per Metric (CUPED coefficient)
prev_with_baseline = previous_df.merge(X_baseline, on=['TestCaseId', 'LLMScore'], how='left')
theta_rows = []
for metric, group in prev_with_baseline.groupby('LLMScore'):
    x = group['X_baseline']
    y = group['Value']
    x_centered = x - x.mean()
    y_centered = y - y.mean()
    var_x = x_centered.var(ddof=0)
    if np.isclose(var_x, 0):
        theta = 0.0
        warn = True
    else:
        cov = np.mean(x_centered * y_centered)
        theta = cov / var_x
        warn = False
    corr_xy = np.corrcoef(x, y)[0, 1] if len(group) > 1 else np.nan
    ols_slope = np.nan
    if OLS is not None and not np.isclose(var_x, 0):
        X = add_constant(x_centered)
        model = OLS(y_centered, X).fit()
        ols_slope = model.params[1]
    theta_rows.append({
        'LLMScore': metric,
        'theta': theta,
        'corr_XY': corr_xy,
        'var_X': var_x,
        'N_rows_prev': len(group),
        'OLS_slope': ols_slope,
        'variance_or_corr_flag': warn or (not np.isnan(corr_xy) and abs(corr_xy) < 0.05)
    })

theta_df = pd.DataFrame(theta_rows)
print('Theta estimates per metric:')
display(theta_df)

Theta estimates per metric:


  ols_slope = model.params[1]
  ols_slope = model.params[1]
  ols_slope = model.params[1]


Unnamed: 0,LLMScore,theta,corr_XY,var_X,N_rows_prev,OLS_slope,variance_or_corr_flag
0,Accuracy,1.0,0.699721,1.097723,522,1.0,False
1,Efficiency,1.0,0.809969,0.55961,522,1.0,False
2,Relevance,1.0,0.79049,0.457928,522,1.0,False


In [6]:
# Cell 6 — Apply CUPED to Previous & Next
mean_X_baseline = X_baseline.groupby('LLMScore')['X_baseline'].mean().rename('mean_X_baseline')
X_with_mean = X_baseline.merge(mean_X_baseline, on='LLMScore', how='left')

def apply_cuped(df: pd.DataFrame, label: str) -> pd.DataFrame:
    merged = df.merge(X_with_mean, on=['TestCaseId', 'LLMScore'], how='left')
    merged = merged.merge(theta_df[['LLMScore', 'theta']], on='LLMScore', how='left')
    merged['X_baseline'] = merged['X_baseline'].fillna(merged['mean_X_baseline'])
    merged['theta'] = merged['theta'].fillna(0.0)
    merged['Value_CUPED'] = merged['Value'] - merged['theta'] * (merged['X_baseline'] - merged['mean_X_baseline'])
    merged['Period'] = label
    return merged

previous_cuped = apply_cuped(previous_df, 'Previous')
next_cuped = apply_cuped(next_df, 'Next')

variance_rows = []
for metric, group in pd.concat([previous_cuped, next_cuped]).groupby('LLMScore'):
    var_raw = group['Value'].var(ddof=0)
    var_cuped = group['Value_CUPED'].var(ddof=0)
    reduction = 1 - (var_cuped / var_raw) if var_raw > 0 else np.nan
    variance_rows.append({
        'LLMScore': metric,
        'var_raw': var_raw,
        'var_cuped': var_cuped,
        'variance_reduction_pct': reduction * 100 if not np.isnan(reduction) else np.nan
    })
variance_df = pd.DataFrame(variance_rows)
print('Variance diagnostics per metric:')
display(variance_df)

Variance diagnostics per metric:


Unnamed: 0,LLMScore,var_raw,var_cuped,variance_reduction_pct
0,Accuracy,2.279321,1.118709,50.919204
1,Efficiency,0.845569,0.29457,65.163147
2,Relevance,0.737186,0.283262,61.575258


In [7]:
# Cell 7 — Per-Case & Per-Metric Comparisons (CUPED space)
all_cuped = pd.concat([previous_cuped, next_cuped], ignore_index=True)
per_case = (
    all_cuped
    .groupby(['TestCaseId', 'LLMScore', 'Period'])['Value_CUPED']
    .mean()
    .unstack('Period')
    .rename(columns={'Previous': 'Prev_CUPED_Mean', 'Next': 'Next_CUPED_Mean'})
)
per_case['Diff'] = per_case['Next_CUPED_Mean'] - per_case['Prev_CUPED_Mean']
per_case = per_case.reset_index()
print('Per-case CUPED summary:')
display(per_case.head())

def cohen_d(diff_values: np.ndarray) -> float:
    diff_values = diff_values[~np.isnan(diff_values)]
    if diff_values.size == 0:
        return np.nan
    mean_diff = diff_values.mean()
    std_diff = diff_values.std(ddof=1)
    return mean_diff / std_diff if std_diff > 0 else np.nan

metric_summary_rows = []
for metric, group in per_case.groupby('LLMScore'):
    diffs = group['Diff'].dropna().values
    prev_mean = group['Prev_CUPED_Mean'].mean()
    next_mean = group['Next_CUPED_Mean'].mean()
    percent_change = (next_mean / prev_mean - 1) if prev_mean != 0 else np.nan
    share_improved = np.mean(diffs > 0) if len(diffs) else np.nan
    metric_summary_rows.append({
        'LLMScore': metric,
        'mean_diff': np.mean(diffs) if len(diffs) else np.nan,
        'cohens_d': cohen_d(diffs),
        'percent_change': percent_change,
        'share_improved': share_improved,
        'n_cases': len(diffs)
    })
metric_summary_df = pd.DataFrame(metric_summary_rows)
print('Per-metric CUPED summary:')
display(metric_summary_df)

Per-case CUPED summary:


Period,TestCaseId,LLMScore,Next_CUPED_Mean,Prev_CUPED_Mean,Diff
0,1,Accuracy,2.580518,2.183908,0.39661
1,1,Efficiency,3.400792,3.310577,0.090215
2,1,Relevance,3.442425,3.438697,0.003727
3,2,Accuracy,2.218469,2.183908,0.034561
4,2,Efficiency,2.682404,3.310577,-0.628174


Per-metric CUPED summary:


Unnamed: 0,LLMScore,mean_diff,cohens_d,percent_change,share_improved,n_cases
0,Accuracy,-0.203914,-0.210296,-0.093371,0.413793,87
1,Efficiency,0.144672,0.270047,0.0437,0.781609,87
2,Relevance,0.188412,0.340322,0.054792,0.770115,87


In [8]:
# Cell 8 — Bootstrap Inference (clustered by TestCaseID)
from numpy.random import default_rng
rng = default_rng(RANDOM_SEED)

def cluster_bootstrap_mean(diffs: pd.Series, ids: pd.Series, n_boot: int) -> np.ndarray:
    unique_ids = ids.unique()
    diff_map = pd.Series(diffs.values, index=ids.values).to_dict()
    boot_means = np.empty(n_boot)
    for i in range(n_boot):
        sampled_ids = rng.choice(unique_ids, size=len(unique_ids), replace=True)
        sampled_values = np.array([diff_map[idx] for idx in sampled_ids])
        boot_means[i] = sampled_values.mean()
    return boot_means

bootstrap_rows = []
bootstrap_results: Dict[str, np.ndarray] = {}
for metric, group in per_case.groupby('LLMScore'):
    mask = group['Diff'].notna()
    if not mask.any():
        continue
    diffs = group.loc[mask, 'Diff']
    ids = group.loc[mask, 'TestCaseId']
    boot_means = cluster_bootstrap_mean(diffs, ids, N_BOOT)
    bootstrap_results[metric] = boot_means
    se = boot_means.std(ddof=1)
    ci_lo, ci_hi = np.percentile(boot_means, [2.5, 97.5])
    bootstrap_rows.append({
        'LLMScore': metric,
        'mean_diff': diffs.mean(),
        'SE': se,
        'CI_lo_95': ci_lo,
        'CI_hi_95': ci_hi,
        'N_cases': len(diffs)
    })
bootstrap_df = pd.DataFrame(bootstrap_rows)
print('Bootstrap summary per metric:')
display(bootstrap_df)

pooled_diffs = per_case[['TestCaseId', 'LLMScore', 'Diff']].dropna()
if not pooled_diffs.empty:
    pooled_boot_means = []
    unique_pairs = pooled_diffs[['TestCaseId', 'LLMScore']].apply(lambda x: tuple(x), axis=1).values
    diff_map = {key: val for key, val in zip(unique_pairs, pooled_diffs['Diff'].values)}
    for _ in range(N_BOOT):
        sampled_pairs = rng.choice(unique_pairs, size=len(unique_pairs), replace=True)
        pooled_boot_means.append(np.mean([diff_map[key] for key in sampled_pairs]))
    print('Overall pooled bootstrap mean diff (descriptive):')
    print(pd.Series(pooled_boot_means).describe(percentiles=[0.025, 0.5, 0.975]))


Bootstrap summary per metric:


Unnamed: 0,LLMScore,mean_diff,SE,CI_lo_95,CI_hi_95,N_cases
0,Accuracy,-0.203914,0.102973,-0.407062,-0.001659,87
1,Efficiency,0.144672,0.056575,0.0319,0.253119,87
2,Relevance,0.188412,0.058094,0.074562,0.303024,87


Overall pooled bootstrap mean diff (descriptive):
count    10000.000000
mean         0.043513
std          0.045051
min         -0.150922
2.5%        -0.044616
50%          0.043646
97.5%        0.130498
max          0.198156
dtype: float64


In [9]:
# Cell 9 — Visuals: Distribution & Effect Views
plot_dir = Path('outputs/plots')
plot_dir.mkdir(parents=True, exist_ok=True)

def plot_metric_kdes(metric: str):
    metric_data = all_cuped[all_cuped['LLMScore'] == metric]
    plt.figure()
    sns.kdeplot(data=metric_data, x='Value_CUPED', hue='Period', common_norm=False)
    plt.title(f'CUPED Value Distribution — {metric}')
    plt.xlabel('Value (CUPED)')
    plt.ylabel('Density')
    plt.tight_layout()
    path = plot_dir / f"kde_cuped_{metric}.png"
    plt.savefig(path)
    plt.close()
    return path

def plot_bootstrap_kde(metric: str, boot_means: np.ndarray):
    plt.figure()
    sns.kdeplot(boot_means, fill=True)
    plt.axvline(0, color='black', linestyle='--', label='Zero')
    plt.axvline(np.mean(boot_means), color='blue', linestyle='-', label='Mean')
    ci_lo, ci_hi = np.percentile(boot_means, [2.5, 97.5])
    plt.axvline(ci_lo, color='red', linestyle='--', label='95% CI')
    plt.axvline(ci_hi, color='red', linestyle='--')
    plt.title(f'Bootstrap Mean Diff Distribution — {metric}')
    plt.xlabel('Bootstrap Mean Diff')
    plt.ylabel('Density')
    plt.legend()
    plt.tight_layout()
    path = plot_dir / f"kde_bootstrap_{metric}.png"
    plt.savefig(path)
    plt.close()
    return path

metric_plot_paths = {}
for metric in all_cuped['LLMScore'].unique():
    metric_plot_paths[metric] = {'cuped_kde': plot_metric_kdes(metric)}
    if metric in bootstrap_results:
        metric_plot_paths[metric]['bootstrap_kde'] = plot_bootstrap_kde(metric, bootstrap_results[metric])

fig, ax = plt.subplots(figsize=(max(8, 2 * max(1, len(bootstrap_df))), 6))
bar_plot = sns.barplot(data=bootstrap_df, x='LLMScore', y='mean_diff', palette='viridis', ax=ax)
positions = ax.get_xticks()
ax.errorbar(positions, bootstrap_df['mean_diff'],
            yerr=[bootstrap_df['mean_diff'] - bootstrap_df['CI_lo_95'],
                  bootstrap_df['CI_hi_95'] - bootstrap_df['mean_diff']],
            fmt='none', c='black', capsize=5)
ax.axhline(0, color='black', linestyle='--')
ax.set_title('Mean CUPED Difference per Metric with 95% CI')
ax.set_xlabel('LLMScore')
ax.set_ylabel('Mean Diff (Next - Previous)')
fig.tight_layout()
bar_plot_path = plot_dir / 'mean_diff_bar.png'
fig.savefig(bar_plot_path)
plt.close(fig)

print('Plot files saved:')
for metric, paths in metric_plot_paths.items():
    for kind, path in paths.items():
        print(metric, kind, path)
print('Bar chart path:', bar_plot_path)

Plot files saved:
Efficiency cuped_kde outputs\plots\kde_cuped_Efficiency.png
Efficiency bootstrap_kde outputs\plots\kde_bootstrap_Efficiency.png
Relevance cuped_kde outputs\plots\kde_cuped_Relevance.png
Relevance bootstrap_kde outputs\plots\kde_bootstrap_Relevance.png
Accuracy cuped_kde outputs\plots\kde_cuped_Accuracy.png
Accuracy bootstrap_kde outputs\plots\kde_bootstrap_Accuracy.png
Bar chart path: outputs\plots\mean_diff_bar.png



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  bar_plot = sns.barplot(data=bootstrap_df, x='LLMScore', y='mean_diff', palette='viridis', ax=ax)


In [10]:
# Cell 10 — Multiple-Comparison Context (optional)
try:
    from statsmodels.stats.multitest import multipletests
    from scipy.stats import t
except ImportError:
    multipletests = None
    t = None

if multipletests is not None and t is not None and not per_case.empty:
    p_values = []
    for metric, group in per_case.groupby('LLMScore'):
        diffs = group['Diff'].dropna()
        if len(diffs) < 2:
            p_values.append(np.nan)
            continue
        mean_diff = diffs.mean()
        std_diff = diffs.std(ddof=1)
        if std_diff == 0:
            p_values.append(np.nan)
            continue
        t_stat = mean_diff / (std_diff / np.sqrt(len(diffs)))
        p_val = 2 * (1 - t.cdf(abs(t_stat), df=len(diffs) - 1))
        p_values.append(p_val)
    temp_df = bootstrap_df.copy()
    temp_df['p_value_ttest'] = p_values
    valid_mask = temp_df['p_value_ttest'].notna()
    if valid_mask.any():
        _, q_values, _, _ = multipletests(temp_df.loc[valid_mask, 'p_value_ttest'], method='fdr_bh')
        temp_df.loc[valid_mask, 'q_value_bh'] = q_values
    print('Multiple comparison table (exploratory):')
    display(temp_df)
else:
    print('Multiple comparison analysis skipped (dependencies unavailable or insufficient data).')

Multiple comparison table (exploratory):


Unnamed: 0,LLMScore,mean_diff,SE,CI_lo_95,CI_hi_95,N_cases,p_value_ttest,q_value_bh
0,Accuracy,-0.203914,0.102973,-0.407062,-0.001659,87,0.053055,0.053055
1,Efficiency,0.144672,0.056575,0.0319,0.253119,87,0.013625,0.020437
2,Relevance,0.188412,0.058094,0.074562,0.303024,87,0.002084,0.006253


In [11]:
# Cell 11 — Sensitivity Analyses
intersection_cases = set(previous_df['TestCaseId']).intersection(set(next_df['TestCaseId']))
per_case_intersection = per_case[per_case['TestCaseId'].isin(intersection_cases)].copy()
intersection_summary = (
    per_case_intersection.groupby('LLMScore')['Diff']
    .agg(mean_diff_intersection='mean', n_cases_intersection='count')
)

def winsorize(series: pd.Series, lower=0.025, upper=0.975) -> pd.Series:
    if series.isna().all():
        return series
    lower_bound = series.quantile(lower)
    upper_bound = series.quantile(upper)
    return series.clip(lower_bound, upper_bound)

per_case_winsor = per_case.copy()
per_case_winsor['Diff_winsor'] = per_case.groupby('LLMScore')['Diff'].transform(winsorize)
winsor_summary = (
    per_case_winsor.groupby('LLMScore')['Diff_winsor']
    .agg(mean_diff_winsor='mean')
)
median_summary = (
    per_case.groupby('LLMScore')['Diff']
    .agg(median_diff='median')
)

sensitivity_df = (
    metric_summary_df[['LLMScore', 'mean_diff', 'n_cases']]
    .merge(intersection_summary, on='LLMScore', how='left')
    .merge(winsor_summary, on='LLMScore', how='left')
    .merge(median_summary, on='LLMScore', how='left')
)
print('Sensitivity analyses overview:')
display(sensitivity_df)

Sensitivity analyses overview:


Unnamed: 0,LLMScore,mean_diff,n_cases,mean_diff_intersection,n_cases_intersection,mean_diff_winsor,median_diff
0,Accuracy,-0.203914,87,-0.203914,87,-0.189034,-0.149064
1,Efficiency,0.144672,87,0.144672,87,0.15809,0.178276
2,Relevance,0.188412,87,0.188412,87,0.191468,0.166726


In [12]:
# Cell 12 — Final Summary Tables & Exports
output_dir = Path('outputs')
output_dir.mkdir(exist_ok=True)
per_case_path = output_dir / 'per_case_cuped_diff.csv'
metric_summary_path = output_dir / 'metric_cuped_summary_raw.csv'
bootstrap_path = output_dir / 'metric_cuped_summary_bootstrap.csv'
theta_path = output_dir / 'theta_table.csv'
variance_path = output_dir / 'variance_reduction_by_metric.csv'
coverage_path = output_dir / 'coverage_by_metric.csv'

per_case.to_csv(per_case_path, index=False)
metric_summary_df.to_csv(metric_summary_path, index=False)
bootstrap_df.to_csv(bootstrap_path, index=False)
theta_df.to_csv(theta_path, index=False)
variance_df.to_csv(variance_path, index=False)
coverage_df.to_csv(coverage_path, index=False)

print('Exports saved:')
for path in [per_case_path, metric_summary_path, bootstrap_path, theta_path, variance_path, coverage_path]:
    print(path)
print('Plots directory:', plot_dir)

Exports saved:
outputs\per_case_cuped_diff.csv
outputs\metric_cuped_summary_raw.csv
outputs\metric_cuped_summary_bootstrap.csv
outputs\theta_table.csv
outputs\variance_reduction_by_metric.csv
outputs\coverage_by_metric.csv
Plots directory: outputs\plots


## Interpretation Guide

* **Primary metric:** The CUPED-adjusted mean difference (Next − Previous) summarizes the directional effect for each `LLMScore`.
* **Uncertainty:** 95% bootstrap confidence intervals that exclude zero indicate stronger evidence that the effect is non-zero.
* **Variance reduction:** Compare raw vs CUPED variance; larger reductions imply a more informative baseline (`corr(X, Y)` close to ±1).
* **Coverage:** Low overlap between previous and next test cases weakens comparability—consult the intersection-only sensitivity table.
* **Multiple metrics:** Use the exploratory q-values to control for multiple comparisons before declaring broad improvements.
* **Practical impact:** Review Cohen’s d, percent change, and share of improved cases to gauge real-world significance.
* **Diagnostics:** Negative variance reduction or negligible correlations highlight metrics where CUPED may not help.