# Benchmarking: DiffInMeans vs Scipy implementation

Goal: verify that `DiffInMeans` implementations of `ttest`, `conversion_ztest`, and `bootstrap`
match raw reference calculations from SciPy/Statsmodels.

In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from scipy.stats import bootstrap as scipy_bootstrap

from statsmodels.stats.weightstats import CompareMeans, DescrStatsW
from statsmodels.stats.proportion import proportions_ztest

from causalis.scenarios.classic_rct.dgp import generate_classic_rct_26
from causalis.data_contracts import CausalData
from causalis.scenarios.classic_rct.model import DiffInMeans

ALPHA = 0.05
BOOT_N = 12_000
BOOT_SEED = 2026


In [10]:
# DGP with binary outcome
data = generate_classic_rct_26(return_causal_data=False, include_oracle=False)

# For Causalis
causaldata = CausalData(
    df=data,
    treatment="d",
    outcome="conversion",
    confounders=["platform_ios", "country_usa", "source_paid"],
)

model = DiffInMeans().fit(causaldata)

# For Scipy
control = causaldata.outcome[causaldata.treatment == 0].dropna().to_numpy()
treated = causaldata.outcome[causaldata.treatment == 1].dropna().to_numpy()

print(f"n_control={control.size}, n_treated={treated.size}")
print(f"control mean={control.mean():.6f}, treated mean={treated.mean():.6f}")

n_control=4955, n_treated=5045
control mean=0.198991, treated mean=0.232904


In [3]:
def estimate_to_dict(est):
    return {
        "p_value": float(est.p_value),
        "absolute_ci_low": float(est.ci_lower_absolute),
        "absolute_ci_high": float(est.ci_upper_absolute),
    }

def comparison_table(model_res, external_res, label_model="DiffInMeans", label_ext="External"):
    idx = [
        "p_value",
        "absolute_ci_low",
        "absolute_ci_high",
    ]
    rows = []
    for k in idx:
        mv = model_res.get(k, np.nan)
        ev = external_res.get(k, np.nan)
        rows.append({
            "metric": k,
            label_model: mv,
            label_ext: ev,
            "abs_diff": np.nan if (pd.isna(mv) or pd.isna(ev)) else abs(mv - ev),
        })
    return pd.DataFrame(rows)


## 1) Welch t-test: model vs SciPy/Statsmodels

In [4]:
# Causalis
res_model_t = estimate_to_dict(model.estimate(method="ttest", alpha=ALPHA))

# Scipy
# External p-value from SciPy Welch test
res_scipy = stats.ttest_ind(treated, control, equal_var=False, nan_policy="raise")

# External absolute CI from Statsmodels Welch CI
cm = CompareMeans(DescrStatsW(treated), DescrStatsW(control))
ci_low_t, ci_high_t = cm.tconfint_diff(alpha=ALPHA, usevar="unequal")

res_ext_t = {
    "p_value": float(res_scipy.pvalue),
    "absolute_ci_low": float(ci_low_t),
    "absolute_ci_high": float(ci_high_t),
}

# Comparison
ttest_cmp = comparison_table(res_model_t, res_ext_t)
display(ttest_cmp)

assert np.isclose(res_model_t["p_value"], res_ext_t["p_value"], atol=1e-12)
assert np.isclose(res_model_t["absolute_ci_low"], res_ext_t["absolute_ci_low"], atol=1e-12)
assert np.isclose(res_model_t["absolute_ci_high"], res_ext_t["absolute_ci_high"], atol=1e-12)

print("ttest validation passed")


Unnamed: 0,metric,DiffInMeans,External,abs_diff
0,p_value,3.74e-05,3.74e-05,0.0
1,absolute_ci_low,0.01779692,0.01779692,0.0
2,absolute_ci_high,0.05002897,0.05002897,0.0


ttest validation passed


## 2) Conversion z-test: model vs Statsmodels two-proportion tools

In [8]:
# Causalis
res_model_z = estimate_to_dict(model.estimate(method="conversion_ztest", alpha=ALPHA))

# Statsmodels
x1 = int(treated.sum())
x0 = int(control.sum())
n1 = int(treated.size)
n0 = int(control.size)

# External pooled z-test p-value
z_stat, p_val_z = proportions_ztest(
    count=np.array([x1, x0]),
    nobs=np.array([n1, n0]),
    alternative="two-sided",
)

# External Newcombe CI for absolute difference
# (same Wilson-difference construction used in causalis.conversion_ztest)
p1 = x1 / n1
p0 = x0 / n0
z_crit = stats.norm.ppf(1 - ALPHA / 2)

def wilson_ci(p, n, z):
    den = 1.0 + (z ** 2) / n
    center = (p + (z ** 2) / (2 * n)) / den
    half = (z * np.sqrt(p * (1 - p) / n + (z ** 2) / (4 * n ** 2))) / den
    return center - half, center + half

l0, u0 = wilson_ci(p0, n0, z_crit)
l1, u1 = wilson_ci(p1, n1, z_crit)
ci_low_z, ci_high_z = (l1 - u0, u1 - l0)

res_ext_z = {
    "p_value": float(p_val_z),
    "absolute_ci_low": float(ci_low_z),
    "absolute_ci_high": float(ci_high_z),
}


# Comparison
ztest_cmp = comparison_table(res_model_z, res_ext_z)
display(ztest_cmp)

assert np.isclose(res_model_z["p_value"], res_ext_z["p_value"], atol=1e-12)
assert np.isclose(res_model_z["absolute_ci_low"], res_ext_z["absolute_ci_low"], atol=1e-12)
assert np.isclose(res_model_z["absolute_ci_high"], res_ext_z["absolute_ci_high"], atol=1e-12)

print("conversion_ztest validation passed")


Unnamed: 0,metric,DiffInMeans,External,abs_diff
0,p_value,3.794e-05,3.794e-05,6.78575035e-17
1,absolute_ci_low,0.01110763,0.01110763,0.0
2,absolute_ci_high,0.05665834,0.05665834,0.0


conversion_ztest validation passed


## 3) Bootstrap diff-in-means: model vs SciPy bootstrap

In [6]:
# Causalis
res_model_b = estimate_to_dict(
    model.estimate(method="bootstrap", alpha=ALPHA, n_simul=BOOT_N, seed=BOOT_SEED)
)
# Scipy
def diff_means(x, y, axis=-1):
    return np.mean(x, axis=axis) - np.mean(y, axis=axis)

def run_bootstrap(data_tuple, statistic):
    try:
        return scipy_bootstrap(
            data=data_tuple,
            statistic=statistic,
            vectorized=True,
            paired=False,
            n_resamples=BOOT_N,
            method="percentile",
            confidence_level=1 - ALPHA,
            random_state=BOOT_SEED,
        )
    except TypeError:
        # SciPy versions that replaced random_state with rng
        return scipy_bootstrap(
            data=data_tuple,
            statistic=statistic,
            vectorized=True,
            paired=False,
            n_resamples=BOOT_N,
            method="percentile",
            confidence_level=1 - ALPHA,
            rng=np.random.default_rng(BOOT_SEED),
        )

boot_diff = run_bootstrap((treated, control), diff_means)
abs_diff = float(treated.mean() - control.mean())
se_diff = float(boot_diff.standard_error)
p_val_b = 1.0 if se_diff == 0 else float(2 * (1 - stats.norm.cdf(abs(abs_diff / se_diff))))

res_ext_b = {
    "p_value": p_val_b,
    "absolute_ci_low": float(boot_diff.confidence_interval.low),
    "absolute_ci_high": float(boot_diff.confidence_interval.high),
}

# Comparison
boot_cmp = comparison_table(res_model_b, res_ext_b)
display(boot_cmp)

# Bootstrap libraries use different RNG internals; compare with practical tolerance
assert abs(res_model_b["absolute_ci_low"] - res_ext_b["absolute_ci_low"]) < 0.015
assert abs(res_model_b["absolute_ci_high"] - res_ext_b["absolute_ci_high"]) < 0.015
assert abs(res_model_b["p_value"] - res_ext_b["p_value"]) < 0.02

print("bootstrap validation passed")


Unnamed: 0,metric,DiffInMeans,External,abs_diff
0,p_value,3.671e-05,4.721e-05,1.05e-05
1,absolute_ci_low,0.01772064,0.01779606,7.543e-05
2,absolute_ci_high,0.04989633,0.05045165,0.00055531


bootstrap validation passed


## 4) Confidence intervals side-by-side

In [7]:
summary_ci = pd.DataFrame([
    {"method": "ttest", "source": "DiffInMeans", "p_value": res_model_t["p_value"], "abs_low": res_model_t["absolute_ci_low"], "abs_high": res_model_t["absolute_ci_high"]},
    {"method": "ttest", "source": "External", "p_value": res_ext_t["p_value"], "abs_low": res_ext_t["absolute_ci_low"], "abs_high": res_ext_t["absolute_ci_high"]},
    {"method": "conversion_ztest", "source": "DiffInMeans", "p_value": res_model_z["p_value"], "abs_low": res_model_z["absolute_ci_low"], "abs_high": res_model_z["absolute_ci_high"]},
    {"method": "conversion_ztest", "source": "External", "p_value": res_ext_z["p_value"], "abs_low": res_ext_z["absolute_ci_low"], "abs_high": res_ext_z["absolute_ci_high"]},
    {"method": "bootstrap", "source": "DiffInMeans", "p_value": res_model_b["p_value"], "abs_low": res_model_b["absolute_ci_low"], "abs_high": res_model_b["absolute_ci_high"]},
    {"method": "bootstrap", "source": "External", "p_value": res_ext_b["p_value"], "abs_low": res_ext_b["absolute_ci_low"], "abs_high": res_ext_b["absolute_ci_high"]},
])

display(summary_ci.sort_values(["method", "source"]).reset_index(drop=True))


Unnamed: 0,method,source,p_value,abs_low,abs_high
0,bootstrap,DiffInMeans,3.671e-05,0.01772064,0.04989633
1,bootstrap,External,4.721e-05,0.01779606,0.05045165
2,conversion_ztest,DiffInMeans,3.794e-05,0.01110763,0.05665834
3,conversion_ztest,External,3.794e-05,0.01778697,0.05001138
4,ttest,DiffInMeans,3.74e-05,0.01779692,0.05002897
5,ttest,External,3.74e-05,0.01779692,0.05002897


In conclusion: Diff_in_Means model is implemented correctly.