In [1]:
import sys
sys.path.append(r"C:\Users\ASUS\Documents\GitHub\Web3CausalInference\src")

# example_att_est.py
import numpy as np

from UnifiedCausal import UnifiedCausalTester, CausalRules
from minVarCatBoostTuner import MinVarCatBoostTuner
from nAUUCCatBoostTuner import NAUUCCatBoostTuner
from NAUUCCatBoostTunerV2 import NAUUCCatBoostTunerV2
from causaldata import nhefs_complete


import pandas as pd
import os.path
import os


In [2]:
try:
    df = nhefs_complete.load_pandas().data.copy()
except Exception:
    df = nhefs_complete.load_pandas().copy()

t_col = "qsmk"
y_col = "wt82_71"
y_nc_col = None

# 协变量（可按需增减；以下为常见一组）
base_covs = [
    "sex", "race", "age", "education",
    "smokeintensity", "smokeyrs",
    "exercise", "active",
    "wt71", "ht", "bmix",  # bmix 若不存在，可用 bmi
    "alcohol", "marital"
]
# 兼容：若某些列不存在，自动剔除
covs = [c for c in base_covs if c in df.columns]
if "bmi" in df.columns and "bmix" not in df.columns:
    covs.append("bmi")

In [3]:
# --- 构造 X / T / Y ---
X_df = pd.get_dummies(df[covs], drop_first=True)   # one-hot → 全数值
X = X_df.to_numpy()
feature_names = list(X_df.columns)

T = df[t_col].to_numpy().ravel().astype(int)
Y = df[y_col].to_numpy().ravel().astype(float)
y_nc = None

In [4]:
df.shape

(1566, 67)

In [5]:
# ===== 2) 公共规则 & 估计器配置（按需调快/调严）=====
rules = CausalRules(
    smd_max=0.3, ovl_min=0.50, ks_max=0.60, ess_min=0.70,
    placebo_alpha=0.09, nc_alpha=0.10, top_k_smd=8
)

reg, clf = MinVarCatBoostTuner(verbose=0,n_trials=50).fit_return_models(X,T,Y)


[I 2025-09-18 21:48:19,620] A new study created in memory with name: no-name-46599549-b82a-40a6-9756-c2d959100ceb
[I 2025-09-18 21:48:21,999] Trial 0 finished with value: 0.703232279306612 and parameters: {'iterations': 305, 'od_wait': 161, 'reg_depth': 5, 'clf_depth': 4, 'reg_loss': 'RMSE', 'reg_lr': 0.07779618980323089, 'reg_l2': 0.005301039627227607, 'reg_subsample': 0.829959672010499, 'clf_lr': 0.001936945130623209, 'clf_l2': 0.06847505363786201, 'clf_subsample': 0.7627793224147521, 'clf_class_wt': None}. Best is trial 0 with value: 0.703232279306612.
[I 2025-09-18 21:48:33,342] Trial 1 finished with value: 471.3419491676873 and parameters: {'iterations': 387, 'od_wait': 137, 'reg_depth': 9, 'clf_depth': 6, 'reg_loss': 'RMSE', 'reg_lr': 0.0032951579039732074, 'reg_l2': 0.2723893685109325, 'reg_subsample': 0.8649138839074996, 'clf_lr': 0.07809172459103658, 'clf_l2': 0.03078800562781488, 'clf_subsample': 0.7751133666190527, 'clf_class_wt': 'SqrtBalanced'}. Best is trial 0 with value:

In [6]:
common_kwargs = dict(
    n_splits=5,
    trim=0.01,
    ps_clip=(0.05, 0.95),     # PS 裁剪，稳健
    weight_clip=10.0,         # 权重裁剪，稳健
    n_jobs=-1,                 # 单测/示例下设 1；真跑可以开大
    n_jobs_placebo=-1,
    random_state=2025,
    verbose=0,
    rules=rules,
    regressor=reg,
    classifier=clf
)


# ===== 3) ATE =====
print("\n=== ATE on full sample ===")
ate = UnifiedCausalTester(estimand="ATE", **common_kwargs)
ate.fit(X, T, Y, X_names=covs, y_nc=None, placebo_runs=10)
print(ate.report())



=== ATE on full sample ===


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

【ATE 可信度诊断报告（并行 + tqdm）】
- 点估计 = 3.353088, 95%CI = [2.593328, 4.112849]
- 重叠性: OVL=0.836 (阈值≥0.5) ✅; KS=0.170 (阈值≤0.6) ✅; 权重尾部: {'w1_p99': 4.478666418246574, 'w0_p99': 2.2026115722590585}
- 平衡性: |SMD|_max=0.124 (阈值≤0.3) ✅；Top8 失衡特征：
    · sex: 0.124
    · active: -0.111
    · exercise: -0.103
    · race: -0.101
    · smokeintensity: 0.076
    · education: 0.066
    · wt71: -0.061
    · age: 0.032
- 安慰剂置换: p=0.091 (应≥0.09) ✅
- ForestDR 排序能力（overlap e∈[0.30,0.70]，OOF） 覆盖率=83.91%，n=1314
    · nAUUC=0.008 (area_model=39.89, area_oracle=5147.62)
    · Policy@10/20/30 = 1000.25 / 1198.54 / 1283.78
- 各项通过: {'overlap_pass': True, 'balance_pass': True, 'placebo_pass': True, 'negctrl_pass': True}  => 结论：可信（通过）


In [7]:
# 取得 OOF 倾向分数，后面做重叠带
e_oof = ate.result_["diag"]["e"]

# ===== 6) “重叠带” ATT（可选，更稳的口径）=====
print("\n=== ATT on overlap band e∈[0.3,0.7] (recommended when overlap is weak) ===")
band = (e_oof >= 0.3) & (e_oof <= 0.7)
print(f"Overlap-band coverage (treated kept): "
    f"{int((T[band]==1).sum())}/{int((T==1).sum())} "
    f"= {(T[band]==1).sum()/(T==1).sum():.1%}")



=== ATT on overlap band e∈[0.3,0.7] (recommended when overlap is weak) ===
Overlap-band coverage (treated kept): 362/403 = 89.8%


In [8]:
# ===== 6) “重叠带” ATT（可选，更稳的口径）=====
print("\n=== ATT on overlap band e∈[0.3,0.7] (recommended when overlap is weak) ===")
band = (e_oof >= 0.3) & (e_oof <= 0.7)
print(f"Overlap-band coverage (treated kept): "
    f"{int((T[band]==1).sum())}/{int((T==1).sum())} "
    f"= {(T[band]==1).sum()/(T==1).sum():.1%}")

att_band = UnifiedCausalTester(estimand="ATT", **common_kwargs)
att_band.fit(X[band], T[band], Y[band], X_names=feature_names, y_nc=y_nc, placebo_runs=10)
print(att_band.report())



=== ATT on overlap band e∈[0.3,0.7] (recommended when overlap is weak) ===
Overlap-band coverage (treated kept): 362/403 = 89.8%


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

【ATT 可信度诊断报告（并行 + tqdm）】
- 点估计 = 3.383176, 95%CI = [2.201528, 4.564823]
- 重叠性: OVL=0.856 (阈值≥0.5) ✅; KS=0.164 (阈值≤0.6) ✅; 权重尾部: {'w_control_p99': 1.242994043189571, 'w_control_max': 1.5101231177308427}
- 平衡性: |SMD|_max=0.116 (阈值≤0.3) ✅；Top8 失衡特征：
    · age: 0.116
    · race_1: -0.107
    · education_5: 0.095
    · exercise_2: 0.089
    · wt71: 0.086
    · active_2: 0.084
    · smokeintensity: -0.069
    · sex_1: -0.067
- 安慰剂置换: p=0.091 (应≥0.09) ✅
- ForestDR 排序能力（overlap e∈[0.30,0.70]，OOF） 覆盖率=89.65%，n=1178
    · nAUUC=0.000 (area_model=-28.75, area_oracle=4633.60)
    · Policy@10/20/30 = 909.18 / 1140.00 / 1403.37
- 各项通过: {'overlap_pass': True, 'balance_pass': True, 'placebo_pass': True, 'negctrl_pass': True}  => 结论：可信（通过）
