# 05b Rebuild Top Groups and Refit Models

This notebook rebuilds a model for the top groups selected in `03a_GI_validate_CATE_estimators.ipynb` / `04a_GI_rank_CATE_estimators.ipynb`.

- Identify units that fall in the top quantile [0.8, 1] across 12 CV folds with frequency >= 0.33.
- Compute Neyman t-statistic and p-value for that subgroup.
- Retune estimators with 4-fold CV and refit on all units.


In [1]:
# Setup and imports
import os
import pickle
from pathlib import Path
import numpy as np
import pandas as pd
import joblib

from methods.causal_functions import get_subgroup_t_statistic, get_Neyman_ATE, get_subgroup_CATE_std
from methods.cate_estimator_wrappers import CATEEstimatorResults, XLearnerWrapper


# Config
outcome = "fausebal"  # adjust as needed
DATA_DIR = Path("output/analysis")
PARAMS_DIR = Path("output/params")
ANALYSIS_DIR = DATA_DIR / outcome
PARAMS_PATH = PARAMS_DIR / outcome / f"{outcome}_tuned_params.pkl"
IMPUTATION_META = PARAMS_DIR / outcome / "analysis_imputation_meta.pkl"
with open(IMPUTATION_META, 'rb') as f:
    meta = pickle.load(f)

features = meta["features"]
treatment_var = meta.get("treatment_var", "TREATED")
all_outcomes = meta.get("outcomes", [outcome])

# Load fitted libraries and top estimator names from 03a/04a cache
FITTED_LIBS_PATH = ANALYSIS_DIR / f"{outcome}_fitted_libraries.pkl"

fitted_libraries = joblib.load(FITTED_LIBS_PATH)
# Load optimal ensemble from 04b
INTERMEDIATE_PATH = Path("output/intermediate/grid_search")
optimal_config_path = INTERMEDIATE_PATH / f"{outcome}_optimal_config.pkl"

optimal_config = joblib.load(optimal_config_path)
top_estimator_names = optimal_config["estimators"]
print(f"Loaded optimal ensemble from 04b: {top_estimator_names}")
print(f"  Configuration: Q*={optimal_config['Q_star']}, k*={optimal_config['k_star']}")
lib_pert_none = fitted_libraries["pert_none"]
n_samples_tv = len(next(iter(lib_pert_none.values())).y)
train_indices = np.arange(n_samples_tv)
val_indices = np.arange(n_samples_tv)

# Train fold-free results and compute average ITEs on trainval
TOP_RESULTS_PKL = ANALYSIS_DIR / f"{outcome}_05b_top_results.pkl"
if TOP_RESULTS_PKL.exists():
    top_results = joblib.load(TOP_RESULTS_PKL)
    print(f"Loaded {len(top_results)} top estimator results from {TOP_RESULTS_PKL}")
else:
    top_results = {}
    for est_name in top_estimator_names:
        if est_name not in lib_pert_none:
            continue
        est = lib_pert_none[est_name]
        res = CATEEstimatorResults(train_indices, val_indices, est, save_metalearner=True)
        top_results[est_name] = res
    joblib.dump(top_results, TOP_RESULTS_PKL)
    print(f"Saved {len(top_results)} top estimator results to {TOP_RESULTS_PKL}")

tau_stack_tv = np.vstack([res.tau for res in top_results.values()]) if len(top_results) > 0 else np.empty((0, n_samples_tv))
tau_avg_tv = tau_stack_tv.mean(axis=0) if tau_stack_tv.size > 0 else np.zeros(n_samples_tv)
print(f"Fitted {len(top_results)} top estimators on all trainval units.")

# Helper to predict on new data with fold-free fitted result
def _predict_on_new(result, estimator_wrapper, X_new):
    Xn = np.asarray(X_new, dtype=float)
    if getattr(result, "_selector", None) is not None:
        n_features = estimator_wrapper.X.shape[1]
        col_names = [f"x_{i}" for i in range(n_features)]
        try:
            X_df = pd.DataFrame(Xn, columns=pd.Index(col_names))
            Xn = result._selector.transform(X_df).values
        except Exception:
            pass
    if isinstance(estimator_wrapper, XLearnerWrapper):
        p = np.mean(estimator_wrapper.t) * np.ones(Xn.shape[0])
        pred = result.meta_learner.predict(Xn, p=p)
    else:
        pred = result.meta_learner.predict(Xn)
    return np.asarray(pred).squeeze()


Failed to import duecredit due to No module named 'duecredit'
configuration generated by an older version of XGBoost, please export the model by calling
`Booster.save_model` from that version first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html

for more details about differences between saving model and serializing.

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Loaded optimal ensemble from 04b: ['x_xgb', 'causal_tree_1', 'x_rf', 't_rf', 'x_logistic']
  Configuration: Q*=Q3, k*=10
Loaded 5 top estimator results from output/analysis/fausebal/fausebal_05b_top_results.pkl
Fitted 5 top estimators on all trainval units.


In [2]:
# Build [0.9, 1] subgroup by ranking averaged ITEs on trainval
# dir_neg=False selects largest effects; True selects smallest effects
dir_neg = False
q_bot, q_top = (0.9, 1.0)

n_samples = len(tau_avg_tv)

# Load aligned trainval data to get y and t
trainval_df = pd.read_csv(ANALYSIS_DIR / "trainval_data.csv")
y = trainval_df[outcome].values
if 'TREATED' in trainval_df.columns:
    t = trainval_df['TREATED'].values.astype(int)
elif treatment_var in trainval_df.columns:
    t = trainval_df[treatment_var].values.astype(int)
else:
    raise KeyError("trainval data must include 'TREATED' or treatment_var column")

# Threshold by quantiles of tau_avg_tv
from scipy.stats import norm
if not dir_neg:
    thr_tv = np.quantile(tau_avg_tv, q_bot)
    subgroup_indicator = (tau_avg_tv >= thr_tv)
else:
    thr_tv = np.quantile(tau_avg_tv, q_top)
    subgroup_indicator = (tau_avg_tv <= thr_tv)

ATE = get_Neyman_ATE(y[subgroup_indicator], t[subgroup_indicator]) if subgroup_indicator.any() else np.nan
# t-stat vs zero to align sign with ATE
if subgroup_indicator.any() and (t[subgroup_indicator].sum() > 0) and ((1 - t[subgroup_indicator]).sum() > 0):
    CATE_std = get_subgroup_CATE_std(y, t, subgroup_indicator)
    t_stat = ATE / CATE_std if (np.isfinite(CATE_std) and CATE_std > 0) else np.nan
    p_value = 2 * (1 - norm.cdf(abs(t_stat))) if np.isfinite(t_stat) else np.nan
else:
    t_stat, p_value = np.nan, np.nan

print({
    "n_samples": int(n_samples),
    "subgroup_size": int(np.asarray(subgroup_indicator, dtype=bool).sum()),
    "ATE": None if (ATE is np.nan or not np.isfinite(ATE)) else float(ATE),
    "t_stat": None if (t_stat is np.nan or not np.isfinite(t_stat)) else float(t_stat),
})


{'n_samples': 28830, 'subgroup_size': 2883, 'ATE': 0.13813965710413545, 't_stat': 7.798796551007339}


In [3]:
# Holdout subgroup via averaged predictions from fold-free top estimators
holdout_df = pd.read_csv(ANALYSIS_DIR / "holdout_data.csv")
X_hold = holdout_df[features].copy().apply(pd.to_numeric, errors='coerce').fillna(0.0).values
y_hold = holdout_df[outcome].values
if 'TREATED' in holdout_df.columns:
    t_hold = holdout_df['TREATED'].values.astype(int)
elif treatment_var in holdout_df.columns:
    t_hold = holdout_df[treatment_var].values.astype(int)
else:
    raise KeyError("Holdout data must include 'TREATED' or treatment_var column")

# Predict ITEs on holdout for each fitted top estimator and average
if len(top_results) > 0:
    tau_hold_stack = []
    for est_name, res in top_results.items():
        est_wrapper = lib_pert_none[est_name]
        tau_hold_stack.append(_predict_on_new(res, est_wrapper, X_hold))
    tau_avg_hold = np.mean(np.vstack(tau_hold_stack), axis=0)
else:
    tau_avg_hold = np.zeros(len(holdout_df))

# Threshold by quantiles of tau_avg_hold (separate holdout ranking)
from scipy.stats import norm
if not dir_neg:
    thr_hold = np.quantile(tau_avg_hold, q_bot)
    subgroup_hold = (tau_avg_hold >= thr_hold)
else:
    thr_hold = np.quantile(tau_avg_hold, q_top)
    subgroup_hold = (tau_avg_hold <= thr_hold)

ATE_hold = get_Neyman_ATE(y_hold[subgroup_hold], t_hold[subgroup_hold]) if subgroup_hold.any() else np.nan
# t-stat vs zero to align sign with ATE
if subgroup_hold.any() and (t_hold[subgroup_hold].sum() > 0) and ((1 - t_hold[subgroup_hold]).sum() > 0):
    CATE_std_hold = get_subgroup_CATE_std(y_hold, t_hold, subgroup_hold)
    t_stat_hold = ATE_hold / CATE_std_hold if (np.isfinite(CATE_std_hold) and CATE_std_hold > 0) else np.nan
    p_value_hold = 2 * (1 - norm.cdf(abs(t_stat_hold))) if np.isfinite(t_stat_hold) else np.nan
else:
    t_stat_hold, p_value_hold = np.nan, np.nan

print({
    "n_holdout": int(len(holdout_df)),
    "subgroup_size_holdout": int(np.asarray(subgroup_hold, dtype=bool).sum()),
    "ATE_holdout": None if (ATE_hold is np.nan or not np.isfinite(ATE_hold)) else float(ATE_hold),
    "t_stat_holdout": None if (t_stat_hold is np.nan or not np.isfinite(t_stat_hold)) else float(t_stat_hold),
})


{'n_holdout': 7208, 'subgroup_size_holdout': 721, 'ATE_holdout': -0.017937944408532636, 't_stat_holdout': -0.503470527552398}


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


In [4]:
# Pooled stats (trainval + holdout)
y_all = np.asarray(trainval_df[outcome].values, dtype=float)
t_all = np.asarray(
    trainval_df['TREATED'].values.astype(int) if 'TREATED' in trainval_df.columns
    else trainval_df[treatment_var].values.astype(int),
    dtype=int,
)

subgroup_all = np.concatenate([
    np.asarray(subgroup_indicator, dtype=bool),
    np.asarray(subgroup_hold, dtype=bool),
])
y_all_pool = np.concatenate([y_all, np.asarray(y_hold, dtype=float)])
t_all_pool = np.concatenate([t_all, np.asarray(t_hold, dtype=int)])

from scipy.stats import norm
ATE_all = get_Neyman_ATE(y_all_pool[subgroup_all], t_all_pool[subgroup_all]) if subgroup_all.any() else np.nan
# t-stat vs zero to align sign with ATE
if subgroup_all.any() and (t_all_pool[subgroup_all].sum() > 0) and ((1 - t_all_pool[subgroup_all]).sum() > 0):
    CATE_std_all = get_subgroup_CATE_std(y_all_pool, t_all_pool, subgroup_all)
    t_stat_all = ATE_all / CATE_std_all if (np.isfinite(CATE_std_all) and CATE_std_all > 0) else np.nan
    p_value_all = 2 * (1 - norm.cdf(abs(t_stat_all))) if np.isfinite(t_stat_all) else np.nan
else:
    t_stat_all, p_value_all = np.nan, np.nan

print({
    "n_all": int(len(y_all_pool)),
    "subgroup_size_all": int(np.sum(subgroup_all)),
    "ATE_all": None if (ATE_all is np.nan or np.isnan(ATE_all)) else float(ATE_all),
    "t_stat_all": None if (t_stat_all is np.nan or not np.isfinite(t_stat_all)) else float(t_stat_all),
})


{'n_all': 36038, 'subgroup_size_all': 3604, 'ATE_all': 0.10690676263113891, 't_stat_all': 6.7277390592105935}
