# 02 Tune CATE estimators

In this notebook, we will tune the hyperparemeters for our CATE methods.

### Contents:
1. Description of estimator library  
2. Setting up  
3. Actual tuning

## 1. Description of estimator library

We will consider the following estimators:

1. S-learner:  
A. RF  
B. XGB
2. T-learner:  
A. Lasso  
B. logistic  
C. RF  
D. XGB
3. X-learner:  
A. Outcome_learner: lasso, effect_learner: lasso  
B. Outcome_learner: logistic, effect_learner: lasso  
C. Outcome_learner: RF, effect_learner: lasso  
D. Outcome_learner: XGB, effect_learner: lasso
4. R-learner:  
A. Outcome_learner: lasso, effect_learner: lasso  
B. Outcome_learner: lasso, effect_learner: XGB  
C. Outcome_learner: RF, effect_learner: lasso  
E. Outcome_learner: RF, effect_learner: RF

R-learner base learner types were chosen independently at random from {lasso, RF, XGB}

We will tune the models for the 4 outcomes: GI, cardio, hypertension, severe GI, without perturbations.

## 2. Setting up

In [1]:
# Standard imports
import numpy as np
import pandas as pd
import sys
import copy
import random
import joblib
import pickle

# Import sklearn methods
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.model_selection import StratifiedKFold
from joblib import Parallel, delayed

# Import own methods
from methods.data_processing import prepare_df, separate_vars
from methods.cate_estimator_wrappers import (SLearnerWrapper, TLearnerWrapper,
                                             XLearnerWrapper, RLearnerWrapper,
                                             CausalTreeWrapper, CausalForestWrapper)
from methods.cate_estimator_validation import make_estimator_library

Failed to import duecredit due to No module named 'duecredit'


In [2]:
# Cap BLAS/OpenMP threads to avoid oversubscription with parallel CV
import os
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"


In [3]:
# Load pre-saved analysis dataset from Analysis.ipynb
import pandas as pd
import numpy as np
from pathlib import Path

ANALYSIS_CSV = Path("data/analysis/analysis_df.csv")
if not ANALYSIS_CSV.exists():
    raise FileNotFoundError(f"Expected CSV not found at {ANALYSIS_CSV}. Run Analysis.ipynb to generate it.")

trainval_df = pd.read_csv(ANALYSIS_CSV)
print(f"Loaded analysis dataset: {trainval_df.shape[0]} rows, {trainval_df.shape[1]} columns")

# Define outcomes available in this dataset
outcomes = ["fausebal"]

# Choose a binary treatment variable from known candidates if present
candidate_treatments = ["message", "awareness", "message_fa", "fa", "billpayfa", "debitfa"]
treatment_var = "message_fa"
candidate_treatments.remove(treatment_var)

# Build a comprehensive feature set:
strat_vars = [c for c in trainval_df.columns if c.startswith("strat_")]

# low-cardinality categoricals (exclude outcomes/treatment/id and obvious non-features like 'group')
# cat_candidates = ['reminder_freq', 'reminder_infreq', 'camp_short', 'htefa', 'htebal', 'awareness', 'creditcard']
# low_card_cats = [c for c in cat_candidates if trainval_df[c].nunique(dropna=True) <= 50]
# cat_dummies = pd.get_dummies(trainval_df[low_card_cats], drop_first=True, dummy_na=False) if low_card_cats else pd.DataFrame(index=trainval_df.index)

# assemble design matrix (avoid duplicates)
# X_numeric = trainval_df[["assets", "deposits", "paymentmean", "debt", "minbal"]].copy()
X_strat = trainval_df[strat_vars].copy()
X_treat_res = trainval_df[candidate_treatments].copy()
X_design = pd.concat([X_strat, X_treat_res], axis=1)
X_design = X_design.loc[:, ~X_design.columns.duplicated()].copy()

features = list(X_design.columns)
model_df = pd.concat([X_design, trainval_df[[treatment_var] + outcomes]], axis=1)

print(f"Detected treatment_var='{treatment_var}'")
print(f"Feature matrix: {X_design.shape[1]} columns")
print(f"Outcomes to tune: {outcomes}")


Loaded analysis dataset: 108000 rows, 519 columns
Detected treatment_var='message_fa'
Feature matrix: 321 columns
Outcomes to tune: ['fausebal']


In [4]:
# Save dataset and metadata for reuse; also export trainval_data.csv with 'TREATED'
from pathlib import Path

for outcome in outcomes:
    # Paths
    OUTPUT_ANALYSIS_DIR = Path(f"output/analysis/{outcome}")
    OUTPUT_PARAMS_DIR = Path(f"output/params/{outcome}")
    OUTPUT_ANALYSIS_DIR.mkdir(parents=True, exist_ok=True)
    OUTPUT_PARAMS_DIR.mkdir(parents=True, exist_ok=True)

    IMPUTED_CSV = OUTPUT_ANALYSIS_DIR / "trainval_data.csv"
    IMPUTATION_META = OUTPUT_PARAMS_DIR / "analysis_imputation_meta.pkl"
    meta = {
        "features": features,
        "treatment_var": treatment_var,
        "outcomes": outcomes,
    }
    with open(IMPUTATION_META, 'wb') as f:
        pickle.dump(meta, f)
    
    if treatment_var != "TREATED":
        model_df.rename(columns={treatment_var: "TREATED"}, inplace=True)
    model_df.to_csv(IMPUTED_CSV, index=False)
    print(f"✓ Saved imputed dataset -> {IMPUTED_CSV}")
    print(f"✓ Saved imputation metadata -> {IMPUTATION_META}")

✓ Saved imputed dataset -> output\analysis\fausebal\trainval_data.csv
✓ Saved imputation metadata -> output\params\fausebal\analysis_imputation_meta.pkl


### 2.2. Defining parameter grids and base learners

In [5]:
cv = StratifiedKFold(n_splits = 4, shuffle = True, random_state = 405)
lasso_grid = {"alpha" : np.logspace(-5,5,500) }
logistic_grid = {"penalty" : ["l1", "l2"], 
                 "C" : np.logspace(-5,5,500)}
rf_grid = {'min_samples_leaf': [10,50,100,200,300,400,500],
           'max_depth': [3,4,5,6,7,8],
           'bootstrap': [False, True],
           'n_estimators': [100,200,300,400,500]}
xgb_grid = {'max_depth': [5,6,7,8,9,10,11,12],
            'gamma': [0, 0.1, 0.2, 0.3, 0.4],
            'subsample': [0.7, 0.75, 0.8,1],
            'reg_lambda': [100,150,200,250, 300, 350, 400],
            'n_estimators': [200, 300, 400, 500, 600, 700, 800, 900, 1000],
            'min_child_weight': [4,5,6,7,8,9,10],
            'learning_rate': [0.1,0.125,0.15,0.175,0.2,0.225,0.25]}

base_learners = {"lasso" : Lasso(),
                 "logistic" : LogisticRegression(solver = "liblinear", 
                                                 max_iter = 500),
                 "rf" : RandomForestRegressor(),
                 "xgb" : XGBRegressor(objective = "reg:squarederror", n_jobs=1, tree_method="hist")}
param_grids = {"lasso" : lasso_grid,
               "logistic" : logistic_grid,
               "rf" : rf_grid,
               "xgb" : xgb_grid}

## 3. Actual tuning

In [6]:
import os
treatment_var = 'TREATED'
for rv in outcomes:
    print("=== Getting results for " + rv + " ===")
    cols_needed = [c for c in features] + [treatment_var, rv]
    df_subset = model_df.dropna(subset=[rv]).loc[:, cols_needed].copy()
    # Subsample 10% stratified by treatment for faster tuning
    df_subset = df_subset.groupby(treatment_var, group_keys=False).apply(lambda g: g.sample(frac=0.1, random_state=405))
    from methods.data_processing import separate_vars as _separate
    X, t, y = _separate(df_subset, rv, treatment_var)
    res = make_estimator_library(X, t, y, cv, base_learners, param_grids, n_iter=200)
    tuned = {}
    for est_name, est in res.items():
        tuned[est_name] = est.get_params()
    os.makedirs("output/params", exist_ok=True)
    joblib.dump(tuned, f"output/params/{outcome}/{rv}_tuned_params.pkl")

=== Getting results for fausebal ===


  df_subset = df_subset.groupby(treatment_var, group_keys=False).apply(lambda g: g.sample(frac=0.1, random_state=405))


Tuning s_xgb
Tuning s_rf
Tuning t_lasso
Tuning t_logistic
Tuning t_rf
Tuning t_xgb
Tuning x_lasso
Tuning x_logistic
Tuning x_rf
Tuning x_xgb
Tuning r_lassolasso
Tuning r_lassoxgb
Tuning r_lassorf
Tuning r_rflasso
Tuning r_rfrf
Tuning r_rfxgb
Tuning r_xgblasso
Tuning r_xgbrf
Tuning r_xgbxgb


In [7]:
PARAMS_PATH = os.path.join('output', 'params', 'fausebal_tuned_params.pkl')

with open(PARAMS_PATH, 'rb') as f:
    tuned_params = pickle.load(f)

print(f"Loaded tuned params from: {PARAMS_PATH}")
try:
    if isinstance(tuned_params, dict):
        print("Keys:", list(tuned_params.keys())[:20])
    else:
        print("Type:", type(tuned_params))
except Exception as e:
    print("Preview failed:", e)

# tuned_params is now available for use


Loaded tuned params from: output\params\fausebal_tuned_params.pkl
Keys: ['s_xgb', 's_rf', 't_lasso', 't_logistic', 't_rf', 't_xgb', 'x_lasso', 'x_logistic', 'x_rf', 'x_xgb', 'r_lassolasso', 'r_lassoxgb', 'r_lassorf', 'r_rflasso', 'r_rfrf', 'r_rfxgb', 'r_xgblasso', 'r_xgbrf', 'r_xgbxgb']
