In [1]:
import lightgbm as lgb
import numpy as np
import numpy.typing as npt
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from sklearn.metrics import roc_auc_score

from cate.model.dataset import Dataset, split, to_rank
from cate.utils import get_logger, path_linker

In [2]:
dataset_name = "test"
pathlinker = path_linker(dataset_name)
logger = get_logger("causalml")
logger.info("load dataset")

ds = Dataset.load(pathlinker.base)
train_ds, test_ds = split(ds, 1 / 3, random_state=42)

# Add Bias To Train Dataset Using LightGBM
_pred_dfs = []
skf = StratifiedKFold(5, shuffle=True, random_state=42)
for i, (train_idx, valid_idx) in enumerate(
    skf.split(np.zeros(len(train_ds)), train_ds.y)
):
    train_X = train_ds.X.iloc[train_idx]
    train_y = train_ds.y.iloc[train_idx].to_numpy().reshape(-1)
    valid_X = train_ds.X.iloc[valid_idx]
    valid_y = train_ds.y.iloc[valid_idx].to_numpy().reshape(-1)

    base_classifier = lgb.LGBMClassifier(
        importance_type="gain",
        random_state=42,
        force_col_wise=True,
        n_jobs=-1,
        verbosity=0,
    )
    base_classifier.fit(
        train_X, train_y, eval_set=[(valid_X, valid_y)], eval_metric="auc"
    )
    pred: npt.NDArray[np.float_] = base_classifier.predict_proba(valid_X)[:, 1]  # type: ignore

    _pred_dfs.append(
        pd.DataFrame(
            {"index": train_ds.y.index[valid_idx], "pred": pred.reshape(-1)}
        ).set_index("index")
    )
pred_df = pd.concat(_pred_dfs)
rank = to_rank(pred_df.index.to_series(), pred_df["pred"]).to_frame()

INFO  2024-11-23 06:43:05 [causalml] load dataset


In [3]:
train_df = pd.merge(train_ds.to_pandas(), rank, left_index=True, right_index=True)

In [4]:
train_df

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,treatment,conversion,visit,exposure,rank
19836,25.521311,10.059654,8.214383,4.679882,10.280525,4.115453,-3.282109,4.833815,3.971858,13.190056,5.300375,-0.168679,1,0,0,0,31
95802,12.616365,10.059654,9.048736,4.679882,10.280525,4.115453,0.294443,4.833815,3.955396,13.190056,5.300375,-0.168679,1,0,0,0,48
41413,24.526921,10.059654,8.214383,4.679882,10.280525,4.115453,-1.288207,4.833815,3.971858,13.190056,5.300375,-0.168679,1,0,0,0,22
62463,25.565241,10.059654,8.214383,4.679882,10.280525,4.115453,-9.065248,4.833815,3.971858,13.190056,5.300375,-0.168679,1,0,0,0,51
2617,12.616365,10.059654,8.934790,4.679882,10.280525,4.115453,0.294443,4.833815,3.955396,13.190056,5.300375,-0.168679,1,0,0,0,70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,26.335352,10.059654,8.214383,4.679882,10.280525,3.013064,-10.006574,11.803612,3.971858,13.190056,5.300375,-0.168679,1,0,0,0,27
54886,12.839715,10.059654,8.855592,0.842442,10.280525,4.115453,-13.904126,4.833815,3.899112,13.190056,5.300375,-0.168679,1,0,0,0,66
76820,14.256626,10.059654,8.311264,0.028363,11.561050,4.115453,-11.589224,4.833815,3.801757,32.395545,6.026625,-0.168679,1,0,1,1,96
860,12.616365,10.059654,8.752797,4.679882,10.280525,4.115453,0.294443,4.833815,3.915574,13.190056,5.300375,-0.168679,1,0,0,0,71


In [None]:
train_ds_list: list[Dataset] = []
for rank in range(1, 101):
    rank_flg = train_df["rank"] <= rank
    group_flg = train_df[ds.w_columns] == 1
    tg_train_df = train_df.loc[rank_flg & group_flg]
    cg_train_df = train_df.loc[~rank_flg & ~group_flg]
    localized_train_df = pd.concat([tg_train_df, cg_train_df]).sample(frac=1, random_state=42)
    localized_train_ds = Dataset(
        localized_train_df, train_ds.x_columns, train_ds.y_columns, train_ds.w_columns
    )
    train_ds_list.append(localized_train_ds)