In [2]:
import pandas as pd
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from causalml.inference.meta import BaseDRLearner, BaseRLearner, BaseXLearner, BaseSLearner, BaseTLearner
from causalml.inference.torch import CEVAE

from cate.dataset import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def to_rank(
    primary_key: pd.Series, score: pd.Series, ascending: bool = True
) -> pd.Series:
    df = pd.DataFrame({primary_key.name: primary_key, score.name: score}).set_index(
        primary_key.name, drop=True
    )
    df = df.sort_values(by=score.name, ascending=ascending)  # type: ignore
    df["rank"] = np.ceil(np.arange(len(df)) / len(df) * 100).astype(int)
    return df["rank"]

In [4]:
base_df = pd.read_csv("/workspace/data/uplift-modeling/criteo-uplift-v2.1.csv")

In [5]:
ds = Dataset(
    base_df,
    ["f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9", "f10", "f11"],
    ["conversion"],
    ["treatment"],
)

In [6]:
base_model = lgb.LGBMClassifier(importance_type="gain")
names = [
    "dr_learner",
    "rlearner",
    "xlearner",
    "slearner",
    "tlearner",
    "cevae"
]
models = [
    BaseDRLearner(base_model, base_model, base_model, base_model),
    BaseRLearner(base_model, base_model, base_model),
    BaseXLearner(base_model, base_model, base_model),
    BaseSLearner(base_model),
    BaseTLearner(base_model, base_model, base_model),
    CEVAE()   
]

In [None]:
pred_dfs = {}
skf = StratifiedKFold(5, shuffle=True, random_state=42)
for name, model in zip(names, models):
    _pred_dfs = []
    for train_idx, valid_idx in tqdm(skf.split(np.zeros(len(ds)), ds.y)):
        train_X = ds.X.iloc[train_idx]
        train_y = ds.y.iloc[train_idx].to_numpy().reshape(-1)
        train_w = ds.w.iloc[train_idx].to_numpy().reshape(-1)
        valid_X = ds.X.iloc[valid_idx]
        valid_y = ds.y.iloc[valid_idx].to_numpy().reshape(-1)
        valid_w = ds.w.iloc[valid_idx].to_numpy().reshape(-1)
        model.fit(train_X, train_w, train_y)
        pred = model.predict(valid_X)
        _pred_dfs.append(pd.DataFrame({"index": valid_idx, "pred": pred[:, 1]}))  # type: ignore
    pred_dfs[name] = _pred_dfs

0it [00:00, ?it/s]

In [1]:
pred_dfs

NameError: name 'pred_dfs' is not defined