In [10]:
import pandas as pd
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from causalml.inference.meta import BaseDRLearner, BaseRLearner, BaseXLearner, BaseSLearner, BaseTLearner
from causalml.inference.torch import CEVAE

from cate.dataset import Dataset

In [4]:
def to_rank(
    primary_key: pd.Series, score: pd.Series, ascending: bool = True
) -> pd.Series:
    df = pd.DataFrame({primary_key.name: primary_key, score.name: score}).set_index(
        primary_key.name, drop=True
    )
    df = df.sort_values(by=score.name, ascending=ascending)  # type: ignore
    df["rank"] = np.ceil(np.arange(len(df)) / len(df) * 100).astype(int)
    return df["rank"]

In [5]:
base_df = pd.read_csv("/workspace/data/uplift-modeling/criteo-uplift-v2.1.csv")

In [6]:
ds = Dataset(
    base_df,
    ["f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9", "f10", "f11"],
    ["conversion"],
    ["treatment"],
)

In [9]:
base_model = lgb.LGBMClassifier(importance_type="gain")
names = [
    "dr_learner",
    "rlearner",
    "xlearner",
    "slearner",
    "tlearner",
    "cevae"
]
models = [
    BaseDRLearner(base_model, base_model, base_model, base_model),
    BaseRLearner(base_model, base_model, base_model),
    BaseXLearner(base_model, base_model, base_model),
    BaseSLearner(base_model),
    BaseTLearner(base_model, base_model, base_model),
    CEVAE()   
]

In [11]:
pred_dfs = {}
skf = StratifiedKFold(5, shuffle=True, random_state=42)
for name, model in zip(names, models):
    _pred_dfs = []
    for train_idx, valid_idx in tqdm(skf.split(np.zeros(len(ds)), ds.y)):
        train_X = ds.X.iloc[train_idx]
        train_y = ds.y.iloc[train_idx].to_numpy().reshape(-1)
        train_w = ds.w.iloc[train_idx]
        valid_X = ds.X.iloc[valid_idx]
        valid_y = ds.y.iloc[valid_idx].to_numpy().reshape(-1)
        valid_w = ds.w.iloc[valid_idx]
        model.fit(train_X, train_y, eval_set=[(valid_X, valid_y)])
        pred = model.predict(valid_X)
        _pred_dfs.append(pd.DataFrame({"index": valid_idx, "pred": pred[:, 1]}))  # type: ignore
    pred_dfs[name] = _pred_dfs

0it [00:00, ?it/s]

[LightGBM] [Info] Number of positive: 32619, number of negative: 11151054
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.433962 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1700
[LightGBM] [Info] Number of data points in the train set: 11183673, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.002917 -> initscore=-5.834394
[LightGBM] [Info] Start training from score -5.834394


1it [00:42, 42.69s/it]

[LightGBM] [Info] Number of positive: 32619, number of negative: 11151054
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.361926 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1708
[LightGBM] [Info] Number of data points in the train set: 11183673, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.002917 -> initscore=-5.834394
[LightGBM] [Info] Start training from score -5.834394


2it [01:05, 30.88s/it]

[LightGBM] [Info] Number of positive: 32620, number of negative: 11151054
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.526025 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1707
[LightGBM] [Info] Number of data points in the train set: 11183674, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.002917 -> initscore=-5.834364
[LightGBM] [Info] Start training from score -5.834364


3it [01:29, 27.84s/it]

[LightGBM] [Info] Number of positive: 32619, number of negative: 11151055
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.437893 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1707
[LightGBM] [Info] Number of data points in the train set: 11183674, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.002917 -> initscore=-5.834394
[LightGBM] [Info] Start training from score -5.834394


4it [01:52, 26.04s/it]

[LightGBM] [Info] Number of positive: 32619, number of negative: 11151055
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.417894 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1703
[LightGBM] [Info] Number of data points in the train set: 11183674, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.002917 -> initscore=-5.834394
[LightGBM] [Info] Start training from score -5.834394


5it [02:16, 27.36s/it]
