# Import

In [23]:
import pandas as pd
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm

from cate.dataset import Dataset
from cate.utils import PathLinker, Timer

In [24]:
pathlinker = PathLinker().data.criteo
timer = Timer()

# Functions

In [25]:
def to_rank(
    primary_key: pd.Series, score: pd.Series, ascending: bool = True
) -> pd.Series:
    df = pd.DataFrame({primary_key.name: primary_key, score.name: score}).set_index(
        primary_key.name, drop=True
    )
    df = df.sort_values(by=score.name, ascending=ascending)  # type: ignore
    df["rank"] = np.ceil(np.arange(len(df)) / len(df) * 100).astype(int)
    return df["rank"]

# Read Data

In [26]:
ds = Dataset.load(pathlinker.base)

In [27]:
pred_dfs = []
skf = StratifiedKFold(5, shuffle=True, random_state=42)
for i, (train_idx, valid_idx) in tqdm(enumerate(skf.split(np.zeros(len(ds)), ds.y))):
    train_X = ds.X.iloc[train_idx]
    train_y = ds.y.iloc[train_idx].to_numpy().reshape(-1)
    train_w = ds.w.iloc[train_idx]
    valid_X = ds.X.iloc[valid_idx]
    valid_y = ds.y.iloc[valid_idx].to_numpy().reshape(-1)
    valid_w = ds.w.iloc[valid_idx]

    model = lgb.LGBMClassifier(importance_type="gain")
    timer.start(f"fit_{i}")
    model.fit(train_X, train_y, eval_set=[(valid_X, valid_y)])
    timer.stop(f"fit_{i}")
    timer.start(f"predict_{i}")
    pred = model.predict_proba(valid_X)
    timer.stop(f"predict_{i}")
    pred_dfs.append(pd.DataFrame({"index": valid_idx, "pred": pred[:, 1]}))  # type: ignore

0it [00:00, ?it/s]

[LightGBM] [Info] Number of positive: 349, number of negative: 111487
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005626 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1680
[LightGBM] [Info] Number of data points in the train set: 111836, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.003121 -> initscore=-5.766591
[LightGBM] [Info] Start training from score -5.766591


1it [00:00,  1.46it/s]

[LightGBM] [Info] Number of positive: 350, number of negative: 111487
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004422 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1681
[LightGBM] [Info] Number of data points in the train set: 111837, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.003130 -> initscore=-5.763730
[LightGBM] [Info] Start training from score -5.763730


2it [00:01,  1.70it/s]

[LightGBM] [Info] Number of positive: 350, number of negative: 111487
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005372 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1682
[LightGBM] [Info] Number of data points in the train set: 111837, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.003130 -> initscore=-5.763730
[LightGBM] [Info] Start training from score -5.763730


3it [00:01,  1.92it/s]

[LightGBM] [Info] Number of positive: 350, number of negative: 111487
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004292 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1683
[LightGBM] [Info] Number of data points in the train set: 111837, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.003130 -> initscore=-5.763730
[LightGBM] [Info] Start training from score -5.763730


4it [00:02,  2.04it/s]

[LightGBM] [Info] Number of positive: 349, number of negative: 111488
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004223 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1685
[LightGBM] [Info] Number of data points in the train set: 111837, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.003121 -> initscore=-5.766600
[LightGBM] [Info] Start training from score -5.766600


5it [00:02,  1.96it/s]


In [28]:
pred_df = pd.concat(pred_dfs, axis=0)
output_df = pd.merge(ds.y.copy(), ds.w.copy(), left_index=True, right_index=True)

rank = to_rank(pred_df["index"], pred_df["pred"], ascending=False)
pred_df = pd.merge(pred_df, rank, left_on="index", right_index=True).set_index(
    "index", drop=True
)
output_df = pd.merge(output_df, pred_df, left_index=True, right_index=True)

In [29]:
cv_list = []
for rank in range(100):
    rank_flg = output_df["rank"] <= rank
    tg_flg = output_df["treatment"] == 1
    cv = (
        output_df.loc[rank_flg & tg_flg, "conversion"].mean()
        - output_df.loc[rank_flg & ~tg_flg, "conversion"].mean()
    )
    cv_list.append(cv)

In [30]:
default_df = pd.Series(cv_list).to_frame()
default_df.columns = ["default"]

In [31]:
# cv_df = pd.read_csv("/workspace/outputs/meta_learner.csv", index_col = 0)

In [32]:
# cv_df["default"] = default_df

In [33]:
# cv_df.plot()

In [34]:
default_df.iloc[range(0, 100, 10), :]

Unnamed: 0,default
0,
10,0.011357
20,0.007767
30,0.005377
40,0.004138
50,0.003407
60,0.002869
70,0.002477
80,0.002187
90,0.001954


In [37]:
time_df = pd.DataFrame(timer.events)
category_flg = time_df["category"] == "start"
duration_df = pd.merge(
    time_df.loc[category_flg],
    time_df.loc[~category_flg],
    left_on="name",
    right_on="name",
    suffixes=["_start", "_stop"],
).iloc[1:]
duration_df["duration"] = (duration_df["time_stop"] - duration_df["time_start"]).apply(
    lambda x: x.total_seconds()
)
duration_df = duration_df.loc[:, ["name", "duration"]]

In [38]:
parsed_index = (
    duration_df["name"]
    .str.split("_", expand=True)
    .rename(columns={0: "phase", 1: "iter_num"})
)
duration_df = pd.merge(
    duration_df.drop("name", axis=1), parsed_index, left_index=True, right_index=True
)

In [39]:
duration_df = (
    duration_df.loc[:, ["phase", "duration"]]
    .groupby(["phase"])
    .mean()
)
duration_df

Unnamed: 0_level_0,duration
phase,Unnamed: 1_level_1
fit,0.41125
predict,0.019287
