# Import

In [3]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm

from cate.model.dataset import Dataset
from cate.utils import path_linker

In [4]:
pathlinker = path_linker("criteo")

# Functions

In [5]:
def to_rank(
    primary_key: pd.Series, score: pd.Series, ascending: bool = True
) -> pd.Series:
    df = pd.DataFrame({primary_key.name: primary_key, score.name: score}).set_index(
        primary_key.name, drop=True
    )
    df = df.sort_values(by=score.name, ascending=ascending)  # type: ignore
    df["rank"] = np.ceil(np.arange(len(df)) / len(df) * 100).astype(int)
    return df["rank"]

# Read Data

In [6]:
ds = Dataset.load(pathlinker.base)

In [7]:
pred_dfs = []
skf = StratifiedKFold(5, shuffle=True, random_state=42)
for train_idx, valid_idx in tqdm(skf.split(np.zeros(len(ds)), ds.y)):
    train_X = ds.X.iloc[train_idx]
    train_y = ds.y.iloc[train_idx].to_numpy().reshape(-1)
    train_w = ds.w.iloc[train_idx]
    valid_X = ds.X.iloc[valid_idx]
    valid_y = ds.y.iloc[valid_idx].to_numpy().reshape(-1)
    valid_w = ds.w.iloc[valid_idx]

    model = LogisticRegression()
    model.fit(train_X, train_y)
    pred = model.predict_proba(valid_X)
    pred_dfs.append(pd.DataFrame({"index": valid_idx, "pred": pred[:, 1]}))  # type: ignore

0it [00:00, ?it/s]

5it [03:49, 45.95s/it]


In [8]:
pred_df = pd.concat(pred_dfs, axis=0)
output_df = pd.merge(ds.y.copy(), ds.w.copy(), left_index=True, right_index=True)

rank = to_rank(pred_df["index"], pred_df["pred"], ascending=False)
pred_df = pd.merge(pred_df, rank, left_on="index", right_index=True).set_index(
    "index", drop=True
)
output_df = pd.merge(output_df, pred_df, left_index=True, right_index=True)

In [9]:
cv_list = []
for rank in range(100):
    rank_flg = output_df["rank"] <= rank
    tg_flg = output_df["treatment"] == 1
    cv = (
        output_df.loc[rank_flg & tg_flg, "conversion"].mean()
        - output_df.loc[rank_flg & ~tg_flg, "conversion"].mean()
    )
    cv_list.append(cv)

In [10]:
default_df = pd.Series(cv_list).to_frame()
default_df.columns = ["default"]

In [None]:
# cv_df = pd.read_csv("/workspace/outputs/meta_learner.csv", index_col = 0)

In [None]:
# cv_df["default"] = default_df

In [None]:
# cv_df.plot()

In [11]:
default_df.iloc[range(0, 100, 10), :]

Unnamed: 0,default
0,
10,0.008372
20,0.004777
30,0.003403
40,0.002648
50,0.002169
60,0.001833
70,0.001588
80,0.001403
90,0.001261


In [13]:
roc_auc_score(output_df["treatment"], output_df["pred"])

0.5065633528951979

In [14]:
roc_auc_score(output_df["conversion"], output_df["pred"])

0.942236719542004

In [17]:
for rank in range(100):
    rank_flg = output_df["rank"] <= rank

    print(f"rank {rank}: {output_df[rank_flg]['conversion'].mean()}")

rank 0: 0.0
rank 1: 0.14291539099831183
rank 2: 0.09024578671778878
rank 3: 0.06751504573330663


rank 4: 0.05428266903201808
rank 5: 0.04539471801768291
rank 6: 0.03915705742653581
rank 7: 0.03445837403890567
rank 8: 0.03076178860625483
rank 9: 0.02781036494447465
rank 10: 0.025375547225957824
rank 11: 0.023348307533835017
rank 12: 0.0216154253340582
rank 13: 0.020110633797508125
rank 14: 0.018799852642346522
rank 15: 0.017670518789530836
rank 16: 0.016659997719890823
rank 17: 0.015763312155406346
rank 18: 0.014949567365449721
rank 19: 0.014216585602398683
rank 20: 0.013557259706021526
rank 21: 0.012950507797929976
rank 22: 0.012393387635420585
rank 23: 0.011884401707208675
rank 24: 0.011416639071885424
rank 25: 0.01098171935726898
rank 26: 0.010579157420129453
rank 27: 0.010205617182718726
rank 28: 0.009857736506734561
rank 29: 0.00953335428341114
rank 30: 0.009232028208736639
rank 31: 0.008947142770776021
rank 32: 0.008684757023071575
rank 33: 0.008431120165193163
rank 34: 0.008195138207749026
rank 35: 0.007969779625732037
rank 36: 0.007757537076858453
rank 37: 0.007555993774710