- [Reference](https://causalml.readthedocs.io/en/latest/examples/uplift_trees_with_synthetic_data.html)

In [140]:
import numpy as np
import pandas as pd

from causalml.dataset import make_uplift_classification
from causalml.inference.tree import UpliftRandomForestClassifier, UpliftTreeClassifier
from causalml.inference.meta import BaseSClassifier, BaseTClassifier, BaseXClassifier, BaseRClassifier
from causalml.inference.meta import BaseSRegressor, BaseTRegressor, BaseXRegressor, BaseRRegressor
from xgboost import XGBClassifier, XGBRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from causalml.metrics import plot_gain

from sklearn.model_selection import train_test_split
from collections import Counter

from sklift.metrics.metrics import uplift_at_k, uplift_auc_score
from causalml.feature_selection.filters import FilterSelect

In [115]:
import importlib
print(importlib.metadata.version('causalml') )

0.14.1


In [116]:
def eval_uplift_metrics(df):
    metrics = {}
    for k in [0.01, 0.05, 0.1, 0.25]:
        res = uplift_at_k(df_res['label'].values, df_res['uplift_score'].values, df_res['treatment'].values, k=k, strategy='overall')
        metrics[f'uplift at {k}'] = [res]
    metrics['AUUC'] = [uplift_auc_score(df_res['label'].values, df_res['uplift_score'].values, df_res['treatment'].values)]
    print(metrics)
        
    return pd.DataFrame.from_dict(metrics)

In [117]:
df, x_names = make_uplift_classification(
    n_samples=600000,
    treatment_name=["control", "treatment1"],
    positive_class_proportion=0.02
)
df

Unnamed: 0,treatment_group_key,x1_informative,x2_informative,x3_informative,x4_informative,x5_informative,x6_irrelevant,x7_irrelevant,x8_irrelevant,x9_irrelevant,x10_irrelevant,x11_uplift_increase,x12_uplift_increase,x13_increase_mix,conversion,treatment_effect
0,control,2.722964,-1.419725,3.902724,-0.384170,1.420031,-0.145567,0.511521,-1.209821,-0.937366,-0.554441,-1.019075,-1.272189,-0.188358,0,0
1,treatment1,0.886757,-0.363093,2.156697,-1.201412,1.773541,-2.123047,0.470222,0.778719,-0.333533,-0.332421,-1.297720,-0.804984,-0.327228,1,0
2,control,-0.005157,-0.364372,0.818738,-1.643242,0.795895,-1.330291,0.171316,0.399428,0.557229,-2.185931,-1.312991,-0.940672,-0.641473,0,0
3,control,1.828774,-0.690822,1.071026,0.222805,0.047887,0.667757,-0.062486,2.004395,-0.146797,0.782772,-0.907016,-0.511760,0.106698,0,0
4,control,0.863603,-2.447405,-0.379929,-1.077776,-2.428298,0.468931,-1.082411,-0.662870,-1.020725,0.052052,-1.420372,-1.386674,-0.728867,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1199995,control,0.462160,-0.790999,0.345238,-2.510470,0.096276,-0.421138,0.080578,-2.074361,1.384840,-1.107157,0.137803,-0.250261,-0.055338,0,0
1199996,treatment1,0.998424,-2.505187,0.463547,-1.539096,-0.847681,-0.431783,0.144410,-0.425668,1.503657,0.020205,-0.765569,-1.076597,-0.484296,0,0
1199997,control,1.585948,-0.727498,1.904890,0.278774,-0.174085,0.342130,-0.268581,-0.419666,-0.382512,-0.768071,-1.696507,-1.583233,-0.682925,0,0
1199998,treatment1,0.462929,-0.091441,0.752130,-1.668240,-1.902938,-0.911969,0.109936,0.577629,-0.013583,0.538694,-0.253276,-0.291328,-0.083095,0,0


In [118]:
feature_list = x_names
label_name = 'conversion'

In [119]:
# Look at the conversion rate and sample size in each group
df.pivot_table(values='conversion',
               index='treatment_group_key',
               aggfunc=[np.mean, np.size],
               margins=True)

Unnamed: 0_level_0,mean,size
Unnamed: 0_level_1,conversion,conversion
treatment_group_key,Unnamed: 1_level_2,Unnamed: 2_level_2
control,0.024677,600000
treatment1,0.049385,600000
All,0.037031,1200000


In [120]:
filter_f = FilterSelect()
method = 'F'
f_imp = filter_f.get_importance(df, feature_list, label_name, method,
                      treatment_group='treatment1')
f_imp

Unnamed: 0,method,feature,rank,score,p_value,misc
0,F filter,x12_uplift_increase,1.0,58491.561612,0.0,"df_num: 1.0, df_denom: 1199996.0, order:1"
0,F filter,x13_increase_mix,2.0,35964.895568,0.0,"df_num: 1.0, df_denom: 1199996.0, order:1"
0,F filter,x1_informative,3.0,2.456965,0.117005,"df_num: 1.0, df_denom: 1199996.0, order:1"
0,F filter,x6_irrelevant,4.0,0.980282,0.322129,"df_num: 1.0, df_denom: 1199996.0, order:1"
0,F filter,x7_irrelevant,5.0,0.926216,0.335848,"df_num: 1.0, df_denom: 1199996.0, order:1"
0,F filter,x2_informative,6.0,0.506607,0.476612,"df_num: 1.0, df_denom: 1199996.0, order:1"
0,F filter,x4_informative,7.0,0.366739,0.544787,"df_num: 1.0, df_denom: 1199996.0, order:1"
0,F filter,x8_irrelevant,8.0,0.301958,0.582658,"df_num: 1.0, df_denom: 1199996.0, order:1"
0,F filter,x11_uplift_increase,9.0,0.234764,0.628013,"df_num: 1.0, df_denom: 1199996.0, order:1"
0,F filter,x10_irrelevant,10.0,0.227394,0.633463,"df_num: 1.0, df_denom: 1199996.0, order:1"


In [121]:
# Split data to training and testing samples for model validation (next section)
df_train, df_test = train_test_split(df, test_size=0.2, random_state=111)

# Uplift tree

In [122]:
clf = UpliftTreeClassifier(control_name='control')
clf.fit(df_train[feature_list].values,
         treatment=df_train['treatment_group_key'].values,
         y=df_train['conversion'].values)
p = clf.predict(df_test[feature_list].values)

In [123]:
df_res = pd.DataFrame(p, columns=clf.classes_)
df_res.head()

Unnamed: 0,control,treatment1
0,0.041462,0.047549
1,0.041462,0.047549
2,0.007737,0.012729
3,0.041462,0.047549
4,0.041462,0.047549


In [124]:
df_res['uplift_score'] = df_res['treatment1'] - df_res['control']
df_res['label'] = df['conversion']
df_res['treatment'] = (df['treatment_group_key'] != 'control').astype(int)

In [125]:
eval_uplift_metrics(df_res)

{'uplift at 0.01': [0.024723785699395453], 'uplift at 0.05': [0.023399651313170112], 'uplift at 0.1': [0.02437042275927236], 'uplift at 0.25': [0.023516883276439065], 'AUUC': [-0.00044765350934257816]}


Unnamed: 0,uplift at 0.01,uplift at 0.05,uplift at 0.1,uplift at 0.25,AUUC
0,0.024724,0.0234,0.02437,0.023517,-0.000448


# Uplift random forest

In [126]:
uplift_model = UpliftRandomForestClassifier(control_name='control')

In [127]:
uplift_model.fit(df_train[feature_list].values,
                 treatment=df_train['treatment_group_key'].values,
                 y=df_train['conversion'].values)

In [128]:
df_res = uplift_model.predict(df_test[feature_list].values, full_output=True)
df_res['uplift_score'] = df_res['delta_treatment1']
df_res['label'] = df['conversion']
df_res['treatment'] = (df['treatment_group_key'] != 'control').astype(int)

In [129]:
eval_uplift_metrics(df_res)

{'uplift at 0.01': [0.016288808182750313], 'uplift at 0.05': [0.02361904457137981], 'uplift at 0.1': [0.025138659973829566], 'uplift at 0.25': [0.02546807003985208], 'AUUC': [-2.0255753617408314e-05]}


Unnamed: 0,uplift at 0.01,uplift at 0.05,uplift at 0.1,uplift at 0.25,AUUC
0,0.016289,0.023619,0.025139,0.025468,-2e-05


# X-Learner

In [130]:
learner_x = BaseXClassifier(
    control_outcome_learner=XGBClassifier(),
    treatment_outcome_learner=XGBClassifier(),
    control_effect_learner=XGBRegressor(),
    treatment_effect_learner=XGBRegressor(),
    control_name='control')

In [141]:
# Using linear model
learner_x = BaseXClassifier(
    control_outcome_learner=LogisticRegression(),
    treatment_outcome_learner=LogisticRegression(),
    control_effect_learner=LinearRegression(),
    treatment_effect_learner=LinearRegression(),
    control_name='control')

In [142]:
learner_x.fit(df_train[feature_list].values,
                 treatment=df_train['treatment_group_key'].values,
                 y=df_train['conversion'].values) 

In [132]:
df_res = df_test.copy()
df_res['uplift_score'] = learner_x.predict(df_test[feature_list].values)
df_res['label'] = df['conversion']
df_res['treatment'] = (df['treatment_group_key'] != 'control').astype(int)

In [133]:
eval_uplift_metrics(df_res)

{'uplift at 0.01': [0.9859544317866075], 'uplift at 0.05': [0.4059591862522429], 'uplift at 0.1': [0.20431218979507265], 'uplift at 0.25': [0.08326120567460112], 'AUUC': [0.03829576416188043]}


Unnamed: 0,uplift at 0.01,uplift at 0.05,uplift at 0.1,uplift at 0.25,AUUC
0,0.985954,0.405959,0.204312,0.083261,0.038296


# T-Learner

In [134]:
from sklearn.linear_model import LinearRegression, LogisticRegression
learner_t = BaseTClassifier(XGBClassifier(), control_name='control')
learner_t.fit(df_train[feature_list].values,
                 treatment=df_train['treatment_group_key'].values,
                 y=df_train['conversion'].values) 

In [135]:
df_res = df_test.copy()
df_res['uplift_score'] = learner_t.predict(df_test[feature_list].values)
df_res['label'] = df['conversion']
df_res['treatment'] = (df['treatment_group_key'] != 'control').astype(int)

In [136]:
eval_uplift_metrics(df_res)

{'uplift at 0.01': [0.9900354388159266], 'uplift at 0.05': [0.3969662539635865], 'uplift at 0.1': [0.20235265985415213], 'uplift at 0.25': [0.0836722876326308], 'AUUC': [0.03745400699687713]}


Unnamed: 0,uplift at 0.01,uplift at 0.05,uplift at 0.1,uplift at 0.25,AUUC
0,0.990035,0.396966,0.202353,0.083672,0.037454


# S-Learner

In [137]:
learner_s = BaseSClassifier(XGBClassifier(), control_name='control')
learner_s.fit(df_train[feature_list].values,
                 treatment=df_train['treatment_group_key'].values,
                 y=df_train['conversion'].values) 

In [138]:
df_res = df_test.copy()
df_res['uplift_score'] = learner_s.predict(df_test[feature_list].values)
df_res['label'] = df['conversion']
df_res['treatment'] = (df['treatment_group_key'] != 'control').astype(int)

In [139]:
eval_uplift_metrics(df_res)

{'uplift at 0.01': [0.9884024620797612], 'uplift at 0.05': [0.4053435360704811], 'uplift at 0.1': [0.20665957438355356], 'uplift at 0.25': [0.08495129449485789], 'AUUC': [0.038380473062911535]}


Unnamed: 0,uplift at 0.01,uplift at 0.05,uplift at 0.1,uplift at 0.25,AUUC
0,0.988402,0.405344,0.20666,0.084951,0.03838
