In [1]:
import pandas as pd 
from sklearn.impute import SimpleImputer
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
from sklift.models import ClassTransformation, TwoModels

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
%matplotlib inline

In [9]:
def age_range(values):
    if values > 0 and values <= 17:
        return 0 
    elif values > 17 and values <= 30:
        return 1 
    elif values > 30 and values <= 45 :
        return 2 
    elif values > 45 and values <= 60 :
        return 3 
    elif values >  60 and values <= 90:
        return 4 
    else :
        return 5
    
def summary_f1_uplift(X_val,X_test,y_val,y_test,model):
    pred_val =  np.where(model.predict(X_val) <= 0, 0, 1)
    pred_test =  np.where(model.predict(X_test) <= 0, 0, 1)
    print('f1 score for validation set {}'.format(f1_score(y_val, pred_val, average='binary')))
    print('f1 score for test set {}'.format(f1_score(y_test, pred_test, average='binary')))
    
def summary_f1(X_val,X_test,y_val,y_test,model):
    pred_val =  model.predict(X_val)
    pred_test =  model.predict(X_test) 
    print('f1 score for validation set {}'.format(f1_score(y_val, pred_val, average='binary')))
    print('f1 score for test set {}'.format(f1_score(y_test, pred_test, average='binary')))
    
def data_preprocessing(df_feature, df_products, df_store):
    # clean up, imputation 
    # label encoder 
    
    # split numeric and categorical columns 

    categorical_columns = list(df_feature.select_dtypes(include=['object']).columns)
    categorical_columns = categorical_columns[1:]

    numeric_columns =  list(df_feature.select_dtypes(include=['float64']).columns)
    # numeric_columns = numeric_columns
    
    # fill null columns on numeric columns 
    imp = SimpleImputer(missing_values=np.nan, strategy='median')
    df_feature[numeric_columns] = imp.fit_transform(df_feature[numeric_columns])

    # fill null columns on numeric columns 
    imp2 = SimpleImputer(strategy="most_frequent")
    df_feature[categorical_columns] = imp2.fit_transform(df_feature[categorical_columns])

    # fill null columns on numeric columns 
    imp3 = SimpleImputer(strategy="most_frequent",missing_values='U')
    df_feature['gender'] = imp3.fit_transform(df_feature['gender'].values.reshape(-1,1))
    
    # preprocessing 
    le_store = preprocessing.LabelEncoder()
    le_prod = preprocessing.LabelEncoder()
    le_gender = preprocessing.LabelEncoder()

    le_prod.fit(df_product['product_id'])
    le_store.fit(df_store['store_id'])
    le_gender.fit(df_feature['gender'])

    # product 
    for col in ['m_1_prd_qty_val', 'm_11_prd_qty_val', 'm_12_prd_qty_val', 'm_2_prd_qty_val', 'm_3_prd_qty_val',
               'm_1_prd_pur','m_11_prd_pur','m_12_prd_pur','m_2_prd_pur','m_3_prd_pur']:
        df_feature[col] = le_prod.transform(df_feature[col])

    # store 
    for col in ['m_1_str_top_qty', 'm_11_str_top_qty', 'm_12_str_top_qty', 'm_2_str_top_qty', 'm_3_str_top_qty',
     'm_1_str_top_pur','m_11_str_top_pur','m_12_str_top_pur','m_2_str_top_pur','m_3_str_top_pur']:
        df_feature[col] = le_store.transform(df_feature[col])

    df_feature['gender'] = le_gender.transform(df_feature['gender'])
    
    return df_feature

In [3]:
# load features 
df_feature = pd.read_csv('../../data/feature_stg1.csv/part-00000-13c76568-33bb-4aaa-bb62-1bbeaf144bcb-c000.csv')
df_store = pd.read_csv('../../data/unique_store.csv/part-00000-d7453c53-4a48-4ff3-85f3-1e665ea60e07-c000.csv')
df_feature = df_feature.drop(['first_issue_date','first_redeem_date'],axis=1)
df_feature['age_group'] = df_feature['age'].apply(age_range)

# load products data 
df_product = pd.read_csv('../../data/data/products.csv')

In [4]:
df_feature = data_preprocessing(df_feature, df_product, df_store)

In [5]:
df_feature.head()

Unnamed: 0,client_id,treatment_flg,target,age,gender,m_1_rpr,m_11_rpr,m_12_rpr,m_2_rpr,m_3_rpr,m_1_eps,m_11_eps,m_12_eps,m_2_eps,m_3_eps,m_1_ps,m_11_ps,m_12_ps,m_2_ps,m_3_ps,m_1_rps,m_11_rps,m_12_rps,m_2_rps,m_3_rps,m_1_prd_qty_val,m_11_prd_qty_val,m_12_prd_qty_val,m_2_prd_qty_val,m_3_prd_qty_val,m_1_prd_pur,m_11_prd_pur,m_12_prd_pur,m_2_prd_pur,m_3_prd_pur,m_1_prd_pur_qty,m_11_prd_pur_qty,m_12_prd_pur_qty,m_2_prd_pur_qty,m_3_prd_pur_qty,m_1_prd_pur_sum,m_11_prd_pur_sum,m_12_prd_pur_sum,m_2_prd_pur_sum,m_3_prd_pur_sum,m_1_str_top_qty,m_11_str_top_qty,m_12_str_top_qty,m_2_str_top_qty,m_3_str_top_qty,m_1_str_top_pur,m_11_str_top_pur,m_12_str_top_pur,m_2_str_top_pur,m_3_str_top_pur,m_1_str_top_qty_val,m_11_str_top_qty_val,m_12_str_top_qty_val,m_2_str_top_qty_val,m_3_str_top_qty_val,m_1_str_top_pur_val,m_11_str_top_pur_val,m_12_str_top_pur_val,m_2_str_top_pur_val,m_3_str_top_pur_val,age_group
0,00f6cab0d9,0,1,48,0,17.6,4.4,26.5,11.1,13.2,0.0,0.0,0.0,0.0,0.0,2533.37,698.865,2844.54,1360.99,1732.33,0.0,0.0,0.0,0.0,0.0,10810,10810,10810,10810,10810,10810,10810,10810,10810,10810,5.0,1.0,4.0,3.0,3.0,2533.37,537.63,2844.54,1360.99,1601.13,6022,12871,6022,6022,6022,6022,12871,6022,6022,6022,83.0,57.0,83.0,83.0,83.0,1811.24,618.27,2844.54,1360.99,1732.33,3
1,010c5002de,1,1,27,1,3.2,3.0,8.6,5.4,3.5,-30.0,0.0,0.0,-30.0,0.0,970.41,307.56,1537.7,1163.03,728.48,-24.0,0.0,0.0,-6.0,0.0,29983,13763,37180,13783,25083,29983,13763,26353,13783,17964,1.0,1.0,2.0,2.0,1.0,429.96,307.56,588.62,619.36,318.85,6960,6960,6960,6960,6960,6960,6960,6960,6960,6960,40.0,40.0,40.0,40.0,40.0,749.84,307.56,1537.7,764.03,728.48,1
2,018253c9e4,0,0,78,0,16.4,0.0,28.6,12.5,0.6,0.0,0.0,0.0,0.0,0.0,1651.0,309.0,3077.0,1706.0,223.0,0.0,-309.0,0.0,0.0,-100.0,36778,4261,36258,14501,42383,38892,19639,36258,14501,42383,1.0,1.0,2.0,2.0,1.0,972.0,179.0,2661.0,893.0,223.0,11440,11440,11440,11440,11809,11809,11440,11440,11440,11809,39.0,39.0,39.0,39.0,28.0,972.0,309.0,2196.0,813.0,223.0,4
3,02429418df,0,0,23,1,11.3,4.4,13.4,15.9,8.0,0.0,0.0,0.0,0.0,0.0,1618.0,698.865,1349.0,1876.7,931.07,0.0,0.0,0.0,0.0,0.0,10810,10810,13091,10810,27821,10810,10810,13091,10810,12454,2.0,1.0,1.0,2.0,1.0,1002.405,537.63,1349.0,1841.7,582.07,11336,12871,11379,2524,5653,11336,12871,11379,2524,5653,53.0,57.0,10.0,21.0,10.0,1285.905,618.27,1349.0,1337.7,582.07,1
4,026466cca4,1,1,26,0,14.1,4.4,17.3,5.0,8.4,0.0,0.0,-30.0,0.0,0.0,2046.9,698.865,2317.45,1029.14,1163.165,0.0,0.0,-100.0,0.0,0.0,16313,10810,42851,40393,10810,30166,10810,42286,40393,10810,2.0,1.0,2.0,1.0,2.0,832.97,537.63,1100.48,495.53,801.0,11822,12871,11822,11822,11336,11822,12871,11822,11822,11336,69.0,57.0,69.0,69.0,46.0,1529.2,618.27,2317.45,916.48,967.835,1


In [6]:
# split dataset 

# train and validation 
X_train, X_val, trmnt_train, trmnt_val, y_train, y_val = train_test_split( df_feature.drop(['client_id','treatment_flg', 'target','age'],axis=1), df_feature['treatment_flg'], df_feature['target'], 
    test_size=0.4,
    random_state=42
)

# test and validation 
X_val, X_test, trmnt_val, trmnt_test, y_val, y_test = train_test_split(X_val, trmnt_val , y_val,
    test_size=0.5,
    random_state=42
)


In [7]:
X_train.shape, X_val.shape, X_test.shape

((120023, 62), (40008, 62), (40008, 62))

#  conventional model 

In [10]:
%%time

#catboost 
# model_cat_conv = CatBoostClassifier(iterations=1500,task_type="GPU",cat_features=categorical_columns, verbose = 0, border_count = 150, l2_leaf_reg=5, depth =4,
#                            devices='0:1', learning_rate=0.03)
# model_cat_conv.fit(X_train,y_train, eval_set=(X_val, y_val))
model_cat_conv = CatBoostClassifier(learning_rate=0.03, verbose = 0)
# model_cat_conv.fit(X_train,y_train, eval_set=(X_val, y_val))
# model_cat_conv.save_model('../../data/data/dft_catboost_feat_stg1',
#            format="cbm",
#            export_parameters=None,
#            pool=None)
model_cat_conv.load_model('../../data/data/dft_catboost_feat_stg1')

Wall time: 52 ms


<catboost.core.CatBoostClassifier at 0x1dc1d8af3c8>

In [17]:
print('CatBoost F1 ')
summary_f1(X_val,X_test,y_val,y_test,model_cat_conv)

CatBoost F1 
f1 score for validation set 0.7768655992236779
f1 score for test set 0.775350161913308


In [22]:
# del model_cat_conv

In [21]:
# # save model 
# model_cat_conv.save_model('../../data/data/dft_catboost_feat_stg1',
#            format="cbm",
#            export_parameters=None,
#            pool=None)

In [12]:
%%time
model_xgb_conv = XGBClassifier(use_label_encoder=False, eval_metric='error', learning_rate = 0.03)
# model_xgb_conv = model_xgb_conv.fit(X_train, y_train)
# model_xgb_conv.save_model("../../data/data/dft_xgb_feat_stg1.json")
model_xgb_conv.load_model('../../data/data/dft_xgb_feat_stg1.json')

Wall time: 89 ms


In [16]:
print('XGB F1 ')
summary_f1(X_val,X_test,y_val,y_test,model_xgb_conv)

XGB F1 
f1 score for validation set 0.7758248063498135
f1 score for test set 0.7767441860465116


In [26]:
# model_xgb_conv.save_model("../../data/data/dft_xgb_feat_stg1.json")

# uplift XGBoost

In [18]:
%%time
# Classtransformation 
# define approach
ct_xgb = ClassTransformation(XGBClassifier(random_state=777,eval_metric='error' ))
# fit the model
ct_xgb = ct_xgb.fit(X_train, y_train, trmnt_train)



Wall time: 22.2 s


In [20]:
# pred_val_up_ct_xgb =  np.where(ct_xgb.predict(X_val) < 0, 0, 1)
# pred_test_up_ct_xgb =  np.where(ct_xgb.predict(X_test) < 0, 0, 1)
# print('f1 score for validation set {}'.format(f1_score(y_val, pred_val_up_ct_xgb, average='binary')))
# print('f1 score for test set {}'.format(f1_score(y_test, pred_test_up_ct_xgb, average='binary')))

summary_f1_uplift(X_val,X_test,y_val,y_test,ct_xgb)

f1 score for validation set 0.6101492537313433
f1 score for test set 0.6079131666633413


In [23]:
%%time
# two model 

estimator_trmnt_xgb = XGBClassifier(random_state=777,eval_metric='error' )
estimator_ctrl_xgb = XGBClassifier(random_state=777,eval_metric='error' )


# define approach
tm_ctrl_xgb = TwoModels(
    estimator_trmnt=estimator_trmnt_xgb,
    estimator_ctrl=estimator_ctrl_xgb,
    method='ddr_control'
)

tm_ctrl_xgb = tm_ctrl_xgb.fit(
    X_train, y_train, trmnt_train
)



Wall time: 32 s


In [24]:
summary_f1_uplift(X_val,X_test,y_val,y_test,tm_ctrl_xgb)

f1 score for validation set 0.615915013028663
f1 score for test set 0.6195515192734692


# uplift Catboost

In [26]:
%%time
# Classtransformation 
# define approach
ct_cat = ClassTransformation(CatBoostClassifier(verbose=0, random_state=777))
# fit the model
ct_cat = ct_cat.fit(X_train, y_train, trmnt_train)

Wall time: 40.2 s


In [27]:
summary_f1_uplift(X_val,X_test,y_val,y_test,ct_cat)

f1 score for validation set 0.6177533754078822
f1 score for test set 0.6201559454191033


In [29]:
%%time
# two model 

estimator_trmnt_cat = CatBoostClassifier(silent=True, thread_count=2, random_state=42)
estimator_ctrl_cat = CatBoostClassifier(silent=True, thread_count=2, random_state=42)

# define approach
tm_ctrl_cat = TwoModels(
    estimator_trmnt=estimator_trmnt_cat,
    estimator_ctrl=estimator_ctrl_cat,
    method='ddr_control'
)

tm_ctrl_cat = tm_ctrl_cat.fit(
    X_train, y_train, trmnt_train
)

Wall time: 1min 22s


In [30]:
summary_f1_uplift(X_val,X_test,y_val,y_test,tm_ctrl_cat)

f1 score for validation set 0.6690188172043011
f1 score for test set 0.6662422052021498
