In [1]:
import pandas as pd
import numpy as np
import catboost
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from IPython.display import clear_output

# SK-Model

In [181]:
data = pd.read_csv('artifacts/data_train.csv')
data = data.drop(columns=['card_own', 'from_redeem',  'regularR', 'regularS', 'expresR', 'trademark', 'popular',
                          'n_purchases', 'alcohol', 'netto', 'frequency', 'last_p', 'popular_store', 'popular_brand',
                          'popular_lvl3', 'popular_vr', 'trn_red', 'trn_iss', 'popular_cnt', 'receipt',
                          'full_sum', 'max_sum', 'min_sum'])
data.head()

Unnamed: 0,client_id,treatment_flg,purchased,age,gender,first_issue_date,first_redeem_date,issue_redeem_delay,expresS,mean_sum,receipt_cnt
0,ad6561e2d8,1,1,50,F,1512322431,1527102000.0,14779510.0,0.0,270.03525,7
1,7c1ccbf93f,1,1,24,F,1510331629,1519326000.0,8994516.0,0.0,425.5,1
2,b58fadcab6,1,1,36,U,1509657465,1537045000.0,27387871.0,0.0,429.447576,15
3,e99e6fabb9,0,0,79,F,1526466080,1527713000.0,1247284.0,0.0,220.923077,0
4,27fb6f8520,1,1,34,F,1500320188,1513332000.0,13012137.0,-50.0,329.540333,8


In [182]:
X_train, X_valid, y_train, y_valid = train_test_split(data.drop(columns=['purchased', 'client_id']),
                                                      data['purchased'],
                                                      test_size=0.1,
                                                      shuffle=True,
                                                      stratify=data['purchased'],
                                                      random_state=42)

treat_train = X_train.treatment_flg
treat_valid = X_valid.treatment_flg
X_train = X_train.drop(columns=['treatment_flg'])
X_valid = X_valid.drop(columns=['treatment_flg'])
cat_features = ['gender']

In [183]:
from sklift.models import ClassTransformation
from sklift.metrics import qini_auc_score
from sklift.metrics import uplift_auc_score


def score(model, X_valid, y_valid, treat_valid):
    uplift = model.predict(X_valid)
    print('Gini:', roc_auc_score(y_valid == treat_valid, uplift) * 2 - 1)
    print('Qini:', qini_auc_score(y_true=y_valid, uplift=uplift, treatment=treat_valid))
    print('UASC:', uplift_auc_score(y_true=y_valid, uplift=uplift, treatment=treat_valid))

    
'''
Gini: 0.07661709179349696
Qini: 0.04018105835527764
UASC: 0.057844425558613854
'''
estimator = catboost.CatBoostClassifier(iterations=2**10, task_type="GPU", random_state=42)
model = ClassTransformation(estimator=estimator)
model.fit(
    X=X_train,
    y=y_train,
    treatment=treat_train,
    estimator_fit_params={'cat_features': cat_features}
)
clear_output(wait=True)
score(model, X_valid, y_valid, treat_valid)

Gini: 0.07661929723890482
Qini: 0.04018266863198764
UASC: 0.05784724702096063


In [121]:
pd.DataFrame({
    'feature_name': model.estimator.feature_names_,
    'feature_score': model.estimator.feature_importances_
}).sort_values('feature_score', ascending=False).reset_index(drop=True)

Unnamed: 0,feature_name,feature_score
0,first_redeem_date,27.362541
1,age,13.967593
2,mean_sum,13.150289
3,first_issue_date,12.180607
4,receipt_cnt,11.573199
5,issue_redeem_delay,11.222169
6,expresS,6.443398
7,gender,4.100204


In [141]:
estimator = catboost.CatBoostClassifier(iterations=2**10, task_type="GPU", random_state=42)
model = ClassTransformation(estimator=estimator)
model.fit(
    X=data.drop(columns=['purchased', 'client_id', 'treatment_flg']),
    y=data['purchased'],
    treatment=data['treatment_flg'],
    estimator_fit_params={'cat_features': cat_features}
)
clear_output()

In [142]:
import pickle


with open('artifacts/model.ml', 'wb') as file:
    pickle.dump(model, file)

# DLC::Research

## SK-TwoModels (ddr_treatment)

In [10]:
from sklift.models import TwoModels


def score(model, X_test, y_test):
    uplift = model.predict(X_test)
    print('Gini:', roc_auc_score(y_test, uplift) * 2 - 1)

    
estimatorT = catboost.CatBoostClassifier(iterations=2**10, task_type="GPU", random_state=42)
estimatorC = catboost.CatBoostClassifier(iterations=2**10, task_type="GPU", random_state=42)
model = TwoModels(
    estimator_trmnt=estimatorT, 
    estimator_ctrl=estimatorC, 
    method='ddr_treatment'
)
model.fit(
    X=X_train,
    y=y_train,
    treatment=treat_train,
    estimator_trmnt_fit_params={'cat_features': cat_features}, 
    estimator_ctrl_fit_params={'cat_features': cat_features}
)
clear_output(wait=True)
score(model, X_valid, y_valid == treat_valid)

Gini: 0.041325145957602505


## SK-TwoModels (ddr_control)

In [9]:
from sklift.models import TwoModels


def score(model, X_test, y_test):
    uplift = model.predict(X_test)
    print('Gini:', roc_auc_score(y_test, uplift) * 2 - 1)

    
estimatorT = catboost.CatBoostClassifier(iterations=2**10, task_type="GPU", random_state=42)
estimatorC = catboost.CatBoostClassifier(iterations=2**10, task_type="GPU", random_state=42)
model = TwoModels(
    estimator_trmnt=estimatorT, 
    estimator_ctrl=estimatorC, 
    method='ddr_control'
)
model.fit(
    X=X_train,
    y=y_train,
    treatment=treat_train,
    estimator_trmnt_fit_params={'cat_features': cat_features}, 
    estimator_ctrl_fit_params={'cat_features': cat_features}
)
clear_output(wait=True)
score(model, X_valid, y_valid == treat_valid)

Gini: 0.05819659912149744


## SK-TwoModels (vanilla)

In [8]:
from sklift.models import TwoModels


def score(model, X_test, y_test):
    uplift = model.predict(X_test)
    print('Gini:', roc_auc_score(y_test, uplift) * 2 - 1)

    
estimatorT = catboost.CatBoostClassifier(iterations=2**10, task_type="GPU", random_state=42)
estimatorC = catboost.CatBoostClassifier(iterations=2**10, task_type="GPU", random_state=42)
model = TwoModels(
    estimator_trmnt=estimatorT, 
    estimator_ctrl=estimatorC, 
    method='vanilla'
)
model.fit(
    X=X_train,
    y=y_train,
    treatment=treat_train,
    estimator_trmnt_fit_params={'cat_features': cat_features}, 
    estimator_ctrl_fit_params={'cat_features': cat_features}
)
clear_output(wait=True)
score(model, X_valid, y_valid == treat_valid)

Gini: 0.054259674860168694


## SK-SoloModel

In [5]:
from sklift.models import SoloModel


def score(model, X_test, y_test):
    uplift = model.predict(X_test)
    print('Gini:', roc_auc_score(y_test, uplift) * 2 - 1)

    
estimator = catboost.CatBoostClassifier(iterations=2**10, task_type="GPU", random_state=42, )
model = SoloModel(estimator=estimator)
model.fit(
    X=X_train,
    y=y_train,
    treatment=treat_train,
    estimator_fit_params={'cat_features': cat_features}
)
clear_output(wait=True)
score(model, X_valid, y_valid == treat_valid)

Gini: 0.03914802418638463


## MyModel

In [16]:
dataT = data[data.treatment_flg == 1].drop(columns=['treatment_flg'])
dataC = data[data.treatment_flg == 0].drop(columns=['treatment_flg'])


X_trainT, X_validT, y_trainT, y_validT = train_test_split(dataT.drop(columns=['purchased', 'client_id']),
                                                          dataT['purchased'],
                                                          test_size=0.1,
                                                          shuffle=True,
                                                          stratify=dataT['purchased'],
                                                          random_state=42)


X_trainC, X_validC, y_trainC, y_validC = train_test_split(dataC.drop(columns=['purchased', 'client_id']),
                                                          dataC['purchased'],
                                                          test_size=0.1,
                                                          shuffle=True,
                                                          stratify=dataC['purchased'],
                                                          random_state=42)

In [17]:
def score(model, X_valid, y_valid):
    preds = model.predict(X_valid)
    print(classification_report(y_valid, preds))
    

modelC = catboost.CatBoostClassifier(iterations=2**10, task_type="GPU", random_state=42)
modelC.fit(X_trainC, y_trainC, cat_features=cat_features)
clear_output(wait=True)
score(modelC, X_validC, y_validC)

              precision    recall  f1-score   support

           0       0.58      0.18      0.28      2779
           1       0.63      0.91      0.74      4218

    accuracy                           0.62      6997
   macro avg       0.60      0.55      0.51      6997
weighted avg       0.61      0.62      0.56      6997



In [18]:
def score(model, X_valid, y_valid):
    preds = model.predict(X_valid)
    print(classification_report(y_valid, preds))
    
    
modelT = catboost.CatBoostClassifier(iterations=2**10, task_type="GPU", random_state=42)
modelT.fit(X_trainT, y_trainT, cat_features=cat_features)
clear_output(wait=True)
score(modelT, X_validT, y_validT)

              precision    recall  f1-score   support

           0       0.63      0.20      0.31      2547
           1       0.67      0.93      0.78      4459

    accuracy                           0.67      7006
   macro avg       0.65      0.57      0.54      7006
weighted avg       0.66      0.67      0.61      7006



In [19]:
def score(modelT, modelC, X_test, y_test):
    predsC = modelC.predict_proba(X_test)
    predsT = modelT.predict_proba(X_test)
    uplift = predsT[:, 1] - predsC[:, 1]
    print('Gini:', roc_auc_score(y_test, uplift) * 2 - 1)
    

#best Gini: 0.0437
X_test = np.concatenate((X_validT, X_validC))
y_test = np.concatenate((y_validT == 1, y_validC == 0))
score(modelT, modelC, X_test, y_test)

Gini: 0.01846283483307598
