# Train

## Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('data/train.csv')
purchase = pd.read_csv('data/train_purch_hist.csv')
clients = pd.read_csv('data/clients.csv').drop(columns=['client_id.1'])
products = pd.read_csv('data/products.csv')
data.head()

Unnamed: 0,client_id,treatment_flg,purchased
0,ad6561e2d8,1,1
1,7c1ccbf93f,1,1
2,b58fadcab6,1,1
3,e99e6fabb9,0,0
4,27fb6f8520,1,1


In [3]:
data = data.merge(clients[['client_id', 'age']], on='client_id', how='left')
data.head()

Unnamed: 0,client_id,treatment_flg,purchased,age
0,ad6561e2d8,1,1,50
1,7c1ccbf93f,1,1,24
2,b58fadcab6,1,1,36
3,e99e6fabb9,0,0,79
4,27fb6f8520,1,1,34


In [4]:
data = data.merge(clients[['client_id', 'gender']], on='client_id', how='left')
data.head()

Unnamed: 0,client_id,treatment_flg,purchased,age,gender
0,ad6561e2d8,1,1,50,F
1,7c1ccbf93f,1,1,24,F
2,b58fadcab6,1,1,36,U
3,e99e6fabb9,0,0,79,F
4,27fb6f8520,1,1,34,F


In [5]:
data = data.merge(clients[['client_id', 'first_issue_date']], on='client_id', how='left')
data['first_issue_date'] = \
    (pd.to_datetime(data['first_issue_date'])- pd.Timestamp('1970-01-01')) // pd.Timedelta('1s')
data.head()

Unnamed: 0,client_id,treatment_flg,purchased,age,gender,first_issue_date
0,ad6561e2d8,1,1,50,F,1512322431
1,7c1ccbf93f,1,1,24,F,1510331629
2,b58fadcab6,1,1,36,U,1509657465
3,e99e6fabb9,0,0,79,F,1526466080
4,27fb6f8520,1,1,34,F,1500320188


In [6]:
data = data.merge(clients[['client_id', 'first_redeem_date']], on='client_id', how='left')
data['first_redeem_date'] = \
    (pd.to_datetime(data['first_redeem_date'])- pd.Timestamp('1970-01-01')) // pd.Timedelta('1s')
data.head()

Unnamed: 0,client_id,treatment_flg,purchased,age,gender,first_issue_date,first_redeem_date
0,ad6561e2d8,1,1,50,F,1512322431,1527102000.0
1,7c1ccbf93f,1,1,24,F,1510331629,1519326000.0
2,b58fadcab6,1,1,36,U,1509657465,1537045000.0
3,e99e6fabb9,0,0,79,F,1526466080,1527713000.0
4,27fb6f8520,1,1,34,F,1500320188,1513332000.0


In [7]:
data['issue_redeem_delay'] = data['first_redeem_date'] - data['first_issue_date']
data.head()

Unnamed: 0,client_id,treatment_flg,purchased,age,gender,first_issue_date,first_redeem_date,issue_redeem_delay
0,ad6561e2d8,1,1,50,F,1512322431,1527102000.0,14779510.0
1,7c1ccbf93f,1,1,24,F,1510331629,1519326000.0,8994516.0
2,b58fadcab6,1,1,36,U,1509657465,1537045000.0,27387871.0
3,e99e6fabb9,0,0,79,F,1526466080,1527713000.0,1247284.0
4,27fb6f8520,1,1,34,F,1500320188,1513332000.0,13012137.0


In [8]:
df = purchase[['client_id', 
               'transaction_id', 
               'express_points_spent']].drop_duplicates('transaction_id').set_index('client_id')

data['expresS'] = data['client_id'].apply(lambda client: df.loc[client, 'express_points_spent'].sum())
data.head()

Unnamed: 0,client_id,treatment_flg,purchased,age,gender,first_issue_date,first_redeem_date,issue_redeem_delay,expresS
0,ad6561e2d8,1,1,50,F,1512322431,1527102000.0,14779510.0,0.0
1,7c1ccbf93f,1,1,24,F,1510331629,1519326000.0,8994516.0,0.0
2,b58fadcab6,1,1,36,U,1509657465,1537045000.0,27387871.0,0.0
3,e99e6fabb9,0,0,79,F,1526466080,1527713000.0,1247284.0,0.0
4,27fb6f8520,1,1,34,F,1500320188,1513332000.0,13012137.0,-50.0


In [9]:
df = purchase[['client_id', 
               'transaction_id', 
               'purchase_sum']].drop_duplicates('transaction_id').drop(columns=['transaction_id'])
df = df.groupby(by=['client_id']).mean()

data['mean_sum'] = data['client_id'].apply(lambda client: df.loc[client])
data.head()

Unnamed: 0,client_id,treatment_flg,purchased,age,gender,first_issue_date,first_redeem_date,issue_redeem_delay,expresS,mean_sum
0,ad6561e2d8,1,1,50,F,1512322431,1527102000.0,14779510.0,0.0,270.03525
1,7c1ccbf93f,1,1,24,F,1510331629,1519326000.0,8994516.0,0.0,425.5
2,b58fadcab6,1,1,36,U,1509657465,1537045000.0,27387871.0,0.0,429.447576
3,e99e6fabb9,0,0,79,F,1526466080,1527713000.0,1247284.0,0.0,220.923077
4,27fb6f8520,1,1,34,F,1500320188,1513332000.0,13012137.0,-50.0,329.540333


In [10]:
df = purchase[['client_id', 'transaction_id', 'purchase_sum']].drop_duplicates('transaction_id')
df = df.set_index('client_id')
mean = df.purchase_sum.mean()

data['receipt_cnt'] = data['client_id'].apply(lambda client: (df.loc[client, 'purchase_sum'] > mean).sum())
data.head()

Unnamed: 0,client_id,treatment_flg,purchased,age,gender,first_issue_date,first_redeem_date,issue_redeem_delay,expresS,mean_sum,receipt_cnt
0,ad6561e2d8,1,1,50,F,1512322431,1527102000.0,14779510.0,0.0,270.03525,7
1,7c1ccbf93f,1,1,24,F,1510331629,1519326000.0,8994516.0,0.0,425.5,1
2,b58fadcab6,1,1,36,U,1509657465,1537045000.0,27387871.0,0.0,429.447576,15
3,e99e6fabb9,0,0,79,F,1526466080,1527713000.0,1247284.0,0.0,220.923077,0
4,27fb6f8520,1,1,34,F,1500320188,1513332000.0,13012137.0,-50.0,329.540333,8


## Model

In [12]:
from sklearn.model_selection import train_test_split


X_train, X_valid, y_train, y_valid = train_test_split(data.drop(columns=['purchased', 'client_id']),
                                                      data['purchased'],
                                                      test_size=0.1,
                                                      shuffle=True,
                                                      stratify=data['purchased'],
                                                      random_state=42)

treat_train = X_train.treatment_flg
treat_valid = X_valid.treatment_flg
X_train = X_train.drop(columns=['treatment_flg'])
X_valid = X_valid.drop(columns=['treatment_flg'])
cat_features = ['gender']

In [16]:
import catboost
from sklift.models import ClassTransformation
from sklift.metrics import qini_auc_score
from sklift.metrics import uplift_auc_score
from sklearn.metrics import roc_auc_score
from IPython.display import clear_output


def score(model, X_valid, y_valid, treat_valid):
    uplift = model.predict(X_valid)
    print('Gini:', roc_auc_score(y_valid == treat_valid, uplift) * 2 - 1)
    print('Qini:', qini_auc_score(y_true=y_valid, uplift=uplift, treatment=treat_valid))
    print('UASC:', uplift_auc_score(y_true=y_valid, uplift=uplift, treatment=treat_valid))

    
'''
Gini: 0.07661709179349696
Qini: 0.04018105835527764
UASC: 0.057844425558613854
'''
estimator = catboost.CatBoostClassifier(iterations=2**10, task_type="GPU", random_state=42)
model = ClassTransformation(estimator=estimator)
model.fit(
    X=X_train,
    y=y_train,
    treatment=treat_train,
    estimator_fit_params={'cat_features': cat_features}
)
clear_output(wait=True)
score(model, X_valid, y_valid, treat_valid)

Gini: 0.07746136896897471
Qini: 0.040675556885961014
UASC: 0.058557428441803644


In [17]:
estimator = catboost.CatBoostClassifier(iterations=2**10, task_type="GPU", random_state=42)
model = ClassTransformation(estimator=estimator)
model.fit(
    X=data.drop(columns=['purchased', 'client_id', 'treatment_flg']),
    y=data['purchased'],
    treatment=data['treatment_flg'],
    estimator_fit_params={'cat_features': cat_features}
)
clear_output()

# Test

## Data

In [19]:
data = pd.read_csv('data/test.csv')
purchase = pd.read_csv('data/test_purch_hist.csv')

data = data.merge(clients[['client_id', 'age']], on='client_id', how='left')

data = data.merge(clients[['client_id', 'gender']], on='client_id', how='left')

data = data.merge(clients[['client_id', 'first_issue_date']], on='client_id', how='left')
data['first_issue_date'] = \
    (pd.to_datetime(data['first_issue_date'])- pd.Timestamp('1970-01-01')) // pd.Timedelta('1s')

data = data.merge(clients[['client_id', 'first_redeem_date']], on='client_id', how='left')
data['first_redeem_date'] = \
    (pd.to_datetime(data['first_redeem_date'])- pd.Timestamp('1970-01-01')) // pd.Timedelta('1s')

data['issue_redeem_delay'] = data['first_redeem_date'] - data['first_issue_date']

df = purchase[['client_id', 
               'transaction_id', 
               'express_points_spent']].drop_duplicates('transaction_id').set_index('client_id')
data['expresS'] = data['client_id'].apply(lambda client: df.loc[client, 'express_points_spent'].sum())
data.head()

df = purchase[['client_id', 
               'transaction_id', 
               'purchase_sum']].drop_duplicates('transaction_id').drop(columns=['transaction_id'])
df = df.groupby(by=['client_id']).mean()
data['mean_sum'] = data['client_id'].apply(lambda client: df.loc[client])


df = purchase[['client_id', 'transaction_id', 'purchase_sum']].drop_duplicates('transaction_id')
df = df.set_index('client_id')
mean = df.purchase_sum.mean()
data['receipt_cnt'] = data['client_id'].apply(lambda client: (df.loc[client, 'purchase_sum'] > mean).sum())

data.head()

Unnamed: 0,client_id,age,gender,first_issue_date,first_redeem_date,issue_redeem_delay,expresS,mean_sum,receipt_cnt
0,a9a604ed6e,36,F,1536860201,,,0.0,629.506,4
1,ebd7360016,63,F,1499101932,1504283000.0,5180580.0,0.0,183.244194,0
2,908cd9b8e8,49,F,1522429280,1531502000.0,9072272.0,0.0,364.296471,7
3,dceb8ce861,46,U,1507309602,1534013000.0,26703436.0,0.0,1377.543333,6
4,f4f0ac6b06,119,U,1530135581,1550262000.0,20126096.0,0.0,746.666667,9


## Predict

In [20]:
idx = np.array(data['client_id'])
X_test = data.drop(columns=['client_id'])
uplift = model.predict(X_test)

In [21]:
solution = pd.DataFrame({'client_id': idx, 'pred': uplift})
solution.to_csv('artifacts/submission.csv', index=False)
solution.head()

Unnamed: 0,client_id,pred
0,a9a604ed6e,-0.009301
1,ebd7360016,0.096475
2,908cd9b8e8,0.036781
3,dceb8ce861,0.029435
4,f4f0ac6b06,0.065164
