In [1]:
import pandas as pd
import numpy as np

# Feature Selection and Preprocessing

In [2]:
train = pd.read_csv('buyer_train.csv', index_col=0)
test = pd.read_csv('buyer_test.csv', index_col=0)

In [3]:
features = ['purchase', 'purchase_last_m', 'click', 'click_last_m', 
            'click_recent', 'cart', 'cart_last_m', 'cart_recent', 
            'cart_buy_ratio', 'click_buy_ratio_last_m',
            'cart_buy_ratio_last_m', 'click_buy_ratio_recent',
            'cart_buy_ratio_recent', 'last_buy', 'last_cart', 'outcome']

In [4]:
train = train[features]
test = test[features]

In [5]:
X_train = train.iloc[:, :-1].values
y_train = train.iloc[:, -1].values
X_test = test.iloc[:, :-1].values
y_test = test.iloc[:, -1].values

In [6]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# random forest

In [20]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, roc_auc_score

In [31]:
from sklearn.ensemble import RandomForestClassifier
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(3, 20)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [32]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 200, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=200,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [3, 3, 3, 4, 4, 4, 5, 5, 5,
                                                      6, 6, 6, 7, 7, 7, 8, 8, 8,
                                                      9, 9, 9, 10, 10, 10, 11,
                                                      11, 12, 12, 12, 13, ...],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [100, 200, 300, 400,
                                                         500, 600, 700, 800,
                                                         900, 1000]},
                   random_state=42, verbose=2)

In [33]:
rf_random.best_params_

{'n_estimators': 500,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 8,
 'bootstrap': True}

In [49]:
rf = rf_random.best_estimator_
rf.fit(X_train, y_train)
threshold = 0.46
predicted_proba = rf.predict_proba(X_test)
y_pred = (predicted_proba [:,1] >= threshold).astype('int')

In [45]:
for threshold in np.linspace(0.31, 0.5, 20):
    predicted_proba = rf.predict_proba(X_test)
    y_pred = (predicted_proba [:,1] >= threshold).astype('int')
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    print(threshold, f1+auc)

0.31 1.3714000995664772
0.32 1.3774874719060919
0.33 1.3859706360990764
0.33999999999999997 1.386826880350033
0.35 1.392081371282264
0.36 1.3959011138703716
0.37 1.3984175251281465
0.38 1.4024531016499566
0.39 1.4023998695203392
0.4 1.4069364739152834
0.41000000000000003 1.413807187835117
0.42 1.4090547332614434
0.43 1.415164694331892
0.44 1.4177725412458142
0.45 1.421610593218924
0.45999999999999996 1.4245249232792165
0.47 1.4198061672630864
0.48 1.4229174867402277
0.49 1.4220057764388527
0.5 1.4193484514030321
[CV] END bootstrap=True, max_depth=9, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=800; total time=   3.6s
[CV] END bootstrap=False, max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=900; total time=   3.3s
[CV] END bootstrap=True, max_depth=8, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=500; total time=   2.0s
[CV] END bootstrap=True, max_depth=8, max_features=sqrt, min_samples_leaf=2

[CV] END bootstrap=False, max_depth=14, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=700; total time=   5.3s
[CV] END bootstrap=False, max_depth=9, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=800; total time=   4.4s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=500; total time=   3.4s
[CV] END bootstrap=True, max_depth=8, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=900; total time=   3.6s
[CV] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=600; total time=   3.1s
[CV] END bootstrap=True, max_depth=12, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   1.5s
[CV] END bootstrap=True, max_depth=16, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=800; total time=   4.1s
[CV] END bootstrap=False, max_depth=9, max_fe

[CV] END bootstrap=True, max_depth=7, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=900; total time=   3.4s
[CV] END bootstrap=True, max_depth=3, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=   0.9s
[CV] END bootstrap=False, max_depth=9, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=800; total time=   4.4s
[CV] END bootstrap=True, max_depth=8, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=900; total time=   3.5s
[CV] END bootstrap=True, max_depth=7, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=400; total time=   1.4s
[CV] END bootstrap=True, max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   1.2s
[CV] END bootstrap=False, max_depth=4, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   0.6s
[CV] END bootstrap=False, max_depth=4, max_features=

[CV] END bootstrap=True, max_depth=9, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=800; total time=   3.5s
[CV] END bootstrap=True, max_depth=3, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=   0.9s
[CV] END bootstrap=False, max_depth=9, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=800; total time=   4.5s
[CV] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   1.2s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=500; total time=   3.3s
[CV] END bootstrap=True, max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   1.2s
[CV] END bootstrap=True, max_depth=14, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=400; total time=   2.2s
[CV] END bootstrap=True, max_depth=3, max_featur

[CV] END bootstrap=True, max_depth=7, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=900; total time=   3.4s
[CV] END bootstrap=False, max_depth=14, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=700; total time=   5.1s
[CV] END bootstrap=False, max_depth=7, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   1.3s
[CV] END bootstrap=False, max_depth=15, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=500; total time=   3.4s
[CV] END bootstrap=True, max_depth=8, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=900; total time=   3.6s
[CV] END bootstrap=True, max_depth=3, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=800; total time=   1.9s
[CV] END bootstrap=False, max_depth=17, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1000; total time=   7.7s
[CV] END bootstrap=True, max_depth=13, max_fea

In [50]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, roc_auc_score
cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
print(acc, f1, auc)

0.6928327645051194 0.7588152327221439 0.6647158305236437


In [51]:
cm

array([[1167, 1063],
       [ 647, 2690]])

# xgboost

In [10]:
import xgboost as xgb

In [13]:
random_grid = [
  {
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'max_depth': range(3, 10),
    'min_child_weight': range(11),
    'gamma': [i/10.0 for i in range(4)],
    'subsample': [i/10 for i in range(5, 9)],
    'colsample_bytree': [i/10.0 for i in range(1, 9)]
  }
]

In [16]:
xg = xgb.XGBClassifier(objective='binary:logistic')
xg_random = RandomizedSearchCV(estimator = xg, param_distributions = random_grid, n_iter = 500, cv = 3, verbose=2, random_state=42, n_jobs = -1)
xg_random.fit(X_train, y_train)

Fitting 3 folds for each of 500 candidates, totalling 1500 fits
[CV] END colsample_bytree=0.8, gamma=0.1, learning_rate=0.3, max_depth=4, min_child_weight=5, n_estimators=50, subsample=0.8; total time=   0.2s
[CV] END colsample_bytree=0.6, gamma=0.1, learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=200, subsample=0.7; total time=   0.6s
[CV] END colsample_bytree=0.6, gamma=0.2, learning_rate=0.1, max_depth=6, min_child_weight=5, n_estimators=200, subsample=0.5; total time=   0.7s
[CV] END colsample_bytree=0.7, gamma=0.1, learning_rate=0.2, max_depth=6, min_child_weight=8, n_estimators=100, subsample=0.5; total time=   0.4s
[CV] END colsample_bytree=0.4, gamma=0.1, learning_rate=0.01, max_depth=7, min_child_weight=3, n_estimators=100, subsample=0.5; total time=   0.4s
[CV] END colsample_bytree=0.1, gamma=0.1, learning_rate=0.01, max_depth=5, min_child_weight=1, n_estimators=200, subsample=0.8; total time=   0.2s
[CV] END colsample_bytree=0.6, gamma=0.0, learning_rate=0.

[CV] END colsample_bytree=0.8, gamma=0.1, learning_rate=0.3, max_depth=4, min_child_weight=5, n_estimators=50, subsample=0.8; total time=   0.2s
[CV] END colsample_bytree=0.7, gamma=0.2, learning_rate=0.3, max_depth=3, min_child_weight=6, n_estimators=200, subsample=0.5; total time=   0.4s
[CV] END colsample_bytree=0.6, gamma=0.2, learning_rate=0.1, max_depth=6, min_child_weight=5, n_estimators=200, subsample=0.5; total time=   0.8s
[CV] END colsample_bytree=0.6, gamma=0.2, learning_rate=0.3, max_depth=6, min_child_weight=5, n_estimators=100, subsample=0.8; total time=   0.4s
[CV] END colsample_bytree=0.3, gamma=0.0, learning_rate=0.1, max_depth=5, min_child_weight=5, n_estimators=50, subsample=0.7; total time=   0.1s
[CV] END colsample_bytree=0.4, gamma=0.1, learning_rate=0.01, max_depth=7, min_child_weight=3, n_estimators=100, subsample=0.5; total time=   0.4s
[CV] END colsample_bytree=0.1, gamma=0.1, learning_rate=0.01, max_depth=5, min_child_weight=1, n_estimators=200, subsample=0.

[CV] END colsample_bytree=0.7, gamma=0.0, learning_rate=0.2, max_depth=9, min_child_weight=10, n_estimators=100, subsample=0.7; total time=   0.6s
[CV] END colsample_bytree=0.7, gamma=0.0, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=100, subsample=0.8; total time=   0.3s
[CV] END colsample_bytree=0.3, gamma=0.3, learning_rate=0.01, max_depth=6, min_child_weight=9, n_estimators=100, subsample=0.7; total time=   0.3s
[CV] END colsample_bytree=0.5, gamma=0.1, learning_rate=0.3, max_depth=3, min_child_weight=1, n_estimators=150, subsample=0.7; total time=   0.3s
[CV] END colsample_bytree=0.6, gamma=0.2, learning_rate=0.3, max_depth=6, min_child_weight=5, n_estimators=100, subsample=0.8; total time=   0.4s
[CV] END colsample_bytree=0.4, gamma=0.1, learning_rate=0.2, max_depth=6, min_child_weight=10, n_estimators=100, subsample=0.6; total time=   0.3s
[CV] END colsample_bytree=0.4, gamma=0.0, learning_rate=0.01, max_depth=6, min_child_weight=4, n_estimators=100, subsampl

[CV] END colsample_bytree=0.8, gamma=0.1, learning_rate=0.3, max_depth=4, min_child_weight=5, n_estimators=50, subsample=0.8; total time=   0.2s
[CV] END colsample_bytree=0.6, gamma=0.1, learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=200, subsample=0.7; total time=   0.6s
[CV] END colsample_bytree=0.3, gamma=0.3, learning_rate=0.01, max_depth=6, min_child_weight=9, n_estimators=100, subsample=0.7; total time=   0.3s
[CV] END colsample_bytree=0.7, gamma=0.3, learning_rate=0.3, max_depth=6, min_child_weight=3, n_estimators=150, subsample=0.6; total time=   0.6s
[CV] END colsample_bytree=0.1, gamma=0.3, learning_rate=0.1, max_depth=3, min_child_weight=0, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.1, gamma=0.3, learning_rate=0.1, max_depth=3, min_child_weight=0, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.4, gamma=0.1, learning_rate=0.2, max_depth=6, min_child_weight=10, n_estimators=100, subsample=

[CV] END colsample_bytree=0.7, gamma=0.2, learning_rate=0.3, max_depth=3, min_child_weight=6, n_estimators=200, subsample=0.5; total time=   0.5s
[CV] END colsample_bytree=0.7, gamma=0.0, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=100, subsample=0.8; total time=   0.4s
[CV] END colsample_bytree=0.3, gamma=0.3, learning_rate=0.01, max_depth=6, min_child_weight=9, n_estimators=100, subsample=0.7; total time=   0.2s
[CV] END colsample_bytree=0.5, gamma=0.1, learning_rate=0.3, max_depth=3, min_child_weight=1, n_estimators=150, subsample=0.7; total time=   0.3s
[CV] END colsample_bytree=0.5, gamma=0.1, learning_rate=0.3, max_depth=3, min_child_weight=1, n_estimators=150, subsample=0.7; total time=   0.3s
[CV] END colsample_bytree=0.7, gamma=0.1, learning_rate=0.2, max_depth=6, min_child_weight=8, n_estimators=100, subsample=0.5; total time=   0.4s
[CV] END colsample_bytree=0.1, gamma=0.0, learning_rate=0.01, max_depth=7, min_child_weight=4, n_estimators=50, subsample=0

[CV] END colsample_bytree=0.7, gamma=0.0, learning_rate=0.2, max_depth=9, min_child_weight=10, n_estimators=100, subsample=0.7; total time=   0.6s
[CV] END colsample_bytree=0.7, gamma=0.0, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=100, subsample=0.8; total time=   0.4s
[CV] END colsample_bytree=0.7, gamma=0.3, learning_rate=0.3, max_depth=6, min_child_weight=3, n_estimators=150, subsample=0.6; total time=   0.6s
[CV] END colsample_bytree=0.7, gamma=0.1, learning_rate=0.2, max_depth=6, min_child_weight=8, n_estimators=100, subsample=0.5; total time=   0.4s
[CV] END colsample_bytree=0.1, gamma=0.0, learning_rate=0.01, max_depth=7, min_child_weight=4, n_estimators=50, subsample=0.6; total time=   0.1s
[CV] END colsample_bytree=0.1, gamma=0.0, learning_rate=0.01, max_depth=7, min_child_weight=4, n_estimators=50, subsample=0.6; total time=   0.1s
[CV] END colsample_bytree=0.4, gamma=0.0, learning_rate=0.01, max_depth=6, min_child_weight=4, n_estimators=100, subsample=

[CV] END colsample_bytree=0.7, gamma=0.0, learning_rate=0.2, max_depth=9, min_child_weight=10, n_estimators=100, subsample=0.7; total time=   0.6s
[CV] END colsample_bytree=0.6, gamma=0.2, learning_rate=0.1, max_depth=6, min_child_weight=5, n_estimators=200, subsample=0.5; total time=   0.7s
[CV] END colsample_bytree=0.6, gamma=0.2, learning_rate=0.3, max_depth=6, min_child_weight=5, n_estimators=100, subsample=0.8; total time=   0.4s
[CV] END colsample_bytree=0.3, gamma=0.0, learning_rate=0.1, max_depth=5, min_child_weight=5, n_estimators=50, subsample=0.7; total time=   0.1s
[CV] END colsample_bytree=0.4, gamma=0.1, learning_rate=0.01, max_depth=7, min_child_weight=3, n_estimators=100, subsample=0.5; total time=   0.3s
[CV] END colsample_bytree=0.4, gamma=0.1, learning_rate=0.01, max_depth=7, min_child_weight=9, n_estimators=200, subsample=0.6; total time=   0.7s
[CV] END colsample_bytree=0.7, gamma=0.2, learning_rate=0.2, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0

[CV] END colsample_bytree=0.7, gamma=0.2, learning_rate=0.3, max_depth=3, min_child_weight=6, n_estimators=200, subsample=0.5; total time=   0.5s
[CV] END colsample_bytree=0.6, gamma=0.1, learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=200, subsample=0.7; total time=   0.5s
[CV] END colsample_bytree=0.7, gamma=0.3, learning_rate=0.3, max_depth=6, min_child_weight=3, n_estimators=150, subsample=0.6; total time=   0.6s
[CV] END colsample_bytree=0.1, gamma=0.3, learning_rate=0.1, max_depth=3, min_child_weight=0, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.3, gamma=0.0, learning_rate=0.1, max_depth=5, min_child_weight=5, n_estimators=50, subsample=0.7; total time=   0.1s
[CV] END colsample_bytree=0.4, gamma=0.1, learning_rate=0.2, max_depth=6, min_child_weight=10, n_estimators=100, subsample=0.6; total time=   0.3s
[CV] END colsample_bytree=0.4, gamma=0.1, learning_rate=0.01, max_depth=7, min_child_weight=9, n_estimators=200, subsample=

RandomizedSearchCV(cv=3,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           callbacks=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None,
                                           early_stopping_rounds=None,
                                           enable_categorical=False,
                                           eval_metric=None, feature_types=None,
                                           gamma=None, gpu_id=None,
                                           grow_policy=None,
                                           importance_type=None,
                                           interaction_constraints=None,
                                           learning_rate...
                                           num_parallel_tree=None,
                            

In [17]:
xg_random.best_params_

{'subsample': 0.5,
 'n_estimators': 50,
 'min_child_weight': 4,
 'max_depth': 5,
 'learning_rate': 0.1,
 'gamma': 0.2,
 'colsample_bytree': 0.4}

In [46]:
xg = xg_random.best_estimator_
xg.fit(X_train, y_train)
threshold = 0.41
predicted_proba = xg.predict_proba(X_test)
y_pred = (predicted_proba [:,1] >= threshold).astype('int')

In [21]:
for threshold in np.linspace(0.31, 0.5, 20):
    predicted_proba = xg.predict_proba(X_test)
    y_pred = (predicted_proba [:,1] >= threshold).astype('int')
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    print(threshold, f1+auc)

0.31 1.3836329370668772
0.32 1.393284199589056
0.33 1.3985280795350628
0.33999999999999997 1.4040305238919584
0.35 1.4071333606397523
0.36 1.4085582607315725
0.37 1.414877377801742
0.38 1.419164410558477
0.39 1.4217777301465178
0.4 1.4199251695958384
0.41000000000000003 1.4235058674090233
0.42 1.4195117039738787
0.43 1.4214050833091965
0.44 1.4176749058575187
0.45 1.4178431660509223
0.45999999999999996 1.415296424588262
0.47 1.4170083781224871
0.48 1.4153418485336604
0.49 1.4132758606868032
0.5 1.4098030627599716


In [47]:
cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
print(acc, f1, auc)

0.6931920244296749 0.7647382920110193 0.6587675753980039


In [48]:
cm

array([[1083, 1147],
       [ 561, 2776]])

In [10]:
# export result for second stage
test['predict'] = y_pred
user_id = test[test.predict == 1].index
%store user_id

Stored 'user_id' (Index)


Stored 'user_id' (Index)
