In [12]:
import gc
import os
import operator

from glob import glob

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import xgbfir

from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from pandas.core.categorical import Categorical
from scipy.sparse import csr_matrix, hstack
from sklearn.metrics import roc_auc_score

In [2]:
dtypes = {
    'ip': 'uint32',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8'
}

# Features used in training
categorical_features = ['app', 'device', 'os', 'channel']
numerical_features = [
    'PERCENT_TRUE(clicks.is_attributed)_app_1day', 'COUNT(clicks)_app_1day',
    'PERCENT_TRUE(clicks.is_attributed)_app_1hour',
    'COUNT(clicks)_app_1hour',
    'PERCENT_TRUE(clicks.is_attributed)_app_3hours',
    'COUNT(clicks)_app_3hours',
    'PERCENT_TRUE(clicks.is_attributed)_device_1day',
    'COUNT(clicks)_device_1day',
    'PERCENT_TRUE(clicks.is_attributed)_device_1hour',
    'COUNT(clicks)_device_1hour',
    'PERCENT_TRUE(clicks.is_attributed)_device_3hours',
    'COUNT(clicks)_device_3hours',
    'PERCENT_TRUE(clicks.is_attributed)_os_1day', 'COUNT(clicks)_os_1day',
    'PERCENT_TRUE(clicks.is_attributed)_os_1hour', 'COUNT(clicks)_os_1hour',
    'PERCENT_TRUE(clicks.is_attributed)_os_3hours',
    'COUNT(clicks)_os_3hours',
    'PERCENT_TRUE(clicks.is_attributed)_channel_1day',
    'COUNT(clicks)_channel_1day',
    'PERCENT_TRUE(clicks.is_attributed)_channel_1hour',
    'COUNT(clicks)_channel_1hour',
    'PERCENT_TRUE(clicks.is_attributed)_channel_3hours',
    'COUNT(clicks)_channel_3hours'
]


train_files = ['../data/processed/train_2017-11-08_1600.csv']

validation_files = ['../data/processed/test_2017-11-09_0600.csv']

In [3]:
def sparse_dummies(df, column):
    """Returns sparse OHE matrix for the column of the dataframe"""
    categories = Categorical(df[column])
    column_names = np.array([f"{column}_{str(i)}" for i in range(len(categories.categories))])
    N = len(categories)
    row_numbers = np.arange(N, dtype=np.int)
    ones = np.ones((N,))
    return csr_matrix((ones, (row_numbers, categories.codes))), column_names

In [4]:
df_train = pd.concat([pd.read_csv(f, dtype=dtypes) for f in train_files], ignore_index=True)
train_size = len(df_train)
print(f"train_size: {train_size}")
l_ = [df_train]
for f in validation_files:
    l_.append(pd.read_csv(f, dtype=dtypes))

df_train = pd.concat(l_, ignore_index=True)

train_size: 30313848


In [5]:
df_train.columns

Index(['app', 'device', 'os', 'channel', 'is_attributed',
       'PERCENT_TRUE(clicks.is_attributed)_app_1day', 'COUNT(clicks)_app_1day',
       'PERCENT_TRUE(clicks.is_attributed)_app_1hour',
       'COUNT(clicks)_app_1hour',
       'PERCENT_TRUE(clicks.is_attributed)_app_3hours',
       'COUNT(clicks)_app_3hours',
       'PERCENT_TRUE(clicks.is_attributed)_device_1day',
       'COUNT(clicks)_device_1day',
       'PERCENT_TRUE(clicks.is_attributed)_device_1hour',
       'COUNT(clicks)_device_1hour',
       'PERCENT_TRUE(clicks.is_attributed)_device_3hours',
       'COUNT(clicks)_device_3hours',
       'PERCENT_TRUE(clicks.is_attributed)_os_1day', 'COUNT(clicks)_os_1day',
       'PERCENT_TRUE(clicks.is_attributed)_os_1hour', 'COUNT(clicks)_os_1hour',
       'PERCENT_TRUE(clicks.is_attributed)_os_3hours',
       'COUNT(clicks)_os_3hours',
       'PERCENT_TRUE(clicks.is_attributed)_channel_1day',
       'COUNT(clicks)_channel_1day',
       'PERCENT_TRUE(clicks.is_attributed)_channel_1hou

In [6]:
matrices = []
all_column_names = []
# creates a matrix per categorical feature
for c in categorical_features:
    matrix, column_names = sparse_dummies(df_train, c)
    matrices.append(matrix)
    all_column_names.append(column_names)

# appends a matrix for numerical features (one column per feature)
matrices.append(csr_matrix(df_train[numerical_features].values, dtype=float))
all_column_names.append(df_train[numerical_features].columns.values)

train_sparse = hstack(matrices, format="csr")
feature_names = np.concatenate(all_column_names)
del matrices, all_column_names

X = train_sparse
y = df_train['is_attributed']

del df_train
gc.collect()

19

In [7]:
# Create binary training and validation files for XGBoost
x1, y1 = X[:train_size], y.iloc[:train_size]
dm1 = xgb.DMatrix(x1, y1, feature_names=feature_names)
dm1.save_binary('../data/cache/train.bin')
del dm1, x1, y1
gc.collect()

x2, y2 = X[train_size:], y.iloc[train_size:]
dm2 = xgb.DMatrix(x2, y2, feature_names=feature_names)
dm2.save_binary('../data/cache/validate.bin')
del dm2, x2, y2, X, y
gc.collect()

41

In [8]:
# XGBoost parameters example
params = {
    'eta': 0.3,
    'tree_method': "hist",
    'grow_policy': "lossguide",
    'max_leaves': 10000,  
    'max_depth': 0, 
    'subsample': 0.5, 
    'alpha':4,
    'objective': 'binary:logistic', 
    'scale_pos_weight':50,
    'eval_metric': 'auc', 
    'nthread':24,
    'silent': 1
}

In [10]:
dmtrain = xgb.DMatrix('../data/cache/train.bin', feature_names=feature_names)
dmvalid = xgb.DMatrix('../data/cache/validate.bin', feature_names=feature_names)

In [17]:
def objective(params):
    num_round = int(params['n_estimators'])
    del params['n_estimators']
    watchlist = [(dmtrain, 'train'), (dmvalid, 'valid')]
    model = xgb.train(params, dmtrain, num_round, watchlist, maximize=True, early_stopping_rounds=20, verbose_eval=5)
    pred = model.predict(dmvalid, ntree_limit=model.best_ntree_limit)
    auc = roc_auc_score(dmvalid.get_label(), pred)
    del model, pred
    gc.collect()
    print(params)
    print(f"SCORE: {auc}")
    print("\n\n\n")
    return { 'loss': 1-auc, 'status': STATUS_OK }

space = {
    'n_estimators': hp.quniform('n_estimators', 200, 600, 50),
    'eta': hp.quniform('eta', 0.025, 0.25, 0.025),
    'max_depth': hp.choice('max_depth', np.arange(1, 14, dtype=int)),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
    'subsample': hp.quniform('subsample', 0.7, 1, 0.05),
    'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
    'colsample_bytree': hp.quniform('colsample_bytree', 0.7, 1, 0.05),
    'alpha' : hp.quniform('alpha', 0, 10, 1),
    'lambda': hp.quniform('lambda', 1, 2, 0.1),
    'scale_pos_weight': hp.quniform('scale_pos_weight', 50, 200, 10),
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': "hist",
    'grow_policy': "lossguide",
    'booster': 'gbtree',
    'nthread': 24,
    'silent': 1
}

trials = Trials()
best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=100,
    trials=trials
)

print("\n\n\n The best hyperparameters:")
print(best)

[0]	train-auc:0.866869	valid-auc:0.884338
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 20 rounds.
[5]	train-auc:0.935923	valid-auc:0.933066
[10]	train-auc:0.941264	valid-auc:0.933464
[15]	train-auc:0.947676	valid-auc:0.93616
[20]	train-auc:0.950397	valid-auc:0.93785
[25]	train-auc:0.953062	valid-auc:0.939817
[30]	train-auc:0.954569	valid-auc:0.940253
[35]	train-auc:0.955557	valid-auc:0.940355
[40]	train-auc:0.956972	valid-auc:0.940332
[45]	train-auc:0.957779	valid-auc:0.940325
[50]	train-auc:0.959248	valid-auc:0.940303
[55]	train-auc:0.959611	valid-auc:0.939851
[60]	train-auc:0.960216	valid-auc:0.939031
Stopping. Best iteration:
[42]	train-auc:0.957163	valid-auc:0.940942

{'alpha': 3.0, 'booster': 'gbtree', 'colsample_bytree': 0.7000000000000001, 'eta': 0.25, 'eval_metric': 'auc', 'gamma': 0.7000000000000001, 'grow_policy': 'lossguide', 'lambda': 1.3, 'max_depth': 3, 'min_child_weight': 4.0, 'nthread

[15]	train-auc:0.957365	valid-auc:0.930291
[20]	train-auc:0.959333	valid-auc:0.929557
[25]	train-auc:0.961875	valid-auc:0.931708
[30]	train-auc:0.962731	valid-auc:0.932211
[35]	train-auc:0.96363	valid-auc:0.934375
[40]	train-auc:0.965189	valid-auc:0.935494
[45]	train-auc:0.966962	valid-auc:0.936216
[50]	train-auc:0.969051	valid-auc:0.935132
[55]	train-auc:0.970244	valid-auc:0.933764
[60]	train-auc:0.971147	valid-auc:0.933114
[65]	train-auc:0.971818	valid-auc:0.930558
Stopping. Best iteration:
[46]	train-auc:0.967144	valid-auc:0.936417

{'alpha': 6.0, 'booster': 'gbtree', 'colsample_bytree': 0.9500000000000001, 'eta': 0.05, 'eval_metric': 'auc', 'gamma': 0.5, 'grow_policy': 'lossguide', 'lambda': 1.1, 'max_depth': 9, 'min_child_weight': 5.0, 'nthread': 24, 'objective': 'binary:logistic', 'scale_pos_weight': 180.0, 'silent': 1, 'subsample': 0.9500000000000001, 'tree_method': 'hist'}
SCORE: 0.9364166355054073




[0]	train-auc:0.851113	valid-auc:0.881034
Multiple eval metrics have been pa

[25]	train-auc:0.957889	valid-auc:0.937133
[30]	train-auc:0.958926	valid-auc:0.937605
[35]	train-auc:0.959367	valid-auc:0.937831
[40]	train-auc:0.96052	valid-auc:0.938268
[45]	train-auc:0.961168	valid-auc:0.939291
[50]	train-auc:0.9614	valid-auc:0.939164
[55]	train-auc:0.962049	valid-auc:0.939865
[60]	train-auc:0.962453	valid-auc:0.939671
[65]	train-auc:0.96366	valid-auc:0.93934
[70]	train-auc:0.964116	valid-auc:0.93943
[75]	train-auc:0.964559	valid-auc:0.939381


KeyboardInterrupt: 