In [10]:
import gc
import os
import operator

from glob import glob

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import xgbfir

from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from pandas.core.categorical import Categorical
from scipy.sparse import csr_matrix, hstack

In [2]:
dtypes = {
    'ip': 'uint32',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8'
}

# Features used in training
categorical_features = ['app', 'device', 'os', 'channel']
numerical_features = [
    'PERCENT_TRUE(clicks.is_attributed)_app_1day',
    'PERCENT_TRUE(clicks.is_attributed)_app_1hour',
    'PERCENT_TRUE(clicks.is_attributed)_app_3hours',
    'PERCENT_TRUE(clicks.is_attributed)_device_1day',
    'PERCENT_TRUE(clicks.is_attributed)_device_1hour',
    'PERCENT_TRUE(clicks.is_attributed)_device_3hours',
    'PERCENT_TRUE(clicks.is_attributed)_os_1day',
    'PERCENT_TRUE(clicks.is_attributed)_os_1hour',
    'PERCENT_TRUE(clicks.is_attributed)_os_3hours',
    'PERCENT_TRUE(clicks.is_attributed)_channel_1day',
    'PERCENT_TRUE(clicks.is_attributed)_channel_1hour',
    'PERCENT_TRUE(clicks.is_attributed)_channel_3hours'
]

train_files = [
    '../data/interim/combined/train_2017-11-08_0000.csv',
    '../data/interim/combined/train_2017-11-08_0100.csv',
    '../data/interim/combined/train_2017-11-08_0200.csv',
    '../data/interim/combined/train_2017-11-08_0300.csv',
    '../data/interim/combined/train_2017-11-08_0400.csv'
]

validation_files = ['../data/interim/combined/train_2017-11-09_0400.csv']

In [3]:
def sparse_dummies(df, column):
    """Returns sparse OHE matrix for the column of the dataframe"""
    categories = Categorical(df[column])
    column_names = np.array([f"{column}_{str(i)}" for i in range(len(categories.categories))])
    N = len(categories)
    row_numbers = np.arange(N, dtype=np.int)
    ones = np.ones((N,))
    return csr_matrix((ones, (row_numbers, categories.codes))), column_names

In [4]:
df_train = pd.concat([pd.read_csv(f, dtype=dtypes) for f in train_files], ignore_index=True)
train_size = len(df_train)
print(f"train_size: {train_size}")
l_ = [df_train]
for f in validation_files:
    l_.append(pd.read_csv(f, dtype=dtypes))

df_train = pd.concat(l_, ignore_index=True)

train_size: 17468845


In [5]:
df_train.columns

Index(['app', 'device', 'os', 'channel', 'is_attributed',
       'PERCENT_TRUE(clicks.is_attributed)_app_1day',
       'PERCENT_TRUE(clicks.is_attributed)_app_1hour',
       'PERCENT_TRUE(clicks.is_attributed)_app_3hours',
       'PERCENT_TRUE(clicks.is_attributed)_device_1day',
       'PERCENT_TRUE(clicks.is_attributed)_device_1hour',
       'PERCENT_TRUE(clicks.is_attributed)_device_3hours',
       'PERCENT_TRUE(clicks.is_attributed)_os_1day',
       'PERCENT_TRUE(clicks.is_attributed)_os_1hour',
       'PERCENT_TRUE(clicks.is_attributed)_os_3hours',
       'PERCENT_TRUE(clicks.is_attributed)_channel_1day',
       'PERCENT_TRUE(clicks.is_attributed)_channel_1hour',
       'PERCENT_TRUE(clicks.is_attributed)_channel_3hours'],
      dtype='object')

In [6]:
matrices = []
all_column_names = []
# creates a matrix per categorical feature
for c in categorical_features:
    matrix, column_names = sparse_dummies(df_train, c)
    matrices.append(matrix)
    all_column_names.append(column_names)

# appends a matrix for numerical features (one column per feature)
matrices.append(csr_matrix(df_train[numerical_features].values, dtype=float))
all_column_names.append(df_train[numerical_features].columns.values)

train_sparse = hstack(matrices, format="csr")
feature_names = np.concatenate(all_column_names)
del matrices, all_column_names

X = train_sparse
y = df_train['is_attributed']

del df_train
gc.collect()

78

In [7]:
x1, y1 = X[:train_size], y.iloc[:train_size]
dm1 = xgb.DMatrix(x1, y1, feature_names=feature_names)
dm1.save_binary('../data/cache/train.bin')
del dm1, x1, y1
gc.collect()

x2, y2 = X[train_size:], y.iloc[train_size:]
dm2 = xgb.DMatrix(x2, y2, feature_names=feature_names)
dm2.save_binary('../data/cache/validate.bin')
del dm2, x2, y2, X, y
gc.collect()

28

In [8]:
dmtrain = xgb.DMatrix('../data/cache/train.bin', feature_names=feature_names)
dmvalid = xgb.DMatrix('../data/cache/validate.bin', feature_names=feature_names)

In [11]:
def objective(params):
    num_round = int(params['n_estimators'])
    del params['n_estimators']
    watchlist = [(dmtrain, 'train'), (dmvalid, 'valid')]
    model = xgb.train(params, dmtrain, num_round, watchlist, maximize=True, early_stopping_rounds=20, verbose_eval=1)
    pred = model.predict(dmvalid)
    auc = roc_auc_score(dmvalid.get_label(), pred)
    del model, pred
    gc.collect()
    print(f"SCORE: {auc}")
    return { 'loss': 1-auc, 'status': STATUS_OK }

space = {
    'n_estimators': hp.quniform('n_estimators', 200, 600, 50),
    'eta': hp.quniform('eta', 0.025, 0.25, 0.025),
    'max_depth': hp.choice('max_depth', np.arange(1, 14, dtype=int)),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
    'subsample': hp.quniform('subsample', 0.7, 1, 0.05),
    'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
    'colsample_bytree': hp.quniform('colsample_bytree', 0.7, 1, 0.05),
    'alpha' : hp.quniform('alpha', 0, 10, 1),
    'lambda': hp.quniform('lambda', 1, 2, 0.1),
    'scale_pos_weight': hp.quniform('scale_pos_weight', 50, 200, 10),
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': "hist",
    'booster': 'gbtree',
    'nthread': 24,
    'silent': 1
}

trials = Trials()
best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=100,
    trials=trials
)

print(best)

[0]	train-auc:0.957747	valid-auc:0.913551
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 20 rounds.
[1]	train-auc:0.959127	valid-auc:0.948722
[2]	train-auc:0.95925	valid-auc:0.954286
[3]	train-auc:0.959466	valid-auc:0.955844
[4]	train-auc:0.961319	valid-auc:0.956698
[5]	train-auc:0.961735	valid-auc:0.957563
[6]	train-auc:0.962606	valid-auc:0.958516
[7]	train-auc:0.963788	valid-auc:0.959453
[8]	train-auc:0.964279	valid-auc:0.959822
[9]	train-auc:0.964574	valid-auc:0.960295
[10]	train-auc:0.965383	valid-auc:0.960551
[11]	train-auc:0.966956	valid-auc:0.961485
[12]	train-auc:0.968428	valid-auc:0.962826
[13]	train-auc:0.968662	valid-auc:0.963252
[14]	train-auc:0.969412	valid-auc:0.963471
[15]	train-auc:0.969988	valid-auc:0.96344
[16]	train-auc:0.97055	valid-auc:0.963915
[17]	train-auc:0.97098	valid-auc:0.963796
[18]	train-auc:0.971239	valid-auc:0.964064
[19]	train-auc:0.971847	valid-auc:0.964259
[20]	train-

KeyboardInterrupt: 