In [1]:
import math
import time
import gc

import numpy as np
import pandas as pd
import lightgbm as lgb

from contextlib import contextmanager
from glob import glob
from itertools import combinations
from pathlib import Path

from sklearn.metrics import roc_auc_score
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials


In [2]:
dtypes = {
    'ip': 'uint32',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8'
}

In [3]:
target_entities_init = ['app', 'device', 'os', 'channel', 'ip']
target_entities = []

for t in combinations(target_entities_init, 1):
    target_entities.append(t[0])
for t in combinations(target_entities_init, 2):
    target_entities.append(list(t))
# for t in combinations(target_entities_init, 3):
#     target_entities.append(list(t))
# for t in combinations(target_entities_init, 4):
#     target_entities.append(list(t))

target_entities.remove(['channel', 'ip'])

In [4]:
def split(df, target_entity):
    if type(target_entity) == str:
        df[target_entity] = df.index
    else:
        df[target_entity[0]], df[target_entity[1]] = df.index.str.split('_', 1).str
        df[target_entity[0]] = df[target_entity[0]].astype(dtypes[target_entity[0]])
        df[target_entity[1]] = df[target_entity[1]].astype(dtypes[target_entity[1]])
    return df

def combine_features(df, features_prefix, feature_suffix):
    for target_entity in target_entities:
        target_entity_name = target_entity if type(target_entity) == str else "_".join(target_entity)
        feature_files = sorted(glob(f"../data/interim/features/{target_entity_name}/{features_prefix}*{feature_suffix}.hdf.compress"))
        assert len(feature_files) > 0
        for feature_file in feature_files:
            df_feature = pd.read_hdf(feature_file)
            df_feature = split(df_feature, target_entity)
            df = pd.merge(df, df_feature, how='left', left_on=target_entity, right_on=target_entity)
            del df_feature
            gc.collect()
    return df

In [5]:
force = False
cache_train = '../data/cache/train_lgbm.hdf.compress'
if not Path(cache_train).exists() or force:
    print("Train cache doesn't exist, creating")
    train_summary = pd.read_csv('../data/interim/day_hour_train.csv')
    start_row = train_summary[(train_summary['day'] == 7) & (train_summary['hour'] == 17)]['start'].values[0]
    stop_row = train_summary[(train_summary['day'] == 8) & (train_summary['hour'] == 4)]['end'].values[0]
    df_train = pd.read_hdf('../data/raw/train.hdf.compress', start=start_row, stop=stop_row)
#     df_train = pd.read_hdf('../data/interim/downsampled/train_2017-11-07_1700_08_1600_0.hdf.compress')
    df_train = combine_features(df_train, 'features_2017-11-07_1700', 'attributed2')
    df_train.to_hdf(cache_train, 'train', mode='w', complib='blosc', fletcher32=True, complevel=9)
else:
    df_train = pd.read_hdf(cache_train)

cache_val = '../data/cache/validate_lgbm.hdf.compress'
if not Path(cache_val).exists() or force:
    print("Validation cache doesn't exist, creating")
    train_summary = pd.read_csv('../data/interim/day_hour_train.csv')
    start_row = train_summary[(train_summary['day'] == 9) & (train_summary['hour'] == 4)]['start'].values[0]
    stop_row = train_summary[(train_summary['day'] == 9) & (train_summary['hour'] == 4)]['end'].values[0]
    df_val = pd.read_hdf('../data/raw/train.hdf.compress', start=start_row, stop=stop_row)
    df_val = combine_features(df_val, 'features_2017-11-08_1700', 'attributed2')
    df_val.to_hdf(cache_val, 'train', mode='w', complib='blosc', fletcher32=True, complevel=9)
else:
    df_val = pd.read_hdf(cache_val)

In [27]:
for c in df_train.columns:
    if c.startswith('COUN'):
        print(f"'{c}',")

'COUNT(clicks)_app_1day',
'COUNT(clicks WHERE is_attributed = True)_app_1day',
'COUNT(clicks)_device_1day',
'COUNT(clicks WHERE is_attributed = True)_device_1day',
'COUNT(clicks)_os_1day',
'COUNT(clicks WHERE is_attributed = True)_os_1day',
'COUNT(clicks)_channel_1day',
'COUNT(clicks WHERE is_attributed = True)_channel_1day',
'COUNT(clicks)_ip_1day',
'COUNT(clicks WHERE is_attributed = True)_ip_1day',
'COUNT(clicks)_app_device_1day',
'COUNT(clicks WHERE is_attributed = True)_app_device_1day',
'COUNT(clicks)_app_os_1day',
'COUNT(clicks WHERE is_attributed = True)_app_os_1day',
'COUNT(clicks)_app_channel_1day',
'COUNT(clicks WHERE is_attributed = True)_app_channel_1day',
'COUNT(clicks)_app_ip_1day',
'COUNT(clicks WHERE is_attributed = True)_app_ip_1day',
'COUNT(clicks)_device_os_1day',
'COUNT(clicks WHERE is_attributed = True)_device_os_1day',
'COUNT(clicks)_device_channel_1day',
'COUNT(clicks WHERE is_attributed = True)_device_channel_1day',
'COUNT(clicks)_device_ip_1day',
'COUNT(clicks

In [17]:
predictors = [
    'PERCENT_TRUE(clicks.is_attributed)_app_1day',
    'PERCENT_TRUE(clicks.is_attributed)_device_1day',
    'PERCENT_TRUE(clicks.is_attributed)_os_1day',
    'PERCENT_TRUE(clicks.is_attributed)_channel_1day',
    'PERCENT_TRUE(clicks.is_attributed)_ip_1day',
    'PERCENT_TRUE(clicks.is_attributed)_app_device_1day',
    'PERCENT_TRUE(clicks.is_attributed)_app_os_1day',
    'PERCENT_TRUE(clicks.is_attributed)_app_channel_1day',
    'PERCENT_TRUE(clicks.is_attributed)_app_ip_1day',
    'PERCENT_TRUE(clicks.is_attributed)_device_os_1day',
    'PERCENT_TRUE(clicks.is_attributed)_device_channel_1day',
    'PERCENT_TRUE(clicks.is_attributed)_device_ip_1day',
    'PERCENT_TRUE(clicks.is_attributed)_os_channel_1day',
    'PERCENT_TRUE(clicks.is_attributed)_os_ip_1day',
    
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_app_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_device_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_os_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_channel_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_ip_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_app_device_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_app_os_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_app_channel_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_app_ip_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_device_os_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_device_channel_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_device_ip_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_os_channel_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_os_ip_1day',

    'COUNT(clicks WHERE is_attributed = True)_app_1day',
    'COUNT(clicks WHERE is_attributed = True)_device_1day',
    'COUNT(clicks WHERE is_attributed = True)_os_1day',
    'COUNT(clicks WHERE is_attributed = True)_channel_1day',
    'COUNT(clicks WHERE is_attributed = True)_ip_1day',
    'COUNT(clicks WHERE is_attributed = True)_app_device_1day',
    'COUNT(clicks WHERE is_attributed = True)_app_os_1day',
    'COUNT(clicks WHERE is_attributed = True)_app_channel_1day',
    'COUNT(clicks WHERE is_attributed = True)_app_ip_1day',
    'COUNT(clicks WHERE is_attributed = True)_device_os_1day',
    'COUNT(clicks WHERE is_attributed = True)_device_channel_1day',
    'COUNT(clicks WHERE is_attributed = True)_device_ip_1day',
    'COUNT(clicks WHERE is_attributed = True)_os_channel_1day',
    'COUNT(clicks WHERE is_attributed = True)_os_ip_1day',
]

In [18]:
dtrain = lgb.Dataset(
    df_train[predictors].values, 
    label=df_train['is_attributed'].values,
    feature_name=predictors,
)

dvalid = lgb.Dataset(
    df_val[predictors].values,
    label=df_val['is_attributed'].values,
    feature_name=predictors,
    reference=dtrain,
)

In [20]:
def objective(params):
    lgb_model = lgb.train(
        params,
        dtrain,
        valid_sets=[dtrain, dvalid],
        valid_names=['train','valid'],
        num_boost_round=1000,
        early_stopping_rounds=30,
        verbose_eval=20,
        feval=None
    )
    auc = lgb_model.best_score['valid']['auc']
    print(f"Params: {params}")
    print(f"SCORE: {auc}")
    return { 'loss': 1-auc, 'status': STATUS_OK }

space = {
    'boosting_type': hp.choice('boosting_type', ['gbdt', 'dart']),
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': hp.quniform('learning_rate', 0.05, 0.5, 0.05),
    'num_leaves': hp.choice('num_leaves', np.arange(10, 200, 10, dtype=int)),
    'max_depth': -1,
    'min_child_samples': hp.choice('min_child_samples', np.arange(100, 1000, 100, dtype=int)),
#     'max_bin': 255,
#     'subsample': 0.7,
#     'subsample_freq': 1,
#     'colsample_bytree': 0.7,
    'min_child_weight': 5,
#     'subsample_for_bin': 200000,
#     'min_split_gain': 0,
#     'reg_alpha': 0.01,
#     'reg_lambda': 0.01,
   # 'nthread': 8,
    'verbose': 0,
    'is_unbalance': True,
#     'scale_pos_weight':10 
}

trials = Trials()
best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=100,
    trials=trials
)

print("\n\n\nThe best parameters:")
print(best)

Training until validation scores don't improve for 30 rounds.
[20]	train's auc: 0.979966	valid's auc: 0.958814
[40]	train's auc: 0.981522	valid's auc: 0.955886
Early stopping, best iteration is:
[12]	train's auc: 0.977553	valid's auc: 0.960202
Params: {'boosting_type': 'dart', 'is_unbalance': True, 'learning_rate': 0.45, 'max_depth': -1, 'metric': 'auc', 'min_child_samples': 200, 'min_child_weight': 5, 'num_leaves': 90, 'objective': 'binary', 'verbose': 0}
SCORE: 0.9602022871111795
Training until validation scores don't improve for 30 rounds.
[20]	train's auc: 0.977548	valid's auc: 0.961103
[40]	train's auc: 0.981482	valid's auc: 0.961014
Early stopping, best iteration is:
[18]	train's auc: 0.976872	valid's auc: 0.961377
Params: {'boosting_type': 'dart', 'is_unbalance': True, 'learning_rate': 0.30000000000000004, 'max_depth': -1, 'metric': 'auc', 'min_child_samples': 900, 'min_child_weight': 5, 'num_leaves': 60, 'objective': 'binary', 'verbose': 0}
SCORE: 0.9613769821411782
Training un

Training until validation scores don't improve for 30 rounds.
[20]	train's auc: 0.973103	valid's auc: 0.959355
[40]	train's auc: 0.975388	valid's auc: 0.960481
[60]	train's auc: 0.976256	valid's auc: 0.960803
[80]	train's auc: 0.977044	valid's auc: 0.961251
[100]	train's auc: 0.977346	valid's auc: 0.961105
Early stopping, best iteration is:
[80]	train's auc: 0.977044	valid's auc: 0.961251
Params: {'boosting_type': 'dart', 'is_unbalance': True, 'learning_rate': 0.1, 'max_depth': -1, 'metric': 'auc', 'min_child_samples': 500, 'min_child_weight': 5, 'num_leaves': 60, 'objective': 'binary', 'verbose': 0}
SCORE: 0.961250993823829
Training until validation scores don't improve for 30 rounds.
[20]	train's auc: 0.985326	valid's auc: 0.9576
Early stopping, best iteration is:
[8]	train's auc: 0.978542	valid's auc: 0.959573
Params: {'boosting_type': 'gbdt', 'is_unbalance': True, 'learning_rate': 0.35000000000000003, 'max_depth': -1, 'metric': 'auc', 'min_child_samples': 100, 'min_child_weight': 5

KeyboardInterrupt: 