In [1]:
import math
import time
import gc

import numpy as np
import pandas as pd
import lightgbm as lgb

from contextlib import contextmanager
from glob import glob
from itertools import combinations
from pathlib import Path

from sklearn.metrics import roc_auc_score

In [2]:
dtypes = {
    'ip': 'uint32',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8'
}

In [3]:
predictors = [
    'PERCENT_TRUE(clicks.is_attributed)_app_1day',
    'PERCENT_TRUE(clicks.is_attributed)_device_1day',
    'PERCENT_TRUE(clicks.is_attributed)_os_1day',
    'PERCENT_TRUE(clicks.is_attributed)_channel_1day',
    'PERCENT_TRUE(clicks.is_attributed)_ip_1day',
    'PERCENT_TRUE(clicks.is_attributed)_app_device_1day',
    'PERCENT_TRUE(clicks.is_attributed)_app_os_1day',
    'PERCENT_TRUE(clicks.is_attributed)_app_channel_1day',
    'PERCENT_TRUE(clicks.is_attributed)_app_ip_1day',
    'PERCENT_TRUE(clicks.is_attributed)_device_os_1day',
    'PERCENT_TRUE(clicks.is_attributed)_device_channel_1day',
    'PERCENT_TRUE(clicks.is_attributed)_device_ip_1day',
    'PERCENT_TRUE(clicks.is_attributed)_os_channel_1day',
    'PERCENT_TRUE(clicks.is_attributed)_os_ip_1day',
    
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_app_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_device_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_os_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_channel_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_ip_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_app_device_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_app_os_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_app_channel_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_app_ip_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_device_os_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_device_channel_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_device_ip_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_os_channel_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_os_ip_1day',

]

In [4]:
target_entities_init = ['app', 'device', 'os', 'channel', 'ip']
target_entities = []

for t in combinations(target_entities_init, 1):
    target_entities.append(t[0])
for t in combinations(target_entities_init, 2):
    target_entities.append(list(t))
# for t in combinations(target_entities_init, 3):
#     target_entities.append(list(t))
# for t in combinations(target_entities_init, 4):
#     target_entities.append(list(t))

target_entities.remove(['channel', 'ip'])

In [5]:
def remove(df, columns):
    to_drop = [c for c in df.columns if not c in predictors]
    df.drop(columns=to_drop, inplace=True)
    for c in df.columns:
        df[c] = df[c].astype(np.float32)
    return df

def split(df, target_entity):
    if type(target_entity) == str:
        df[target_entity] = df.index
    else:
        df[target_entity[0]], df[target_entity[1]] = df.index.str.split('_', 1).str
        df[target_entity[0]] = df[target_entity[0]].astype(dtypes[target_entity[0]])
        df[target_entity[1]] = df[target_entity[1]].astype(dtypes[target_entity[1]])
    return df

def combine_features(df, features_prefix, feature_suffix):
    for target_entity in target_entities:
        target_entity_name = target_entity if type(target_entity) == str else "_".join(target_entity)
        feature_files = sorted(glob(f"../data/interim/features/{target_entity_name}/{features_prefix}*{feature_suffix}.hdf.compress"))
        assert len(feature_files) > 0
        for feature_file in feature_files:
            df_feature = pd.read_hdf(feature_file)
            df_feature = remove(df_feature, predictors)
            df_feature = split(df_feature, target_entity)
            df = pd.merge(df, df_feature, how='left', left_on=target_entity, right_on=target_entity)
            del df_feature
            gc.collect()
    return df

In [None]:
force = False
cache_train = '../data/cache/train_lgbm_2017-11-09.hdf.compress'
if not Path(cache_train).exists() or force:
    print("Train cache doesn't exist, creating")
    train_summary = pd.read_csv('../data/interim/day_hour_train.csv')
    start_row = train_summary[(train_summary['day'] == 8) & (train_summary['hour'] == 17)]['start'].values[0]
    stop_row = train_summary[(train_summary['day'] == 9) & (train_summary['hour'] == 16)]['end'].values[0]
    df_train = pd.read_hdf('../data/raw/train.hdf.compress', start=start_row, stop=stop_row)
#     df_train = pd.read_hdf('../data/interim/downsampled/train_2017-11-07_1700_08_1600_0.hdf.compress')
    df_train = combine_features(df_train, 'features_2017-11-08_1700', 'attributed2')
    df_train.to_hdf(cache_train, 'train', mode='w', complib='blosc', fletcher32=True, complevel=9)
else:
    df_train = pd.read_hdf(cache_train)

# cache_val = '../data/cache/validate_lgbm.hdf.compress'
# if not Path(cache_val).exists() or force:
#     print("Validation cache doesn't exist, creating")
#     train_summary = pd.read_csv('../data/interim/day_hour_train.csv')
#     start_row = train_summary[(train_summary['day'] == 9) & (train_summary['hour'] == 4)]['start'].values[0]
#     stop_row = train_summary[(train_summary['day'] == 9) & (train_summary['hour'] == 6)]['end'].values[0]
#     df_val = pd.read_hdf('../data/raw/train.hdf.compress', start=start_row, stop=stop_row)
#     df_val = combine_features(df_val, 'features_2017-11-08_1700', 'attributed2')
#     df_val.to_hdf(cache_val, 'train', mode='w', complib='blosc', fletcher32=True, complevel=9)
# else:
#     df_val = pd.read_hdf(cache_val)

Train cache doesn't exist, creating


In [13]:
params = {
    'boosting_type': 'dart',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.1,
    'num_leaves': 100,
    'max_depth': -1,
    'min_child_samples': 100,
#     'max_bin': 16,
#     'subsample': 0.7,
#     'subsample_freq': 1,
#     'colsample_bytree': 0.7,
#     'min_child_weight': 5,
#     'subsample_for_bin': 200000,
#     'min_split_gain': 0,
#     'reg_alpha': 0.01,
#     'reg_lambda': 0.01,
   # 'nthread': 8,
    'verbose': 0,
    'is_unbalance': True,
#     'scale_pos_weight':10 
    }

In [18]:
dtrain = lgb.Dataset(
    df_train[predictors].values, 
    label=df_train['is_attributed'].values,
    feature_name=predictors,
)

# dvalid = lgb.Dataset(
#     df_val[predictors].values,
#     label=df_val['is_attributed'].values,
#     feature_name=predictors,
#     reference=dtrain,
# )

In [19]:
lgb_model = lgb.train(
    params,
    dtrain,
    valid_sets=[dtrain, dvalid],
    valid_names=['train','valid'],
    num_boost_round=100,
    early_stopping_rounds=30,
    verbose_eval=10, 
    feval=None
)

Training until validation scores don't improve for 30 rounds.
[10]	train's auc: 0.973122	valid's auc: 0.959908
[20]	train's auc: 0.974009	valid's auc: 0.960488
[30]	train's auc: 0.975213	valid's auc: 0.9614
[40]	train's auc: 0.975885	valid's auc: 0.96193
[50]	train's auc: 0.976249	valid's auc: 0.962057
[60]	train's auc: 0.976675	valid's auc: 0.962072
[70]	train's auc: 0.976925	valid's auc: 0.962177


KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt
f, ax = plt.subplots(figsize=[7,10])
lgb.plot_importance(lgb_model, ax=ax, max_num_features=len(predictors))
plt.title("Light GBM Feature Importance")