In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
from plotly import tools
import json
init_notebook_mode(connected=True)

In [2]:
import sys
import os
sys.path.append(os.path.join(os.getcwd(), '../../'))
%load_ext autoreload
%autoreload 2
import tools4kaggle

# Data Description

## データ読み込み

In [3]:
%time train_df = pd.read_parquet('../input/train.parquet')
train_df.loc[5244810, 'AvSigVersion'] = '1.273.1144.0'
%time sample_df = pd.read_parquet('../input/sample.parquet')
%time test_df = pd.read_parquet('../input/test.parquet')

CPU times: user 28.7 s, sys: 50.6 s, total: 1min 19s
Wall time: 20.3 s
CPU times: user 10.4 s, sys: 6.07 s, total: 16.5 s
Wall time: 3.18 s
CPU times: user 28 s, sys: 40.7 s, total: 1min 8s
Wall time: 18 s


## データタイプで分類

In [4]:
with open('../input/new_types.json','r') as f :
    new_types = json.load(f)
train_df = train_df.astype(new_types)
sample_df = sample_df.astype(new_types)
new_types.pop('HasDetections')
test_df = test_df.astype(new_types)

In [7]:
type_names = [str(a) for a in train_df.dtypes.values]
category_mask = [type_name == 'category' for type_name in type_names]
int_mask   = ['int' in type_name for type_name in type_names]
float_mask = ['float' in type_name for type_name in type_names]

# 特徴量エンジニアリング

基本的にはすべての特徴量を頻度エンコーディングする

In [60]:
from sklearn.pipeline import Pipeline, make_union, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from lightgbm import LGBMClassifier
import optuna
import pickle

In [6]:
version_cols = ['AppVersion','EngineVersion','AvSigVersion','Census_OSVersion','OsVer']
features = train_df.columns.drop('MachineIdentifier').drop('HasDetections')

columnTransformer を使う方法
output が dataframe でなくて困るが，両方とも nparray だった．

In [7]:
column_trans = ColumnTransformer( transformers=[
    ('freq_encode', tools4kaggle.transformer.FreqEncoder(), features),
    ('version_encode', tools4kaggle.transformer.VersionEncoder(), version_cols)],
    remainder='drop')

In [55]:
%%time

ve = tools4kaggle.transformer.VersionEncoder()
column_sample = column_trans.fit_transform(sample_df)
column_sample

CPU times: user 10min, sys: 1min 16s, total: 11min 17s
Wall time: 33 s


### Train Validation Split

AvSigVersin で分ける場合と EngineVersion で分ける場合の二通りがある．

In [8]:
ve = tools4kaggle.transformer.VersionEncoder()
version_df = ve.fit_transform(train_df.loc[:,version_cols])

In [9]:
version_df.dtypes

AppVersion          float64
AvSigVersion        float64
Census_OSVersion    float64
EngineVersion       float64
OsVer               float64
dtype: object

In [10]:
lower_avsig_mask = version_df.AvSigVersion < 127400000
lower_avsig_mask.sum() / len(lower_avsig_mask)

0.5538020977005729

In [11]:
lower_engine_mask = version_df.EngineVersion < 11151500
lower_engine_mask.sum() / len(lower_engine_mask)

0.5478242798870995

In [12]:
np.logical_and(lower_avsig_mask, lower_engine_mask).sum() / len(lower_engine_mask)

0.5476729597534401

# 学習

### データ読み込み

In [13]:
%%time

encoded_df = column_trans.fit_transform(train_df)

CPU times: user 2h 12min 58s, sys: 1h 14min 43s, total: 3h 27min 41s
Wall time: 8min 18s


### optuna の最適化

In [67]:
def val_auc(trial):
    max_depth = trial.suggest_int('max_depth', 6,15)
    learning_rate = trial.suggest_loguniform('learning_rate', 0.001, 0.3)
    min_child_weight = trial.suggest_loguniform('min_child_weight', 0.0001, 0.01)
    min_child_samples = trial.suggest_int('min_child_samples', 10,30)

    mean_train_auc = 0.0
    mean_eval_auc = 0.0
    train_masks = [lower_avsig_mask, lower_engine_mask]
    for mask in train_masks:
        train_split_mat    = encoded_df[mask]
        train_split_target = train_df[mask].HasDetections
        eval_split_mat     = encoded_df[~mask]
        eval_split_target  = train_df[~mask].HasDetections
        lgbm = LGBMClassifier(
            objective='binary',
            n_estimators=1000,
            max_depth=max_depth,
            learning_rate=learning_rate,
            min_child_weight = min_child_weight,
            min_child_samples = min_child_samples,
            colsample_bytree=0.7
        )
        lgbm.fit(
            train_split_mat,
            train_split_target,
            eval_set = \
                [(eval_split_mat, eval_split_target)],
            eval_metric = 'auc',
            early_stopping_rounds = 50
        )
        train_proba = lgbm.predict_proba(train_split_mat)
        train_auc = roc_auc_score(train_split_target, train_proba[:,1])
        eval_auc = lgbm.best_score_['valid_0']['auc']
        mean_train_auc += train_auc / len(train_masks)
        mean_eval_auc += eval_auc / len(train_masks)
    trial.set_user_attr('mean_train_auc', mean_train_auc)
    trial.set_user_attr('mean_eval_auc', mean_eval_auc)
    trial.set_user_attr('best_iteration_', lgbm.best_iteration_)
    return 1 - mean_eval_auc

### lightgbm の単一モデル学習で param search

In [None]:
study = optuna.create_study()
study.optimize(val_auc, n_trials=10)

[1]	valid_0's auc: 0.630681	valid_0's binary_logloss: 0.691917
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's auc: 0.635265	valid_0's binary_logloss: 0.691313
[3]	valid_0's auc: 0.638661	valid_0's binary_logloss: 0.690007
[4]	valid_0's auc: 0.643529	valid_0's binary_logloss: 0.689037
[5]	valid_0's auc: 0.64525	valid_0's binary_logloss: 0.687692
[6]	valid_0's auc: 0.646154	valid_0's binary_logloss: 0.686392
[7]	valid_0's auc: 0.645527	valid_0's binary_logloss: 0.685293
[8]	valid_0's auc: 0.645276	valid_0's binary_logloss: 0.684099
[9]	valid_0's auc: 0.645889	valid_0's binary_logloss: 0.683072
[10]	valid_0's auc: 0.645976	valid_0's binary_logloss: 0.682645
[11]	valid_0's auc: 0.646846	valid_0's binary_logloss: 0.682238
[12]	valid_0's auc: 0.646485	valid_0's binary_logloss: 0.681307
[13]	valid_0's auc: 0.647259	valid_0's binary_logloss: 0.680294
[14]	valid_0's auc: 0.648539	valid_0's binary_logloss: 0.679448
[15]	valid_0's auc: 0.648717	valid_0's binary_loglos

[254]	valid_0's auc: 0.668685	valid_0's binary_logloss: 0.648265
[255]	valid_0's auc: 0.668721	valid_0's binary_logloss: 0.648238
[256]	valid_0's auc: 0.668749	valid_0's binary_logloss: 0.648227
[257]	valid_0's auc: 0.668818	valid_0's binary_logloss: 0.648193
[258]	valid_0's auc: 0.668887	valid_0's binary_logloss: 0.648159
[259]	valid_0's auc: 0.66894	valid_0's binary_logloss: 0.648135
[260]	valid_0's auc: 0.668968	valid_0's binary_logloss: 0.648121
[261]	valid_0's auc: 0.669029	valid_0's binary_logloss: 0.648094
[262]	valid_0's auc: 0.66907	valid_0's binary_logloss: 0.648071
[263]	valid_0's auc: 0.66906	valid_0's binary_logloss: 0.648056
[264]	valid_0's auc: 0.669141	valid_0's binary_logloss: 0.648004
[265]	valid_0's auc: 0.6692	valid_0's binary_logloss: 0.647975
[266]	valid_0's auc: 0.66922	valid_0's binary_logloss: 0.647958
[267]	valid_0's auc: 0.669267	valid_0's binary_logloss: 0.647939
[268]	valid_0's auc: 0.669332	valid_0's binary_logloss: 0.647908
[269]	valid_0's auc: 0.669351	v

[506]	valid_0's auc: 0.677561	valid_0's binary_logloss: 0.643047
[507]	valid_0's auc: 0.677577	valid_0's binary_logloss: 0.643034
[508]	valid_0's auc: 0.677619	valid_0's binary_logloss: 0.643016
[509]	valid_0's auc: 0.677636	valid_0's binary_logloss: 0.643005
[510]	valid_0's auc: 0.67765	valid_0's binary_logloss: 0.642997
[511]	valid_0's auc: 0.677673	valid_0's binary_logloss: 0.642983
[512]	valid_0's auc: 0.677682	valid_0's binary_logloss: 0.642969
[513]	valid_0's auc: 0.677691	valid_0's binary_logloss: 0.642963
[514]	valid_0's auc: 0.677694	valid_0's binary_logloss: 0.642961
[515]	valid_0's auc: 0.677692	valid_0's binary_logloss: 0.642968
[516]	valid_0's auc: 0.677628	valid_0's binary_logloss: 0.642912
[517]	valid_0's auc: 0.677652	valid_0's binary_logloss: 0.642901
[518]	valid_0's auc: 0.67767	valid_0's binary_logloss: 0.642889
[519]	valid_0's auc: 0.677692	valid_0's binary_logloss: 0.642881
[520]	valid_0's auc: 0.677784	valid_0's binary_logloss: 0.64279
[521]	valid_0's auc: 0.67779

[758]	valid_0's auc: 0.68146	valid_0's binary_logloss: 0.640558
[759]	valid_0's auc: 0.681483	valid_0's binary_logloss: 0.640549
[760]	valid_0's auc: 0.681501	valid_0's binary_logloss: 0.64052
[761]	valid_0's auc: 0.68153	valid_0's binary_logloss: 0.6405
[762]	valid_0's auc: 0.68154	valid_0's binary_logloss: 0.640495
[763]	valid_0's auc: 0.68155	valid_0's binary_logloss: 0.640489
[764]	valid_0's auc: 0.681547	valid_0's binary_logloss: 0.640489
[765]	valid_0's auc: 0.681545	valid_0's binary_logloss: 0.640492
[766]	valid_0's auc: 0.681581	valid_0's binary_logloss: 0.640468
[767]	valid_0's auc: 0.681589	valid_0's binary_logloss: 0.64046
[768]	valid_0's auc: 0.681602	valid_0's binary_logloss: 0.64045
[769]	valid_0's auc: 0.681608	valid_0's binary_logloss: 0.640447
[770]	valid_0's auc: 0.681618	valid_0's binary_logloss: 0.640439
[771]	valid_0's auc: 0.681633	valid_0's binary_logloss: 0.640428
[772]	valid_0's auc: 0.68165	valid_0's binary_logloss: 0.640435
[773]	valid_0's auc: 0.681675	valid

[8]	valid_0's auc: 0.647475	valid_0's binary_logloss: 0.683413
[9]	valid_0's auc: 0.648593	valid_0's binary_logloss: 0.682978
[10]	valid_0's auc: 0.649159	valid_0's binary_logloss: 0.681866
[11]	valid_0's auc: 0.649356	valid_0's binary_logloss: 0.680923
[12]	valid_0's auc: 0.649494	valid_0's binary_logloss: 0.679894
[13]	valid_0's auc: 0.649272	valid_0's binary_logloss: 0.678919
[14]	valid_0's auc: 0.649793	valid_0's binary_logloss: 0.678196
[15]	valid_0's auc: 0.650229	valid_0's binary_logloss: 0.677265
[16]	valid_0's auc: 0.650988	valid_0's binary_logloss: 0.676527
[17]	valid_0's auc: 0.651301	valid_0's binary_logloss: 0.675701
[18]	valid_0's auc: 0.651779	valid_0's binary_logloss: 0.675364
[19]	valid_0's auc: 0.652783	valid_0's binary_logloss: 0.675005
[20]	valid_0's auc: 0.652571	valid_0's binary_logloss: 0.674232
[21]	valid_0's auc: 0.652506	valid_0's binary_logloss: 0.673942
[22]	valid_0's auc: 0.65281	valid_0's binary_logloss: 0.673174
[23]	valid_0's auc: 0.652989	valid_0's bina

[261]	valid_0's auc: 0.668392	valid_0's binary_logloss: 0.648086
[262]	valid_0's auc: 0.668484	valid_0's binary_logloss: 0.648041
[263]	valid_0's auc: 0.668549	valid_0's binary_logloss: 0.648016
[264]	valid_0's auc: 0.668588	valid_0's binary_logloss: 0.647995
[265]	valid_0's auc: 0.668622	valid_0's binary_logloss: 0.647974
[266]	valid_0's auc: 0.668677	valid_0's binary_logloss: 0.647947
[267]	valid_0's auc: 0.668721	valid_0's binary_logloss: 0.647953
[268]	valid_0's auc: 0.668748	valid_0's binary_logloss: 0.647948
[269]	valid_0's auc: 0.668804	valid_0's binary_logloss: 0.647928
[270]	valid_0's auc: 0.668895	valid_0's binary_logloss: 0.647878
[271]	valid_0's auc: 0.668946	valid_0's binary_logloss: 0.647855
[272]	valid_0's auc: 0.669009	valid_0's binary_logloss: 0.647821
[273]	valid_0's auc: 0.669052	valid_0's binary_logloss: 0.647795
[274]	valid_0's auc: 0.669084	valid_0's binary_logloss: 0.647782
[275]	valid_0's auc: 0.669123	valid_0's binary_logloss: 0.647757
[276]	valid_0's auc: 0.66

[513]	valid_0's auc: 0.677067	valid_0's binary_logloss: 0.640806
[514]	valid_0's auc: 0.677097	valid_0's binary_logloss: 0.640787
[515]	valid_0's auc: 0.677117	valid_0's binary_logloss: 0.640788
[516]	valid_0's auc: 0.677123	valid_0's binary_logloss: 0.640783
[517]	valid_0's auc: 0.677156	valid_0's binary_logloss: 0.640736
[518]	valid_0's auc: 0.677179	valid_0's binary_logloss: 0.640725
[519]	valid_0's auc: 0.67721	valid_0's binary_logloss: 0.64071
[520]	valid_0's auc: 0.677228	valid_0's binary_logloss: 0.640701
[521]	valid_0's auc: 0.677244	valid_0's binary_logloss: 0.640693
[522]	valid_0's auc: 0.677265	valid_0's binary_logloss: 0.640685
[523]	valid_0's auc: 0.677282	valid_0's binary_logloss: 0.640672
[524]	valid_0's auc: 0.677308	valid_0's binary_logloss: 0.640666
[525]	valid_0's auc: 0.677356	valid_0's binary_logloss: 0.640638
[526]	valid_0's auc: 0.677372	valid_0's binary_logloss: 0.640631
[527]	valid_0's auc: 0.677369	valid_0's binary_logloss: 0.640624
[528]	valid_0's auc: 0.6773

[765]	valid_0's auc: 0.680669	valid_0's binary_logloss: 0.63734
[766]	valid_0's auc: 0.68068	valid_0's binary_logloss: 0.637335
[767]	valid_0's auc: 0.680695	valid_0's binary_logloss: 0.637331
[768]	valid_0's auc: 0.680698	valid_0's binary_logloss: 0.637329
[769]	valid_0's auc: 0.680708	valid_0's binary_logloss: 0.637324
[770]	valid_0's auc: 0.680712	valid_0's binary_logloss: 0.637321
[771]	valid_0's auc: 0.680732	valid_0's binary_logloss: 0.637311
[772]	valid_0's auc: 0.680745	valid_0's binary_logloss: 0.637305
[773]	valid_0's auc: 0.680739	valid_0's binary_logloss: 0.637308
[774]	valid_0's auc: 0.680748	valid_0's binary_logloss: 0.637292
[775]	valid_0's auc: 0.680761	valid_0's binary_logloss: 0.637285
[776]	valid_0's auc: 0.680774	valid_0's binary_logloss: 0.637278
[777]	valid_0's auc: 0.680794	valid_0's binary_logloss: 0.637269
[778]	valid_0's auc: 0.680807	valid_0's binary_logloss: 0.637262
[779]	valid_0's auc: 0.680817	valid_0's binary_logloss: 0.637256
[780]	valid_0's auc: 0.6808

[I 2019-02-06 10:59:45,040] Finished a trial resulted in value: 0.31620137656217295. Current best value is 0.31620137656217295 with parameters: {'max_depth': 4, 'learning_rate': 0.02099095036849272, 'min_child_samples': 29, 'min_child_weight': 0.0014260960168705932}.


[1]	valid_0's auc: 0.650219	valid_0's binary_logloss: 0.692955
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's auc: 0.652986	valid_0's binary_logloss: 0.692711
[3]	valid_0's auc: 0.656277	valid_0's binary_logloss: 0.692264
[4]	valid_0's auc: 0.6559	valid_0's binary_logloss: 0.69193
[5]	valid_0's auc: 0.658128	valid_0's binary_logloss: 0.691453
[6]	valid_0's auc: 0.658371	valid_0's binary_logloss: 0.690983
[7]	valid_0's auc: 0.658683	valid_0's binary_logloss: 0.690546
[8]	valid_0's auc: 0.658628	valid_0's binary_logloss: 0.690085
[9]	valid_0's auc: 0.658338	valid_0's binary_logloss: 0.689664
[10]	valid_0's auc: 0.658789	valid_0's binary_logloss: 0.689431
[11]	valid_0's auc: 0.659283	valid_0's binary_logloss: 0.689202
[12]	valid_0's auc: 0.659179	valid_0's binary_logloss: 0.688812
[13]	valid_0's auc: 0.659256	valid_0's binary_logloss: 0.688371
[14]	valid_0's auc: 0.65978	valid_0's binary_logloss: 0.68803
[15]	valid_0's auc: 0.659958	valid_0's binary_logloss: 0

[254]	valid_0's auc: 0.665739	valid_0's binary_logloss: 0.652133
[255]	valid_0's auc: 0.665757	valid_0's binary_logloss: 0.652076
[256]	valid_0's auc: 0.665755	valid_0's binary_logloss: 0.652025
[257]	valid_0's auc: 0.665772	valid_0's binary_logloss: 0.651963
[258]	valid_0's auc: 0.665781	valid_0's binary_logloss: 0.651911
[259]	valid_0's auc: 0.665782	valid_0's binary_logloss: 0.65186
[260]	valid_0's auc: 0.665811	valid_0's binary_logloss: 0.651808
[261]	valid_0's auc: 0.665831	valid_0's binary_logloss: 0.65178
[262]	valid_0's auc: 0.665848	valid_0's binary_logloss: 0.651727
[263]	valid_0's auc: 0.665866	valid_0's binary_logloss: 0.651669
[264]	valid_0's auc: 0.665897	valid_0's binary_logloss: 0.651604
[265]	valid_0's auc: 0.66591	valid_0's binary_logloss: 0.651547
[266]	valid_0's auc: 0.665937	valid_0's binary_logloss: 0.651495
[267]	valid_0's auc: 0.665971	valid_0's binary_logloss: 0.65143
[268]	valid_0's auc: 0.666014	valid_0's binary_logloss: 0.651408
[269]	valid_0's auc: 0.666053

[506]	valid_0's auc: 0.671587	valid_0's binary_logloss: 0.645909
[507]	valid_0's auc: 0.6716	valid_0's binary_logloss: 0.645898
[508]	valid_0's auc: 0.671618	valid_0's binary_logloss: 0.645893
[509]	valid_0's auc: 0.671648	valid_0's binary_logloss: 0.645877
[510]	valid_0's auc: 0.671661	valid_0's binary_logloss: 0.64587
[511]	valid_0's auc: 0.671671	valid_0's binary_logloss: 0.64586
[512]	valid_0's auc: 0.671699	valid_0's binary_logloss: 0.645843
[513]	valid_0's auc: 0.671728	valid_0's binary_logloss: 0.64583
[514]	valid_0's auc: 0.671745	valid_0's binary_logloss: 0.645818
[515]	valid_0's auc: 0.671757	valid_0's binary_logloss: 0.64581
[516]	valid_0's auc: 0.671779	valid_0's binary_logloss: 0.645802
[517]	valid_0's auc: 0.671806	valid_0's binary_logloss: 0.645791
[518]	valid_0's auc: 0.671825	valid_0's binary_logloss: 0.645785
[519]	valid_0's auc: 0.671851	valid_0's binary_logloss: 0.645768
[520]	valid_0's auc: 0.67187	valid_0's binary_logloss: 0.645759
[521]	valid_0's auc: 0.671897	va

[758]	valid_0's auc: 0.676234	valid_0's binary_logloss: 0.643686
[759]	valid_0's auc: 0.676246	valid_0's binary_logloss: 0.643681
[760]	valid_0's auc: 0.676256	valid_0's binary_logloss: 0.643676
[761]	valid_0's auc: 0.676265	valid_0's binary_logloss: 0.64367
[762]	valid_0's auc: 0.676279	valid_0's binary_logloss: 0.643663
[763]	valid_0's auc: 0.67629	valid_0's binary_logloss: 0.643662
[764]	valid_0's auc: 0.676311	valid_0's binary_logloss: 0.643653
[765]	valid_0's auc: 0.676338	valid_0's binary_logloss: 0.643639
[766]	valid_0's auc: 0.676351	valid_0's binary_logloss: 0.643631
[767]	valid_0's auc: 0.676366	valid_0's binary_logloss: 0.643622
[768]	valid_0's auc: 0.676376	valid_0's binary_logloss: 0.643618
[769]	valid_0's auc: 0.676407	valid_0's binary_logloss: 0.643601
[770]	valid_0's auc: 0.676429	valid_0's binary_logloss: 0.64359
[771]	valid_0's auc: 0.676451	valid_0's binary_logloss: 0.643578
[772]	valid_0's auc: 0.67647	valid_0's binary_logloss: 0.643567
[773]	valid_0's auc: 0.67649	

[7]	valid_0's auc: 0.658287	valid_0's binary_logloss: 0.690261
[8]	valid_0's auc: 0.658461	valid_0's binary_logloss: 0.689796
[9]	valid_0's auc: 0.659926	valid_0's binary_logloss: 0.689555
[10]	valid_0's auc: 0.659692	valid_0's binary_logloss: 0.689107
[11]	valid_0's auc: 0.659369	valid_0's binary_logloss: 0.688696
[12]	valid_0's auc: 0.659171	valid_0's binary_logloss: 0.688255
[13]	valid_0's auc: 0.659331	valid_0's binary_logloss: 0.687813
[14]	valid_0's auc: 0.659709	valid_0's binary_logloss: 0.687513
[15]	valid_0's auc: 0.659788	valid_0's binary_logloss: 0.687089
[16]	valid_0's auc: 0.660348	valid_0's binary_logloss: 0.686755
[17]	valid_0's auc: 0.660254	valid_0's binary_logloss: 0.686343
[18]	valid_0's auc: 0.660301	valid_0's binary_logloss: 0.686139
[19]	valid_0's auc: 0.660285	valid_0's binary_logloss: 0.685942
[20]	valid_0's auc: 0.660522	valid_0's binary_logloss: 0.685535
[21]	valid_0's auc: 0.660497	valid_0's binary_logloss: 0.685353
[22]	valid_0's auc: 0.660443	valid_0's bina

[260]	valid_0's auc: 0.665407	valid_0's binary_logloss: 0.652248
[261]	valid_0's auc: 0.66542	valid_0's binary_logloss: 0.652189
[262]	valid_0's auc: 0.665458	valid_0's binary_logloss: 0.652143
[263]	valid_0's auc: 0.665476	valid_0's binary_logloss: 0.652073
[264]	valid_0's auc: 0.665493	valid_0's binary_logloss: 0.652015
[265]	valid_0's auc: 0.665504	valid_0's binary_logloss: 0.651999
[266]	valid_0's auc: 0.665515	valid_0's binary_logloss: 0.651939
[267]	valid_0's auc: 0.66551	valid_0's binary_logloss: 0.651885
[268]	valid_0's auc: 0.665519	valid_0's binary_logloss: 0.651837
[269]	valid_0's auc: 0.665549	valid_0's binary_logloss: 0.651808
[270]	valid_0's auc: 0.665591	valid_0's binary_logloss: 0.651742
[271]	valid_0's auc: 0.665627	valid_0's binary_logloss: 0.651671
[272]	valid_0's auc: 0.665647	valid_0's binary_logloss: 0.651618
[273]	valid_0's auc: 0.665663	valid_0's binary_logloss: 0.651565
[274]	valid_0's auc: 0.665676	valid_0's binary_logloss: 0.651537
[275]	valid_0's auc: 0.6656

[512]	valid_0's auc: 0.67096	valid_0's binary_logloss: 0.645842
[513]	valid_0's auc: 0.670983	valid_0's binary_logloss: 0.645829
[514]	valid_0's auc: 0.670999	valid_0's binary_logloss: 0.645819
[515]	valid_0's auc: 0.671017	valid_0's binary_logloss: 0.645807
[516]	valid_0's auc: 0.67104	valid_0's binary_logloss: 0.645794
[517]	valid_0's auc: 0.671066	valid_0's binary_logloss: 0.645788
[518]	valid_0's auc: 0.671082	valid_0's binary_logloss: 0.645777
[519]	valid_0's auc: 0.671105	valid_0's binary_logloss: 0.645761
[520]	valid_0's auc: 0.671122	valid_0's binary_logloss: 0.645748
[521]	valid_0's auc: 0.671145	valid_0's binary_logloss: 0.645733
[522]	valid_0's auc: 0.671166	valid_0's binary_logloss: 0.645721
[523]	valid_0's auc: 0.671179	valid_0's binary_logloss: 0.645713
[524]	valid_0's auc: 0.671193	valid_0's binary_logloss: 0.645702
[525]	valid_0's auc: 0.67121	valid_0's binary_logloss: 0.645692
[526]	valid_0's auc: 0.671226	valid_0's binary_logloss: 0.645684
[527]	valid_0's auc: 0.67124

[764]	valid_0's auc: 0.675609	valid_0's binary_logloss: 0.643545
[765]	valid_0's auc: 0.675629	valid_0's binary_logloss: 0.643539
[766]	valid_0's auc: 0.675642	valid_0's binary_logloss: 0.643532
[767]	valid_0's auc: 0.675661	valid_0's binary_logloss: 0.64352
[768]	valid_0's auc: 0.675677	valid_0's binary_logloss: 0.643511
[769]	valid_0's auc: 0.675694	valid_0's binary_logloss: 0.643498
[770]	valid_0's auc: 0.675706	valid_0's binary_logloss: 0.643491
[771]	valid_0's auc: 0.675733	valid_0's binary_logloss: 0.643479
[772]	valid_0's auc: 0.675757	valid_0's binary_logloss: 0.643462
[773]	valid_0's auc: 0.675775	valid_0's binary_logloss: 0.643453
[774]	valid_0's auc: 0.675789	valid_0's binary_logloss: 0.643447
[775]	valid_0's auc: 0.675809	valid_0's binary_logloss: 0.643435
[776]	valid_0's auc: 0.675829	valid_0's binary_logloss: 0.643421
[777]	valid_0's auc: 0.675842	valid_0's binary_logloss: 0.643414
[778]	valid_0's auc: 0.675861	valid_0's binary_logloss: 0.643406
[779]	valid_0's auc: 0.675

[I 2019-02-06 11:18:42,197] Finished a trial resulted in value: 0.32063338325917734. Current best value is 0.31620137656217295 with parameters: {'max_depth': 4, 'learning_rate': 0.02099095036849272, 'min_child_samples': 29, 'min_child_weight': 0.0014260960168705932}.


[1]	valid_0's auc: 0.650219	valid_0's binary_logloss: 0.692571
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's auc: 0.652711	valid_0's binary_logloss: 0.692139
[3]	valid_0's auc: 0.656066	valid_0's binary_logloss: 0.69136
[4]	valid_0's auc: 0.656454	valid_0's binary_logloss: 0.690789
[5]	valid_0's auc: 0.658426	valid_0's binary_logloss: 0.689964
[6]	valid_0's auc: 0.658719	valid_0's binary_logloss: 0.689165
[7]	valid_0's auc: 0.658612	valid_0's binary_logloss: 0.688431
[8]	valid_0's auc: 0.658742	valid_0's binary_logloss: 0.687662
[9]	valid_0's auc: 0.658445	valid_0's binary_logloss: 0.686962
[10]	valid_0's auc: 0.658868	valid_0's binary_logloss: 0.686582
[11]	valid_0's auc: 0.659522	valid_0's binary_logloss: 0.686211
[12]	valid_0's auc: 0.659578	valid_0's binary_logloss: 0.68557
[13]	valid_0's auc: 0.659734	valid_0's binary_logloss: 0.684854
[14]	valid_0's auc: 0.660354	valid_0's binary_logloss: 0.684297
[15]	valid_0's auc: 0.66067	valid_0's binary_logloss:

[254]	valid_0's auc: 0.670529	valid_0's binary_logloss: 0.646546
[255]	valid_0's auc: 0.67055	valid_0's binary_logloss: 0.646522
[256]	valid_0's auc: 0.670606	valid_0's binary_logloss: 0.646488
[257]	valid_0's auc: 0.670639	valid_0's binary_logloss: 0.646464
[258]	valid_0's auc: 0.670689	valid_0's binary_logloss: 0.64643
[259]	valid_0's auc: 0.67073	valid_0's binary_logloss: 0.646407
[260]	valid_0's auc: 0.670775	valid_0's binary_logloss: 0.646383
[261]	valid_0's auc: 0.670803	valid_0's binary_logloss: 0.646355
[262]	valid_0's auc: 0.670864	valid_0's binary_logloss: 0.64632
[263]	valid_0's auc: 0.670894	valid_0's binary_logloss: 0.646294
[264]	valid_0's auc: 0.670922	valid_0's binary_logloss: 0.646273
[265]	valid_0's auc: 0.670959	valid_0's binary_logloss: 0.646255
[266]	valid_0's auc: 0.671016	valid_0's binary_logloss: 0.646226
[267]	valid_0's auc: 0.671047	valid_0's binary_logloss: 0.646204
[268]	valid_0's auc: 0.671094	valid_0's binary_logloss: 0.646191
[269]	valid_0's auc: 0.671144

[506]	valid_0's auc: 0.678635	valid_0's binary_logloss: 0.642518
[507]	valid_0's auc: 0.678662	valid_0's binary_logloss: 0.642501
[508]	valid_0's auc: 0.678685	valid_0's binary_logloss: 0.642487
[509]	valid_0's auc: 0.678696	valid_0's binary_logloss: 0.642478
[510]	valid_0's auc: 0.678717	valid_0's binary_logloss: 0.642465
[511]	valid_0's auc: 0.678754	valid_0's binary_logloss: 0.642448
[512]	valid_0's auc: 0.678774	valid_0's binary_logloss: 0.642432
[513]	valid_0's auc: 0.678807	valid_0's binary_logloss: 0.642417
[514]	valid_0's auc: 0.678837	valid_0's binary_logloss: 0.642399
[515]	valid_0's auc: 0.678855	valid_0's binary_logloss: 0.642377
[516]	valid_0's auc: 0.678871	valid_0's binary_logloss: 0.642369
[517]	valid_0's auc: 0.678893	valid_0's binary_logloss: 0.64236
[518]	valid_0's auc: 0.678916	valid_0's binary_logloss: 0.642352
[519]	valid_0's auc: 0.678939	valid_0's binary_logloss: 0.642336
[520]	valid_0's auc: 0.678963	valid_0's binary_logloss: 0.642323
[521]	valid_0's auc: 0.678

[758]	valid_0's auc: 0.683223	valid_0's binary_logloss: 0.639735
[759]	valid_0's auc: 0.683232	valid_0's binary_logloss: 0.639747
[760]	valid_0's auc: 0.683298	valid_0's binary_logloss: 0.639702
[761]	valid_0's auc: 0.683308	valid_0's binary_logloss: 0.63969
[762]	valid_0's auc: 0.683321	valid_0's binary_logloss: 0.639682
[763]	valid_0's auc: 0.683348	valid_0's binary_logloss: 0.639669
[764]	valid_0's auc: 0.683368	valid_0's binary_logloss: 0.639661
[765]	valid_0's auc: 0.683381	valid_0's binary_logloss: 0.639653
[766]	valid_0's auc: 0.683418	valid_0's binary_logloss: 0.639651
[767]	valid_0's auc: 0.683433	valid_0's binary_logloss: 0.639643
[768]	valid_0's auc: 0.683444	valid_0's binary_logloss: 0.639608
[769]	valid_0's auc: 0.683468	valid_0's binary_logloss: 0.639597
[770]	valid_0's auc: 0.683477	valid_0's binary_logloss: 0.639591
[771]	valid_0's auc: 0.683491	valid_0's binary_logloss: 0.639586
[772]	valid_0's auc: 0.683513	valid_0's binary_logloss: 0.639565
[773]	valid_0's auc: 0.683

In [71]:
study.best_trial

FrozenTrial(trial_id=9, state=<TrialState.COMPLETE: 1>, value=0.3095201338236717, datetime_start=datetime.datetime(2019, 2, 6, 12, 20, 43, 758251), datetime_complete=datetime.datetime(2019, 2, 6, 12, 26, 45, 485441), params={'max_depth': 10, 'learning_rate': 0.07217484586809854, 'min_child_samples': 15, 'min_child_weight': 0.006493073242727629}, user_attrs={'mean_train_auc': 0.7348527426618463, 'mean_eval_auc': 0.6904798661763283, 'best_iteration_': 174}, system_attrs={}, intermediate_values={}, params_in_internal_repr={'max_depth': 10, 'learning_rate': 0.07217484586809854, 'min_child_samples': 15, 'min_child_weight': 0.006493073242727629})

### Grid search CV 結果

In [142]:
study.best_trial

ValueError: No trials are completed yet.

In [110]:
print('best_params is :\n',study.best_trial.params,'\n')
print('best_params is :\n',study.best_trial.params,'\n')
print('best_iteration is :\n',study.best_trial.user_attrs['best_iteration_'],'\n')
print('best_iteration is :\n',study.best_trial)

best_params is :
 {'min_child_weight': 0.0007750748346154407, 'min_child_samples': 15, 'learning_rate': 0.10566509680676393, 'max_depth': 10} 

best_iteration is :
 270 

best_iteration is :
 FrozenTrial(trial_id=14, state=<TrialState.COMPLETE: 1>, value=0.3050038874723374, datetime_start=datetime.datetime(2019, 1, 30, 17, 58, 27, 535904), datetime_complete=datetime.datetime(2019, 1, 30, 18, 0, 58, 144087), params={'min_child_weight': 0.0007750748346154407, 'min_child_samples': 15, 'learning_rate': 0.10566509680676393, 'max_depth': 10}, user_attrs={'best_iteration_': 270}, system_attrs={}, intermediate_values={}, params_in_internal_repr={'min_child_weight': 0.0007750748346154407, 'min_child_samples': 15, 'learning_rate': 0.10566509680676393, 'max_depth': 10})


### 全学習設定で学習

### AvSigVersion validation split

- CV validation score AUC : 0.693322
- CV validation rounds : 273
- LB score AUC : 

In [111]:
%%time

best_param = study.best_trial.params
best_iteration = study.best_trial.user_attrs['best_iteration_']
lgbm = LGBMClassifier(n_estimators=best_iteration, **best_param)

lgbm.fit(
    encoded_df,
    train_df.HasDetections
)

with open('../tmp/tmp_pp.pkl','wb') as f:
    pickle.dump(pipeline, f)

CPU times: user 1h 7min 6s, sys: 2min 33s, total: 1h 9min 39s
Wall time: 1min 52s


## 予測

In [52]:
encoded_test = column_trans.transform(test_df)

In [114]:
pred_score = lgbm.predict_proba(encoded_test)

In [115]:
pred_score

array([[0.53208084, 0.46791916],
       [0.47727142, 0.52272858],
       [0.62393838, 0.37606162],
       ...,
       [0.73213963, 0.26786037],
       [0.5329717 , 0.4670283 ],
       [0.65240437, 0.34759563]])

In [116]:
submission = test_df.iloc[:,[0]].copy()
submission['HasDetections'] = pred_score[:,1]
submission.to_csv('../result/avsig_validate_HPS_lgbm.csv',index=False)