In [1]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [1]:
import sys
sys.path.append("../..") # Adds higher directory to python modules path.
from utilities import aggregate_feature_calculators
from utilities import aggregate_feature_calculators_setting as aggcal
from utilities.parallel import Parallel
from utilities.dfdb import DFDB

from utilities.process.pqueue import *
from utilities.process.pnode import *
from utilities.process.putilities import *

In [2]:
import numpy as np
import pandas as pd
import os
import time
import datetime
import json
import copy
import gc
import warnings
from tqdm import tqdm_notebook, tqdm

import optuna

import lightgbm as lgb
import xgboost as xgb

from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold,TimeSeriesSplit, GroupKFold
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR, SVR
from sklearn.metrics import mean_absolute_error
from sklearn import preprocessing

import eli5
from eli5.sklearn import PermutationImportance

import networkx as nx

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
csv_file_folder =  '../../data/input'
os.listdir(csv_file_folder)

['test.csv',
 'structures',
 'sample_submission.csv',
 'magnetic_shielding_tensors.csv',
 'mulliken_charges.csv',
 'potential_energy.csv',
 'scalar_coupling_contributions.csv',
 'dipole_moments.csv',
 'structures.csv',
 'train.csv']

In [4]:
file_folder =  '../../data/feature'
[f for f in os.listdir(file_folder) if (f.endswith('.pkl')) and (not f.startswith('.'))]

['giba-r-data-table-simple-features-1-17-lb_train.pkl',
 'eem_train.pkl',
 'coulomb-interaction-speed-up_train.pkl',
 'dataset-with-number-of-bonds-between-atoms_test.pkl',
 'keras-neural-net-for-champs_train.pkl',
 'brute-force-feature-engineering_test.pkl',
 'angles-and-distances_test.pkl',
 'giba-r-data-table-simple-features-0-991-lb_test.pkl',
 'keras-neural-net-for-champs_test.pkl',
 'brute-force-feature-engineering-mini_train.pkl',
 'laplaction_matrix_test.pkl',
 'dataset-with-number-of-bonds-between-atoms_train.pkl',
 'coulomb-interaction-speed-up_test.pkl',
 'bonds-from-structure-data_train.pkl',
 'laplaction_matrix_train.pkl',
 'giba-r-data-table-simple-features-0-991-lb_train.pkl',
 'eem_test.pkl',
 'brute-force-feature-engineering_train.pkl',
 'molecule-with-openbabel_train.pkl',
 'molecular-properties-eda-and-models_train.pkl',
 'brute-force-feature-engineering-mini_test.pkl',
 'eachtype_train.pkl',
 'molecular-properties-eda-and-models_test.pkl',
 'bonds-from-structure-dat

In [5]:
df_train=pd.read_pickle(f'{file_folder}/df_train.gzde', compression='gzip')
df_train = df_train.rename(columns={'group':'molecule_name', 'type':'group'})

In [6]:
df_test=pd.read_pickle(f'{file_folder}/df_test.gzde', compression='gzip')
df_test = df_test.rename(columns={'group':'molecule_name', 'type':'group'})

In [6]:
oof_file_folder =  '../../data/oof'
[f for f in os.listdir(oof_file_folder) if f.startswith('lgbm')]

['lgbm_sd_test.pkl',
 'lgbm_fc_train.pkl',
 'lgbm_pso_train.pkl',
 'lgbm_pso_test.pkl',
 'lgbm_fc_test.pkl',
 'lgbm_sd_train.pkl']

In [7]:
df_oof_train = pd.DataFrame()
# df_oof_test = pd.DataFrame()
for f in [f for f in os.listdir(oof_file_folder) if f.startswith('lgbm')]:
    feat_name = f.split('_')[1]
    if 'train' in f:
        df_oof_i = pd.read_pickle(f'{oof_file_folder}/{f}')[['id', feat_name]].rename(columns={'id':'index', feat_name:f'oof_{feat_name}'})
        df_oof_train = pd.concat([df_oof_train, df_oof_i], axis=1)
#     if 'test' in f:
#         df_oof_i = pd.read_pickle(f'{oof_file_folder}/{f}')[['id', feat_name]].rename(columns={'id':'index', feat_name:f'oof_{feat_name}'})
#         df_oof_test = pd.concat([df_oof_test, df_oof_i], axis=1)

In [8]:
df_train = pd.concat([df_train, df_oof_train[['oof_sd','oof_fc','oof_pso']]], axis=1)
# df_test = pd.concat([df_test, df_oof_test[['oof_sd','oof_fc','oof_pso']]], axis=1)

In [9]:
df_train = df_train.sample(500000).reset_index(drop=True)

In [9]:
df_train['y'] = df_train['scalar_coupling_constant']

In [12]:
mytrial = []

In [12]:
param = {
    'columns': df_train.columns.drop(['index', 'y','group', 'scalar_coupling_constant', 'fc', 'sd','pso','dso']).tolist(),
    'cv': {
        'cls': 'GroupKFold',
        'init':{
            'n_splits': 3,
#             'shuffle': True,
#             'random_state': 42,
        },
    },
    'scaler': {'cls': 'StandardScaler', 'init': {}, 'fit': {}},
    'model': {
        'cls': 'lgb.LGBMRegressor',
        'init': {
            'learning_rate': 0.2833769330240482,
            'feature_fraction': 0.8818248470204605,
            'bagging_fraction': 0.8205197060908092,
            'min_data_in_leaf': 202,
            'lambda_l1': 0.017039063121824582,
            'lambda_l2': 0.8318702431636841,
            'max_bin': 100,
            'num_leaves': 255,
            'random_state': 3895,
            'n_jobs': 16
        },
        'fit': {}
    },
    'metric': 'mean_absolute_error'
}



process_queue = PQueue(df_train, None, param, mytrial)
sort_features = SortFeatureSelectTopNProcess(**{'top_n':200})
select_topn = RFESelectTopNProcess(**{'n_features_remain':20, 'n_features_to_remove':10})
remove_useless = RFERemoveUselessFeaturesProcess(**{})
process_queue.insert_node(sort_features)
process_queue.insert_node(select_topn)
process_queue.insert_node(remove_useless)

try:
    result = process_queue.run()
except Exception as e:
    print(e.__str__())
print(len(process_queue.trial))
print(process_queue.param)


40
{'columns': ['tertiary_distance_0', 'oof_fc', 'dist_to_type_mean', 'dist_C_0_x', 'oof_pso', 'tertiary_atom_0', 'dist_O_0_y', 'dist_H_1_y', 'molecule_name.1', 'z_1', 'dist_N_3_x', 'dist_F_0_y', 'dist_N_3_y', 'dist_O_3_y', 'dist_F_0_x', 'dist_N_2_y', 'tertiary_atom_5', 'dist_N_4_y', 'adH4', 'vander_F.y'], 'cv': {'cls': 'GroupKFold', 'init': {'n_splits': 3}}, 'scaler': {'cls': 'StandardScaler', 'init': {}, 'fit': {}}, 'model': {'cls': 'lgb.LGBMRegressor', 'init': {'learning_rate': 0.2833769330240482, 'feature_fraction': 0.8818248470204605, 'bagging_fraction': 0.8205197060908092, 'min_data_in_leaf': 202, 'lambda_l1': 0.017039063121824582, 'lambda_l2': 0.8318702431636841, 'max_bin': 100, 'num_leaves': 255, 'random_state': 3895, 'n_jobs': 16}, 'fit': {}}, 'metric': 'mean_absolute_error'}


In [13]:
param = {'columns': ['tertiary_distance_0', 'oof_fc', 'dist_to_type_mean', 'dist_C_0_x', 'oof_pso', 'tertiary_atom_0', 'dist_O_0_y', 'dist_H_1_y', 'molecule_name.1', 'z_1', 'dist_N_3_x', 'dist_F_0_y', 'dist_N_3_y', 'dist_O_3_y', 'dist_F_0_x', 'dist_N_2_y', 'tertiary_atom_5', 'dist_N_4_y', 'adH4', 'vander_F.y'], 'cv': {'cls': 'GroupKFold', 'init': {'n_splits': 3}}, 'scaler': {'cls': 'StandardScaler', 'init': {}, 'fit': {}}, 'model': {'cls': 'lgb.LGBMRegressor', 'init': {'learning_rate': 0.2833769330240482, 'feature_fraction': 0.8818248470204605, 'bagging_fraction': 0.8205197060908092, 'min_data_in_leaf': 202, 'lambda_l1': 0.017039063121824582, 'lambda_l2': 0.8318702431636841, 'max_bin': 100, 'num_leaves': 255, 'random_state': 3895, 'n_jobs': 16}, 'fit': {}}, 'metric': 'mean_absolute_error'}

In [14]:
def objective(trial):
        
    learning_rate = trial.suggest_uniform('learning_rate', .01, .5)
    feature_fraction = trial.suggest_uniform('feature_fraction', .6, 1)
    bagging_fraction = trial.suggest_uniform('bagging_fraction', 0.6, 1)
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 200, 800)
    lambda_l1 = trial.suggest_loguniform('lambda_l1', 1e-6, 1e2)
    lambda_l2 = trial.suggest_loguniform('lambda_l2', 1e-6, 1e2)
    max_bin = trial.suggest_int('max_bin', 10, 100)
    num_leaves = trial.suggest_int('num_leaves', 4, 512)
        
    args = param.copy()
    args['model']['init']={
                'learning_rate':learning_rate,
                'feature_fraction':feature_fraction,
                'bagging_fraction':bagging_fraction,
                'min_data_in_leaf':min_data_in_leaf,
                'lambda_l1':lambda_l1,
                'lambda_l2':lambda_l2,
                'max_bin':max_bin,
                'num_leaves':num_leaves,
                'n_jobs':16
    }
    
    df_his, df_feature_importances, df_valid_pred, df_test_pred =  sk_process(df_train, args, f'tune hyperparam', df_test=None, trial=mytrial, is_output_feature_importance=False, trial_level=0)
    val_metric_mean = np.mean(df_his.valid)
    return val_metric_mean



In [15]:
study = optuna.create_study()
study.optimize(objective, n_trials=200)

[I 2019-07-22 23:54:13,478] Finished trial#0 resulted in value: 10.344044083148583. Current best value is 10.344044083148583 with parameters: {'learning_rate': 0.027604722875908642, 'feature_fraction': 0.8720963602405887, 'bagging_fraction': 0.9229079820645048, 'min_data_in_leaf': 218, 'lambda_l1': 0.0007284359432875013, 'lambda_l2': 0.003445565607273236, 'max_bin': 57, 'num_leaves': 217}.
[I 2019-07-22 23:54:52,470] Finished trial#1 resulted in value: 9.298108553798427. Current best value is 9.298108553798427 with parameters: {'learning_rate': 0.4301914812079682, 'feature_fraction': 0.7078286975782451, 'bagging_fraction': 0.8649072188015634, 'min_data_in_leaf': 369, 'lambda_l1': 0.34935543061458213, 'lambda_l2': 0.891421045949713, 'max_bin': 74, 'num_leaves': 216}.
[I 2019-07-22 23:55:36,562] Finished trial#2 resulted in value: 8.99956804490527. Current best value is 8.99956804490527 with parameters: {'learning_rate': 0.3378080858764837, 'feature_fraction': 0.6881289259294576, 'baggin

[I 2019-07-23 00:27:12,130] Finished trial#41 resulted in value: 9.563715960536292. Current best value is 7.610252321847802 with parameters: {'learning_rate': 0.3318819525666389, 'feature_fraction': 0.7670828565729676, 'bagging_fraction': 0.8953774408763282, 'min_data_in_leaf': 202, 'lambda_l1': 1.0908256382181142e-05, 'lambda_l2': 3.523205970365862e-05, 'max_bin': 88, 'num_leaves': 468}.
[I 2019-07-23 00:28:11,668] Finished trial#42 resulted in value: 7.563506380185266. Current best value is 7.563506380185266 with parameters: {'learning_rate': 0.15925795882187083, 'feature_fraction': 0.8920602146506871, 'bagging_fraction': 0.6485951194464459, 'min_data_in_leaf': 755, 'lambda_l1': 0.8488923280457591, 'lambda_l2': 1.1028265879651968e-05, 'max_bin': 100, 'num_leaves': 378}.
[I 2019-07-23 00:29:09,844] Finished trial#43 resulted in value: 7.9768727596340865. Current best value is 7.563506380185266 with parameters: {'learning_rate': 0.15925795882187083, 'feature_fraction': 0.89206021465068

[I 2019-07-23 01:04:11,536] Finished trial#83 resulted in value: 8.989435380483407. Current best value is 7.563506380185266 with parameters: {'learning_rate': 0.15925795882187083, 'feature_fraction': 0.8920602146506871, 'bagging_fraction': 0.6485951194464459, 'min_data_in_leaf': 755, 'lambda_l1': 0.8488923280457591, 'lambda_l2': 1.1028265879651968e-05, 'max_bin': 100, 'num_leaves': 378}.
[I 2019-07-23 01:05:03,473] Finished trial#84 resulted in value: 8.07921167217753. Current best value is 7.563506380185266 with parameters: {'learning_rate': 0.15925795882187083, 'feature_fraction': 0.8920602146506871, 'bagging_fraction': 0.6485951194464459, 'min_data_in_leaf': 755, 'lambda_l1': 0.8488923280457591, 'lambda_l2': 1.1028265879651968e-05, 'max_bin': 100, 'num_leaves': 378}.
[I 2019-07-23 01:06:03,115] Finished trial#85 resulted in value: 8.365237171490573. Current best value is 7.563506380185266 with parameters: {'learning_rate': 0.15925795882187083, 'feature_fraction': 0.8920602146506871,

[I 2019-07-23 01:33:20,945] Finished trial#123 resulted in value: 8.197555549388303. Current best value is 7.450919464201985 with parameters: {'learning_rate': 0.3622587705966408, 'feature_fraction': 0.7854483962772717, 'bagging_fraction': 0.9546342593989241, 'min_data_in_leaf': 350, 'lambda_l1': 7.642900510828487e-06, 'lambda_l2': 0.0002861596844630391, 'max_bin': 88, 'num_leaves': 228}.
[I 2019-07-23 01:34:01,959] Finished trial#124 resulted in value: 8.8471682719883. Current best value is 7.450919464201985 with parameters: {'learning_rate': 0.3622587705966408, 'feature_fraction': 0.7854483962772717, 'bagging_fraction': 0.9546342593989241, 'min_data_in_leaf': 350, 'lambda_l1': 7.642900510828487e-06, 'lambda_l2': 0.0002861596844630391, 'max_bin': 88, 'num_leaves': 228}.
[I 2019-07-23 01:34:34,877] Finished trial#125 resulted in value: 14.590755305438753. Current best value is 7.450919464201985 with parameters: {'learning_rate': 0.3622587705966408, 'feature_fraction': 0.785448396277271

[I 2019-07-23 02:07:17,141] Finished trial#165 resulted in value: 12.334919976665377. Current best value is 7.401316586325016 with parameters: {'learning_rate': 0.4822173858081333, 'feature_fraction': 0.7695179153572054, 'bagging_fraction': 0.9414942329313656, 'min_data_in_leaf': 528, 'lambda_l1': 3.971942589250556, 'lambda_l2': 0.16474371358640466, 'max_bin': 100, 'num_leaves': 353}.
[I 2019-07-23 02:08:09,348] Finished trial#166 resulted in value: 8.280690333115475. Current best value is 7.401316586325016 with parameters: {'learning_rate': 0.4822173858081333, 'feature_fraction': 0.7695179153572054, 'bagging_fraction': 0.9414942329313656, 'min_data_in_leaf': 528, 'lambda_l1': 3.971942589250556, 'lambda_l2': 0.16474371358640466, 'max_bin': 100, 'num_leaves': 353}.
[I 2019-07-23 02:09:07,668] Finished trial#167 resulted in value: 9.620217353941772. Current best value is 7.401316586325016 with parameters: {'learning_rate': 0.4822173858081333, 'feature_fraction': 0.7695179153572054, 'bagg

In [16]:
df_trial = pd.DataFrame(mytrial)
df_trial[df_trial['message']=='tune hyperparam'][['datetime', 'message', 'nfeatures', 'train_metric_mean', 'val_metric_mean', 'trn_val_metric_diff']].sort_values(by=['val_metric_mean']).head()

Unnamed: 0,datetime,message,nfeatures,train_metric_mean,val_metric_mean,trn_val_metric_diff
135,2019-07-23 01:42:42.622060,tune hyperparam,20,0.529697,7.401317,6.87162
137,2019-07-23 01:44:13.223997,tune hyperparam,20,0.544194,7.431582,6.887389
98,2019-07-23 01:16:30.136841,tune hyperparam,20,0.518375,7.450919,6.932544
42,2019-07-23 00:28:11.572068,tune hyperparam,20,0.487404,7.563506,7.076102
148,2019-07-23 01:52:52.592693,tune hyperparam,20,0.5428,7.577832,7.035031
