In [1]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [1]:
import sys
sys.path.append("../..") # Adds higher directory to python modules path.
from utilities import aggregate_feature_calculators
from utilities import aggregate_feature_calculators_setting as aggcal
from utilities.parallel import Parallel
from utilities.dfdb import DFDB

from utilities.process.pqueue import *
from utilities.process.pnode import *
from utilities.process.putilities import *

In [2]:
import numpy as np
import pandas as pd
import os
import time
import datetime
import json
import copy
import gc
import warnings
from tqdm import tqdm_notebook, tqdm

import optuna

import lightgbm as lgb
import xgboost as xgb

from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold,TimeSeriesSplit, GroupKFold
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR, SVR
from sklearn.metrics import mean_absolute_error
from sklearn import preprocessing

import eli5
from eli5.sklearn import PermutationImportance

import networkx as nx

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
csv_file_folder =  '../../data/input'
os.listdir(csv_file_folder)

['test.csv',
 'structures',
 'sample_submission.csv',
 'magnetic_shielding_tensors.csv',
 'mulliken_charges.csv',
 'potential_energy.csv',
 'scalar_coupling_contributions.csv',
 'dipole_moments.csv',
 'structures.csv',
 'train.csv']

In [4]:
file_folder =  '../../data/feature'
os.listdir(file_folder)

['giba-r-data-table-simple-features-1-17-lb_train.pkl',
 'eem_train.pkl',
 'dataset-with-number-of-bonds-between-atoms_test.pkl',
 'keras-neural-net-for-champs_train.pkl',
 'brute-force-feature-engineering_test.pkl',
 'angles-and-distances_test.pkl',
 'giba-r-data-table-simple-features-0-991-lb_test.pkl',
 'keras-neural-net-for-champs_test.pkl',
 'brute-force-feature-engineering-mini_train.pkl',
 'dataset-with-number-of-bonds-between-atoms_train.pkl',
 'giba-r-data-table-simple-features-0-991-lb_train.pkl',
 'eem_test.pkl',
 'brute-force-feature-engineering_train.pkl',
 'molecule-with-openbabel_train.pkl',
 'molecular-properties-eda-and-models_train.pkl',
 'brute-force-feature-engineering-mini_test.pkl',
 'eachtype_train.pkl',
 'molecular-properties-eda-and-models_test.pkl',
 '.ipynb_checkpoints',
 'molecule-with-openbabel_test.pkl',
 'eachtype_test.pkl',
 'giba-r-data-table-simple-features-1-17-lb_test.pkl',
 'angles-and-distances_train.pkl']

In [5]:
file_list = ['giba-r-data-table-simple-features-1-17-lb_train.pkl',
 'eem_train.pkl',
 'dataset-with-number-of-bonds-between-atoms_test.pkl',
 'keras-neural-net-for-champs_train.pkl',
 'brute-force-feature-engineering-mini_test.pkl',
 'angles-and-distances_test.pkl',
 'keras-neural-net-for-champs_test.pkl',
 'dataset-with-number-of-bonds-between-atoms_train.pkl',
 'eem_test.pkl',
 'brute-force-feature-engineering-mini_train.pkl',
 'molecule-with-openbabel_train.pkl',
 'molecular-properties-eda-and-models_train.pkl',
 'eachtype_train.pkl',
 'molecular-properties-eda-and-models_test.pkl',
 'molecule-with-openbabel_test.pkl',
 'eachtype_test.pkl',
 'giba-r-data-table-simple-features-1-17-lb_test.pkl',
 'angles-and-distances_train.pkl']

In [6]:
def prepare_data(feature_folder =  '../data/feature', csv_file_folder = '../data/input'):

    df_train = pd.read_csv(f"{csv_file_folder}/train.csv")
    df_test = pd.read_csv(f"{csv_file_folder}/test.csv")

    if type(feature_folder) == list:
        feature_file_list = feature_folder
    elif type(feature_folder) == str:
        feature_file_list = os.listdir(feature_folder)
    else:
        raise Exception(f'{feature_folder} format error')
        
    for f in feature_file_list:
        if (f.endswith('.pkl')) and (not f.startswith('.')):
            if f[:-4].endswith('train'):
                df_feature_i = pd.read_pickle(f'{file_folder}/{f}')
                columns_i = df_feature_i.columns.tolist()
                new_columns = set(columns_i) - set(df_train.columns.tolist())
                df_train = pd.merge(df_train, df_feature_i[list(new_columns) + ['id']], on='id')
                print('train add', f, df_feature_i.shape)
            if f[:-4].endswith('test'):
                df_feature_i = pd.read_pickle(f'{file_folder}/{f}')
                columns_i = df_feature_i.columns.tolist()
                new_columns = set(columns_i) - set(df_test.columns.tolist())
                df_test = pd.merge(df_test, df_feature_i[list(new_columns) + ['id']], on='id')
                print('test add', f, df_feature_i.shape)

    numerics = ['int16', 'int8', 'int32', 'int64', 'float16', 'float32', 'float64']
    for col in df_train.columns:
        col_type = df_train[col].dtypes
        if not col_type in numerics:
            print(col, df_train[col].unique())
            le = LabelEncoder()
            le.fit(list(df_train[col].values) + list(df_test[col].values))
            df_train[col] = le.transform(list(df_train[col].values))
            df_test[col] = le.transform(list(df_test[col].values))
            print(le.classes_)

    df_train = df_train.replace([np.inf, -np.inf], np.nan)
    df_train = df_train.fillna(0)
    df_test = df_test.replace([np.inf, -np.inf], np.nan)
    df_test = df_test.fillna(0)

    df_train = df_train.rename(columns={'id': 'index', 'scalar_coupling_constant': 'y'})
    df_test = df_test.rename(columns={'id': 'index'})

    df_train = df_train.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    return df_train, df_test


In [7]:
df_train, df_test = prepare_data(feature_folder =  file_list,  csv_file_folder = '../../data/input')
print(df_train.shape, df_test.shape)

train add giba-r-data-table-simple-features-1-17-lb_train.pkl (4658147, 96)
train add eem_train.pkl (4658147, 3)
test add dataset-with-number-of-bonds-between-atoms_test.pkl (2505542, 12)
train add keras-neural-net-for-champs_train.pkl (4658147, 56)
test add brute-force-feature-engineering-mini_test.pkl (2505542, 49)
test add angles-and-distances_test.pkl (2505542, 2)
test add keras-neural-net-for-champs_test.pkl (2505542, 56)
train add dataset-with-number-of-bonds-between-atoms_train.pkl (4658147, 12)
test add eem_test.pkl (2505542, 3)
train add brute-force-feature-engineering-mini_train.pkl (4658147, 49)
train add molecule-with-openbabel_train.pkl (4658147, 81)
train add molecular-properties-eda-and-models_train.pkl (4658147, 19)
train add eachtype_train.pkl (5090786, 92)
test add molecular-properties-eda-and-models_test.pkl (2505542, 19)
test add molecule-with-openbabel_test.pkl (2505542, 81)
test add eachtype_test.pkl (2737775, 92)
test add giba-r-data-table-simple-features-1-17-lb

In [8]:
df_train.shape, df_test.shape

((5090786, 268), (2737775, 267))

In [9]:
param = {
    'columns': df_train.columns.drop(['index', 'y']).tolist(),
    'cv': {
        'cls': 'KFold',
        'init': {'n_splits': 5, 'shuffle': True, 'random_state': 42}
    },
    'scaler': {
        'cls': 'StandardScaler', 'init': {}, 'fit': {}
    },
    'model': {
        'cls': 'lgb.LGBMRegressor',
        'init': {
            'learning_rate': 0.35395923077843333,
            'feature_fraction': 0.8840483697334669,
            'bagging_fraction': 0.7017457378676857,
            'min_data_in_leaf': 616,
            'lambda_l1': 0.00013058988949929333,
            'lambda_l2': 0.004991992636437704,
            'max_bin': 74,
            'num_leaves': 255,
            'random_state': 2928,
            'n_jobs': 16
        },
        'fit': {}
    },
    'metric': 'mean_absolute_error'
}

mytrial = []
process_queue = PQueue(df_train.sample(500000), df_test, param, mytrial)
sort_features = SortFeatureSelectTopNProcess(**{'top_n':200})
select_topn = RFESelectTopNProcess(**{'n_features_remain':20, 'n_features_to_remove':10})
remove_useless = RFERemoveUselessFeaturesProcess(**{})
process_queue.insert_node(sort_features)
process_queue.insert_node(select_topn)
process_queue.insert_node(remove_useless)

In [None]:
try:
    result = process_queue.run()
except Exception as e:
    print(e.__str__())
print(len(process_queue.trial))
print(process_queue.param)

In [None]:
df_trial = pd.DataFrame(mytrial)

In [None]:
df_trial[df_trial['message']=='RFESelectTopNProcess to 20 features'][['datetime', 'message', 'nfeatures', 'train_metric_mean', 'val_metric_mean', 'trn_val_metric_diff']]

In [None]:
df_trial[df_trial['message']=='RFERemoveUselessFeaturesProcess'][['datetime', 'message', 'nfeatures', 'train_metric_mean', 'val_metric_mean', 'trn_val_metric_diff']].sort_values(by=['val_metric_mean']).head()

In [22]:
def objective(trial):
        
    learning_rate = trial.suggest_uniform('learning_rate', .01, .5)
    feature_fraction = trial.suggest_uniform('feature_fraction', .6, 1)
    bagging_fraction = trial.suggest_uniform('bagging_fraction', 0.6, 1)
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 200, 800)
    lambda_l1 = trial.suggest_loguniform('lambda_l1', 1e-6, 1e2)
    lambda_l2 = trial.suggest_loguniform('lambda_l2', 1e-6, 1e2)
    max_bin = trial.suggest_int('max_bin', 10, 100)
    num_leaves = trial.suggest_int('num_leaves', 4, 512)
    random_state = trial.suggest_int('random_state', 1, 9999)
        
    args={
        'columns':process_queue.param['columns'],
        'cv': {
            'cls': 'KFold',
            'init':{
                'n_splits': 5,
                'shuffle': True,
                'random_state': 42,
            },
        },
        'scaler': {
            'cls': 'StandardScaler',
            'init':{},
            'fit':{},
        },
        'model': {
            'cls': 'lgb.LGBMRegressor',
            'init': {
                'learning_rate':learning_rate,
                'feature_fraction':feature_fraction,
                'bagging_fraction':bagging_fraction,
                'min_data_in_leaf':min_data_in_leaf,
                'lambda_l1':lambda_l1,
                'lambda_l2':lambda_l2,
                'max_bin':max_bin,
                'num_leaves':num_leaves,
                'random_state':random_state,
                'n_jobs':16
            },
            'fit': {
            },
        },
        'metric':'mean_absolute_error',
    }
    
    df_his, df_feature_importances, df_valid_pred, df_test_pred =  sk_process(df_train.sample(1000000), args, 'tune hyperparam', trial=mytrial, is_output_feature_importance=False, trial_level=0)
    val_metric_mean = np.mean(df_his.valid)
    return val_metric_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

[I 2019-07-08 12:43:35,887] Finished trial#0 resulted in value: 1.9479356066616114. Current best value is 1.9479356066616114 with parameters: {'learning_rate': 0.2557043291872395, 'feature_fraction': 0.8402360940571069, 'bagging_fraction': 0.6471727275908338, 'min_data_in_leaf': 762, 'lambda_l1': 0.001260157167891001, 'lambda_l2': 1.8109804288577593, 'max_bin': 49, 'num_leaves': 6, 'random_state': 8920}.
[I 2019-07-08 12:44:05,257] Finished trial#1 resulted in value: 1.0277087290901945. Current best value is 1.0277087290901945 with parameters: {'learning_rate': 0.467907656066654, 'feature_fraction': 0.9296597716002054, 'bagging_fraction': 0.88353122869927, 'min_data_in_leaf': 583, 'lambda_l1': 0.4844715238697431, 'lambda_l2': 8.647503479325444, 'max_bin': 78, 'num_leaves': 143, 'random_state': 2998}.
[I 2019-07-08 12:44:20,441] Finished trial#2 resulted in value: 1.5398453432881518. Current best value is 1.0277087290901945 with parameters: {'learning_rate': 0.467907656066654, 'feature_

[I 2019-07-08 13:01:35,485] Finished trial#38 resulted in value: 2.0455410292427043. Current best value is 0.9445406374765357 with parameters: {'learning_rate': 0.3003750010473155, 'feature_fraction': 0.7954806411666246, 'bagging_fraction': 0.825122687823091, 'min_data_in_leaf': 201, 'lambda_l1': 0.00010051426394331587, 'lambda_l2': 3.2239054643990496e-06, 'max_bin': 72, 'num_leaves': 234, 'random_state': 2988}.
[I 2019-07-08 13:02:07,136] Finished trial#39 resulted in value: 0.976087813882492. Current best value is 0.9445406374765357 with parameters: {'learning_rate': 0.3003750010473155, 'feature_fraction': 0.7954806411666246, 'bagging_fraction': 0.825122687823091, 'min_data_in_leaf': 201, 'lambda_l1': 0.00010051426394331587, 'lambda_l2': 3.2239054643990496e-06, 'max_bin': 72, 'num_leaves': 234, 'random_state': 2988}.
[I 2019-07-08 13:02:28,519] Finished trial#40 resulted in value: 1.0830722403332604. Current best value is 0.9445406374765357 with parameters: {'learning_rate': 0.300375

[I 2019-07-08 13:20:35,936] Finished trial#76 resulted in value: 0.9479237571940843. Current best value is 0.9445406374765357 with parameters: {'learning_rate': 0.3003750010473155, 'feature_fraction': 0.7954806411666246, 'bagging_fraction': 0.825122687823091, 'min_data_in_leaf': 201, 'lambda_l1': 0.00010051426394331587, 'lambda_l2': 3.2239054643990496e-06, 'max_bin': 72, 'num_leaves': 234, 'random_state': 2988}.
[I 2019-07-08 13:20:57,442] Finished trial#77 resulted in value: 1.244288848337931. Current best value is 0.9445406374765357 with parameters: {'learning_rate': 0.3003750010473155, 'feature_fraction': 0.7954806411666246, 'bagging_fraction': 0.825122687823091, 'min_data_in_leaf': 201, 'lambda_l1': 0.00010051426394331587, 'lambda_l2': 3.2239054643990496e-06, 'max_bin': 72, 'num_leaves': 234, 'random_state': 2988}.
[I 2019-07-08 13:21:31,993] Finished trial#78 resulted in value: 0.9763902372769845. Current best value is 0.9445406374765357 with parameters: {'learning_rate': 0.300375

[I 2019-07-08 13:39:40,905] Finished trial#114 resulted in value: 1.0492230333398693. Current best value is 0.9368553118374205 with parameters: {'learning_rate': 0.3166712257724028, 'feature_fraction': 0.9071277735212887, 'bagging_fraction': 0.7422147877975603, 'min_data_in_leaf': 416, 'lambda_l1': 1.5701746597674077e-06, 'lambda_l2': 1.7903507490427843e-05, 'max_bin': 98, 'num_leaves': 253, 'random_state': 6406}.
[I 2019-07-08 13:40:13,419] Finished trial#115 resulted in value: 0.9610546104863019. Current best value is 0.9368553118374205 with parameters: {'learning_rate': 0.3166712257724028, 'feature_fraction': 0.9071277735212887, 'bagging_fraction': 0.7422147877975603, 'min_data_in_leaf': 416, 'lambda_l1': 1.5701746597674077e-06, 'lambda_l2': 1.7903507490427843e-05, 'max_bin': 98, 'num_leaves': 253, 'random_state': 6406}.
[I 2019-07-08 13:40:40,681] Finished trial#116 resulted in value: 1.015126817371264. Current best value is 0.9368553118374205 with parameters: {'learning_rate': 0.3

[I 2019-07-08 13:59:47,959] Finished trial#152 resulted in value: 0.951212713884505. Current best value is 0.9368553118374205 with parameters: {'learning_rate': 0.3166712257724028, 'feature_fraction': 0.9071277735212887, 'bagging_fraction': 0.7422147877975603, 'min_data_in_leaf': 416, 'lambda_l1': 1.5701746597674077e-06, 'lambda_l2': 1.7903507490427843e-05, 'max_bin': 98, 'num_leaves': 253, 'random_state': 6406}.
[I 2019-07-08 14:00:26,436] Finished trial#153 resulted in value: 0.9329697749105812. Current best value is 0.9329697749105812 with parameters: {'learning_rate': 0.3414884085893545, 'feature_fraction': 0.9968942693539474, 'bagging_fraction': 0.6304650536121073, 'min_data_in_leaf': 391, 'lambda_l1': 1.0451097740407883e-06, 'lambda_l2': 0.039471193246930845, 'max_bin': 98, 'num_leaves': 256, 'random_state': 8004}.
[I 2019-07-08 14:01:03,998] Finished trial#154 resulted in value: 0.9415126520731034. Current best value is 0.9329697749105812 with parameters: {'learning_rate': 0.341

[I 2019-07-08 14:21:20,485] Finished trial#190 resulted in value: 1.0305378560799183. Current best value is 0.9300692164665992 with parameters: {'learning_rate': 0.29426859872688016, 'feature_fraction': 0.9991242137638949, 'bagging_fraction': 0.6352351289978297, 'min_data_in_leaf': 431, 'lambda_l1': 6.313437511621462e-06, 'lambda_l2': 0.016604298397879098, 'max_bin': 100, 'num_leaves': 255, 'random_state': 9214}.
[I 2019-07-08 14:21:52,453] Finished trial#191 resulted in value: 0.9697525930426156. Current best value is 0.9300692164665992 with parameters: {'learning_rate': 0.29426859872688016, 'feature_fraction': 0.9991242137638949, 'bagging_fraction': 0.6352351289978297, 'min_data_in_leaf': 431, 'lambda_l1': 6.313437511621462e-06, 'lambda_l2': 0.016604298397879098, 'max_bin': 100, 'num_leaves': 255, 'random_state': 9214}.
[I 2019-07-08 14:22:24,841] Finished trial#192 resulted in value: 0.9465400774373991. Current best value is 0.9300692164665992 with parameters: {'learning_rate': 0.29

In [23]:
df_trial = pd.DataFrame(mytrial)
df_trial[df_trial['message']=='tune hyperparam'][['datetime', 'message', 'nfeatures', 'train_metric_mean', 'val_metric_mean', 'trn_val_metric_diff']].sort_values(by=['val_metric_mean']).head()

Unnamed: 0,datetime,message,nfeatures,train_metric_mean,val_metric_mean,trn_val_metric_diff
212,2019-07-08 14:06:29.634383,tune hyperparam,38,0.863236,0.930069,0.066833
215,2019-07-08 14:08:20.456876,tune hyperparam,38,0.862214,0.930458,0.068244
234,2019-07-08 14:18:47.324132,tune hyperparam,38,0.840667,0.932579,0.091912
202,2019-07-08 14:00:26.322634,tune hyperparam,38,0.855637,0.93297,0.077333
145,2019-07-08 13:30:42.870039,tune hyperparam,38,0.86663,0.936855,0.070225


In [14]:
columns = df_trial[df_trial['message']=='RFERemoveUselessFeaturesProcess'].sort_values(by=['val_metric_mean']).head(1)['param'].values[0]['columns']

In [15]:
param = {
    'columns': columns,
    'cv': {
        'cls': 'KFold',
        'init': {'n_splits': 5, 'shuffle': True, 'random_state': 42}
    },
    'scaler': {'cls': 'StandardScaler', 'init': {}, 'fit': {}},
    'model': {
        'cls': 'lgb.LGBMRegressor',
        'init': {
            'learning_rate': 0.29426859872688016,
            'feature_fraction': 0.9991242137638949,
            'bagging_fraction': 0.6352351289978297,
            'min_data_in_leaf': 431,
            'lambda_l1': 6.313437511621462e-06,
            'lambda_l2': 0.016604298397879098,
            'max_bin': 100,
            'num_leaves': 512,
            'n_jobs': 16
        },
            'fit': {}
    },
    'metric': 'mean_absolute_error'
}

In [9]:
df_test = df_test.rename(columns={'cycle_size_mean_x':'atom_index_0_cycle_size_mean', 
                        'cycle_size_mean_y':'atom_index_1_cycle_size_mean',
                       'n_cycle_x':'atom_index_0_n_cycle',
                       'n_cycle_y':'atom_index_1_n_cycle'})

In [16]:
set(df_test.columns.tolist()) - set(df_train.columns.tolist())

{'cycle_size_mean_x', 'cycle_size_mean_y', 'n_cycle_x', 'n_cycle_y'}

In [17]:
set(df_train.columns.tolist()) - set(df_test.columns.tolist())

{'atom_index_0_cycle_size_mean',
 'atom_index_0_n_cycle',
 'atom_index_1_cycle_size_mean',
 'atom_index_1_n_cycle',
 'y'}

In [10]:
param

NameError: name 'param' is not defined

In [None]:
mytrial=[]
df_his, df_feature_importances, df_valid_pred, df_test_pred =  sk_process(df_train, param, 'modeling', df_test=df_test, trial=mytrial, is_output_feature_importance=False, trial_level=1)

In [None]:
df_trial = pd.DataFrame(mytrial)
df_trial[['datetime', 'message', 'nfeatures', 'train_metric_mean', 'val_metric_mean', 'trn_val_metric_diff']].tail(1)

In [36]:
idx=311
df_test_pred = df_trial.loc[idx]['df_test_pred']
df_submit = pd.DataFrame()
df_submit['scalar_coupling_constant'] = np.mean(df_test_pred.drop(columns=['index']).values, axis=1)
df_submit['id'] = df_test_pred['index']
df_submit.to_csv('../../data/submission/submission_lgbm_{}.csv'.format(idx), index=False)