In [1]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [1]:
import sys
sys.path.append("../..") # Adds higher directory to python modules path.
from utilities import aggregate_feature_calculators
from utilities import aggregate_feature_calculators_setting as aggcal
from utilities.parallel import Parallel
from utilities.dfdb import DFDB

from utilities.process.pqueue import *
from utilities.process.pnode import *
from utilities.process.putilities import *

In [2]:
import numpy as np
import pandas as pd
import os
import time
import datetime
import json
import copy
import gc
import warnings
from tqdm import tqdm_notebook, tqdm

import optuna

import lightgbm as lgb
import xgboost as xgb

from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold,TimeSeriesSplit, GroupKFold
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR, SVR
from sklearn.metrics import mean_absolute_error
from sklearn import preprocessing

import eli5
from eli5.sklearn import PermutationImportance

import networkx as nx

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
csv_file_folder =  '../../data/input'
os.listdir(csv_file_folder)

['structures',
 'magnetic_shielding_parameters.csv',
 'train.csv',
 'dipole_moments.csv',
 'magnetic_shielding_tensors.csv',
 'mulliken_charges.csv',
 'potential_energy.csv',
 'sample_submission.csv',
 'scalar_coupling_contributions.csv',
 'structures.csv',
 'test.csv']

In [4]:
file_folder =  '../../data/feature'
[f for f in os.listdir(file_folder) if (f.endswith('.pkl')) and (not f.startswith('.'))]

['angles-and-distances_test.pkl',
 'angles-and-distances_train.pkl',
 'brute-force-feature-engineering_test.pkl',
 'eem_test.pkl',
 'giba-r-data-table-simple-features-0-991-lb_test.pkl',
 'giba-r-data-table-simple-features-0-991-lb_train.pkl',
 'brute-force-feature-engineering_train.pkl',
 'keras-neural-net-for-champs_test.pkl',
 'keras-neural-net-for-champs_train.pkl',
 'eem_train.pkl',
 'molecular-properties-eda-and-models_test.pkl',
 'molecular-properties-eda-and-models_train.pkl',
 'molecule-with-openbabel_test.pkl',
 'molecule-with-openbabel_train.pkl',
 'dataset-with-number-of-bonds-between-atoms_test.pkl',
 'brute-force-feature-engineering-mini_test.pkl',
 'eachtype_train.pkl',
 'eachtype_test.pkl',
 'giba-r-data-table-simple-features-1-17-lb_test.pkl',
 'dataset-with-number-of-bonds-between-atoms_train.pkl',
 'brute-force-feature-engineering-mini_train.pkl',
 'giba-r-data-table-simple-features-1-17-lb_train.pkl',
 'bonds-from-structure-data_train.pkl',
 'coulomb-interaction-spe

In [5]:
file_list = ['giba-r-data-table-simple-features-1-17-lb_train.pkl',
 'eem_train.pkl',
 'coulomb-interaction-speed-up_train.pkl',
 'dataset-with-number-of-bonds-between-atoms_test.pkl',
 'keras-neural-net-for-champs_train.pkl',
 'angles-and-distances_test.pkl',
 'keras-neural-net-for-champs_test.pkl',
 'brute-force-feature-engineering-mini_train.pkl',
 'dataset-with-number-of-bonds-between-atoms_train.pkl',
 'coulomb-interaction-speed-up_test.pkl',
 'bonds-from-structure-data_train.pkl',
 'eem_test.pkl',
 'molecule-with-openbabel_train.pkl',
 'molecular-properties-eda-and-models_train.pkl',
 'brute-force-feature-engineering-mini_test.pkl',
 'molecular-properties-eda-and-models_test.pkl',
 'bonds-from-structure-data_test.pkl',
 'molecule-with-openbabel_test.pkl',
 'giba-r-data-table-simple-features-1-17-lb_test.pkl',
 'angles-and-distances_train.pkl']
print(len(file_list))

20


In [6]:
trail = []
def prepare_data(feature_folder='../../data/feature', csv_file_folder='../../data/input', feature_file_list=None, trail=trail):
    
    if os.path.exists(f'{feature_folder}/df_train'):
        print(f'=========================load from temp===============================')
        df_train = pd.read_pickle(f'{feature_folder}/df_train')
        df_test = pd.read_pickle(f'{feature_folder}/df_test')
        return df_train, df_test
    
    df_train = pd.read_csv(f"{csv_file_folder}/train.csv")
    df_test = pd.read_csv(f"{csv_file_folder}/test.csv")
    scalar_coupling_contributions = pd.read_csv(f'{csv_file_folder}/scalar_coupling_contributions.csv')
    
    #scalar_coupling_constant fc sd pso dso
    df_train = pd.merge(df_train, scalar_coupling_contributions, how = 'left',
                  left_on  = ['molecule_name', 'atom_index_0', 'atom_index_1', 'type'],
                  right_on = ['molecule_name', 'atom_index_0', 'atom_index_1', 'type'])
    
    print(df_train.shape, df_test.shape)

    if type(feature_file_list) == type(None):
        feature_file_list = os.listdir(feature_folder)

    for f in feature_file_list:
        print(f'========================={f}===============================')
        if (f.endswith('.pkl')) and (not f.startswith('.')):
            if f[:-4].endswith('train'):
                df_feature_i = pd.read_pickle(f'{feature_folder}/{f}').sort_values(by=['id'])
                df_feature_i = df_feature_i.reset_index(drop=True)
                columns_i = df_feature_i.columns.tolist()
                new_columns = set(columns_i) - set(df_train.columns.tolist())
                duplicates_columns = [col for col in columns_i if col not in list(new_columns)]
                for col in duplicates_columns:
                    try:
                        error = np.where(df_feature_i[col].values!=df_train[col].values)[0]
                        if error.shape[0] > 0:
                            if np.where(~np.isnan(df_feature_i[col].values[np.where(df_feature_i[col].values!=df_train[col].values)[0]]))[0].shape[0]>0:
                                if not str in [type(df_train[col].values[0]),type(df_feature_i[col].values[0])]:
                                    trail.append({'df_feature_i':df_feature_i, 'df_train':df_train})
                                    print(col, error, [f'{v1}:{v2}' for v1, v2 in zip(df_feature_i[col].values[error], df_train[col].values[error])])
                                    raise Exception()
                    except Exception as e:
#                         raise Exception(col)
                        print(col)
                df_train = pd.merge(df_train, df_feature_i[list(new_columns) + ['id']], on='id')
                df_train = df_train.sort_values(by=['id'])
                df_train = df_train.reset_index(drop=True)
                print('train add', f, new_columns)
            if f[:-4].endswith('test'):
                df_feature_i = pd.read_pickle(f'{feature_folder}/{f}').sort_values(by=['id'])
                df_feature_i = df_feature_i.reset_index(drop=True)
                columns_i = df_feature_i.columns.tolist()
                new_columns = set(columns_i) - set(df_test.columns.tolist())
                duplicates_columns = [col for col in columns_i if col not in list(new_columns)]
                for col in duplicates_columns:
                    try:
                        error = np.where(df_feature_i[col].values!=df_test[col].values)[0]
                        if error.shape[0] > 0:
                            if np.where(~np.isnan(df_feature_i[col].values[np.where(df_feature_i[col].values!=df_test[col].values)[0]]))[0].shape[0]>0:
                                if not str in [type(df_test[col].values[0]),type(df_feature_i[col].values[0])]:
                                    trail.append({'df_feature_i':df_feature_i, 'df_test':df_test})
                                    print(col, error, [f'{v1}:{v2}' for v1, v2 in zip(df_feature_i[col].values[error], df_test[col].values[error])])
                                    raise Exception()
                    except Exception as e:
#                         raise Exception(col)
                        print(col)
                df_test = pd.merge(df_test, df_feature_i[list(new_columns) + ['id']], on='id')
                df_test = df_test.sort_values(by=['id'])
                df_test = df_test.reset_index(drop=True)
                print('test add', f, new_columns)

    print(f'=========================encode label===============================')
    numerics = ['int16', 'int8', 'int32', 'int64', 'float16', 'float32', 'float64']
    for col in df_train.columns:
        col_type = df_train[col].dtypes
        if not col_type in numerics:
            print(col, df_train[col].unique())
            le = LabelEncoder()
            le.fit(list(df_train[col].values) + list(df_test[col].values))
            df_train[col] = le.transform(list(df_train[col].values))
            df_test[col] = le.transform(list(df_test[col].values))
            print(le.classes_)

    print(f'=========================fill nan inf===============================')
    df_train = df_train.replace([np.inf, -np.inf], np.nan)
    df_train = df_train.fillna(0)
    df_test = df_test.replace([np.inf, -np.inf], np.nan)
    df_test = df_test.fillna(0)

    print(f'=========================rename===============================')
    df_train = df_train.rename(columns={'id': 'index'}) #'scalar_coupling_constant': 'y'
    df_test = df_test.rename(columns={'id': 'index'})
    df_train = df_train.rename(columns={'molecule_name':'group'})
    df_test = df_test.rename(columns={'molecule_name':'group'})
    df_test = df_test.rename(columns={'cycle_size_mean_x':'atom_index_0_cycle_size_mean', 
                            'cycle_size_mean_y':'atom_index_1_cycle_size_mean',
                           'n_cycle_x':'atom_index_0_n_cycle',
                           'n_cycle_y':'atom_index_1_n_cycle'})

    df_train = df_train.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)
    
#     if not os.path.exists(f'{feature_folder}/df_train'):
#         print(f'=========================save tmp===============================')
#         df_train.to_pickle(f'{feature_folder}/df_train')
#         df_test.to_pickle(f'{feature_folder}/df_test')

    return df_train, df_test

In [7]:
df_train, df_test = prepare_data(feature_file_list=file_list)

(4658147, 10) (2505542, 5)
train add giba-r-data-table-simple-features-1-17-lb_train.pkl {'NH', 'inv_dist1E', 'linkM1', 'ID', 'adN2', 'yukawa_H.y', 'coulomb_H.x', 'distN0', 'inv_dist1R', 'distC0', 'yukawa_C.y', 'atom_index_1.1', 'structure_z_1', 'E0', 'structure_x_0', 'coulomb_N.y', 'vander_O.y', 'adC4', 'structure_y_1', 'structure_z_0', 'typei', 'coulomb_C.x', 'yukawa_O.x', 'inv_distP', 'vander_H.x', 'structure_atom_0', 'sd_molecule_atom_1_dist_xyz', 'E1', 'vander_N.y', 'linkN', 'structure_x_1', 'structure_atom_1', 'coulomb_F.y', 'inv_dist1', 'inv_dist0R', 'link0', 'inv_dist0', 'inv_distPR', 'mean_molecule_atom_0_dist_xyz', 'distH0', 'pos', 'linkM0', 'yukawa_N.x', 'structure_y_0', 'adH2', 'yukawa_H.x', 'vander_C.x', 'vander_F.x', 'adH3', 'R0', 'sd_molecule_atom_0_dist_xyz', 'coulomb_O.x', 'coulomb_O.y', 'NN', 'N2', 'vander_N.x', 'NF', 'yukawa_C.x', 'adC1', 'inv_dist0E', 'yukawa_N.y', 'adH4', 'adN4', 'max_molecule_atom_0_dist_xyz', 'adC3', 'adN1', 'max_molecule_atom_1_dist_xyz', 'link1

train add dataset-with-number-of-bonds-between-atoms_train.pkl {'nbond', 'error', 'atom_index_0_charge', 'atom_index_0_n_cycle', 'atom_index_1_charge', 'atom_index_0_cycle_size_mean', 'L2dist', 'bond_type', 'atom_index_1_n_cycle', 'atom_index_1_cycle_size_mean', 'is_found_bond'}
test add coulomb-interaction-speed-up_test.pkl {'dist_C_2_y', 'dist_H_2_y', 'dist_C_2_x', 'dist_C_3_x', 'dist_O_2_y', 'dist_H_1_x', 'dist_F_0_y', 'dist_O_3_x', 'dist_H_2_x', 'dist_O_1_y', 'dist_O_0_x', 'dist_F_4_y', 'dist_H_3_x', 'dist_C_4_x', 'dist_N_1_x', 'dist_C_1_y', 'dist_N_4_x', 'dist_F_2_x', 'dist_C_4_y', 'dist_N_3_x', 'dist_N_4_y', 'dist_H_4_y', 'dist_F_1_x', 'dist_C_0_x', 'dist_H_4_x', 'dist_F_4_x', 'dist_N_1_y', 'dist_O_1_x', 'dist_F_0_x', 'dist_C_0_y', 'dist_C_3_y', 'dist_F_2_y', 'dist_N_0_y', 'dist_N_0_x', 'dist_H_0_y', 'dist_N_2_x', 'dist_F_3_x', 'dist_O_4_y', 'dist_H_0_x', 'dist_N_3_y', 'dist_H_3_y', 'dist_O_3_y', 'dist_O_4_x', 'dist_H_1_y', 'dist_F_1_y', 'dist_N_2_y', 'dist_O_0_y', 'dist_F_3_y', 

molecule_name ['dsgdb9nsd_000001' 'dsgdb9nsd_000002' 'dsgdb9nsd_000003' ...
 'dsgdb9nsd_133881' 'dsgdb9nsd_133882' 'dsgdb9nsd_133884']
['dsgdb9nsd_000001' 'dsgdb9nsd_000002' 'dsgdb9nsd_000003' ...
 'dsgdb9nsd_133883' 'dsgdb9nsd_133884' 'dsgdb9nsd_133885']
type ['1JHC' '2JHH' '1JHN' '2JHN' '2JHC' '3JHH' '3JHC' '3JHN']
['1JHC' '1JHN' '2JHC' '2JHH' '2JHN' '3JHC' '3JHH' '3JHN']
structure_atom_0 ['H']
['H']
structure_atom_1 ['C' 'H' 'N']
['C' 'H' 'N']
molecule_name.1 ['dsgdb9nsd_000001' 'dsgdb9nsd_000002' 'dsgdb9nsd_000003' ...
 'dsgdb9nsd_133881' 'dsgdb9nsd_133882' 'dsgdb9nsd_133884']
['dsgdb9nsd_000001' 'dsgdb9nsd_000002' 'dsgdb9nsd_000003' ...
 'dsgdb9nsd_133883' 'dsgdb9nsd_133884' 'dsgdb9nsd_133885']
atom_1 ['C' 'H' 'N']
['C' 'H' 'N']
atom_0 ['H']
['H']
bond_type ['1.0CH' 'none' '1.0HN']
['1.0CH' '1.0HN' 'none']
type_1 ['JHC' 'JHH' 'JHN']
['JHC' 'JHH' 'JHN']
type_0 ['1' '2' '3']
['1' '2' '3']
tertiary_atom_17 [nan 'H' 'HO' 'C3' 'O3' 'O2' 'C1' 'N1' 'C2' 'Nam' 'N3' 'N2' 'Car' 'Nar'
 'Npl'

In [8]:
df_train.shape, df_test.shape

((4658147, 417), (2505542, 412))

In [9]:
set(df_train.columns) -set(df_test.columns)

{'dso', 'fc', 'pso', 'scalar_coupling_constant', 'sd'}

In [13]:
df_train.to_pickle(f'{file_folder}/df_train.gzde', compression='gzip')

In [14]:
df_test.to_pickle(f'{file_folder}/df_test.gzde', compression='gzip')

In [12]:
df_train.to_pickle(f'{file_folder}/df_train')
df_test.to_pickle(f'{file_folder}/df_test')

OSError: [Errno 22] Invalid argument

In [20]:
df_feature_i = trail[0]['df_feature_i']
df_test = trail[0]['df_test']
col = 'atom_index_closest_1'
np.where(df_feature_i[col].values!=df_test[col].values)[0].shape

(2199748,)

In [21]:
index_list = np.where(~np.isnan(df_feature_i[col].values[np.where(df_feature_i[col].values!=df_test[col].values)[0]]))[0]

In [22]:
str in [type(df_test[col].values[0]),type(df_feature_i[col].values[0])]

False

In [28]:
df_feature_i.columns

Index(['id', 'atom_index_x', 'atom_0', 'x_0', 'y_0', 'z_0', 'EN_x', 'rad_x',
       'n_bonds_x', 'bond_lengths_mean_x', 'bond_lengths_std_x',
       'atom_index_y', 'atom_1', 'x_1', 'y_1', 'z_1', 'EN_y', 'rad_y',
       'n_bonds_y', 'bond_lengths_mean_y', 'bond_lengths_std_y', 'dist',
       'dist_x', 'dist_y', 'dist_z', 'type_0', 'molecule_couples',
       'molecule_dist_mean', 'molecule_dist_min', 'molecule_dist_max',
       'atom_0_couples_count', 'atom_1_couples_count',
       'molecule_atom_index_0_x_1_std', 'molecule_atom_index_0_y_1_mean',
       'molecule_atom_index_0_y_1_mean_diff',
       'molecule_atom_index_0_y_1_mean_div', 'molecule_atom_index_0_y_1_max',
       'molecule_atom_index_0_y_1_max_diff', 'molecule_atom_index_0_y_1_std',
       'molecule_atom_index_0_z_1_std', 'molecule_atom_index_0_dist_mean',
       'molecule_atom_index_0_dist_mean_diff',
       'molecule_atom_index_0_dist_mean_div', 'molecule_atom_index_0_dist_max',
       'molecule_atom_index_0_dist_max_diff

In [25]:
df_feature_i.loc[index_list][['id','atom_index_closest_1']].head()

Unnamed: 0,id,atom_index_closest_1
43,4658190,
46,4658193,
49,4658196,3.0
53,4658200,3.0
56,4658203,4.0


In [26]:
df_test.loc[index_list][['id','atom_index_closest_1']].head()

Unnamed: 0,id,atom_index_closest_1
43,4658190,2
46,4658193,8
49,4658196,3
53,4658200,3
56,4658203,5


In [7]:
for y in ['scalar_coupling_constant', 'fc', 'sd','pso','dso']:
    for t in np.arange(8):
        df_train, df_test = prepare_data(feature_file_list=file_list)
    #     print(df_train.shape, df_test.shape)
        print('type', t)
        df_train = df_train[df_train['type']==t].reset_index(drop=True)
        df_test = df_test[df_test['type']==t].reset_index(drop=True)

        param = {
            'columns': df_train.columns.drop(['index', 'y','group']).tolist(),
            'cv': {
        #         'cls': 'GroupKFold', 'init': {'n_splits': 5}
                'cls': 'KFold',
                'init':{
                    'n_splits': 5,
                    'shuffle': True,
                    'random_state': 42,
                },
            },
            'scaler': {'cls': 'StandardScaler', 'init': {}, 'fit': {}},
            'model': {
                'cls': 'lgb.LGBMRegressor',
                'init': {
                    'learning_rate': 0.2833769330240482,
                    'feature_fraction': 0.8818248470204605,
                    'bagging_fraction': 0.8205197060908092,
                    'min_data_in_leaf': 202,
                    'lambda_l1': 0.017039063121824582,
                    'lambda_l2': 0.8318702431636841,
                    'max_bin': 100,
                    'num_leaves': 255,
                    'random_state': 3895,
                    'n_jobs': 16
                },
                'fit': {}
            },
            'metric': 'mean_absolute_error'
        }



        process_queue = PQueue(df_train, df_test, param, mytrial)
        sort_features = SortFeatureSelectTopNProcess(**{'top_n':200})
        select_topn = RFESelectTopNProcess(**{'n_features_remain':20, 'n_features_to_remove':10})
        remove_useless = RFERemoveUselessFeaturesProcess(**{})
        process_queue.insert_node(sort_features)
        process_queue.insert_node(select_topn)
        process_queue.insert_node(remove_useless)

        try:
            result = process_queue.run()
        except Exception as e:
            print(e.__str__())
        print(len(process_queue.trial))
        print(process_queue.param)
    

(4658147, 6) (2505542, 5)


KeyboardInterrupt: 

In [13]:


param = {
    'columns': df_train.columns.drop(['index', 'y','group']).tolist(),
    'cv': {
#         'cls': 'GroupKFold', 'init': {'n_splits': 5}
        'cls': 'KFold',
        'init':{
            'n_splits': 5,
            'shuffle': True,
            'random_state': 42,
        },
    },
    'scaler': {'cls': 'StandardScaler', 'init': {}, 'fit': {}},
    'model': {
        'cls': 'lgb.LGBMRegressor',
        'init': {
            'learning_rate': 0.2833769330240482,
            'feature_fraction': 0.8818248470204605,
            'bagging_fraction': 0.8205197060908092,
            'min_data_in_leaf': 202,
            'lambda_l1': 0.017039063121824582,
            'lambda_l2': 0.8318702431636841,
            'max_bin': 100,
            'num_leaves': 255,
            'random_state': 3895,
            'n_jobs': 16
        },
        'fit': {}
    },
    'metric': 'mean_absolute_error'
}



process_queue = PQueue(df_train, df_test, param, mytrial)
sort_features = SortFeatureSelectTopNProcess(**{'top_n':200})
select_topn = RFESelectTopNProcess(**{'n_features_remain':20, 'n_features_to_remove':10})
remove_useless = RFERemoveUselessFeaturesProcess(**{})
process_queue.insert_node(sort_features)
process_queue.insert_node(select_topn)
process_queue.insert_node(remove_useless)

try:
    result = process_queue.run()
except Exception as e:
    print(e.__str__())
print(len(process_queue.trial))
print(process_queue.param)

294
{'columns': ['dist_O_1_x', 'dist_C_0_y', 'atom_index_1_cycle_size_mean', 'molecule_type_dist_mean_div', 'cos_f0', 'dist_N_0_y', 'dist_to_type_std', 'molecule_atom_index_0_dist_min_diff', 'cos_center0_center1', 'adC3', 'dist_to_type_mean', 'adC2', 'inv_dist1R', 'molecule_atom_1_dist_min_diff', 'atom_index_1_n_cycle', 'min_molecule_atom_0_dist_xyz', 'molecule_atom_index_1_dist_max_div', 'molecule_atom_index_0_dist_min_div', 'molecule_atom_index_1_dist_std_diff', 'dist_H_0_y', 'cos_c0', 'dist_xyz', 'dist_C_1_y', 'dist_O_1_y', 'molecule_atom_index_0_dist_max_diff', 'dist_to_type_0_mean', 'mean_molecule_atom_1_dist_xyz', 'dist_H_0_x', 'mean_molecule_atom_0_dist_xyz', 'vander_O.y', 'dist_H_2_y', 'dist_H_2_x', 'molecule_atom_index_0_dist_max_div', 'cos_f1', 'atom_1_bond_lengths_mean', 'dist_C_2_y', 'molecule_atom_index_1_dist_min_div', 'cos_c1', 'max_molecule_atom_0_dist_xyz', 'dist_C_1_x', 'dist_O_0_x', 'distance_f0', 'atom_1_bond_lengths_max', 'atom_1_n_bonds', 'dist_C_0_x', 'dist_N_0_x

In [14]:
df_trial = pd.DataFrame(mytrial)

In [15]:
df_trial[df_trial['message']=='RFESelectTopNProcess to 20 features'][['datetime', 'message', 'nfeatures', 'train_metric_mean', 'val_metric_mean', 'trn_val_metric_diff']]

Unnamed: 0,datetime,message,nfeatures,train_metric_mean,val_metric_mean,trn_val_metric_diff
1,2019-07-13 12:40:20.546274,RFESelectTopNProcess to 20 features,200,0.42474,0.517648,0.092908
2,2019-07-13 12:51:12.084660,RFESelectTopNProcess to 20 features,190,0.4237,0.516118,0.092418
3,2019-07-13 13:01:23.232649,RFESelectTopNProcess to 20 features,180,0.423037,0.515225,0.092187
4,2019-07-13 13:11:00.076312,RFESelectTopNProcess to 20 features,170,0.423694,0.514258,0.090564
5,2019-07-13 13:20:00.772390,RFESelectTopNProcess to 20 features,160,0.427667,0.518147,0.09048
6,2019-07-13 13:28:21.264657,RFESelectTopNProcess to 20 features,150,0.423064,0.511754,0.08869
7,2019-07-13 13:36:06.755523,RFESelectTopNProcess to 20 features,140,0.424167,0.511186,0.087019
8,2019-07-13 13:43:11.926027,RFESelectTopNProcess to 20 features,130,0.422329,0.507644,0.085315
9,2019-07-13 13:49:37.474184,RFESelectTopNProcess to 20 features,120,0.427895,0.51384,0.085946
10,2019-07-13 13:55:29.032337,RFESelectTopNProcess to 20 features,110,0.42602,0.509787,0.083767


In [16]:
df_trial[df_trial['message']=='RFERemoveUselessFeaturesProcess'][['datetime', 'message', 'nfeatures', 'train_metric_mean', 'val_metric_mean', 'trn_val_metric_diff']].sort_values(by=['val_metric_mean']).head()

Unnamed: 0,datetime,message,nfeatures,train_metric_mean,val_metric_mean,trn_val_metric_diff
174,2019-07-13 15:41:22.115034,RFERemoveUselessFeaturesProcess,67,0.419057,0.49112,0.072063
184,2019-07-13 15:46:20.891670,RFERemoveUselessFeaturesProcess,67,0.420238,0.492366,0.072128
291,2019-07-13 16:39:10.899668,RFERemoveUselessFeaturesProcess,66,0.419673,0.492434,0.072761
171,2019-07-13 15:39:52.129241,RFERemoveUselessFeaturesProcess,67,0.421021,0.492508,0.071487
224,2019-07-13 16:06:18.882925,RFERemoveUselessFeaturesProcess,67,0.419932,0.492779,0.072847


In [22]:
def objective(trial):
        
    learning_rate = trial.suggest_uniform('learning_rate', .01, .5)
    feature_fraction = trial.suggest_uniform('feature_fraction', .6, 1)
    bagging_fraction = trial.suggest_uniform('bagging_fraction', 0.6, 1)
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 200, 800)
    lambda_l1 = trial.suggest_loguniform('lambda_l1', 1e-6, 1e2)
    lambda_l2 = trial.suggest_loguniform('lambda_l2', 1e-6, 1e2)
    max_bin = trial.suggest_int('max_bin', 10, 100)
    num_leaves = trial.suggest_int('num_leaves', 4, 512)
    random_state = trial.suggest_int('random_state', 1, 9999)
        
    args={
        'columns':process_queue.param['columns'],
        'cv': {
            'cls': 'KFold',
            'init':{
                'n_splits': 5,
                'shuffle': True,
                'random_state': 42,
            },
        },
        'scaler': {
            'cls': 'StandardScaler',
            'init':{},
            'fit':{},
        },
        'model': {
            'cls': 'lgb.LGBMRegressor',
            'init': {
                'learning_rate':learning_rate,
                'feature_fraction':feature_fraction,
                'bagging_fraction':bagging_fraction,
                'min_data_in_leaf':min_data_in_leaf,
                'lambda_l1':lambda_l1,
                'lambda_l2':lambda_l2,
                'max_bin':max_bin,
                'num_leaves':num_leaves,
                'random_state':random_state,
                'n_jobs':16
            },
            'fit': {
            },
        },
        'metric':'mean_absolute_error',
    }
    
    df_his, df_feature_importances, df_valid_pred, df_test_pred =  sk_process(df_train.sample(1000000), args, 'tune hyperparam', trial=mytrial, is_output_feature_importance=False, trial_level=0)
    val_metric_mean = np.mean(df_his.valid)
    return val_metric_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

[I 2019-07-08 12:43:35,887] Finished trial#0 resulted in value: 1.9479356066616114. Current best value is 1.9479356066616114 with parameters: {'learning_rate': 0.2557043291872395, 'feature_fraction': 0.8402360940571069, 'bagging_fraction': 0.6471727275908338, 'min_data_in_leaf': 762, 'lambda_l1': 0.001260157167891001, 'lambda_l2': 1.8109804288577593, 'max_bin': 49, 'num_leaves': 6, 'random_state': 8920}.
[I 2019-07-08 12:44:05,257] Finished trial#1 resulted in value: 1.0277087290901945. Current best value is 1.0277087290901945 with parameters: {'learning_rate': 0.467907656066654, 'feature_fraction': 0.9296597716002054, 'bagging_fraction': 0.88353122869927, 'min_data_in_leaf': 583, 'lambda_l1': 0.4844715238697431, 'lambda_l2': 8.647503479325444, 'max_bin': 78, 'num_leaves': 143, 'random_state': 2998}.
[I 2019-07-08 12:44:20,441] Finished trial#2 resulted in value: 1.5398453432881518. Current best value is 1.0277087290901945 with parameters: {'learning_rate': 0.467907656066654, 'feature_

[I 2019-07-08 13:01:35,485] Finished trial#38 resulted in value: 2.0455410292427043. Current best value is 0.9445406374765357 with parameters: {'learning_rate': 0.3003750010473155, 'feature_fraction': 0.7954806411666246, 'bagging_fraction': 0.825122687823091, 'min_data_in_leaf': 201, 'lambda_l1': 0.00010051426394331587, 'lambda_l2': 3.2239054643990496e-06, 'max_bin': 72, 'num_leaves': 234, 'random_state': 2988}.
[I 2019-07-08 13:02:07,136] Finished trial#39 resulted in value: 0.976087813882492. Current best value is 0.9445406374765357 with parameters: {'learning_rate': 0.3003750010473155, 'feature_fraction': 0.7954806411666246, 'bagging_fraction': 0.825122687823091, 'min_data_in_leaf': 201, 'lambda_l1': 0.00010051426394331587, 'lambda_l2': 3.2239054643990496e-06, 'max_bin': 72, 'num_leaves': 234, 'random_state': 2988}.
[I 2019-07-08 13:02:28,519] Finished trial#40 resulted in value: 1.0830722403332604. Current best value is 0.9445406374765357 with parameters: {'learning_rate': 0.300375

[I 2019-07-08 13:20:35,936] Finished trial#76 resulted in value: 0.9479237571940843. Current best value is 0.9445406374765357 with parameters: {'learning_rate': 0.3003750010473155, 'feature_fraction': 0.7954806411666246, 'bagging_fraction': 0.825122687823091, 'min_data_in_leaf': 201, 'lambda_l1': 0.00010051426394331587, 'lambda_l2': 3.2239054643990496e-06, 'max_bin': 72, 'num_leaves': 234, 'random_state': 2988}.
[I 2019-07-08 13:20:57,442] Finished trial#77 resulted in value: 1.244288848337931. Current best value is 0.9445406374765357 with parameters: {'learning_rate': 0.3003750010473155, 'feature_fraction': 0.7954806411666246, 'bagging_fraction': 0.825122687823091, 'min_data_in_leaf': 201, 'lambda_l1': 0.00010051426394331587, 'lambda_l2': 3.2239054643990496e-06, 'max_bin': 72, 'num_leaves': 234, 'random_state': 2988}.
[I 2019-07-08 13:21:31,993] Finished trial#78 resulted in value: 0.9763902372769845. Current best value is 0.9445406374765357 with parameters: {'learning_rate': 0.300375

[I 2019-07-08 13:39:40,905] Finished trial#114 resulted in value: 1.0492230333398693. Current best value is 0.9368553118374205 with parameters: {'learning_rate': 0.3166712257724028, 'feature_fraction': 0.9071277735212887, 'bagging_fraction': 0.7422147877975603, 'min_data_in_leaf': 416, 'lambda_l1': 1.5701746597674077e-06, 'lambda_l2': 1.7903507490427843e-05, 'max_bin': 98, 'num_leaves': 253, 'random_state': 6406}.
[I 2019-07-08 13:40:13,419] Finished trial#115 resulted in value: 0.9610546104863019. Current best value is 0.9368553118374205 with parameters: {'learning_rate': 0.3166712257724028, 'feature_fraction': 0.9071277735212887, 'bagging_fraction': 0.7422147877975603, 'min_data_in_leaf': 416, 'lambda_l1': 1.5701746597674077e-06, 'lambda_l2': 1.7903507490427843e-05, 'max_bin': 98, 'num_leaves': 253, 'random_state': 6406}.
[I 2019-07-08 13:40:40,681] Finished trial#116 resulted in value: 1.015126817371264. Current best value is 0.9368553118374205 with parameters: {'learning_rate': 0.3

[I 2019-07-08 13:59:47,959] Finished trial#152 resulted in value: 0.951212713884505. Current best value is 0.9368553118374205 with parameters: {'learning_rate': 0.3166712257724028, 'feature_fraction': 0.9071277735212887, 'bagging_fraction': 0.7422147877975603, 'min_data_in_leaf': 416, 'lambda_l1': 1.5701746597674077e-06, 'lambda_l2': 1.7903507490427843e-05, 'max_bin': 98, 'num_leaves': 253, 'random_state': 6406}.
[I 2019-07-08 14:00:26,436] Finished trial#153 resulted in value: 0.9329697749105812. Current best value is 0.9329697749105812 with parameters: {'learning_rate': 0.3414884085893545, 'feature_fraction': 0.9968942693539474, 'bagging_fraction': 0.6304650536121073, 'min_data_in_leaf': 391, 'lambda_l1': 1.0451097740407883e-06, 'lambda_l2': 0.039471193246930845, 'max_bin': 98, 'num_leaves': 256, 'random_state': 8004}.
[I 2019-07-08 14:01:03,998] Finished trial#154 resulted in value: 0.9415126520731034. Current best value is 0.9329697749105812 with parameters: {'learning_rate': 0.341

[I 2019-07-08 14:21:20,485] Finished trial#190 resulted in value: 1.0305378560799183. Current best value is 0.9300692164665992 with parameters: {'learning_rate': 0.29426859872688016, 'feature_fraction': 0.9991242137638949, 'bagging_fraction': 0.6352351289978297, 'min_data_in_leaf': 431, 'lambda_l1': 6.313437511621462e-06, 'lambda_l2': 0.016604298397879098, 'max_bin': 100, 'num_leaves': 255, 'random_state': 9214}.
[I 2019-07-08 14:21:52,453] Finished trial#191 resulted in value: 0.9697525930426156. Current best value is 0.9300692164665992 with parameters: {'learning_rate': 0.29426859872688016, 'feature_fraction': 0.9991242137638949, 'bagging_fraction': 0.6352351289978297, 'min_data_in_leaf': 431, 'lambda_l1': 6.313437511621462e-06, 'lambda_l2': 0.016604298397879098, 'max_bin': 100, 'num_leaves': 255, 'random_state': 9214}.
[I 2019-07-08 14:22:24,841] Finished trial#192 resulted in value: 0.9465400774373991. Current best value is 0.9300692164665992 with parameters: {'learning_rate': 0.29

In [23]:
df_trial = pd.DataFrame(mytrial)
df_trial[df_trial['message']=='tune hyperparam'][['datetime', 'message', 'nfeatures', 'train_metric_mean', 'val_metric_mean', 'trn_val_metric_diff']].sort_values(by=['val_metric_mean']).head()

Unnamed: 0,datetime,message,nfeatures,train_metric_mean,val_metric_mean,trn_val_metric_diff
212,2019-07-08 14:06:29.634383,tune hyperparam,38,0.863236,0.930069,0.066833
215,2019-07-08 14:08:20.456876,tune hyperparam,38,0.862214,0.930458,0.068244
234,2019-07-08 14:18:47.324132,tune hyperparam,38,0.840667,0.932579,0.091912
202,2019-07-08 14:00:26.322634,tune hyperparam,38,0.855637,0.93297,0.077333
145,2019-07-08 13:30:42.870039,tune hyperparam,38,0.86663,0.936855,0.070225


In [11]:
param = {'columns': ['yukawa_H.y', 'bond_lengths_mean_y', 'adC1', 'bond_lengths_mean_x', 'eem_0', 'inv_dist1R', 'eem_1', 'dist_to_type_std', 'molecule_atom_index_0_dist_min_div', 'cos_c0', 'cos_f0', 'max_distance_x', 'atom_index_1_cycle_size_mean', 'type', 'molecule_atom_index_0_dist_max_div', 'cos_c1', 'yukawa_H.x', 'distance', 'vander_O.y', 'nbond', 'L2dist', 'coulomb_O.y', 'min_molecule_atom_1_dist_xyz', 'cos_center1', 'vander_C.x', 'inv_dist0R', 'vander_N.y', 'distC0', 'adN1', 'molecule_atom_index_0_dist_max_diff', 'distC1', 'min_molecule_atom_0_dist_xyz', 'distance_closest_1', 'dist_to_type_1_mean', 'yukawa_N.x', 'molecule_atom_index_0_dist_mean_diff', 'n_bonds_y', 'molecule_atom_index_0_dist_std_div', 'vander_H.x', 'dist_to_type_mean', 'molecule_atom_index_1_dist_max_diff', 'cos_c0_c1', 'inv_distP', 'molecule_atom_index_0_dist_std_diff', 'vander_C.y', 'dist_to_type_0_mean', 'inv_dist1E', 'inv_dist1', 'adC2', 'vander_O.x', 'yukawa_O.y', 'inv_distPR', 'bond_lengths_std_y', 'molecule_atom_index_0_dist_min_diff', 'yukawa_O.x', 'adC3'], 'cv': {'cls': 'KFold', 'init': {'n_splits': 5, 'shuffle': True, 'random_state': 42}}, 'scaler': {'cls': 'StandardScaler', 'init': {}, 'fit': {}}, 'model': {'cls': 'lgb.LGBMRegressor', 'init': {'learning_rate': 0.35395923077843333, 'feature_fraction': 0.8840483697334669, 'bagging_fraction': 0.7017457378676857, 'min_data_in_leaf': 616, 'lambda_l1': 0.00013058988949929333, 'lambda_l2': 0.004991992636437704, 'max_bin': 74, 'num_leaves': 255, 'random_state': 2928, 'n_jobs': 16}, 'fit': {}}, 'metric': 'mean_absolute_error'}

In [13]:
mytrial=[]
df_his, df_feature_importances, df_valid_pred, df_test_pred = pd.DataFrame(), pd.DataFrame(),pd.DataFrame(),pd.DataFrame()
for t in  df_train.type.unique().tolist():
    df_his_i, df_feature_importances_i, df_valid_pred_i, df_test_pred_i =  sk_process(df_train[df_train['type']==t].reset_index(drop=True), param, 'modeling', df_test=df_test[df_test['type']==t].reset_index(drop=True), trial=mytrial, is_output_feature_importance=False, trial_level=1)
    df_his = pd.concat([df_his, df_his_i], axis=0)
    df_feature_importances = pd.concat([df_feature_importances, df_feature_importances_i], axis=0)
    df_valid_pred = pd.concat([df_valid_pred, df_valid_pred_i], axis=0)
    df_test_pred = pd.concat([df_test_pred, df_test_pred_i], axis=0)

In [14]:
df_trial = pd.DataFrame(mytrial)
df_trial[['datetime', 'message', 'nfeatures', 'train_metric_mean', 'val_metric_mean', 'trn_val_metric_diff']].tail(8)

Unnamed: 0,datetime,message,nfeatures,train_metric_mean,val_metric_mean,trn_val_metric_diff
0,2019-07-12 13:20:09.719245,modeling,56,1.004764,1.114777,0.110013
1,2019-07-12 13:20:24.793253,modeling,56,0.237386,0.278988,0.041602
2,2019-07-12 13:20:26.203270,modeling,56,0.505092,0.584718,0.079626
3,2019-07-12 13:20:30.523850,modeling,56,0.258565,0.309427,0.050862
4,2019-07-12 13:21:22.323074,modeling,56,0.484731,0.524041,0.039311
5,2019-07-12 13:21:48.145118,modeling,56,0.252262,0.283812,0.03155
6,2019-07-12 13:22:59.999031,modeling,56,0.488842,0.518543,0.029701
7,2019-07-12 13:23:06.462814,modeling,56,0.161942,0.196213,0.034271


In [24]:
print(mean_absolute_error(df_valid_pred.sort_values(by=['index']).reset_index(drop=True).predict.values, df_train.reset_index(drop=True).y.values))
for t in df_train.type.unique().tolist():
    index = df_train[df_train['type']==t]['index'].values
    print(t, mean_absolute_error(df_valid_pred[df_valid_pred['index'].isin(index)].sort_values(by=['index']).reset_index(drop=True).predict.values, df_train[df_train['index'].isin(index)].reset_index(drop=True).y.values))

0.5452372461926065
0 1.1147768568040992
3 0.27898778247588285
1 0.5847177038306108
4 0.3094267042353605
2 0.5240411403290354
6 0.2838115643936403
5 0.5185433461387731
7 0.19621299730712535


In [20]:
df_test_pred = df_test_pred.sort_values(by=['index'])
df_test_pred = df_test_pred.reset_index(drop=True)

In [21]:
df_test_pred.tail()

Unnamed: 0,index,0,1,2,3,4
2505537,7163684,0.680527,0.433437,0.668406,1.115294,0.385468
2505538,7163685,4.17544,2.263447,3.028679,4.096824,3.920649
2505539,7163686,2.660563,3.140089,2.5821,3.866638,2.077487
2505540,7163687,4.250692,5.84945,3.293861,4.557056,5.493906
2505541,7163688,114.278557,110.288347,116.321,118.92149,118.534586


In [None]:
mytrial=[]
df_his, df_feature_importances, df_valid_pred, df_test_pred =  sk_process(df_train, param, 'modeling', df_test=df_test, trial=mytrial, is_output_feature_importance=False, trial_level=1)

In [None]:
df_trial = pd.DataFrame(mytrial)
df_trial[['datetime', 'message', 'nfeatures', 'train_metric_mean', 'val_metric_mean', 'trn_val_metric_diff']].tail(1)

In [22]:
idx=712
# df_test_pred = df_trial.loc[idx]['df_test_pred']
df_submit = pd.DataFrame()
df_submit['scalar_coupling_constant'] = np.mean(df_test_pred.drop(columns=['index']).values, axis=1)
df_submit['id'] = df_test_pred['index']
df_submit.to_csv('../../data/submission/submission_lgbm_{}.csv'.format(idx), index=False)