In [1]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [2]:
# import sys
# !{sys.executable} -m pip install optuna

In [1]:
import sys
sys.path.append("../..") # Adds higher directory to python modules path.
from utilities import aggregate_feature_calculators
from utilities import aggregate_feature_calculators_setting as aggcal
from utilities.parallel import Parallel
from utilities.dfdb import DFDB

from utilities.process.pqueue import *
from utilities.process.pnode import *
from utilities.process.putilities import *

Using TensorFlow backend.


In [2]:
import numpy as np
import pandas as pd
import os
import time
import datetime
import json
import copy
import gc
import warnings
from tqdm import tqdm_notebook, tqdm

import optuna

import lightgbm as lgb
import xgboost as xgb

from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold,TimeSeriesSplit, GroupKFold
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR, SVR
from sklearn.metrics import mean_absolute_error
from sklearn import preprocessing

import eli5
from eli5.sklearn import PermutationImportance

import networkx as nx

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
pd.set_option("display.max_rows", 500)

In [4]:
csv_file_folder =  '../../data/input'
file_folder =  '../../data/feature'

In [5]:
LGB_PARAMS = {
    'objective': 'regression',
    'metric': 'mae',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'learning_rate': 0.2,
    'num_leaves': 256,
    'min_child_samples': 79,
    'max_depth': 9,
    'subsample_freq': 1,
    'subsample': 0.9,
    'bagging_seed': 11,
    'reg_alpha': 0.1,
    'reg_lambda': 0.3,
    'colsample_bytree': 1.0
}

In [None]:
for i in [0,1,2,3,4,5,6,7]:
    df_train2 = pd.read_pickle(f'{file_folder}/df_train2.gzde', compression='gzip')
    df_train2_plus = pd.read_pickle(f'{file_folder}/df_train2_plus.gzde', compression='gzip')
    df_train2_plus = df_train2_plus.rename(columns={'id':'index'})

    df_train2 = df_train2[df_train2['type']==i]
    df_train2 = pd.merge(df_train2, df_train2_plus, how='left', on='index')
    df_train2 = df_train2.fillna(0)
    
    columns = df_train2.columns.drop(['index', 'group', 'scalar_coupling_constant', 'fc', 'sd','pso','dso'])

    X_data = df_train2.drop(['index', 'group', 'scalar_coupling_constant', 'fc', 'sd','pso','dso'], axis=1).values.astype('float32')
    y_data = df_train2['fc'].values.astype('float32')

    group_kfold = GroupKFold(n_splits=8)
    splits = group_kfold.split(X_data, y_data, df_train2.group)
    for train_index, test_index in splits:
        break
    X_train, X_val, y_train, y_val = X_data[train_index], X_data[test_index], y_data[train_index], y_data[test_index]

    model = lgb.LGBMRegressor(**LGB_PARAMS, n_estimators=1500, n_jobs = -1)
    model.fit(X_train, y_train,  eval_set=[(X_train, y_train), (X_val, y_val)], eval_metric='mae', verbose=False, early_stopping_rounds=200)
    y_pred = model.predict(X_val)
    y_pred_train = model.predict(X_train)
    error = np.log(mean_absolute_error(y_val, y_pred))
    error_train = np.log(mean_absolute_error(y_train, y_pred_train))
    print('init', i, error, error_train)

    perm = PermutationImportance(model, random_state=42).fit(X_val, y_val)
    df_feature_importances_i2 = eli5.explain_weights_dfs(perm)['feature_importances']
    df_feature_importances_i2['feature2'] = df_feature_importances_i2['feature'].apply(lambda x : columns[int(x.replace('x',''))])
    df_feature_importances_i2 = df_feature_importances_i2.sort_values(by=['weight'], ascending=False)
    df_feature_importances_i2 = df_feature_importances_i2.reset_index(drop=True)
    
    columns_comb =[]
    best_error = 999
    his = []
    for col in df_feature_importances_i2.feature2.tolist():

        X_data = df_train2[[col]+columns_comb].values.astype('float32')
        y_data = df_train2['scalar_coupling_constant'].values.astype('float32')

        group_kfold = GroupKFold(n_splits=8)
        splits = group_kfold.split(X_data, y_data, df_train2.group)
        for train_index, test_index in splits:
            break
        X_train, X_val, y_train, y_val = X_data[train_index], X_data[test_index], y_data[train_index], y_data[test_index]

        model = lgb.LGBMRegressor(**LGB_PARAMS, n_estimators=1500, n_jobs = -1)
        model.fit(X_train, y_train,  eval_set=[(X_train, y_train), (X_val, y_val)], eval_metric='mae', verbose=False, early_stopping_rounds=200)
        y_pred = model.predict(X_val)
        y_pred_train = model.predict(X_train)
        error = np.log(mean_absolute_error(y_val, y_pred))
        error_train = np.log(mean_absolute_error(y_train, y_pred_train))
        print('col', col, len(columns_comb), 'error', error, 'error_train', error_train)
        his.append({'col':[col]+columns_comb, 'error':error, 'error_train':error_train})

        if error < best_error:
            best_error = error
            columns_comb += [col]
            j = 0
        else:
            j += 1

        if (len(columns_comb)>=70) or (j>=100):
            print(columns_comb)
            break
    

init 0 -0.4240007342043722 -1.4316231473415328
col tertiary_angle_0 0 error 1.6222798164456236 error_train 1.6293130461726788
col d_4_3 1 error 1.329692197086896 error_train 1.3072353888549098
col mulliken_atom_0 2 error 1.0360023559196712 error_train 0.9468516903837791
col inv_dist1R 3 error 0.7542507970282034 error_train 0.5686045118602273
col dist_O_0_x 4 error 0.4855272695484993 error_train 0.2539744908432718
col dist_to_type_mean 5 error 0.4204102140319862 error_train 0.18178968582507657
col yukawa_H.y 6 error 0.35563944722699947 error_train 0.07446116073946782
col dist_to_type_1_mean 7 error 0.3484981044670654 error_train 0.06472108212540229
col atom_1_n_bonds 8 error 0.3427815815836598 error_train 0.059656515812919794
col atom_index_1_ hybridization 9 error 0.3395066154759798 error_train 0.05476354614365818
col dist_to_type_std 10 error 0.3415586307520738 error_train 0.05484443087407
col atom_1_bond_lengths_mean 10 error 0.20972002643579354 error_train -0.06102203404063144
col d

col molecule_atom_0_dist_std_diff 45 error -1.174437371344144 error_train -2.5242502029061153
col gap 45 error -1.209492482562665 error_train -2.4381282544734177
col molecule_type_dist_max 45 error -1.2157004442634993 error_train -2.4934057349880243
col dist_H_2_y 45 error -1.1975016349436596 error_train -2.4595040786372975
col dist_pos_1 45 error -1.1952168778201815 error_train -2.481719992748679
col atom_index_1_dist_q.01 45 error -1.1945340876442305 error_train -2.4887673425560815
col max_molecule_atom_0_dist_xyz 45 error -1.203317157398645 error_train -2.4952977248073314
col tertiary_atom_2 45 error -1.218890166370688 error_train -2.4364789065866668
col adH1 45 error -1.2181782992490824 error_train -2.4460351618625005
col atom_index_0_sv_1 45 error -1.2026472126606924 error_train -2.4965584717584908
col adC4 45 error -1.194026077444577 error_train -2.4767968307914416
col molecule_atom_index_0_dist_mean_div 45 error -1.215630967644945 error_train -2.5084752835230835
col dist_no_bond

In [None]:
df_train2 = pd.read_pickle(f'{file_folder}/df_train2.gzde', compression='gzip')
df_train2_plus = pd.read_pickle(f'{file_folder}/df_train2_plus.gzde', compression='gzip')
df_train2_plus = df_train2_plus.rename(columns={'id':'index'})
df_train2 = pd.merge(df_train2, df_train2_plus, how='left', on='index')
df_train2 = df_train2.fillna(0)

In [None]:
df_test2 = pd.read_pickle(f'{file_folder}/df_test2.gzde', compression='gzip')
df_test2_plus = pd.read_pickle(f'{file_folder}/df_test2_plus.gzde', compression='gzip')
df_test2_plus = df_test2_plus.rename(columns={'id':'index'})
df_test2 = pd.merge(df_test2, df_test2_plus, how='left', on='index')
df_test2 = df_test2.fillna(0)

In [None]:
df_train = pd.DataFrame()
df_test = pd.DataFrame()

for f in [f for f in os.listdir('./') if f.startswith('oof')][:]:
    
    if 'train' in f:
        df_i = pd.read_pickle(f, compression='gzip')
        df_train[f'{f}'.replace('_train', '')] = df_i.predict
        if 'index' not in df_train.columns:
            df_train['index'] = df_i['index']
        
    if 'test' in f:
        df_i = pd.read_pickle(f, compression='gzip')
        df_test[f'{f}'.replace('_test', '')] = df_i.drop(columns=['index']).mean(axis=1)
        if 'index' not in df_test.columns:
            df_test['index'] = df_i['index']

In [47]:
all_cols = []
for cols in type_columns:
     all_cols += cols
all_cols = list(set(all_cols))

In [50]:
#'scalar_coupling_constant', 'fc', 'sd','pso','dso'

df_train = pd.merge(df_train2[['index', 'type', 'group', 'scalar_coupling_constant', 'fc', 'sd','pso','dso']+all_cols], df_train, how='left', on='index')
df_train['y'] = df_train2['scalar_coupling_constant']
df_test = pd.merge(df_test2[['index', 'type', 'group']+all_cols], df_test, how='left', on='index')

In [51]:
base_param = {'columns': [], 'cv': {'cls': 'GroupKFold', 'init': {'n_splits': 8}}, 'scaler': {'cls': 'StandardScaler', 'init': {}, 'fit': {}}, 'model': {'cls': 'lgb.LGBMRegressor', 'init': {
        'learning_rate': 0.2,
    'num_leaves': 128,
    'min_child_samples': 79,
    'max_depth': 9,
    'subsample_freq': 1,
    'subsample': 0.9,
    'bagging_seed': 11,
    'reg_alpha': 0.1,
    'reg_lambda': 0.3,
    'colsample_bytree': 1.0, 'n_estimators':3000, 'n_jobs': 16}, 'fit': {}}, 'metric': 'mean_absolute_error'}

In [54]:
oof_cols = ['oof_dso', 'oof_fc', 'oof_pso', 'oof_scalar_coupling_constant', 'oof_sd']

In [55]:
type_columns = [oof_cols+['tertiary_angle_0', 'inv_dist1R', 'd_4_3', 'yukawa_H.y', 'mulliken_atom_0', 'dist_to_type_mean', 'dist_O_0_x', 'atom_1_n_bonds', 'dist_to_type_1_mean', 'atom_1_bond_lengths_mean', 'dist_xyz', 'dist_C_0_y', 'd_3_2', 'atom_index_1_ hybridization', 'atom_index_1_cycle_size_mean', 'dist_O_0_y', 'eem_1', 'inv_distPE', 'd_4_2', 'inv_distPR', 'dist_no_bond_min_y', 'dist_H_2_x', 'dist_H_1_x', 'tertiary_distance_2', 'dist_C_3_x', 'dist_O_1_x', 'atom_1_bond_lengths_std', 'dist_C_2_y', 'dist_C_2_x', 'mulliken_atom_1', 'cos_center1', 'dist_O_1_y', 'tertiary_angle_3', 'dist_H_2_y', 'dist_N_0_y', 'dist_C_1_y', 'inv_dist1E', 'distance_y', 'tertiary_angle_2', 'dist_N_0_x', 'd_2_1', 'molecule_atom_index_0_dist_max_div', 'adC1', 'adN1', 'd_4_0', 'dist_C_3_y', 'atom_3', 'distC0', 'tertiary_distance_4', 'tertiary_angle_5', 'd_5_1', 'molecule_atom_index_1_dist_min_diff', 'dist_C_4_y', 'dist_H_0_y', 'cos_f0', 'd_5_0', 'tertiary_distance_3', 'd_5_2', 'tertiary_atom_1', 'dist_C_4_x', 'cos_c0_f0', 'atom_index_0_sv_3', 'rc_C', 'cos_f1', 'tertiary_angle_8', 'dist_O_2_y', 'max_molecule_atom_1_dist_xyz', 'dist_F_0_y', 'atom_index_1_ aromatic', 'tertiary_angle_26', 'type_0'],
oof_cols+['dist_H_0_y', 'd_3_2', 'dist_C_0_y', 'atom_index_1_ aromatic', 'atom_1_bond_lengths_mean', 'bond_atom', 'inv_dist1R', 'd_3_1', 'mulliken_atom_0', 'dist_H_0_x', 'dist_O_0_y', 'dist_C_1_x', 'tertiary_angle_0', 'dist_C_1_y', 'vander_C.y', 'dist_H_1_y', 'mulliken_atom_1', 'inv_dist0R', 'd_1_0', 'tertiary_distance_0', 'tertiary_angle_2', 'atom_index_1_explicit_valence', 'dist_N_0_y', 'inv_distPR', 'dist_C_2_x', 'vander_H.x', 'd_4_2', 'atom_index_0_eigv_max', 'tertiary_distance_2', 'dist_H_1_x', 'dist_N_1_x', 'dist_C_3_x', 'cos_f0', 'atom_index_1_sv_2', 'max_molecule_atom_0_dist_xyz', 'd_2_1'],
oof_cols+['tertiary_atom_0', 'inv_dist0', 'dist_no_bond_min_x', 'atom_index_1_ hybridization', 'tertiary_angle_0', 'tertiary_angle_1', 'dist_O_0_x', 'cos_c0', 'd_5_2', 'tertiary_atom_1', 'cos_f0', 'dist_H_0_x', 'd_3_1', 'atom_index_1_degree', 'dist_C_0_y', 'adC2', 'dist_C_3_x', 'vander_O.y', 'mulliken_atom_1', 'atom_7', 'tertiary_angle_2', 'd_2_1', 'atom_3', 'd_5_1', 'd_6_2', 'd_4_1', 'tertiary_atom_2', 'molecule_atom_index_1_dist_min_diff', 'd_4_2', 'dist_C_2_x', 'cos_c0_f0', 'd_6_0', 'dist_O_0_y', 'd_4_3', 'd_3_0', 'd_7_0', 'd_3_2', 'inv_dist0R', 'atom_8', 'dist_C_1_x', 'd_6_1', 'd_2_0', 'd_8_1', 'mulliken_atom_0', 'dist_N_0_x', 'atom_4', 'tertiary_distance_2', 'd_7_2', 'dist_C_0_x', 'atom_1_bond_lengths_mean', 'dist_C_1_y', 'bond_atom', 'd_7_1', 'd_4_0', 'distC0', 'atom_index_1_cycle_size_mean', 'cos_c0_c1', 'tertiary_angle_3', 'dist_O_1_x', 'atom_index_1_n_cycle', 'max_molecule_atom_0_dist_xyz', 'molecule_atom_index_0_dist_max_div', 'atom_5', 'gap', 'cos_c1', 'dist_N_0_y', 'd_6_3', 'dist_C_3_y', 'inv_distP', 'dist_C_4_y'],
oof_cols+['cos_c0', 'd_4_3', 'cos_c0_c1', 'molecule_atom_index_0_dist_min_diff', 'tertiary_atom_1', 'd_3_2', 'd_1_0', 'dist_H_0_y', 'mulliken_atom_0', 'mulliken_atom_1', 'dist_N_0_x', 'link0', 'tertiary_atom_2', 'dist_C_1_y', 'dist_C_1_x', 'cos_f0', 'dist_C_0_y', 'cos_f1', 'd_3_1', 'tertiary_distance_1', 'dist_O_0_y', 'cos_f0_f1', 'adC1', 'd_5_3', 'inv_distP', 'edge_4', 'd_6_2', 'dist_N_0_y', 'tertiary_distance_2', 'dist_O_0_x', 'cos_c1_f1', 'd_3_0', 'd_5_2', 'dist_C_0_x', 'adN1', 'cos_c0_f0', 'd_4_1', 'max_distance_y', 'dist_C_2_y', 'atom_5', 'adC3', 'dist_to_type_1_mean', 'vander_H.x', 'dist_C_3_y', 'dist_H_3_x', 'molecule_atom_index_0_dist_max_div', 'atom_7', 'dist_C_3_x', 'd_5_1', 'dist_H_3_y', 'atom_index_0_eigv_max', 'atom_6', 'dist_H_2_x', 'atom_index_1_sv_0', 'molecule_atom_index_1_dist_std_div', 'link1'],
oof_cols+['d_3_1', 'dist_H_1_x', 'd_5_0', 'd_4_0', 'yukawa_H.x', 'inv_dist0', 'd_6_0', 'd_4_1', 'cos_c0', 'atom_3', 'dist_C_0_y', 'molecule_atom_index_0_dist_std_div', 'cos_c0_c1', 'd_4_2', 'min_molecule_atom_0_dist_xyz', 'sd_molecule_atom_0_dist_xyz', 'd_2_1', 'adC2', 'd_3_0', 'dist_C_1_y', 'd_4_3', 'dist_H_0_x', 'vander_C.x', 'd_5_3', 'dist_H_1_y', 'tertiary_distance_3', 'd_2_0', 'dist_O_0_x', 'd_5_1', 'dist_O_0_y', 'adC3', 'inv_dist0R', 'dist_C_3_y', 'atom_index_1_ hybridization', 'cos_f0', 'dist_C_2_x', 'd_5_2', 'd_6_1', 'dist_C_0_x', 'atom_1_bond_lengths_min', 'mulliken_atom_1', 'distance_farthest_0', 'tertiary_distance_1', 'min_molecule_atom_1_dist_xyz', 'yukawa_O.y', 'atom_0_bond_lengths_max'],
oof_cols+['tertiary_angle_0', 'd_2_1', 'cos_c0', 'atom_1_bond_lengths_mean', 'd_3_1', 'd_2_0', 'tertiary_distance_1', 'd_3_2', 'tertiary_angle_1', 'cos_f0', 'tertiary_distance_2', 'dist_C_0_x', 'dist_H_0_x', 'dist_C_2_x', 'dist_O_0_y', 'd_4_1', 'd_4_3', 'atom_index_1_cycle_size_mean', 'molecule_atom_index_0_dist_min_diff', 'tertiary_atom_2', 'atom_4', 'cos_c0_f0', 'tertiary_distance_3', 'd_3_0', 'dist_median_bond_y', 'd_5_2', 'adC3', 'atom_5', 'dist_H_1_x', 'molecule_atom_index_0_dist_min_div', 'gap', 'molecule_atom_index_1_dist_min_div', 'dist_O_0_x', 'cos_c1', 'dist_C_0_y', 'd_5_1', 'dist_N_0_y', 'dist_C_3_y', 'dist_no_bond_min_y', 'd_4_0', 'dist_N_0_x', 'd_4_2', 'max_molecule_atom_0_dist_xyz', 'cos_c0_c1', 'adC2', 'atom_index_1_n_cycle', 'd_5_0', 'd_6_1', 'dist_C_4_y', 'dist_O_1_y', 'd_7_2', 'tertiary_angle_2', 'd_6_2', 'mulliken_atom_1', 'atom_6', 'd_7_3', 'dist_O_1_x'],
oof_cols+['cos_c0_c1', 'atom_4', 'atom_5', 'molecule_atom_index_0_dist_min_diff', 'cos_c1', 'max_molecule_atom_1_dist_xyz', 'dist_to_type_std', 'd_3_2', 'cos_c0', 'dist_O_0_x', 'd_4_3', 'atom_6', 'dist_O_0_y', 'tertiary_atom_1', 'dist_C_2_y', 'd_4_2', 'dist_C_1_y', 'atom_7', 'tertiary_angle_1', 'dist_H_0_y', 'dist_no_bond_min_y', 'distance_c1', 'dist_C_2_x', 'linkM0', 'd_6_2', 'dist_C_0_y', 'd_5_2', 'd_7_2', 'dist_C_3_y', 'd_6_0', 'dihedral', 'max_molecule_atom_0_dist_xyz', 'd_7_3', 'd_6_1', 'dist_H_1_y', 'tertiary_atom_2', 'd_4_0', 'tertiary_atom_0', 'tertiary_angle_3', 'dist_C_0_x', 'dist_to_type_0_mean', 'dist_N_0_y', 'd_4_1', 'cos_c1_f1', 'cos_f0', 'dist_xyz', 'adC2', 'd_5_3', 'cos_f0_f1', 'gap', 'd_7_0', 'cos_f1', 'tertiary_distance_1', 'molecule_atom_index_0_dist_max_diff', 'd_2_1'],
oof_cols+['cos_c0', 'tertiary_distance_1', 'cos_c1', 'd_3_2', 'tertiary_angle_1', 'tertiary_angle_0', 'atom_1_n_bonds', 'tertiary_distance_2', 'd_2_1', 'tertiary_angle_2', 'd_4_0', 'molecule_atom_index_0_dist_min_div', 'd_2_0', 'dist_H_0_x', 'd_3_1', 'cos_c0_c1', 'mulliken_atom_1', 'd_8_3', 'd_4_1', 'dist_C_0_y', 'd_3_0', 'atom_index_1_cycle_size_mean', 'dist_C_1_x', 'dist_C_2_x', 'adC2', 'adC1', 'atom_1_bond_lengths_std', 'atom_index_1_n_cycle', 'd_4_2', 'cos_f0', 'd_5_2', 'dist_to_type_0_mean', 'dist_O_0_x', 'molecule_atom_index_0_dist_std_diff', 'd_5_1', 'tertiary_angle_3', 'd_6_2', 'd_7_3']]

In [None]:
mytrial=[]
df_his, df_feature_importances, df_valid_pred, df_test_pred = pd.DataFrame(), pd.DataFrame(),pd.DataFrame(),pd.DataFrame()
for t in  df_train.type.unique().tolist():
    
    param = base_param.copy()
    param['columns'] = type_columns[t]
    
    df_his_i, df_feature_importances_i, df_valid_pred_i, df_test_pred_i =  sk_process(df_train[df_train['type']==t].reset_index(drop=True), param, f'modeling for {t}', df_test=df_test[df_test['type']==t].reset_index(drop=True), trial=mytrial, is_output_feature_importance=False, trial_level=1)
    df_his = pd.concat([df_his, df_his_i], axis=0)
    df_feature_importances = pd.concat([df_feature_importances, df_feature_importances_i], axis=0)
    df_valid_pred = pd.concat([df_valid_pred, df_valid_pred_i], axis=0)
    df_test_pred = pd.concat([df_test_pred, df_test_pred_i], axis=0)
    

df_valid_pred = df_valid_pred.sort_values(by=['index']).reset_index(drop=True)
df_test_pred = df_test_pred.sort_values(by=['index']).reset_index(drop=True)

In [None]:
#oof
df_trial = pd.DataFrame(mytrial)
df_trial['trn_val_metric_diff_rate'] = df_trial['trn_val_metric_diff'] / df_trial['train_metric_mean']
df_trial['log_val_mae'] = df_trial['val_metric_mean'].apply(lambda x : np.log(x))
print(mean_absolute_error(df_valid_pred.sort_values(by=['index']).reset_index(drop=True).predict.values, df_train.reset_index(drop=True).y.values))
df_trial[['datetime', 'message', 'nfeatures', 'train_metric_mean', 'val_metric_mean', 'trn_val_metric_diff', 'trn_val_metric_diff_rate', 'message', 'log_val_mae']]

In [None]:
#dso
df_trial = pd.DataFrame(mytrial)
df_trial['trn_val_metric_diff_rate'] = df_trial['trn_val_metric_diff'] / df_trial['train_metric_mean']
df_trial['log_val_mae'] = df_trial['val_metric_mean'].apply(lambda x : np.log(x))
print(mean_absolute_error(df_valid_pred.sort_values(by=['index']).reset_index(drop=True).predict.values, df_train.reset_index(drop=True).y.values))
df_trial[['datetime', 'message', 'nfeatures', 'train_metric_mean', 'val_metric_mean', 'trn_val_metric_diff', 'trn_val_metric_diff_rate', 'message', 'log_val_mae']]

0.009578546187112865


Unnamed: 0,datetime,message,nfeatures,train_metric_mean,val_metric_mean,trn_val_metric_diff,trn_val_metric_diff_rate,message.1,log_val_mae
0,2019-08-22 23:05:23.638616,modeling for 0,71,0.002443,0.005019,0.002576,1.054147,modeling for 0,-5.294546
1,2019-08-22 23:15:29.229261,modeling for 3,56,0.006739,0.019578,0.012839,1.905245,modeling for 3,-3.933369
2,2019-08-22 23:16:34.822741,modeling for 1,36,0.000984,0.002029,0.001045,1.062602,modeling for 1,-6.200042
3,2019-08-22 23:20:31.127484,modeling for 4,46,0.000964,0.002097,0.001133,1.17455,modeling for 4,-6.16718
4,2019-08-22 23:52:02.889524,modeling for 2,70,0.004229,0.006896,0.002666,0.630461,modeling for 2,-4.976877
5,2019-08-23 00:06:18.011585,modeling for 6,55,0.011482,0.025552,0.01407,1.225338,modeling for 6,-3.667049
6,2019-08-23 01:07:02.009842,modeling for 5,57,0.004459,0.00661,0.002151,0.482295,modeling for 5,-5.019133
7,2019-08-23 01:12:36.323113,modeling for 7,38,0.001101,0.002271,0.00117,1.062423,modeling for 7,-6.087522


In [14]:
df_valid_pred.to_pickle('oof_dso_train', compression='gzip')
df_test_pred.to_pickle('oof_dso_test', compression='gzip')

In [81]:
#pso
df_trial = pd.DataFrame(mytrial)
df_trial['trn_val_metric_diff_rate'] = df_trial['trn_val_metric_diff'] / df_trial['train_metric_mean']
df_trial['log_val_mae'] = df_trial['val_metric_mean'].apply(lambda x : np.log(x))
print(mean_absolute_error(df_valid_pred.sort_values(by=['index']).reset_index(drop=True).predict.values, df_train.reset_index(drop=True).y.values))
df_trial[['datetime', 'message', 'nfeatures', 'train_metric_mean', 'val_metric_mean', 'trn_val_metric_diff', 'trn_val_metric_diff_rate', 'message', 'log_val_mae']]

0.011634202386808


Unnamed: 0,datetime,message,nfeatures,train_metric_mean,val_metric_mean,trn_val_metric_diff,trn_val_metric_diff_rate,message.1,log_val_mae
0,2019-08-22 11:05:06.608505,modeling for 0,71,0.005293,0.011852,0.006558,1.238966,modeling for 0,-4.435266
1,2019-08-22 11:14:44.376738,modeling for 3,56,0.006104,0.015228,0.009124,1.494666,modeling for 3,-4.184588
2,2019-08-22 11:17:04.068932,modeling for 1,36,0.002169,0.016369,0.0142,6.547536,modeling for 1,-4.112381
3,2019-08-22 11:21:24.964886,modeling for 4,46,0.002049,0.007385,0.005336,2.603808,modeling for 4,-4.908289
4,2019-08-22 11:51:17.266337,modeling for 2,70,0.006749,0.011612,0.004863,0.720579,modeling for 2,-4.455688
5,2019-08-22 12:05:40.948671,modeling for 6,55,0.009028,0.019892,0.010864,1.203308,modeling for 6,-3.917415
6,2019-08-22 12:37:59.231928,modeling for 5,57,0.005449,0.008289,0.00284,0.521266,modeling for 5,-4.792837
7,2019-08-22 12:43:58.550884,modeling for 7,38,0.001751,0.005555,0.003805,2.173226,modeling for 7,-5.192982


In [82]:
df_valid_pred.to_pickle('oof_pso_train', compression='gzip')
df_test_pred.to_pickle('oof_pso_test', compression='gzip')

In [75]:
#sd
df_trial = pd.DataFrame(mytrial)
df_trial['trn_val_metric_diff_rate'] = df_trial['trn_val_metric_diff'] / df_trial['train_metric_mean']
df_trial['log_val_mae'] = df_trial['val_metric_mean'].apply(lambda x : np.log(x))
print(mean_absolute_error(df_valid_pred.sort_values(by=['index']).reset_index(drop=True).predict.values, df_train.reset_index(drop=True).y.values))
df_trial[['datetime', 'message', 'nfeatures', 'train_metric_mean', 'val_metric_mean', 'trn_val_metric_diff', 'trn_val_metric_diff_rate', 'message', 'log_val_mae']]

0.0028518626968526977


Unnamed: 0,datetime,message,nfeatures,train_metric_mean,val_metric_mean,trn_val_metric_diff,trn_val_metric_diff_rate,message.1,log_val_mae
0,2019-08-22 08:21:13.910285,modeling for 0,71,0.001555,0.003055,0.0015,0.964784,modeling for 0,-5.790894
1,2019-08-22 08:32:59.340831,modeling for 3,56,0.000712,0.001354,0.000642,0.901471,modeling for 3,-6.604683
2,2019-08-22 08:34:32.338182,modeling for 1,36,0.001088,0.003027,0.001939,1.781626,modeling for 1,-5.800066
3,2019-08-22 08:38:34.840061,modeling for 4,46,0.000958,0.002333,0.001374,1.433867,modeling for 4,-6.060733
4,2019-08-22 09:09:57.985913,modeling for 2,70,0.002305,0.00395,0.001645,0.713786,modeling for 2,-5.533958
5,2019-08-22 09:26:40.087047,modeling for 6,55,0.000909,0.001593,0.000684,0.752539,modeling for 6,-6.442167
6,2019-08-22 10:00:30.385933,modeling for 5,57,0.001885,0.002905,0.00102,0.54121,modeling for 5,-5.841431
7,2019-08-22 10:06:01.179879,modeling for 7,38,0.000923,0.002173,0.00125,1.353585,modeling for 7,-6.131662


In [76]:
df_valid_pred.to_pickle('oof_sd_train', compression='gzip')
df_test_pred.to_pickle('oof_sd_test', compression='gzip')

In [68]:
#fc
df_trial = pd.DataFrame(mytrial)
df_trial['trn_val_metric_diff_rate'] = df_trial['trn_val_metric_diff'] / df_trial['train_metric_mean']
df_trial['log_val_mae'] = df_trial['val_metric_mean'].apply(lambda x : np.log(x))
print(mean_absolute_error(df_valid_pred.sort_values(by=['index']).reset_index(drop=True).predict.values, df_train.reset_index(drop=True).y.values))
df_trial[['datetime', 'message', 'nfeatures', 'train_metric_mean', 'val_metric_mean', 'trn_val_metric_diff', 'trn_val_metric_diff_rate', 'message', 'log_val_mae']]

0.2642068948259309


Unnamed: 0,datetime,message,nfeatures,train_metric_mean,val_metric_mean,trn_val_metric_diff,trn_val_metric_diff_rate,message.1,log_val_mae
0,2019-08-22 05:56:13.671091,modeling for 0,71,0.249396,0.580054,0.330657,1.325831,modeling for 0,-0.544634
1,2019-08-22 06:06:05.635389,modeling for 3,56,0.036385,0.121778,0.085393,2.346925,modeling for 3,-2.105557
2,2019-08-22 06:07:45.058639,modeling for 1,36,0.029229,0.295462,0.266233,9.108602,modeling for 1,-1.219215
3,2019-08-22 06:11:35.146473,modeling for 4,46,0.022341,0.140886,0.118545,5.306092,modeling for 4,-1.959806
4,2019-08-22 06:41:24.878131,modeling for 2,70,0.129866,0.235691,0.105825,0.81488,modeling for 2,-1.445235
5,2019-08-22 06:56:02.611189,modeling for 6,55,0.055249,0.137021,0.081772,1.480057,modeling for 6,-1.987623
6,2019-08-22 07:28:25.500155,modeling for 5,57,0.157494,0.249594,0.0921,0.584784,modeling for 5,-1.387919
7,2019-08-22 07:33:32.002615,modeling for 7,38,0.018684,0.101022,0.082337,4.406766,modeling for 7,-2.292422


In [69]:
df_valid_pred.to_pickle('oof_fc_train', compression='gzip')
df_test_pred.to_pickle('oof_fc_test', compression='gzip')

In [57]:
# idx=0.26
# # df_test_pred = df_trial.loc[idx]['df_test_pred']
# df_submit = pd.DataFrame()
# df_submit['scalar_coupling_constant'] = np.mean(df_test_pred.drop(columns=['index']).values, axis=1)
# df_submit['id'] = df_test_pred['index']
# df_submit.to_csv('../../data/submission/submission_lgbm_{}.csv'.format(idx), index=False)

In [24]:
df_trial = pd.DataFrame(mytrial)
df_trial['trn_val_metric_diff_rate'] = df_trial['trn_val_metric_diff'] / df_trial['train_metric_mean']
df_trial['log_val_mae'] = df_trial['val_metric_mean'].apply(lambda x : np.log(x))
print(mean_absolute_error(df_valid_pred.sort_values(by=['index']).reset_index(drop=True).predict.values, df_train.reset_index(drop=True).y.values))
df_trial[['datetime', 'message', 'nfeatures', 'train_metric_mean', 'val_metric_mean', 'trn_val_metric_diff', 'trn_val_metric_diff_rate', 'message', 'log_val_mae']]

0.2762095859176849


Unnamed: 0,datetime,message,nfeatures,train_metric_mean,val_metric_mean,trn_val_metric_diff,trn_val_metric_diff_rate,message.1,log_val_mae
0,2019-08-21 23:29:24.455487,modeling for 0,71,0.330755,0.607331,0.276576,0.836197,modeling for 0,-0.498681
1,2019-08-21 23:34:08.670000,modeling for 3,56,0.059156,0.128804,0.069648,1.177351,modeling for 3,-2.049465
2,2019-08-21 23:34:48.456152,modeling for 1,36,0.090381,0.303577,0.213197,2.35887,modeling for 1,-1.192119
3,2019-08-21 23:36:35.320959,modeling for 4,46,0.047605,0.147608,0.100003,2.100688,modeling for 4,-1.913197
4,2019-08-21 23:54:11.117995,modeling for 2,70,0.156557,0.246093,0.089536,0.571904,modeling for 2,-1.402047
5,2019-08-22 00:02:07.567450,modeling for 6,55,0.075828,0.144828,0.069,0.909958,modeling for 6,-1.932209
6,2019-08-22 00:22:32.144051,modeling for 5,57,0.178458,0.259814,0.081356,0.455885,modeling for 5,-1.34779
7,2019-08-22 00:24:53.931302,modeling for 7,38,0.036759,0.106056,0.069297,1.885185,modeling for 7,-2.243784


In [38]:
[f for f in os.listdir('./') if f.startswith('oof')]

['oof_dso_train',
 'oof_sd_test',
 'oof_fc_test',
 'oof_pso_test',
 'oof_fc_train',
 'oof_pso_train',
 'oof_scalar_coupling_constant_test',
 'oof_scalar_coupling_constant_train',
 'oof_sd_train',
 'oof_dso_test']

In [39]:
df_train = pd.DataFrame()
df_test = pd.DataFrame()

for f in [f for f in os.listdir('./') if f.startswith('oof')][:]:
    
    if 'train' in f:
        df_i = pd.read_pickle(f, compression='gzip')
        df_train[f'{f}'.replace('_train', '')] = df_i.predict
        if 'index' not in df_train.columns:
            df_train['index'] = df_i['index']
        
    if 'test' in f:
        df_i = pd.read_pickle(f, compression='gzip')
        df_test[f'{f}'.replace('_test', '')] = df_i.drop(columns=['index']).mean(axis=1)
        if 'index' not in df_test.columns:
            df_test['index'] = df_i['index']

In [42]:
df_train.columns, df_train.shape

(Index(['oof_dso', 'index', 'oof_fc', 'oof_pso', 'oof_scalar_coupling_constant',
        'oof_sd'],
       dtype='object'), (4658147, 6))

In [41]:
df_test.columns, df_test.shape

(Index(['oof_sd', 'index', 'oof_fc', 'oof_pso', 'oof_scalar_coupling_constant',
        'oof_dso'],
       dtype='object'), (2505542, 6))