In [1]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [1]:
import sys
sys.path.append("../..") # Adds higher directory to python modules path.
from utilities import aggregate_feature_calculators
from utilities import aggregate_feature_calculators_setting as aggcal
from utilities.parallel import Parallel
from utilities.dfdb import DFDB

In [2]:
# import sys
# !{sys.executable} -m pip install seaborn

In [2]:
import numpy as np
import pandas as pd
import os
import time
import datetime
import json
import copy
import gc
import warnings
from tqdm import tqdm_notebook, tqdm

import optuna

import lightgbm as lgb
import xgboost as xgb

from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold,TimeSeriesSplit, GroupKFold
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR, SVR
from sklearn.metrics import mean_absolute_error
from sklearn import preprocessing

import eli5
from eli5.sklearn import PermutationImportance

import networkx as nx

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

# def mem_check(scope):
#     list_ = []
#     for var_name in scope:
#         if not var_name.startswith("_"):
#             mem_usage = sys.getsizeof(eval(var_name))/1e6
#             list_.append({'variable_name':var_name, 'memory_usage(M)':mem_usage, 'address':id(var_name), 'refcount':sys.getrefcount(eval(var_name))})
#     df_ = pd.DataFrame(list_)
#     return df_[df_['memory_usage(M)']>1]
# mem_check(dir())

In [5]:
csv_file_folder =  '../../data/input'
os.listdir(csv_file_folder)

['test.csv',
 'structures',
 'sample_submission.csv',
 'magnetic_shielding_tensors.csv',
 'mulliken_charges.csv',
 'potential_energy.csv',
 'scalar_coupling_contributions.csv',
 'dipole_moments.csv',
 'structures.csv',
 'train.csv']

In [6]:
file_folder =  '../../data/feature'
os.listdir(file_folder)

['keras-neural-net-for-champs_train.pkl',
 'giba-r-data-table-simple-features-0-991-lb_test.pkl',
 'keras-neural-net-for-champs_test.pkl',
 'giba-r-data-table-simple-features-0-991-lb_train.pkl',
 'giba-r-data-table-simple-features-0-991-lb.r.csv']

In [7]:
df_train = pd.read_csv(f"{csv_file_folder}/train.csv")
df_test = pd.read_csv(f"{csv_file_folder}/test.csv")

for f in os.listdir(file_folder):
    if f.endswith('.pkl'):
        if f[:-4].endswith('train'):
            df_feature_i = pd.read_pickle(f'{file_folder}/{f}')
            columns_i = df_feature_i.columns.tolist()
            new_columns = set(columns_i) - set(df_train.columns.tolist())
            df_train = pd.merge(df_train, df_feature_i[list(new_columns)+['id']], on='id')
            print('train add', f, df_feature_i.shape)
        if f[:-4].endswith('test'):
            df_feature_i = pd.read_pickle(f'{file_folder}/{f}')
            columns_i = df_feature_i.columns.tolist()
            new_columns = set(columns_i) - set(df_test.columns.tolist())
            df_test = pd.merge(df_test, df_feature_i[list(new_columns)+['id']], on='id')
            print('test add', f, df_feature_i.shape)

train add keras-neural-net-for-champs_train.pkl (4658147, 56)
test add giba-r-data-table-simple-features-0-991-lb_test.pkl (2505542, 49)
test add keras-neural-net-for-champs_test.pkl (2505542, 56)
train add giba-r-data-table-simple-features-0-991-lb_train.pkl (4658147, 49)


In [8]:
numerics = ['int16', 'int8', 'int32', 'int64', 'float16', 'float32', 'float64']
for col in df_train.columns:
    col_type = df_train[col].dtypes
    if not col_type in numerics:
        print(col, df_train[col].unique())
        le = LabelEncoder()
        le.fit(list(df_train[col].values) + list(df_test[col].values))
        df_train[col] = le.transform(list(df_train[col].values))
        df_test[col] = le.transform(list(df_test[col].values))
        print(le.classes_)

molecule_name ['dsgdb9nsd_000001' 'dsgdb9nsd_000002' 'dsgdb9nsd_000003' ...
 'dsgdb9nsd_133881' 'dsgdb9nsd_133882' 'dsgdb9nsd_133884']
['dsgdb9nsd_000001' 'dsgdb9nsd_000002' 'dsgdb9nsd_000003' ...
 'dsgdb9nsd_133883' 'dsgdb9nsd_133884' 'dsgdb9nsd_133885']
type ['1JHC' '2JHH' '1JHN' '2JHN' '2JHC' '3JHH' '3JHC' '3JHN']
['1JHC' '1JHN' '2JHC' '2JHH' '2JHN' '3JHC' '3JHH' '3JHN']
atom_0 ['H']
['H']
atom_1 ['C' 'H' 'N']
['C' 'H' 'N']


In [9]:
df_train = df_train.replace([np.inf, -np.inf], np.nan)
df_train = df_train.fillna(0)
df_test = df_test.replace([np.inf, -np.inf], np.nan)
df_test = df_test.fillna(0)
df_train = df_train.rename(columns = {'id':'index', 'scalar_coupling_constant':'y'})
df_test = df_test.rename(columns = {'id':'index'})
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [14]:
df_train = reduce_mem_usage(df_train)
df_test = reduce_mem_usage(df_test)

Mem. usage decreased to 928.45 Mb (37.0% reduction)
Mem. usage decreased to 494.62 Mb (36.1% reduction)


In [15]:
# nullcolumns = []   
# for col, nullcount in zip(df_train.columns, df_train.isnull().sum()):
#     if nullcount!=0:
#         series_ = df_train[col].replace([np.inf, -np.inf], np.nan).dropna()
#         nullcolumns.append({'feature':col, 'nullcount':nullcount, 'min':series_.min(), 'max':series_.max()})
# df_nullcolumns = pd.DataFrame(nullcolumns)

In [16]:
# for col in df_nullcolumns['feature']:
    
#     series_ = df_train[col].replace([np.inf, -np.inf], np.nan).dropna()
#     max_  = series_.max()
#     min_ = series_.min()
#     mean_  = series_.mean()
#     std_ = series_.std()
#     df_train[col] = df_train[col].replace([np.inf, -np.inf], np.nan).fillna(0)
#     print(col, min_, max_, mean_, std_)
#     break


In [17]:
def _str2class(s):
    if s in globals() and isinstance(globals()[s], type):
            return globals()[s]
    if isinstance(eval(s), type):
        return eval(s)
    if callable(eval(s)):
        return eval(s)
    return None

def sk_process(df_train, param, message, df_test=None, trial=None, is_output_feature_importance=False, trial_level=0):

    columns = param['columns']

    assert 'y' in df_train.columns.tolist(), 'y is not in df_train'
    assert 'index' in df_train.columns.tolist(), 'index is not in df_train'
    assert 'index' not in param['columns'], 'index is in features'
    assert 'y' not in param['columns'], 'y is in features'
    assert 'label' not in param['columns'], 'label is in features'
    assert 'group' not in param['columns'], 'group is in features'
    assert (type(trial) == list) | (trial == None), 'trial is neither list nor none'
    assert len(columns) != 0, 'columns size is 0'

    df_test_pred = None
    if type(df_test) == pd.DataFrame:
        assert 'index' in df_test.columns.tolist(), 'index is not in df_test'
        df_test_pred = pd.concat([df_test_pred, df_test[['index']]], axis=1)
        
    CV = _str2class(param['cv']['cls'])
    MODEL = _str2class(param['model']['cls'])
    if 'scaler' in param:
        SCALER = _str2class(param['scaler']['cls'])
    metric = _str2class(param['metric'])
    
    history = []
    df_valid_pred = pd.DataFrame()
    df_feature_importances_i_list = []

    # StratifiedKFold, KFold, RepeatedKFold,TimeSeriesSplit, GroupKFold
    if 'splits' in param['cv']:
        splits = param['cv']['splits']
    else:
        cv = CV(**param['cv']['init'])
        if param['cv']['cls'] == 'StratifiedKFold':
            assert 'label' in df_train.columns.tolist(), 'label is not in df_train'
            splits = list(cv.split(df_train, df_train['label']))
        elif param['cv']['cls'] == 'GroupKFold':
            assert 'group' in df_train.columns.tolist(), 'group is not in df_train'
            splits = list(cv.split(df_train, groups=df_train['group']))
        else:
            splits = list(cv.split(df_train))

    for fold_n, (train_index, valid_index) in enumerate(splits):

        X_train, X_valid = df_train[columns].values[train_index, :], df_train[columns].values[valid_index, :]
        y_train, y_valid = df_train['y'].values[train_index], df_train['y'].values[valid_index]

        if 'scaler' in param:
            scaler = SCALER(**param['scaler']['init'])
            X_train = scaler.fit_transform(X_train)
            X_valid = scaler.transform(X_valid)

        model = MODEL(**param['model']['init'])
        model.fit(X_train, y_train, **param['model']['fit'])

        y_valid_pred = model.predict(X_valid)
        y_train_pred = model.predict(X_train)

        original_index = df_train['index'].values[valid_index]
        df_valid_pred_i = pd.DataFrame({'index': original_index, 'predict': y_valid_pred, 'fold_n': np.zeros(y_valid_pred.shape[0]) + fold_n})
        df_valid_pred = pd.concat([df_valid_pred, df_valid_pred_i], axis=0)

        if is_output_feature_importance:
            df_feature_importances_i = pd.DataFrame({'feature': columns, 'model_weight': model.feature_importances_})
            df_feature_importances_i = df_feature_importances_i.sort_values(by=['feature'])
            df_feature_importances_i = df_feature_importances_i.reset_index(drop=True)
            perm = PermutationImportance(model, random_state=42).fit(X_valid, y_valid)
            df_feature_importances_i2 = eli5.explain_weights_dfs(perm, feature_names=columns, top=len(columns))['feature_importances']
            df_feature_importances_i2 = df_feature_importances_i2.sort_values(by=['feature'])
            df_feature_importances_i2 = df_feature_importances_i2.reset_index(drop=True)
            df_feature_importances_i = pd.merge(df_feature_importances_i, df_feature_importances_i2, on='feature')
            df_feature_importances_i_list.append(df_feature_importances_i)

        if type(df_test) == pd.DataFrame:
            X_test = df_test[columns].values
            if 'scaler' in param:
                X_test = scaler.transform(X_test)
            y_test_pred = model.predict(X_test)
            df_test_pred_i = pd.DataFrame({fold_n: y_test_pred})
            df_test_pred = pd.concat([df_test_pred, df_test_pred_i], axis=1)
        
        history.append({'fold_n': fold_n, 'train': metric(y_train, y_train_pred), 'valid': metric(y_valid, y_valid_pred)})

    df_his = pd.DataFrame(history)

    df_feature_importances = None
    if is_output_feature_importance:
        df_feature_importances = df_feature_importances_i_list[0]
        for idx, df_feature_importances_i in enumerate(df_feature_importances_i_list[1:]):
            df_feature_importances = pd.merge(df_feature_importances, df_feature_importances_i, on='feature', suffixes=('', idx + 1))

    df_valid_pred = df_valid_pred.sort_values(by=['index'])
    df_valid_pred = df_valid_pred.reset_index(drop=True)

    if type(df_test) == pd.DataFrame:
        df_test_pred = df_test_pred.sort_values(by=['index'])
        df_test_pred = df_test_pred.reset_index(drop=True)

    if type(trial) == list:
        datetime_ = datetime.datetime.now()
        val_metric_mean = np.mean(df_his.valid)
        val_metric_std = np.std(df_his.valid)
        train_metric_mean = np.mean(df_his.train)
        train_metric_std = np.std(df_his.train)

        trial_i_d_ = {'datetime': datetime_, 'message': message, 'val_metric_mean': val_metric_mean,
                  'train_metric_mean': train_metric_mean, 'val_metric_std': val_metric_std, 'train_metric_std': train_metric_std,
                  'trn_val_metric_diff': val_metric_mean - train_metric_mean,
                  'df_feature_importances': df_feature_importances,'param': param.copy(),
                  'nfeatures': len(columns)}
        if trial_level > 0:
            trial_i_d_ = {'df_his': df_his, 'df_valid_pred': df_valid_pred, 'df_test_pred': df_test_pred, **trial_i_d_}
        trial.append(trial_i_d_)

    return df_his, df_feature_importances, df_valid_pred, df_test_pred

def evaluate(df_feature_importances, key='average_model_weight'):
        df_feature_importances['average_permutation_weight'] = df_feature_importances[
            [col for col in df_feature_importances.columns.tolist() if ('weight' in col) & ('model' not in col)]].mean(
            axis=1)
        df_feature_importances['average_model_weight'] = df_feature_importances[
            [col for col in df_feature_importances.columns.tolist() if ('model_weight' in col)]].mean(axis=1)
        df_feature_importances = df_feature_importances.sort_values(by=[key], ascending=False)
        sorted_columns = df_feature_importances.feature.tolist()
        return sorted_columns

def select_features_(df_train, param, trial, message, df_test=None, nfeats_best=10, nfeats_removed_per_try=10, key='average_model_weight'):
    param_i = param.copy()
    while True:
        df_his, df_feature_importances, df_valid_pred, df_test_pred = sk_process(df_train, param_i, df_test=df_test, trial=trial, is_output_feature_importance=True, message=message)
        sorted_columns = evaluate(df_feature_importances, key)
        if (len(sorted_columns) <= nfeats_best)|(len(sorted_columns)-nfeats_removed_per_try<1):
            break
        else:
            param_i['columns'] = sorted_columns[:-nfeats_removed_per_try]
    return

def width_frist_rfe(df_train, param, trial, score, message, df_test=None):

    param_ = copy.deepcopy(param)
    columns_ = param_['columns']
    best_score = score
    best_param = param_
    for col in columns_:
        param_['columns'] = list(set(columns_) - set([col]))
        df_his, df_feature_importances, df_valid_pred, df_test_pred = sk_process(df_train, param_, df_test=df_test, trial=trial, is_output_feature_importance=False, message=message)
        val_mae_mean = np.mean(df_his.valid)
        if val_mae_mean<best_score:
            best_score = val_mae_mean
            best_param = copy.deepcopy(param_)

    if best_score < score:
        width_frist_rfe(df_train, best_param, trial, best_score, message, df_test)

    return

def revert_rfe(df_train, param, sorted_columns, df_test, trial, start_columns, limit=None, remark=None):

    # init cv_score and try only base feature
    selected_columns = copy.deepcopy(start_columns)
    if type(limit) == type(None):
        limit = len(sorted_columns)
    args = copy.deepcopy(param)
    args['columns'] = selected_columns
    df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=trial, remark=remark)
    val_mae_mean = np.mean(df_his.valid)
    cv_score = val_mae_mean

    # add feature one by one and check cv score change
    for idx,col in enumerate(sorted_columns):
#         if idx in start_column_index:
#             continue
        args = copy.deepcopy(param)
        args['columns'] = list(set(selected_columns + [col]))
        df_his,  df_feature_importances, df_valid_pred, df_test_pred =  EP.process(df_train, args, df_test = df_test, trial=trial, remark=remark)
        val_mae_mean = np.mean(df_his.valid)
        if val_mae_mean < cv_score:
            selected_columns.append(col)
            cv_score = val_mae_mean
        if len(selected_columns) >= limit:
            break

    return selected_columns

def blacklist_merge(df, columns=None, base_correlation_coefficient=.9):

    if type(columns)==type(None):
        columns = df.columns.tolist()
    bcc_ = base_correlation_coefficient
    X = df_train[columns].values
    X = StandardScaler().fit_transform(X)
    df_norm = pd.DataFrame(X, columns=columns)
    df_corr = df_norm.corr()

    black_lst = []
    group = {}
    for col in columns:
        if col in black_lst:
            continue
        group[col] = list(df_corr[(df_corr[col]>=bcc_)|(df_corr[col]<=-bcc_)].index)
        black_lst +=  group[col]
    return group

def bubble_merge(df, columns=None, base_correlation_coefficient=.9, coverage_rate=.9):

    def is_similar(group1, group2):
        assert type(group1)==list, 'group1 should be a list'
        assert type(group2)==list, 'group2 should be a list'
        total_units = group1 + group2
        unique_units = list(set(total_units))
        common_parts = [col for col in unique_units if total_units.count(col)==2]
        if (len(common_parts)/len(group1) >= coverage_rate) | (len(common_parts)/len(group2) >= coverage_rate):
            return True
        else:
            return False

    def merge_group(original_group):
        group = original_group.copy()
        merged_group = group
        dict_list_ = list(group.items())
        is_merged = False

        index1 = 1
        for k1, v1 in dict_list_[:-1]:
            for k2,v2 in dict_list_[index1:]:
                    if is_similar(v1, v2):
                        group[k1] = list(set(v1 + v2))
                        del group[k2]
                        merged_group = merge_group(group)
                        is_merged = True
                        break
            if is_merged:
                break
            index1 += 1
        return merged_group

    if type(columns)==type(None):
        columns = df.columns.tolist()
    bcc_ = base_correlation_coefficient
    X = df[columns].values
    X = StandardScaler().fit_transform(X)
    df_norm = pd.DataFrame(X, columns=columns)
    df_corr = df_norm.corr()

    group = {}
    for col in columns:
        group[col] = list(df_corr[(df_corr[col]>=bcc_)|(df_corr[col]<=-bcc_)].index)

    return merge_group(group)


def group_mean_log_mae(y_true, y_pred, groups, floor=1e-9):
    maes = (y_true-y_pred).abs().groupby(groups).mean()
    return np.log(maes.map(lambda x: max(x, floor))).mean()

In [37]:
columns = df_train.drop(columns=['index','y','molecule_name']).columns.tolist()

In [101]:
param = {
    'columns': columns,
    'cv': {
        'cls': 'KFold',
        'init': {'n_splits': 5, 'shuffle': True, 'random_state': 42}
    },
    'scaler': {
        'cls': 'StandardScaler', 'init': {}, 'fit': {}
    },
    'model': {
        'cls': 'lgb.LGBMRegressor',
        'init': {
            'learning_rate': 0.35395923077843333,
             'feature_fraction': 0.8840483697334669,
            'bagging_fraction': 0.7017457378676857,
            'min_data_in_leaf': 616,
            'lambda_l1': 0.00013058988949929333,
            'lambda_l2': 0.004991992636437704,
            'max_bin': 74,
            'num_leaves': 64,
            'random_state': 2928,
            'n_jobs': 16
        },
        'fit': {}
    },
    'metric': 'mean_absolute_error'
}

In [102]:
df_train_sample = df_train.sample(105542)

In [104]:
message = 'try 100k samples'

In [103]:
# mytrial = []
df_his, df_feature_importances, df_valid_pred, df_test_pred = sk_process(df_train_sample, param, f'sort columns {message}', trial=mytrial, is_output_feature_importance=True, trial_level=0)

sorted_columns = evaluate(df_feature_importances, 'average_permutation_weight')
param["columns"] = sorted_columns[:200] if len(sorted_columns) > 200 else sorted_columns

In [43]:
warnings.filterwarnings('once')
select_features_(df_train_sample, param, mytrial, message='rfe cv5' , key='average_permutation_weight', nfeats_best=20, nfeats_removed_per_try=20)

df_trial = pd.DataFrame(mytrial)
df_trial_top1 = df_trial[(df_trial['message']=='rfe')&(df_trial['nfeatures']<100)].sort_values(by=['val_metric_mean'], ascending=True).head(1)
param = df_trial_top1['param'].tolist()[0]
score = df_trial_top1['val_metric_mean'].tolist()[0]

In [84]:
width_frist_rfe(df_train_sample, param, mytrial, score=score, message='width_frist_rfe cv5')

df_trial = pd.DataFrame(mytrial)
columns = df_trial[df_trial['message']=='width_frist_rfe cv5'].sort_values(by=['val_metric_mean'], ascending=True)['param'].tolist()[0]['columns']

In [85]:
len(columns)

44

In [87]:
def objective(trial):
        
    learning_rate = trial.suggest_uniform('learning_rate', .01, .5)
    feature_fraction = trial.suggest_uniform('feature_fraction', .6, 1)
    bagging_fraction = trial.suggest_uniform('bagging_fraction', 0.6, 1)
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 200, 800)
    lambda_l1 = trial.suggest_loguniform('lambda_l1', 1e-6, 1e2)
    lambda_l2 = trial.suggest_loguniform('lambda_l2', 1e-6, 1e2)
    max_bin = trial.suggest_int('max_bin', 10, 100)
    num_leaves = trial.suggest_int('num_leaves', 4, 64)
    random_state = trial.suggest_int('random_state', 1, 9999)
        
    args={
        'columns':columns,
        'cv': {
            'cls': 'KFold',
            'init':{
                'n_splits': 5,
                'shuffle': True,
                'random_state': 42,
            },
        },
        'scaler': {
            'cls': 'StandardScaler',
            'init':{},
            'fit':{},
        },
        'model': {
            'cls': 'lgb.LGBMRegressor',
            'init': {
                'learning_rate':learning_rate,
                'feature_fraction':feature_fraction,
                'bagging_fraction':bagging_fraction,
                'min_data_in_leaf':min_data_in_leaf,
                'lambda_l1':lambda_l1,
                'lambda_l2':lambda_l2,
                'max_bin':max_bin,
                'num_leaves':num_leaves,
                'random_state':random_state,
                'n_jobs':16
            },
            'fit': {
            },
        },
        'metric':'mean_absolute_error',
    }
    
    df_his, df_feature_importances, df_valid_pred, df_test_pred =  sk_process(df_train_sample, args, 'tune hyperparam cv5', trial=mytrial, is_output_feature_importance=False, trial_level=0)
    val_metric_mean = np.mean(df_his.valid)
    return val_metric_mean

study = optuna.create_study()
study.optimize(objective, n_trials=200)

[I 2019-07-07 06:49:40,198] Finished trial#0 resulted in value: 2.2464974465032133. Current best value is 2.2464974465032133 with parameters: {'learning_rate': 0.03345061774680737, 'feature_fraction': 0.7657916327974188, 'bagging_fraction': 0.9126895055700575, 'min_data_in_leaf': 258, 'lambda_l1': 2.432893812323033, 'lambda_l2': 0.3756929098183104, 'max_bin': 96, 'num_leaves': 16, 'random_state': 9212}.
[I 2019-07-07 06:49:57,653] Finished trial#1 resulted in value: 1.1866145983505556. Current best value is 1.1866145983505556 with parameters: {'learning_rate': 0.34437045825140017, 'feature_fraction': 0.7933808302372306, 'bagging_fraction': 0.7035904375480084, 'min_data_in_leaf': 501, 'lambda_l1': 0.10400098987620247, 'lambda_l2': 0.001207552558055291, 'max_bin': 45, 'num_leaves': 35, 'random_state': 6610}.
[I 2019-07-07 06:50:11,761] Finished trial#2 resulted in value: 1.6381721515838457. Current best value is 1.1866145983505556 with parameters: {'learning_rate': 0.34437045825140017, '

[I 2019-07-07 07:01:25,890] Finished trial#38 resulted in value: 1.3540938536879825. Current best value is 1.083789678356766 with parameters: {'learning_rate': 0.23923650301504132, 'feature_fraction': 0.9394550263211469, 'bagging_fraction': 0.89297706834366, 'min_data_in_leaf': 515, 'lambda_l1': 6.724238209952969, 'lambda_l2': 0.00014253833222585238, 'max_bin': 88, 'num_leaves': 62, 'random_state': 328}.
[I 2019-07-07 07:01:52,317] Finished trial#39 resulted in value: 4.526874246341155. Current best value is 1.083789678356766 with parameters: {'learning_rate': 0.23923650301504132, 'feature_fraction': 0.9394550263211469, 'bagging_fraction': 0.89297706834366, 'min_data_in_leaf': 515, 'lambda_l1': 6.724238209952969, 'lambda_l2': 0.00014253833222585238, 'max_bin': 88, 'num_leaves': 62, 'random_state': 328}.
[I 2019-07-07 07:02:15,855] Finished trial#40 resulted in value: 1.0782333610947283. Current best value is 1.0782333610947283 with parameters: {'learning_rate': 0.24907935711542986, 'fe

[I 2019-07-07 07:14:18,632] Finished trial#76 resulted in value: 1.120215611713565. Current best value is 1.0456198145537663 with parameters: {'learning_rate': 0.4035014679207284, 'feature_fraction': 0.9869954461080779, 'bagging_fraction': 0.9884821281676851, 'min_data_in_leaf': 201, 'lambda_l1': 0.002784639396262083, 'lambda_l2': 1.1764696585587135, 'max_bin': 69, 'num_leaves': 64, 'random_state': 6633}.
[I 2019-07-07 07:14:36,054] Finished trial#77 resulted in value: 1.2504917881472604. Current best value is 1.0456198145537663 with parameters: {'learning_rate': 0.4035014679207284, 'feature_fraction': 0.9869954461080779, 'bagging_fraction': 0.9884821281676851, 'min_data_in_leaf': 201, 'lambda_l1': 0.002784639396262083, 'lambda_l2': 1.1764696585587135, 'max_bin': 69, 'num_leaves': 64, 'random_state': 6633}.
[I 2019-07-07 07:14:55,723] Finished trial#78 resulted in value: 1.0904686031581896. Current best value is 1.0456198145537663 with parameters: {'learning_rate': 0.4035014679207284, 

[I 2019-07-07 07:26:48,349] Finished trial#114 resulted in value: 1.3120718276641847. Current best value is 1.0456198145537663 with parameters: {'learning_rate': 0.4035014679207284, 'feature_fraction': 0.9869954461080779, 'bagging_fraction': 0.9884821281676851, 'min_data_in_leaf': 201, 'lambda_l1': 0.002784639396262083, 'lambda_l2': 1.1764696585587135, 'max_bin': 69, 'num_leaves': 64, 'random_state': 6633}.
[I 2019-07-07 07:27:08,665] Finished trial#115 resulted in value: 1.082852551031705. Current best value is 1.0456198145537663 with parameters: {'learning_rate': 0.4035014679207284, 'feature_fraction': 0.9869954461080779, 'bagging_fraction': 0.9884821281676851, 'min_data_in_leaf': 201, 'lambda_l1': 0.002784639396262083, 'lambda_l2': 1.1764696585587135, 'max_bin': 69, 'num_leaves': 64, 'random_state': 6633}.
[I 2019-07-07 07:27:31,710] Finished trial#116 resulted in value: 1.062745786098675. Current best value is 1.0456198145537663 with parameters: {'learning_rate': 0.4035014679207284

[I 2019-07-07 07:40:01,888] Finished trial#152 resulted in value: 1.075502326594658. Current best value is 1.0440839872585042 with parameters: {'learning_rate': 0.45834436387727595, 'feature_fraction': 0.9884278635419517, 'bagging_fraction': 0.7664243176720027, 'min_data_in_leaf': 438, 'lambda_l1': 0.04963072572218125, 'lambda_l2': 0.46776469467041915, 'max_bin': 84, 'num_leaves': 64, 'random_state': 5281}.
[I 2019-07-07 07:40:23,696] Finished trial#153 resulted in value: 1.0721126124772666. Current best value is 1.0440839872585042 with parameters: {'learning_rate': 0.45834436387727595, 'feature_fraction': 0.9884278635419517, 'bagging_fraction': 0.7664243176720027, 'min_data_in_leaf': 438, 'lambda_l1': 0.04963072572218125, 'lambda_l2': 0.46776469467041915, 'max_bin': 84, 'num_leaves': 64, 'random_state': 5281}.
[I 2019-07-07 07:40:45,133] Finished trial#154 resulted in value: 1.078618709455188. Current best value is 1.0440839872585042 with parameters: {'learning_rate': 0.45834436387727

[I 2019-07-07 07:54:26,508] Finished trial#194 resulted in value: 1.3716220647630661. Current best value is 1.0439918941575732 with parameters: {'learning_rate': 0.42518789717977856, 'feature_fraction': 0.8950777905177992, 'bagging_fraction': 0.6760716081410235, 'min_data_in_leaf': 315, 'lambda_l1': 0.02984794013041554, 'lambda_l2': 0.008873656843196034, 'max_bin': 86, 'num_leaves': 64, 'random_state': 1772}.
[I 2019-07-07 07:54:46,940] Finished trial#195 resulted in value: 1.0741941632498888. Current best value is 1.0439918941575732 with parameters: {'learning_rate': 0.42518789717977856, 'feature_fraction': 0.8950777905177992, 'bagging_fraction': 0.6760716081410235, 'min_data_in_leaf': 315, 'lambda_l1': 0.02984794013041554, 'lambda_l2': 0.008873656843196034, 'max_bin': 86, 'num_leaves': 64, 'random_state': 1772}.
[I 2019-07-07 07:55:07,021] Finished trial#196 resulted in value: 1.1338977016676253. Current best value is 1.0439918941575732 with parameters: {'learning_rate': 0.4251878971

In [90]:
df_trial = pd.DataFrame(mytrial)
param = df_trial[(df_trial['message']=='tune hyperparam cv5')].sort_values(by=['val_metric_mean'], ascending=True)['param'].tolist()[0]

In [91]:
len(param['columns'])

44

In [89]:
df_trial[(df_trial['message']=='tune hyperparam cv5')].sort_values(by=['val_metric_mean'], ascending=True).head(10)

Unnamed: 0,datetime,df_feature_importances,df_his,df_test_pred,df_valid_pred,message,nfeatures,param,train_metric_mean,train_metric_std,trn_val_metric_diff,val_metric_mean,val_metric_std
1141,2019-07-07 07:52:24.644752,,,,,tune hyperparam cv5,44,"{'columns': ['distance_c0', 'cos_center0_cente...",1.011857,0.002656,0.032135,1.043992,0.001257
1101,2019-07-07 07:38:29.308360,,,,,tune hyperparam cv5,44,"{'columns': ['distance_c0', 'cos_center0_cente...",1.011289,0.003449,0.032795,1.044084,0.003057
1008,2019-07-07 07:07:12.754866,,,,,tune hyperparam cv5,44,"{'columns': ['distance_c0', 'cos_center0_cente...",1.013154,0.00156,0.032466,1.04562,0.004736
716,2019-07-07 03:24:25.743022,,,,,tune hyperparam cv5,43,"{'columns': ['cos_center0_center1', 'distance'...",1.0165,0.003093,0.031919,1.048419,0.00579
719,2019-07-07 03:25:40.934811,,,,,tune hyperparam cv5,43,"{'columns': ['cos_center0_center1', 'distance'...",1.022806,0.001774,0.027477,1.050283,0.00551
717,2019-07-07 03:24:50.670912,,,,,tune hyperparam cv5,43,"{'columns': ['cos_center0_center1', 'distance'...",1.023751,0.00171,0.027573,1.051324,0.002018
1009,2019-07-07 07:07:35.226099,,,,,tune hyperparam cv5,44,"{'columns': ['distance_c0', 'cos_center0_cente...",1.01532,0.00332,0.036011,1.051331,0.002211
1126,2019-07-07 07:47:22.303356,,,,,tune hyperparam cv5,44,"{'columns': ['distance_c0', 'cos_center0_cente...",1.020443,0.003447,0.031182,1.051625,0.003643
1102,2019-07-07 07:38:52.419883,,,,,tune hyperparam cv5,44,"{'columns': ['distance_c0', 'cos_center0_cente...",1.018959,0.004278,0.032951,1.05191,0.002785
718,2019-07-07 03:25:15.499352,,,,,tune hyperparam cv5,43,"{'columns': ['cos_center0_center1', 'distance'...",1.023137,0.002259,0.029043,1.05218,0.003197


In [92]:
df_his, df_feature_importances, df_valid_pred, df_test_pred = sk_process(df_train, param, 'try tuned param cv5', df_test=df_test, trial=mytrial, is_output_feature_importance=False, trial_level=1)

In [93]:
# db = DFDB('../../data/trial/lgbm_trial.pkl')
db.insert(mytrial)
# db.commit()
df_trial = db.select()

In [82]:
df_trial[df_trial['message'] == 'width_frist_rfe cv5'][['datetime', 'message', 'nfeatures', 'train_metric_mean', 'val_metric_mean', 'trn_val_metric_diff']].sort_values(by=['val_metric_mean']).head()

Unnamed: 0,datetime,message,nfeatures,train_metric_mean,val_metric_mean,trn_val_metric_diff
1603,2019-07-07 02:05:40.763631,width_frist_rfe cv5,44,1.376339,1.385574,0.009235
851,2019-07-07 02:05:40.763631,width_frist_rfe cv5,44,1.376339,1.385574,0.009235
785,2019-07-07 01:52:53.036896,width_frist_rfe cv5,45,1.377695,1.386632,0.008938
1537,2019-07-07 01:52:53.036896,width_frist_rfe cv5,45,1.377695,1.386632,0.008938
882,2019-07-07 02:11:32.328712,width_frist_rfe cv5,43,1.378766,1.387437,0.008671


In [94]:
df_trial[df_trial['message'] == 'try tuned param cv5'][['datetime', 'message', 'nfeatures', 'train_metric_mean', 'val_metric_mean', 'trn_val_metric_diff']]

Unnamed: 0,datetime,message,nfeatures,train_metric_mean,val_metric_mean,trn_val_metric_diff
1137,2019-07-07 03:38:04.106983,try tuned param cv5,43,1.030377,1.038493,0.008116
1889,2019-07-07 03:38:04.106983,try tuned param cv5,43,1.030377,1.038493,0.008116
2842,2019-07-07 03:38:04.106983,try tuned param cv5,43,1.030377,1.038493,0.008116
3244,2019-07-07 08:18:21.110466,try tuned param cv5,44,1.01626,1.02409,0.007829


In [96]:
df_trial[df_trial['message'] == 'try tuned param cv5 add depth range'][['datetime', 'message', 'nfeatures', 'train_metric_mean', 'val_metric_mean', 'trn_val_metric_diff']]

Unnamed: 0,datetime,message,nfeatures,train_metric_mean,val_metric_mean,trn_val_metric_diff
2090,2019-07-07 06:42:43.053457,try tuned param cv5 add depth range,43,0.799803,0.819977,0.020174
3043,2019-07-07 06:42:43.053457,try tuned param cv5 add depth range,43,0.799803,0.819977,0.020174


In [95]:
df_trial.message.value_counts()

tune hyperparam                        869
tune hyperparam cv5                    800
width_frist_rfe                        712
width_frist_rfe cv5                    405
tune hyperparam cv5 add depth range    400
rfe                                     24
rfe cv5                                 18
try tuned param                          4
first try                                4
try tuned param cv5                      4
first try cv5                            3
try tuned param cv5 add depth range      2
Name: message, dtype: int64

In [97]:
#[df_trial['message'] == 'try tuned param cv5']
df_trial.sort_values(by=['val_metric_mean'])[['datetime', 'message', 'nfeatures', 'train_metric_mean', 'val_metric_mean', 'trn_val_metric_diff']].loc[[385, 1137, 2090, 3043]]

Unnamed: 0,datetime,message,nfeatures,train_metric_mean,val_metric_mean,trn_val_metric_diff
385,2019-07-07 00:39:52.447019,try tuned param,43,1.030377,1.038493,0.008116
1137,2019-07-07 03:38:04.106983,try tuned param cv5,43,1.030377,1.038493,0.008116
2090,2019-07-07 06:42:43.053457,try tuned param cv5 add depth range,43,0.799803,0.819977,0.020174
3043,2019-07-07 06:42:43.053457,try tuned param cv5 add depth range,43,0.799803,0.819977,0.020174


In [59]:
set(df_trial.loc[385]['param']['columns']) - set(df_trial.loc[1137]['param']['columns']) 

set()

In [52]:
def group_mean_log_mae(y_true, y_pred, groups, floor=1e-9):
    maes = (y_true-y_pred).abs().groupby(groups).mean()
    return np.log(maes.map(lambda x: max(x, floor))).mean()

In [53]:
gmlogmae = group_mean_log_mae(df_train.y, df_valid_pred.predict, df_train.molecule_name)

In [54]:
gmlogmae

0.04762889832183193

In [80]:
db.commit()

In [98]:
idx=3043
df_test_pred = df_trial.loc[idx]['df_test_pred']
df_submit = pd.DataFrame()
df_submit['scalar_coupling_constant'] = np.mean(df_test_pred.drop(columns=['index']).values, axis=1)
df_submit['id'] = df_test_pred['index']
df_submit.to_csv('../../data/submission/submission_lgbm_{}.csv'.format(idx), index=False)

In [100]:
df_test_pred.tail()

Unnamed: 0,index,0,1,2,3,4
2505537,7163684,0.327024,0.734119,0.987375,1.401807,0.915071
2505538,7163685,2.334262,1.679179,1.996939,1.310548,3.637714
2505539,7163686,3.469656,4.735478,5.64426,4.280331,3.540652
2505540,7163687,3.777367,3.884637,4.108416,4.695168,3.362576
2505541,7163688,120.11741,122.06681,120.087277,119.986718,120.531373
