In [1]:
import numpy as np
import pandas as pd
import os
import time
import datetime
import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm_notebook

from scipy import fftpack
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn import metrics

from sklearn.svm import NuSVR, SVR
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold
from sklearn.linear_model import LinearRegression

import gc
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from scipy.signal import hilbert
from scipy.signal import hann
from scipy.signal import convolve
from scipy import stats
from sklearn.kernel_ridge import KernelRidge
from itertools import product

from tsfresh.feature_extraction import feature_calculators

import timeit
import multiprocessing as mp

from IPython.display import HTML
from bayes_opt import BayesianOptimization

In [2]:
def printLog(string):
    os.system('echo ' + str(string))
    print(string)

PATH = '../input/'

if os.path.isdir('../input/train.csv') == False:
    PATH = '../input/LANL-Earthquake-Prediction'
    
FOLDER_PATH_TEST = os.path.join(PATH, 'test/')
FILE_PATH_TRAIN = os.path.join(PATH, 'train.csv')
FILE_PATH_SUBMISSION = os.path.join(PATH, 'sample_submission.csv')
printLog('file/folder check ' + PATH + '  : ' + str(os.path.isdir(PATH)))
printLog('file/folder check ' + FOLDER_PATH_TEST + '  : ' + str(os.path.isdir(FOLDER_PATH_TEST)))
printLog('file/folder check ' + FILE_PATH_TRAIN + '  : ' + str(os.path.isfile(FILE_PATH_TRAIN)))
printLog('file/folder check ' + FILE_PATH_SUBMISSION + '  : ' + str(os.path.isfile(FILE_PATH_SUBMISSION)))
pd.options.display.precision = 20
segment_size = 150000

file/folder check ../input/LANL-Earthquake-Prediction  : True
file/folder check ../input/LANL-Earthquake-Prediction/test/  : True
file/folder check ../input/LANL-Earthquake-Prediction/train.csv  : True
file/folder check ../input/LANL-Earthquake-Prediction/sample_submission.csv  : True


In [3]:
train_data = pd.read_csv('../input/0520test/train_data_v15.csv')
train_data.drop(['Unnamed: 0'], inplace=True, axis=1)
test_data = pd.read_csv('../input/0520test/test_data_v15.csv')
test_data.drop(['Unnamed: 0'], inplace=True, axis=1)
printLog('testing data done preparing.')

X = train_data.drop(['target', 'seg_id'], axis=1)
X_test = test_data.drop(['target', 'seg_id'], axis=1)
test_segs = test_data.seg_id
y = train_data.target

del train_data, test_data

means_dict = {}
for col in X.columns:
    if X[col].isnull().any():
        print(col)
        mean_value = X.loc[X[col] != -np.inf, col].mean()
        X.loc[X[col] == -np.inf, col] = mean_value
        X[col] = X[col].fillna(mean_value)
        means_dict[col] = mean_value
        
for col in X_test.columns:
    if X_test[col].isnull().any():
        X_test.loc[X_test[col] == -np.inf, col] = means_dict[col]
        X_test[col] = X_test[col].fillna(means_dict[col])

printLog('total ' + str(X.values[0,:].size) + ' of features are used.')

testing data done preparing.
total 1056 of features are used.


In [4]:
n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=True, random_state=11)

In [5]:
def LGB_CV(max_depth, num_leaves, min_data_in_leaf, feature_fraction, bagging_fraction, lambda_l1, is_unbalance):
#     folds = KFold(n_splits=5, shuffle=True, random_state=15)
    oof = np.zeros(len(X))
    scores = []
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
        printLog('Fold '+ str(fold_n) + ', started at '+ time.ctime())
        if type(X) == np.ndarray:
            X_train, X_valid = X[train_index], X[valid_index]
            y_train, y_valid = y[train_index], y[valid_index]
        else:
            X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
        param = {'num_leaves': int(num_leaves),
                 'min_data_in_leaf': int(min_data_in_leaf), 
                 'objective':'gamma',
                 'max_depth': int(max_depth),
                 'learning_rate': 0.01,
                 "boosting": "gbdt",
                 "feature_fraction": feature_fraction,
                 "bagging_freq": 1,
                 "bagging_fraction": bagging_fraction ,
                 "bagging_seed": 11,
                 "metric": 'mae',
                 "lambda_l1": lambda_l1,
                 "verbosity": -1,
                 'is_unbalance': bool(is_unbalance >= 0.5)
                }
        
        model = lgb.LGBMRegressor(**param, n_estimators=50000, n_jobs = -1)
        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric='mae', verbose=-1, early_stopping_rounds=200)

        y_pred_valid = model.predict(X_valid)
        y_pred = model.predict(X_test, num_iteration=model.best_iteration_)
            
        oof[valid_index] = y_pred_valid.reshape(-1,)
        scores.append(mean_absolute_error(y_valid, y_pred_valid))
        score = np.mean(scores)
        
    return score

#     for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, Y_train)):
#         print("fold n°{}".format(fold_))
#         trn_data = lgb.Dataset(X_train[trn_idx], label=Y_train[trn_idx])
#         val_data = lgb.Dataset(X_train[val_id], label=Y_train[val_idx])
    
#         param = {'num_leaves': int(num_leaves),
#                  'min_data_in_leaf': int(min_data_in_leaf), 
#                  'objective':'regression',
#                  'max_depth': int(max_depth),
#                  'learning_rate': 0.01,
#                  "boosting": "gbdt",
#                  "feature_fraction": feature_fraction,
#                  "bagging_freq": 1,
#                  "bagging_fraction": bagging_fraction ,
#                  "bagging_seed": 11,
#                  "metric": 'mae',
#                  "lambda_l1": lambda_l1,
#                  "verbosity": -1
#                 }
    
#         clf = lgb.train(param, trn_data, 5000, valid_sets = [trn_data, val_data], verbose_eval=500001, early_stopping_rounds=500)
        
#         oof[val_idx] = clf.predict(X_train[val_idx], num_iteration=clf.best_iteration)
        
#         del clf, trn_idx, val_idx
        
#     return metrics.r2_score(oof, Y_train)

printLog('BayesianOptimization starting ...')

LGB_BO = BayesianOptimization(LGB_CV, {
    'max_depth': (-1, 100),
    'num_leaves': (5, 300),
    'min_data_in_leaf': (5, 200),
    'feature_fraction': (0.2, 1.0),
    'bagging_fraction': (0.2, 1.0),
    'lambda_l1': (0, 27),
    'is_unbalance': (0, 1)
    })

LGB_BO.maximize(init_points=2, n_iter=36)

BayesianOptimization starting ...
|   iter    |  target   | baggin... | featur... | is_unb... | lambda_l1 | max_depth | min_da... | num_le... |
-------------------------------------------------------------------------------------------------------------
Fold 0, started at Fri May 31 00:29:12 2019
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[815]	training's l1: 1.82019	valid_1's l1: 2.14151
Fold 1, started at Fri May 31 00:29:29 2019
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[888]	training's l1: 1.84636	valid_1's l1: 1.95654
Fold 2, started at Fri May 31 00:29:48 2019
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[609]	training's l1: 1.87204	valid_1's l1: 2.11638
Fold 3, started at Fri May 31 00:30:02 2019
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[558]	training's l1: 1.92

In [6]:
colname = ''
colname += 'score'.center(7, ' ')
for key in LGB_BO.res[0]['params'].keys():
    name = key.center(25, ' ')
    colname += name
print(colname)
for idx, item in enumerate(LGB_BO.res):
    msg = ''
    msg += '{0:7.5f}'.format(item['target'])
    for key in item['params'].keys():
        val = item['params'][key]
        msg += ' {0:24.20f}'.format(val)
    print(msg)

 score      bagging_fraction         feature_fraction           is_unbalance              lambda_l1                max_depth             min_data_in_leaf            num_leaves       
2.03828   0.58086936437085423535   0.74442790399391456369   0.10531407426330630095   8.54574772276745342481  70.70400504168678423866 179.18043157943583310043 207.51382441248193799765
2.04317   0.90351576601473371220   0.54545978800396777153   0.88959837658401785809  24.60295220415360617494  82.10588870462549948570 177.24651627047239799140 166.39136810322483484015
2.04854   0.20000000000000001110   0.20000000000000001110   0.00000000000000000000   0.00000000000000000000  -1.00000000000000000000   5.00000000000000000000   5.00000000000000000000
2.05475   0.86604058114301873950   0.77240844609726022618   0.74968302957642452000   8.45424590360210359563   1.92119511694320399542 198.63510401184532838670   6.74271162798787937476
2.04982   0.26041248561036722808   0.20815882539179630939   0.88854869086583820881   