# LANL Earhquake prediction

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import numpy as np
import pandas as pd

pd.options.display.float_format = '{:,.10f}'.format

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

# Visualizations
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
sns.set()

In [3]:
FEATURES_BASE = '../data/processed/train/features_base.csv'
FEATURES_BASE_DENOISE = '../data/processed/train/features_base_denoise.csv'
FEATURES_FOLDS_DENOISE = '../data/processed/train/features_folds_denoise.csv'
FEATURES_TSFRESH = '../data/processed/train/features_tsfresh.csv'
FEATURES_SIGNAL = '../data/processed/train/features_signal.csv'
FEATURES_WAVELET = '../data/processed/train/features_wavelet.csv'

In [4]:
%%time
features_base = pd.read_csv(FEATURES_BASE,index_col='id').add_prefix('b_')
features_base_denoise = pd.read_csv(FEATURES_BASE_DENOISE,index_col='id').add_prefix('bd_')
features_folds_denoise = pd.read_csv(FEATURES_FOLDS_DENOISE,index_col='id').add_prefix('fd_')
# features_tsfresh = pd.read_csv(FEATURES_TSFRESH,index_col='id').add_prefix('ts_')
features_signal = pd.read_csv(FEATURES_SIGNAL,index_col='id').add_prefix('sg_')
features_wavelet = pd.read_csv(FEATURES_WAVELET,index_col='id').add_prefix('wv_')


Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.



CPU times: user 8.71 s, sys: 389 ms, total: 9.1 s
Wall time: 9.1 s


In [5]:
y_all = features_base['b_y']
df_all = features_base.join(features_base_denoise).join(features_folds_denoise).join(features_signal).join(features_wavelet)
df = df_all.drop(['bd_y','fd_y','sg_y','wv_y'],axis=1)
X_all = df_all.drop(['b_y','bd_y','fd_y','sg_y','wv_y'],axis=1)

In [6]:
print('features_base.shape:',features_base.shape)
print('features_base_denoise.shape:',features_base_denoise.shape)
print('features_folds_denoise.shape:',features_folds_denoise.shape)
# print('features_tsfresh.shape:',features_tsfresh.shape)
print('features_signal.shape:',features_signal.shape)
print('features_wavelet.shape:',features_wavelet.shape)


print('df_all.shape:',df_all.shape)
print('df.shape:',df.shape)
print('X_all.shape:',X_all.shape)

features_base.shape: (19194, 116)
features_base_denoise.shape: (19194, 116)
features_folds_denoise.shape: (19194, 1151)
features_signal.shape: (19194, 866)
features_wavelet.shape: (19194, 25)
df_all.shape: (19194, 2274)
df.shape: (19194, 2270)
X_all.shape: (19194, 2269)


## Cleaning from NaN,infinity or too large values

In [7]:
if np.any(np.isnan(X_all)):
    X_all.fillna(0,inplace=True)

## Outer Cross-Validation split

In [8]:
from sklearn.model_selection import train_test_split
from src.config.common import RANDOM_STATE
X, X_cross, y, y_cross = train_test_split(X_all, y_all, test_size=0.15, random_state=RANDOM_STATE)

In [9]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

var_selector = VarianceThreshold().fit(X)
print('before',X.shape)
X = X[X.columns[var_selector.get_support(indices=True)]]
print('after',X.shape)

X_cross = X_cross[X_cross.columns[var_selector.get_support(indices=True)]]

before (16314, 2269)
after (16314, 2265)


## Submission dataset

In [10]:
%%time

FEATURES_BASE = '../data/processed/test/features_base.csv'
FEATURES_BASE_DENOISE = '../data/processed/test/features_base_denoise.csv'
FEATURES_FOLDS_DENOISE = '../data/processed/test/features_folds_denoise.csv'
FEATURES_TSFRESH = '../data/processed/test/features_tsfresh.csv'
FEATURES_SIGNAL = '../data/processed/test/features_signal.csv'
FEATURES_WAVELET = '../data/processed/test/features_wavelet.csv'


features_base = pd.read_csv(FEATURES_BASE,index_col='id').add_prefix('b_')
features_base_denoise = pd.read_csv(FEATURES_BASE_DENOISE,index_col='id').add_prefix('bd_')
features_folds_denoise = pd.read_csv(FEATURES_FOLDS_DENOISE,index_col='id').add_prefix('fd_')
# features_tsfresh = pd.read_csv(FEATURES_TSFRESH,index_col='id').add_prefix('ts_')
features_signal = pd.read_csv(FEATURES_SIGNAL,index_col='id').add_prefix('sg_')
features_wavelet = pd.read_csv(FEATURES_WAVELET,index_col='id').add_prefix('wv_')

X_submit = features_base.join(features_base_denoise).join(features_folds_denoise).join(features_signal).join(features_wavelet)

if np.any(np.isnan(X_submit)):
    X_submit.fillna(0,inplace=True)

print('before',X_submit.shape)
X_submit = X_submit[X_submit.columns[var_selector.get_support(indices=True)]]
print('after',X_submit.shape)

before (2624, 2269)
after (2624, 2265)
CPU times: user 1.21 s, sys: 87.8 ms, total: 1.3 s
Wall time: 1.3 s


## LGBM

In [11]:
params = {'num_leaves': 21,
         'min_data_in_leaf': 20,
         'objective':'regression',
         'max_depth': 108,
         'learning_rate': 0.001,
         "boosting": "gbdt",
         "feature_fraction": 0.91,
         "bagging_freq": 1,
         "bagging_fraction": 0.91,
         "bagging_seed": 42,
         "metric": 'mae',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "random_state": 42}

In [12]:
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
import lightgbm as lgb

maes = []
rmses = []
tr_maes = []
tr_rmses = []

submission_preds = np.zeros(len(X_submit))
predictions = np.zeros(len(X_cross))

n_fold = 6
folds = KFold(n_splits=n_fold, shuffle=False, random_state=42)

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
    print('working fold %d' % fold_)
    strLog = "fold {}".format(fold_)
    print(strLog)

    X_train, X_test = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_test = y.iloc[trn_idx], y.iloc[val_idx]
    
    scaler = StandardScaler()

    scaler = scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    X_cross_scaled = scaler.transform(X_cross)
    X_submit_scaled = scaler.transform(X_submit)
    
    model = lgb.LGBMRegressor(**params, n_estimators=60000, n_jobs=-1)
    model.fit(X_train, y_train,
              eval_set=[(X_train, y_train), (X_test, y_test)], 
              eval_metric='mae',
              verbose=1000, 
              early_stopping_rounds=200)

    # predictions
    preds_cross = model.predict(X_cross_scaled)  #, num_iteration=model.best_iteration_)
    predictions += preds_cross / folds.n_splits
    
    preds_submit = model.predict(X_submit_scaled)  #, num_iteration=model.best_iteration_)
    submission_preds += preds_submit / folds.n_splits

    # mean absolute error
    mae = mean_absolute_error(y_cross, preds_cross)
    print('MAE: %.6f' % mae)
    maes.append(mae)

    # root mean squared error
    rmse = mean_squared_error(y_cross, preds_cross)
    print('RMSE: %.6f' % rmse)
    rmses.append(rmse)

    # training for over fit
    preds = model.predict(X_train)  #, num_iteration=model.best_iteration_)

    mae = mean_absolute_error(y_train, preds)
    print('Tr MAE: %.6f' % mae)
    tr_maes.append(mae)

    rmse = mean_squared_error(y_train, preds)
    print('Tr RMSE: %.6f' % rmse)
    tr_rmses.append(rmse)

print('MAEs', maes)
print('MAE mean: %.6f' % np.mean(maes))
print('RMSEs', rmses)
print('RMSE mean: %.6f' % np.mean(rmses))

print('Tr MAEs', tr_maes)
print('Tr MAE mean: %.6f' % np.mean(tr_maes))
print('Tr RMSEs', rmses)
print('Tr RMSE mean: %.6f' % np.mean(tr_rmses))

working fold 0
fold 0
Training until validation scores don't improve for 200 rounds.
[1000]	training's l1: 2.23028	valid_1's l1: 2.33067
[2000]	training's l1: 2.04613	valid_1's l1: 2.18009
[3000]	training's l1: 1.96625	valid_1's l1: 2.13806
[4000]	training's l1: 1.90878	valid_1's l1: 2.12037
[5000]	training's l1: 1.86037	valid_1's l1: 2.1124
[6000]	training's l1: 1.81676	valid_1's l1: 2.10758
[7000]	training's l1: 1.77601	valid_1's l1: 2.1037
[8000]	training's l1: 1.73741	valid_1's l1: 2.10083
[9000]	training's l1: 1.70073	valid_1's l1: 2.09808
[10000]	training's l1: 1.6654	valid_1's l1: 2.09569
[11000]	training's l1: 1.63122	valid_1's l1: 2.09393
[12000]	training's l1: 1.59848	valid_1's l1: 2.09205
[13000]	training's l1: 1.56658	valid_1's l1: 2.09015
[14000]	training's l1: 1.53581	valid_1's l1: 2.08818
[15000]	training's l1: 1.50585	valid_1's l1: 2.08653
[16000]	training's l1: 1.47691	valid_1's l1: 2.08479
[17000]	training's l1: 1.4486	valid_1's l1: 2.08327
[18000]	training's l1: 1.42

[49000]	training's l1: 0.81667	valid_1's l1: 2.03013
[50000]	training's l1: 0.80294	valid_1's l1: 2.02954
[51000]	training's l1: 0.789461	valid_1's l1: 2.02904
[52000]	training's l1: 0.776207	valid_1's l1: 2.02864
[53000]	training's l1: 0.763215	valid_1's l1: 2.02816
[54000]	training's l1: 0.750444	valid_1's l1: 2.02771
[55000]	training's l1: 0.737845	valid_1's l1: 2.02719
[56000]	training's l1: 0.725538	valid_1's l1: 2.02671
Early stopping, best iteration is:
[56185]	training's l1: 0.723258	valid_1's l1: 2.02658
MAE: 1.983583
RMSE: 6.669230
Tr MAE: 0.723258
Tr RMSE: 0.918912
working fold 4
fold 4
Training until validation scores don't improve for 200 rounds.
[1000]	training's l1: 2.24137	valid_1's l1: 2.27005
[2000]	training's l1: 2.0548	valid_1's l1: 2.12858
[3000]	training's l1: 1.97272	valid_1's l1: 2.09279
[4000]	training's l1: 1.91462	valid_1's l1: 2.07866
[5000]	training's l1: 1.86593	valid_1's l1: 2.07091
[6000]	training's l1: 1.82207	valid_1's l1: 2.06589
[7000]	training's l1:

## Submission

In [13]:
X_submit

Unnamed: 0_level_0,b_ave,b_std,b_max,b_min,b_q90,b_q95,b_q99,b_q05,b_q10,b_q01,...,wv_pk_idx_7,wv_pk_val_7,wv_pk_idx_8,wv_pk_val_8,wv_pk_idx_9,wv_pk_val_9,wv_pk_idx_10,wv_pk_val_10,wv_pk_idx_11,wv_pk_val_11
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00030f,4.4917800000,4.8936733747,115.0000000000,-75.0000000000,9.0000000000,11.0000000000,18.0000000000,-2.0000000000,0.0000000000,-8.0000000000,...,15653.0000000000,43.1077252632,78519.0000000000,42.5439614293,15840.0000000000,42.1633338627,934.0000000000,42.1493925944,15679.0000000000,41.4619936084
0012b5,4.1711533333,5.9228197004,152.0000000000,-140.0000000000,9.0000000000,11.0000000000,20.0000000000,-2.0000000000,-1.0000000000,-12.0000000000,...,90197.0000000000,46.5625994908,26763.0000000000,46.3244587671,26964.0000000000,45.5342574615,132314.0000000000,45.3599362610,132258.0000000000,41.9284128458
00184e,4.6102600000,6.9469669208,248.0000000000,-193.0000000000,9.0000000000,11.0000000000,20.0000000000,-2.0000000000,0.0000000000,-11.0000000000,...,26571.0000000000,71.6021789632,104556.0000000000,61.7208995915,104693.0000000000,46.4601718572,104857.0000000000,45.1017787632,22195.0000000000,44.1262408118
003339,4.5314733333,4.1141328891,85.0000000000,-93.0000000000,8.0000000000,10.0000000000,14.0000000000,-1.0000000000,1.0000000000,-5.0000000000,...,53157.0000000000,51.6508326762,72241.0000000000,46.4223129393,72257.0000000000,45.9156931754,72449.0000000000,43.7962507857,72334.0000000000,40.1087096743
0042cc,4.1283400000,5.7971443123,177.0000000000,-147.0000000000,9.0000000000,10.0000000000,19.0000000000,-2.0000000000,0.0000000000,-10.0000000000,...,52604.0000000000,53.4068827278,141835.0000000000,52.6046876628,44300.0000000000,51.5017829693,44330.0000000000,49.1002929017,6633.0000000000,49.0007103602
004314,4.1486066667,24.7826862156,671.0000000000,-675.0000000000,13.0000000000,20.0000000000,58.0000000000,-12.0000000000,-5.0000000000,-52.0000000000,...,21094.0000000000,311.5413582539,20395.0000000000,273.1508418453,19294.0000000000,254.7264063695,20845.0000000000,221.1018272782,21504.0000000000,212.7345806167
004cd2,4.1139866667,4.7071343412,125.0000000000,-107.0000000000,8.0000000000,10.0000000000,15.0000000000,-2.0000000000,0.0000000000,-7.0000000000,...,25034.0000000000,34.2572289896,74343.0000000000,32.4855186752,64204.0000000000,30.2661235586,41921.0000000000,28.8579630744,41906.0000000000,28.4187335404
004ee5,4.3283800000,5.9644234068,120.0000000000,-120.0000000000,9.0000000000,12.0000000000,21.0000000000,-3.0000000000,-1.0000000000,-13.0000000000,...,97564.0000000000,64.5935819062,68070.0000000000,52.1296690031,32454.0000000000,50.6438132246,104446.0000000000,50.0337619509,62104.0000000000,49.1408638277
004f1f,4.0007333333,5.8744497157,118.0000000000,-114.0000000000,9.0000000000,11.0000000000,20.0000000000,-3.0000000000,-1.0000000000,-12.0000000000,...,8380.0000000000,48.7035956934,62935.0000000000,48.1958990226,27177.0000000000,45.9154351868,8637.0000000000,45.4029489335,8211.0000000000,44.7906960373
00648a,4.4588000000,8.9264466181,281.0000000000,-258.0000000000,10.0000000000,12.0000000000,26.0000000000,-3.0000000000,-1.0000000000,-17.0000000000,...,121832.0000000000,98.0435573113,83844.0000000000,93.9986037384,102112.0000000000,90.2373270048,102020.0000000000,73.7944874923,122827.0000000000,64.8001019045


In [14]:
submission = pd.DataFrame()
submission['seg_id'] = X_submit.index
submission['seg_id'] = submission['seg_id'].apply(lambda seg_id: 'seg_'+seg_id)
submission['time_to_failure'] = submission_preds

In [15]:
submission_filename = 'submission_lgbm.csv'
submission.to_csv(submission_filename,index=False)
!kaggle competitions submit -f $submission_filename -m lgbm0.2_sm_segments LANL-Earthquake-Prediction

100%|██████████████████████████████████████| 75.0k/75.0k [00:03<00:00, 19.3kB/s]
Successfully submitted to LANL Earthquake Prediction