# LANL Earhquake prediction

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import numpy as np
import pandas as pd

pd.options.display.float_format = '{:,.10f}'.format

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

# Visualizations
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
sns.set()

In [3]:
FEATURES_BASE = '../data/processed/train/features_base.csv'
FEATURES_BASE_DENOISE = '../data/processed/train/features_base_denoise.csv'
FEATURES_FOLDS_DENOISE = '../data/processed/train/features_folds_denoise.csv'
FEATURES_TSFRESH = '../data/processed/train/features_tsfresh.csv'
FEATURES_SIGNAL = '../data/processed/train/features_signal.csv'
FEATURES_WAVELET = '../data/processed/train/features_wavelet.csv'

In [4]:
%%time
features_base = pd.read_csv(FEATURES_BASE,index_col='id').add_prefix('b_')
features_base_denoise = pd.read_csv(FEATURES_BASE_DENOISE,index_col='id').add_prefix('bd_')
features_folds_denoise = pd.read_csv(FEATURES_FOLDS_DENOISE,index_col='id').add_prefix('fd_')
features_tsfresh = pd.read_csv(FEATURES_TSFRESH,index_col='id').add_prefix('ts_')
features_signal = pd.read_csv(FEATURES_SIGNAL,index_col='id').add_prefix('sg_')
features_wavelet = pd.read_csv(FEATURES_WAVELET,index_col='id').add_prefix('wv_')

CPU times: user 2.59 s, sys: 111 ms, total: 2.7 s
Wall time: 2.7 s


In [5]:
y_all = features_base['b_y']
df_all = features_base.join(features_base_denoise).join(features_folds_denoise).join(features_tsfresh).join(features_signal).join(features_wavelet)
df = df_all.drop(['bd_y','fd_y','ts_y','sg_y','wv_y'],axis=1)
X_all = df_all.drop(['b_y','bd_y','fd_y','ts_y','sg_y','wv_y'],axis=1)

In [6]:
print('features_base.shape:',features_base.shape)
print('features_base_denoise.shape:',features_base_denoise.shape)
print('features_folds_denoise.shape:',features_folds_denoise.shape)
print('features_tsfresh.shape:',features_tsfresh.shape)
print('features_signal.shape:',features_signal.shape)
print('features_wavelet.shape:',features_wavelet.shape)


print('df_all.shape:',df_all.shape)
print('df.shape:',df.shape)
print('X_all.shape:',X_all.shape)

features_base.shape: (4194, 116)
features_base_denoise.shape: (4194, 116)
features_folds_denoise.shape: (4194, 1151)
features_tsfresh.shape: (4194, 789)
features_signal.shape: (4194, 866)
features_wavelet.shape: (4194, 25)
df_all.shape: (4194, 3063)
df.shape: (4194, 3058)
X_all.shape: (4194, 3057)


## Cleaning from NaN,infinity or too large values

In [7]:
if np.any(np.isnan(X_all)):
    X_all.fillna(0,inplace=True)

## Outer Cross-Validation split

In [8]:
from sklearn.model_selection import train_test_split
from src.config.common import RANDOM_STATE
X, X_cross, y, y_cross = train_test_split(X_all, y_all, test_size=0.15, random_state=RANDOM_STATE)

In [9]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

var_selector = VarianceThreshold().fit(X)
print('before',X.shape)
X = X[X.columns[var_selector.get_support(indices=True)]]
print('after',X.shape)

X_cross = X_cross[X_cross.columns[var_selector.get_support(indices=True)]]

before (3564, 3057)
after (3564, 2916)


## Submission dataset

In [10]:
%%time

FEATURES_BASE = '../data/processed/test/features_base.csv'
FEATURES_BASE_DENOISE = '../data/processed/test/features_base_denoise.csv'
FEATURES_FOLDS_DENOISE = '../data/processed/test/features_folds_denoise.csv'
FEATURES_TSFRESH = '../data/processed/test/features_tsfresh.csv'
FEATURES_SIGNAL = '../data/processed/test/features_signal.csv'
FEATURES_WAVELET = '../data/processed/test/features_wavelet.csv'


features_base = pd.read_csv(FEATURES_BASE,index_col='id').add_prefix('b_')
features_base_denoise = pd.read_csv(FEATURES_BASE_DENOISE,index_col='id').add_prefix('bd_')
features_folds_denoise = pd.read_csv(FEATURES_FOLDS_DENOISE,index_col='id').add_prefix('fd_')
features_tsfresh = pd.read_csv(FEATURES_TSFRESH,index_col='id').add_prefix('ts_')
features_signal = pd.read_csv(FEATURES_SIGNAL,index_col='id').add_prefix('sg_')
features_wavelet = pd.read_csv(FEATURES_WAVELET,index_col='id').add_prefix('wv_')

X_submit = features_base.join(features_base_denoise).join(features_folds_denoise).join(features_tsfresh).join(features_signal).join(features_wavelet)

if np.any(np.isnan(X_submit)):
    X_submit.fillna(0,inplace=True)

print('before',X_submit.shape)
X_submit = X_submit[X_submit.columns[var_selector.get_support(indices=True)]]
print('after',X_submit.shape)

before (2624, 3057)
after (2624, 2916)
CPU times: user 1.68 s, sys: 118 ms, total: 1.8 s
Wall time: 1.8 s


## LGBM

In [11]:
params = {'num_leaves': 21,
         'min_data_in_leaf': 20,
         'objective':'regression',
         'max_depth': 108,
         'learning_rate': 0.001,
         "boosting": "gbdt",
         "feature_fraction": 0.91,
         "bagging_freq": 1,
         "bagging_fraction": 0.91,
         "bagging_seed": 42,
         "metric": 'mae',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "random_state": 42}

In [None]:
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
import lightgbm as lgb

maes = []
rmses = []
tr_maes = []
tr_rmses = []

submission_preds = np.zeros(len(X_submit))
predictions = np.zeros(len(X_cross))

n_fold = 6
folds = KFold(n_splits=n_fold, shuffle=False, random_state=42)

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
    print('working fold %d' % fold_)
    strLog = "fold {}".format(fold_)
    print(strLog)

    X_train, X_test = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_test = y.iloc[trn_idx], y.iloc[val_idx]
    
    scaler = StandardScaler()

    scaler = scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    X_cross_scaled = scaler.transform(X_cross)
    X_submit_scaled = scaler.transform(X_submit)
    
    model = lgb.LGBMRegressor(**params, n_estimators=60000, n_jobs=-1)
    model.fit(X_train, y_train,
              eval_set=[(X_train, y_train), (X_test, y_test)], 
              eval_metric='mae',
              verbose=1000, 
              early_stopping_rounds=200)

    # predictions
    preds_cross = model.predict(X_cross_scaled)  #, num_iteration=model.best_iteration_)
    predictions += preds_cross / folds.n_splits
    
    preds_submit = model.predict(X_submit_scaled)  #, num_iteration=model.best_iteration_)
    submission_preds += preds_submit / folds.n_splits

    # mean absolute error
    mae = mean_absolute_error(y_cross, preds_cross)
    print('MAE: %.6f' % mae)
    maes.append(mae)

    # root mean squared error
    rmse = mean_squared_error(y_cross, preds_cross)
    print('RMSE: %.6f' % rmse)
    rmses.append(rmse)

    # training for over fit
    preds = model.predict(X_train)  #, num_iteration=model.best_iteration_)

    mae = mean_absolute_error(y_train, preds)
    print('Tr MAE: %.6f' % mae)
    tr_maes.append(mae)

    rmse = mean_squared_error(y_train, preds)
    print('Tr RMSE: %.6f' % rmse)
    tr_rmses.append(rmse)

print('MAEs', maes)
print('MAE mean: %.6f' % np.mean(maes))
print('RMSEs', rmses)
print('RMSE mean: %.6f' % np.mean(rmses))

print('Tr MAEs', tr_maes)
print('Tr MAE mean: %.6f' % np.mean(tr_maes))
print('Tr RMSEs', rmses)
print('Tr RMSE mean: %.6f' % np.mean(tr_rmses))

working fold 0
fold 0
Training until validation scores don't improve for 200 rounds.


## Submission

In [None]:
X_submit

In [None]:
submission = pd.DataFrame()
submission['seg_id'] = X_submit.index
submission['seg_id'] = submission['seg_id'].apply(lambda seg_id: 'seg_'+seg_id)
submission['time_to_failure'] = submission_preds

In [None]:
submission_filename = 'submission_lgbm.csv'
submission.to_csv(submission_filename,index=False)
!kaggle competitions submit -f $submission_filename -m catboost0.2 LANL-Earthquake-Prediction