# LANL Earhquake prediction

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import numpy as np
import pandas as pd

pd.options.display.float_format = '{:,.10f}'.format

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

# Visualizations
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
sns.set()

In [3]:
FEATURES_BASE = '../data/processed/train/features_base.csv'
FEATURES_BASE_DENOISE = '../data/processed/train/features_base_denoise.csv'
FEATURES_FOLDS_DENOISE = '../data/processed/train/features_folds_denoise.csv'
FEATURES_TSFRESH = '../data/processed/train/features_tsfresh.csv'

In [4]:
features_base = pd.read_csv(FEATURES_BASE,index_col='id')
features_base_denoise = pd.read_csv(FEATURES_BASE_DENOISE,index_col='id')
features_folds_denoise = pd.read_csv(FEATURES_FOLDS_DENOISE,index_col='id')
features_tsfresh = pd.read_csv(FEATURES_TSFRESH,index_col='id')

In [5]:
features_tsfresh.head(2)

Unnamed: 0_level_0,x__abs_energy,x__absolute_sum_of_changes,"x__agg_autocorrelation__f_agg_""mean""__maxlag_40","x__agg_autocorrelation__f_agg_""median""__maxlag_40","x__agg_autocorrelation__f_agg_""var""__maxlag_40","x__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""intercept""","x__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""rvalue""","x__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""slope""","x__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""stderr""","x__agg_linear_trend__f_agg_""max""__chunk_len_50__attr_""intercept""",...,x__symmetry_looking__r_0.9500000000000001,x__time_reversal_asymmetry_statistic__lag_1,x__time_reversal_asymmetry_statistic__lag_2,x__time_reversal_asymmetry_statistic__lag_3,x__value_count__value_-1,x__value_count__value_0,x__value_count__value_1,x__variance,x__variance_larger_than_standard_deviation,y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,10247.0,391980.0,-0.0196169172,-0.0292105689,0.0771727933,11.4728051641,-0.1604799308,-0.0001705898,8.5674e-06,15.5494230812,...,1.0,0.2592567901,3.8303954772,-4.1183647346,3622.0,5741.0,8406.0,26.0211102805,1.0,1.4307971859
1,31377.0,405226.0,-0.0296423639,-0.0473553483,0.0950035692,10.8740597116,-0.0420251683,-6.53812e-05,1.26924e-05,14.9988208375,...,1.0,-6.4017320231,-0.2259793594,2.8093990426,3888.0,5853.0,8557.0,43.4123094122,1.0,1.3914988931


In [6]:
y_all = features_base['y']
X_all = features_base.drop('y',axis=1).join(features_base_denoise.drop('y',axis=1),rsuffix='bd_').join(features_folds_denoise.drop('y',axis=1),rsuffix='fdd_').join(features_tsfresh.drop('y',axis=1),rsuffix='ts_')
# X_all = features_folds_denoise.drop('y',axis=1).join(feature.drop('y',axis=1))
# X_all = data_s.drop('y',axis=1)
# X_all = data_d_f.drop('y',axis=1).join(data_s.drop('y',axis=1),rsuffix='s_').join(data_f.drop('y',axis=1),rsuffix='f_')
# X_all = data_f.drop('y',axis=1).join(data_s.drop('y',axis=1),rsuffix='s_')

In [7]:
print('features_base.shape:',features_base.shape)
print('features_base_denoise.shape:',features_base_denoise.shape)
print('features_folds_denoise.shape:',features_folds_denoise.shape)
print('features_tsfresh.shape:',features_tsfresh.shape)
print('X_all.shape:',X_all.shape)

features_base.shape: (4194, 116)
features_base_denoise.shape: (4194, 116)
features_folds_denoise.shape: (4194, 1151)
features_tsfresh.shape: (4194, 789)
X_all.shape: (4194, 2168)


## Cleaning from NaN,infinity or too large values

In [8]:
if np.any(np.isnan(X_all)):
    X_all.fillna(0,inplace=True)

## Outer Cross-Validation split

In [9]:
from sklearn.model_selection import train_test_split
from src.config.common import RANDOM_STATE
X, X_cross, y, y_cross = train_test_split(X_all, y_all, test_size=0.15, random_state=RANDOM_STATE)

In [10]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

var_selector = VarianceThreshold().fit(X)
print('before',X.shape)
X = X[X.columns[var_selector.get_support(indices=True)]]
print('after',X.shape)

before (3564, 2168)
after (3564, 2076)


In [None]:
kbest_selector = SelectKBest(f_regression, k=100)
kbest_selector = kbest_selector.fit(X, y)
print('before',X.shape)
X = X[X.columns[kbest_selector.get_support(indices=True)]]
print('after',X.shape)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=RANDOM_STATE)

## Model

In [21]:
import xgboost as xgb

params = {
    "objective":"reg:linear",
    'colsample_bytree': 0.3,
    'learning_rate': 0.1,
    'max_depth': 9, 
    'alpha': 3,
    'gpu_id':0,
    'max_bin':32,
    'tree_method':'gpu_hist',
}

In [22]:
dtrain = xgb.DMatrix(X, label=y)

In [23]:
cv_results = xgb.cv(dtrain=dtrain, 
                    params=params, 
                    nfold=3,
                    num_boost_round=500,
                    early_stopping_rounds=100,
                    metrics="mae", 
                    as_pandas=True, 
                    seed=42)
cv_results.head()

Unnamed: 0,train-mae-mean,train-mae-std,test-mae-mean,test-mae-std
0,4.7330963333,0.026172289,4.781314,0.0636717274
1,4.3052666667,0.0269991135,4.3968666667,0.072079412
2,3.918151,0.0251578879,4.057804,0.0838863066
3,3.5683346667,0.0265232677,3.760559,0.0930752768
4,3.2560313333,0.024886294,3.5022263333,0.0981706518


In [16]:
print('before',X_cross.shape)
X_cross = X_cross[X_cross.columns[var_selector.get_support(indices=True)]]
print('after',X_cross.shape)

before (630, 2168)
after (630, 2076)


In [17]:
from sklearn.metrics import mean_absolute_error
prediction = model.predict(X_cross)
mae = mean_absolute_error(y_cross, prediction)
print(mae)

2.0733071282495583


In [19]:
model.save_model('catboost.cbm', 
           format="cbm", 
           export_parameters=None,
           pool=None)

## Submission

In [20]:
FEATURES_BASE = '../data/processed/test/features_base.csv'
FEATURES_BASE_DENOISE = '../data/processed/test/features_base_denoise.csv'
FEATURES_FOLDS_DENOISE = '../data/processed/test/features_folds_denoise.csv'
FEATURES_TSFRESH = '../data/processed/test/features_tsfresh.csv'

In [21]:
features_base = pd.read_csv(FEATURES_BASE,index_col='id')
features_base_denoise = pd.read_csv(FEATURES_BASE_DENOISE,index_col='id')
features_folds_denoise = pd.read_csv(FEATURES_FOLDS_DENOISE,index_col='id')
features_tsfresh = pd.read_csv(FEATURES_TSFRESH,index_col='id')

In [22]:
X_all = features_base.join(features_base_denoise,rsuffix='bd_').join(features_folds_denoise,rsuffix='fdd_').join(features_tsfresh,rsuffix='ts_')

In [23]:
print('features_base.shape:',features_base.shape)
print('features_base_denoise.shape:',features_base_denoise.shape)
print('features_folds_denoise.shape:',features_folds_denoise.shape)
print('features_tsfresh.shape:',features_tsfresh.shape)
print('X_all.shape:',X_all.shape)

features_base.shape: (2624, 115)
features_base_denoise.shape: (2624, 115)
features_folds_denoise.shape: (2624, 1150)
features_tsfresh.shape: (2624, 788)
X_all.shape: (2624, 2168)


In [25]:
print('before',X.shape)
X_all = X_all[X_all.columns[var_selector.get_support(indices=True)]]
print('after',X.shape)

before (3564, 2076)
after (3564, 2076)


In [26]:
predictions = model.predict(X_all)

In [39]:
submission = pd.DataFrame()
submission['seg_id'] = X_all.index.values
submission['seg_id'] = submission['seg_id'].apply(lambda seg_id: 'seg_'+seg_id)
submission['time_to_failure'] = predictions

Unnamed: 0,seg_id,time_to_failure
0,seg_00030f,2.810558949
1,seg_0012b5,4.7626902832


In [40]:
submission.to_csv('submission.csv',index=False)
!kaggle competitions submit -f submission.csv -m catboost0.1 LANL-Earthquake-Prediction