In [3]:
import os
import gc
import numpy as np
from numpy.fft import *
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns 
from statsmodels.robust import mad
from statsmodels.tsa.stattools import acf
import scipy
from scipy import signal
from scipy.signal import butter, deconvolve
import warnings
warnings.filterwarnings('ignore')

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from matplotlib.pylab import rcParams
import seaborn as sns
from scipy import stats
from tqdm import tqdm_notebook

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%matplotlib inline

In [5]:
rcParams['figure.figsize'] = 10, 4

In [6]:
pd.set_option("display.precision", 20)

In [10]:
%%time
# load all data
train = pd.read_csv('D:\\LANL-Earthquake-Prediction\\train.csv')

FileNotFoundError: File b'D:\\LANL-Earthquake-Prediction\\train.csv' does not exist

In [2]:
acoustic_data = train.acoustic_data
time_to_failure = train.time_to_failure
data_len = len(train)
#del train
gc.collect()

NameError: name 'train' is not defined

# Feature Engineering / Extraction

For this project, feature engineering and extraction will be key since we are only given one dependent variable, seismic signal. Based on our research, it seems that the best way to approach this process, given our minimal domain knowledge, is to create statistically significant features that we can feed into our models. 2 areas of focus for our features will be:
1. Aggregations
2. Indicators

If time/opportunities allow, I think adding external features from research can prove to be helpful as well. This would have to be after we do initial modeling. 

# Aggregation Features

In [141]:
rows = 150_000
segments = int(np.floor(train.shape[0] / rows))

X_train = pd.DataFrame(index=range(segments), dtype=np.float64,
                       columns=['mean', 'std', 'max', 'min', 'sum', 'med', 'abs_mean', 'abs_std', 'abs_max',
                                'abs_min','abs_med', 'gmean', 'hmean','max_to_min_diff', 'unique_signals',  
                                'unique_mean','unique_std','unique_med', 'pct_unique', 'exp_ma_5000_mean', 
                                'exp_ma_5000_std', 'ma_5000_mean', 'ma_5000_std', 'abs_energy', 'abs_sum_change',
                                'ac_1', 'ac_10', 'ac_1000', 'ac_10000', 'bin_ent_1' ,'bin_ent_10' ,'bin_ent_30' ,
                                'bin_ent_50' ,'bin_ent_70' ,'bin_ent_90', 'bin_ent_99', 'complex_0', 'complex_1', 
                                'ft_centroid', 'ft_variance', 'ft_skew', 'ft_kurtosis', 'mean_abs_chng', 
                                'mean_chng', 'num_peaks_10', 'num_peaks_100', 'num_peaks_500', 'skew', 'kurtosis',
                                'variance', 'var_larger_std'])

y_train = pd.DataFrame(index=range(segments), dtype=np.float64,
                       columns=['time_to_failure'])

lags = [1,10,100,1000,10000]

for segment in tqdm(range(segments)):
    seg = train.iloc[segment*rows:segment*rows+rows]
    x = seg['acoustic_data'].values
    y = seg['time_to_failure'].values[-1]
    
    y_train.loc[segment, 'time_to_failure'] = y
    
    X_train.loc[segment, 'mean'] = x.mean()
    X_train.loc[segment,'std'] = x.std()
    X_train.loc[segment,'max'] = x.max()
    X_train.loc[segment,'min'] = x.min()
    X_train.loc[segment,'sum'] = sum(x)
    X_train.loc[segment,'med'] = fc.median(x)
    
    X_train.loc[segment,'abs_mean'] = np.abs(x).mean()
    X_train.loc[segment,'abs_std'] = np.abs(x).std()
    X_train.loc[segment,'abs_max'] = np.abs(x).max()
    X_train.loc[segment,'abs_min'] = np.abs(x).min()
    X_train.loc[segment,'abs_med'] = fc.median(np.abs(x))
    
    X_train.loc[segment,'gmean'] = stats.gmean(np.abs(x[np.nonzero(x)[0]]))
    X_train.loc[segment,'hmean'] = stats.hmean(np.abs(x[np.nonzero(x)[0]]))
    
    
    X_train.loc[segment,'max_to_min_diff'] = (x.max() - x.min())
    
    X_train.loc[segment,'unique_signals'] = len(np.unique(x))
    X_train.loc[segment,'unique_mean'] = np.unique(x).mean()
    X_train.loc[segment,'unique_std'] = np.unique(x).std()
    X_train.loc[segment,'unique_med'] = fc.median(np.unique(x))
    X_train.loc[segment, 'pct_unique'] = fc.percentage_of_reoccurring_values_to_all_values(x)
    
    X_train.loc[segment, 'exp_ma_5000_mean'] = pd.Series(x).ewm(span=5000).mean(skipna = True).mean(skipna = True)
    X_train.loc[segment, 'exp_ma_5000_std'] = pd.Series(x).ewm(span=5000).mean(skipna = True).std(skipna = True)
    

    x_roll_std = pd.Series(x).rolling(5000).std().dropna().values
    x_roll_mean = pd.Series(x).rolling(5000).mean().dropna().values
    X_train.loc[segment, 'ma_5000_mean'] = x_roll_mean.mean()
    X_train.loc[segment, 'ma_5000_std'] = x_roll_mean.std()

    
    # tsfresh aggregations
    
    X_train.loc[segment, 'abs_energy'] = fc.abs_energy(x)
    X_train.loc[segment, 'abs_sum_change'] = fc.absolute_sum_of_changes(x)
    
    
    for lag in lags:
        X_train.loc[segment, f'ac_{lag}'] = fc.autocorrelation(x,lag)
#         X_train.loc[segment, f'pac_{lag}'] = fc.partial_autocorrelation(x,[{'lag':lag}])[0][1]
    
    bins = [1, 10, 30, 50, 70, 90, 99]
    for bin in bins:
        X_train.loc[segment, f'bin_ent_{bin}'] = fc.binned_entropy(x,bin)
        
    # 0/1 indicates normalization
    X_train.loc[segment, 'complex_0'] = fc.cid_ce(x,0)
    X_train.loc[segment, 'complex_1'] = fc.cid_ce(x,1)
    
    params = ['centroid', 'variance', 'skew', 'kurtosis']
    for param in params:
        X_train.loc[segment, f'ft_{param}'] = list(fc.fft_aggregated(x,[{'aggtype':f'{param}'}]))[0][1]
        
#    X_train.loc[segment, 'seq_above_mean'] = fc.longest_strike_above_mean(x)
#    X_train.loc[segment, 'seq_below_mean'] = fc.longest_strike_below_mean(x)
    
    X_train.loc[segment, 'mean_abs_chng'] = fc.mean_abs_change(x)
    X_train.loc[segment, 'mean_chng'] = fc.mean_change(x)
    
    neighbor = [10,100,500]
    for n in neighbor:
        X_train.loc[segment, f'num_peaks_{n}'] = fc.number_peaks(x,n)
    
    X_train.loc[segment, 'skew'] = fc.skewness(x)
    X_train.loc[segment, 'kurtosis'] = fc.kurtosis(x)
    X_train.loc[segment, 'variance'] = fc.variance(x)
    X_train.loc[segment, 'var_larger_std'] = fc.variance_larger_than_standard_deviation(x)
    
    
    