In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from joblib import Parallel, delayed
import os
import gc
import xgboost as xgb
from sklearn.model_selection import KFold
import scipy as sp
from sklearn import metrics
from tsfresh.feature_extraction import feature_calculators
ewma = pd.Series.ewm
import os
print(os.listdir("../input"))
no_of_std = 3
# Any results you write to the current directory are saved as output.

In [None]:
%%time
train      = pd.read_csv('../input/train.csv', dtype={'acoustic_data': np.int16, 'time_to_failure': np.float32})
submission = pd.read_csv('../input/sample_submission.csv', index_col='seg_id')

In [None]:
submission.head()

In [None]:
# Create a training file with simple derived features
rows     = 150_000
segments = int(np.floor(train.shape[0] / rows))

In [None]:
seg = pd.read_csv('../input/test/' + 'seg_00030f' + '.csv')

In [None]:
seg.rolling(window=700).mean().mean(skipna=True)

In [None]:
class FeatureGenerator(object):
    def __init__(self, dtype, n_jobs=1, chunk_size=None):
        self.chunk_size = chunk_size
        self.dtype = dtype
        self.filename = None
        self.n_jobs = n_jobs
        self.test_files = []
        
        if self.dtype == 'train':
            self.filename = '../input/train.csv'
            self.total_data = int(629145481 / self.chunk_size)
        else:
            submission = pd.read_csv('../input/sample_submission.csv')
            for seg_id in submission.seg_id.values:
                self.test_files.append((seg_id, '../input/test/' + seg_id + '.csv'))
            self.total_data = int(len(submission))

    def read_chunks(self):
        if self.dtype == 'train':
            iter_df = pd.read_csv(self.filename, iterator=True, chunksize=self.chunk_size,
                                  dtype={'acoustic_data': np.float64, 'time_to_failure': np.float64})
            for counter, df in enumerate(iter_df):
                x = df.acoustic_data.values
                y = df.time_to_failure.values[-1]
                seg_id = 'train_' + str(counter)
                del df
                yield seg_id, x, y
        else:
            for seg_id, f in self.test_files:
                df = pd.read_csv(f, dtype={'acoustic_data': np.float64})
                x = df.acoustic_data.values[-self.chunk_size:]
                del df
                yield seg_id, x, -999
                
    
    def add_trend_feature(self,x,abs_values=False):
        idx = np.array(range(len(x)))
        if abs_values:
            x = np.abs(x)
        lr = LinearRegression()
        lr.fit(idx.reshape(-1, 1), x)
        return lr.coef_[0]
    
    def classic_sta_lta(self,x,length_sta, length_lta):
        sta = np.cumsum(x ** 2)
        # Convert to float
        sta = np.require(sta, dtype=np.float)
        # Copy for LTA
        lta = sta.copy()
        # Compute the STA and the LTA
        sta[length_sta:] = sta[length_sta:] - sta[:-length_sta]
        sta /= length_sta
        lta[length_lta:] = lta[length_lta:] - lta[:-length_lta]
        lta /= length_lta
        # Pad zeros
        sta[:length_lta - 1] = 0
        # Avoid division by zero by setting zero values to tiny float
        dtiny = np.finfo(0.0).tiny
        idx = lta < dtiny
        lta[idx] = dtiny
        return sta / lta
    
    def calc_change_rate(self,x):
        change = (np.diff(x) / x[:-1]).values
        change = change[np.nonzero(change)[0]]
        change = change[~np.isnan(change)]
        change = change[change != -np.inf]
        change = change[change != np.inf]
        return np.mean(change)

    def features(self, x, y, seg_id):
        
        feature_dict           = dict()
        feature_dict['target'] = y
        feature_dict['seg_id'] = seg_id
        
        #trend features

        # create features here # numpy
        feature_dict['mean'] = np.mean(x)
        feature_dict['max'] = np.max(x)
        feature_dict['min'] = np.min(x)
        feature_dict['std'] = np.std(x)
        feature_dict['var'] = np.var(x)
        feature_dict['ptp'] = np.ptp(x)
        feature_dict['percentile_10'] = np.percentile(x, 10)
        feature_dict['percentile_20'] = np.percentile(x, 20)
        feature_dict['percentile_30'] = np.percentile(x, 30)
        feature_dict['percentile_40'] = np.percentile(x, 40)
        feature_dict['percentile_50'] = np.percentile(x, 50)
        feature_dict['percentile_60'] = np.percentile(x, 60)
        feature_dict['percentile_70'] = np.percentile(x, 70)
        feature_dict['percentile_80'] = np.percentile(x, 80)
        feature_dict['percentile_90'] = np.percentile(x, 90)

        # scipy
        feature_dict['skew'] = sp.stats.skew(x)
        feature_dict['kurtosis'] = sp.stats.kurtosis(x)
        feature_dict['kstat_1'] = sp.stats.kstat(x, 1)
        feature_dict['kstat_2'] = sp.stats.kstat(x, 2)
        feature_dict['kstat_3'] = sp.stats.kstat(x, 3)
        feature_dict['kstat_4'] = sp.stats.kstat(x, 4)
        feature_dict['moment_1'] = sp.stats.moment(x, 1)
        feature_dict['moment_2'] = sp.stats.moment(x, 2)
        feature_dict['moment_3'] = sp.stats.moment(x, 3)
        feature_dict['moment_4'] = sp.stats.moment(x, 4)
        
        feature_dict['abs_energy']         = feature_calculators.abs_energy(x)
        feature_dict['abs_sum_of_changes'] = feature_calculators.absolute_sum_of_changes(x)
        feature_dict['count_above_mean']    = feature_calculators.count_above_mean(x)
        feature_dict['count_below_mean'] = feature_calculators.count_below_mean(x)
        feature_dict['mean_abs_change'] = feature_calculators.mean_abs_change(x)
        feature_dict['mean_change'] = feature_calculators.mean_change(x)
        feature_dict['var_larger_than_std_dev'] = feature_calculators.variance_larger_than_standard_deviation(x)
        feature_dict['range_minf_m4000'] = feature_calculators.range_count(x, -np.inf, -4000)
        feature_dict['range_m4000_m3000'] = feature_calculators.range_count(x, -4000, -3000)
        feature_dict['range_m3000_m2000'] = feature_calculators.range_count(x, -3000, -2000)
        feature_dict['range_m2000_m1000'] = feature_calculators.range_count(x, -2000, -1000)
        feature_dict['range_m1000_0'] = feature_calculators.range_count(x, -1000, 0)
        feature_dict['range_0_p1000'] = feature_calculators.range_count(x, 0, 1000)
        feature_dict['range_p1000_p2000'] = feature_calculators.range_count(x, 1000, 2000)
        feature_dict['range_p2000_p3000'] = feature_calculators.range_count(x, 2000, 3000)
        feature_dict['range_p3000_p4000'] = feature_calculators.range_count(x, 3000, 4000)
        feature_dict['range_p4000_pinf'] = feature_calculators.range_count(x, 4000, np.inf)

        feature_dict['ratio_unique_values'] = feature_calculators.ratio_value_number_to_time_series_length(x)
        feature_dict['first_loc_min'] = feature_calculators.first_location_of_minimum(x)
        feature_dict['first_loc_max'] = feature_calculators.first_location_of_maximum(x)
        feature_dict['last_loc_min'] = feature_calculators.last_location_of_minimum(x)
        feature_dict['last_loc_max'] = feature_calculators.last_location_of_maximum(x)
        feature_dict['time_rev_asym_stat_10'] = feature_calculators.time_reversal_asymmetry_statistic(x, 10)
        feature_dict['time_rev_asym_stat_100'] = feature_calculators.time_reversal_asymmetry_statistic(x, 100)
        feature_dict['time_rev_asym_stat_1000'] = feature_calculators.time_reversal_asymmetry_statistic(x, 1000)
        feature_dict['autocorrelation_5'] = feature_calculators.autocorrelation(x, 5)
        feature_dict['autocorrelation_10'] = feature_calculators.autocorrelation(x, 10)
        feature_dict['autocorrelation_50'] = feature_calculators.autocorrelation(x, 50)
        feature_dict['autocorrelation_100'] = feature_calculators.autocorrelation(x, 100)
        feature_dict['autocorrelation_1000'] = feature_calculators.autocorrelation(x, 1000)
        feature_dict['c3_5'] = feature_calculators.c3(x, 5)
        feature_dict['c3_10'] = feature_calculators.c3(x, 10)
        feature_dict['c3_100'] = feature_calculators.c3(x, 100)
        feature_dict['fft_1_real'] = list(feature_calculators.fft_coefficient(x, [{'coeff': 1, 'attr': 'real'}]))[0][1]
        feature_dict['fft_1_imag'] = list(feature_calculators.fft_coefficient(x, [{'coeff': 1, 'attr': 'imag'}]))[0][1]
        feature_dict['fft_1_ang'] = list(feature_calculators.fft_coefficient(x, [{'coeff': 1, 'attr': 'angle'}]))[0][1]
        feature_dict['fft_2_real'] = list(feature_calculators.fft_coefficient(x, [{'coeff': 2, 'attr': 'real'}]))[0][1]
        feature_dict['fft_2_imag'] = list(feature_calculators.fft_coefficient(x, [{'coeff': 2, 'attr': 'imag'}]))[0][1]
        feature_dict['fft_2_ang'] = list(feature_calculators.fft_coefficient(x, [{'coeff': 2, 'attr': 'angle'}]))[0][1]
        feature_dict['fft_3_real'] = list(feature_calculators.fft_coefficient(x, [{'coeff': 3, 'attr': 'real'}]))[0][1]
        feature_dict['fft_3_imag'] = list(feature_calculators.fft_coefficient(x, [{'coeff': 3, 'attr': 'imag'}]))[0][1]
        feature_dict['fft_3_ang'] = list(feature_calculators.fft_coefficient(x, [{'coeff': 3, 'attr': 'angle'}]))[0][1]
        feature_dict['long_strk_above_mean'] = feature_calculators.longest_strike_above_mean(x)
        feature_dict['long_strk_below_mean'] = feature_calculators.longest_strike_below_mean(x)
        feature_dict['cid_ce_0'] = feature_calculators.cid_ce(x, 0)
        feature_dict['cid_ce_1'] = feature_calculators.cid_ce(x, 1)
        feature_dict['binned_entropy_5'] = feature_calculators.binned_entropy(x, 5)
        feature_dict['binned_entropy_10'] = feature_calculators.binned_entropy(x, 10)
        feature_dict['binned_entropy_20'] = feature_calculators.binned_entropy(x, 20)
        feature_dict['binned_entropy_50'] = feature_calculators.binned_entropy(x, 50)
        feature_dict['binned_entropy_80'] = feature_calculators.binned_entropy(x, 80)
        feature_dict['binned_entropy_100'] = feature_calculators.binned_entropy(x, 100)

        feature_dict['num_crossing_0'] = feature_calculators.number_crossing_m(x, 0)
        feature_dict['num_peaks_10'] = feature_calculators.number_peaks(x, 10)
        feature_dict['num_peaks_50'] = feature_calculators.number_peaks(x, 50)
        feature_dict['num_peaks_100'] = feature_calculators.number_peaks(x, 100)
        feature_dict['num_peaks_500'] = feature_calculators.number_peaks(x, 500)

        feature_dict['spkt_welch_density_1'] = list(feature_calculators.spkt_welch_density(x, [{'coeff': 1}]))[0][1]
        feature_dict['spkt_welch_density_10'] = list(feature_calculators.spkt_welch_density(x, [{'coeff': 10}]))[0][1]
        feature_dict['spkt_welch_density_50'] = list(feature_calculators.spkt_welch_density(x, [{'coeff': 50}]))[0][1]
        feature_dict['spkt_welch_density_100'] = list(feature_calculators.spkt_welch_density(x, [{'coeff': 100}]))[0][1]

        feature_dict['time_rev_asym_stat_1'] = feature_calculators.time_reversal_asymmetry_statistic(x, 1)
        feature_dict['time_rev_asym_stat_10'] = feature_calculators.time_reversal_asymmetry_statistic(x, 10)
        feature_dict['time_rev_asym_stat_100'] = feature_calculators.time_reversal_asymmetry_statistic(x, 100)
        
        feature_dict['mean_change_rate'] = self.calc_change_rate(x)
        feature_dict['abs_max']          = np.abs(x).max()
        feature_dict['abs_min']          = np.abs(x).min()
    
        feature_dict['std_first_50000']  = x[:50000].std()
        feature_dict['std_last_50000']   = x[-50000:].std()
        feature_dict['std_first_10000']  = x[:10000].std()
        feature_dict['std_last_10000']   = x[-10000:].std()
    
        feature_dict['avg_first_50000']  = x[:50000].mean()
        feature_dict['avg_last_50000']   =  x[-50000:].mean()
        feature_dict['avg_first_10000']  = x[:10000].mean()
        feature_dict['avg_last_10000']   = x[-10000:].mean()
    
        feature_dict['min_first_50000'] = x[:50000].min()
        feature_dict['min_last_50000']  = x[-50000:].min()
        feature_dict['min_first_10000'] = x[:10000].min()
        feature_dict['min_last_10000']  = x[-10000:].min()
    
        feature_dict['max_first_50000'] = x[:50000].max()
        feature_dict['max_last_50000']  = x[-50000:].max()
        feature_dict['max_first_10000'] = x[:10000].max()
        feature_dict['max_last_10000']  = x[-10000:].max()
    
        feature_dict['max_to_min']      = x.max() / np.abs(x.min())
        feature_dict['max_to_min_diff'] = x.max() - np.abs(x.min())
        feature_dict['count_big']       = len(x[np.abs(x) > 500])
        feature_dict['sum']             = x.sum()

        feature_dict['mean_change_rate_first_50000'] = self.calc_change_rate(x[:50000])
        feature_dict['mean_change_rate_last_50000']  = self.calc_change_rate(x[-50000:])
        feature_dict['mean_change_rate_first_10000'] = self.calc_change_rate(x[:10000])
        feature_dict['mean_change_rate_last_10000']  = self.calc_change_rate(x[-10000:])


        feature_dict['abs_q95'] = np.quantile(np.abs(x), 0.95)
        feature_dict['abs_q99'] = np.quantile(np.abs(x), 0.99)
        feature_dict['abs_q05'] = np.quantile(np.abs(x), 0.05)
        feature_dict['abs_q01'] = np.quantile(np.abs(x), 0.01)

        feature_dict['trend']   = self.add_trend_feature(x)
        feature_dict['abs_trend'] = self.add_trend_feature(x, abs_values=True)
        feature_dict['abs_mean'] = np.abs(x).mean()
        feature_dict['abs_std'] = np.abs(x).std()

        feature_dict['mad'] = x.mad()
        feature_dict['med'] = x.median()

        feature_dict['Hilbert_mean'] = np.abs(hilbert(x)).mean()
        feature_dict['Hann_window_mean'] = (convolve(x, hann(150), mode='same') / sum(hann(150))).mean()
        feature_dict['classic_sta_lta1_mean'] = classic_sta_lta(x, 500, 10000).mean()
        feature_dict['classic_sta_lta2_mean'] = classic_sta_lta(x, 5000, 100000).mean()
        feature_dict['classic_sta_lta3_mean'] = classic_sta_lta(x, 3333, 6666).mean()
        feature_dict['classic_sta_lta4_mean'] = classic_sta_lta(x, 10000, 25000).mean()
        feature_dict['classic_sta_lta5_mean'] = classic_sta_lta(x, 50, 1000).mean()
        feature_dict['classic_sta_lta6_mean'] = classic_sta_lta(x, 100, 5000).mean()
        feature_dict['classic_sta_lta7_mean'] = classic_sta_lta(x, 333, 666).mean()
        feature_dict['classic_sta_lta8_mean'] = classic_sta_lta(x, 4000, 10000).mean()
        feature_dict['Moving_average_700_mean'] = x.rolling(window=700).mean().mean(skipna=True)
        feature_dict['exp_Moving_average_300_mean']    = (ewma(x, span=300).mean()).mean(skipna=True)
        feature_dict['exp_Moving_average_3000_mean']   = ewma(x, span=3000).mean().mean(skipna=True)
        feature_dict['exp_Moving_average_30000_mean']  = ewma(x, span=30000).mean().mean(skipna=True)
        feature_dict['MA_700MA_std_mean']              = x.rolling(window=700).std().mean()
        feature_dict['MA_700MA_BB_high_mean'] = feature_dict['Moving_average_700_mean'] + no_of_std*feature_dict['MA_700MA_std_mean']
        feature_dict['MA_700MA_BB_low_mean'] = feature_dict['Moving_average_700_mean'] - no_of_std*feature_dict['MA_700MA_std_mean']
        feature_dict['MA_400MA_std_mean']              = x.rolling(window=400).std().mean()
        feature_dict['MA_400MA_BB_high_mean']= feature_dict['Moving_average_700_mean'] + no_of_std*feature_dict['MA_400MA_std_mean']
        feature_dict['MA_400MA_BB_low_mean'] = feature_dict['Moving_average_700_mean'] - no_of_std*feature_dict['MA_400MA_std_mean']
        feature_dict['MA_1000MA_std_mean']             = x.rolling(window=1000).std().mean()
    

        feature_dict['iqr']    = np.subtract(*np.percentile(x, [75, 25]))
        feature_dict['q999']   = np.quantile(x,0.999)
        feature_dict['q001']   = np.quantile(x,0.001)
        feature_dict['ave10']  = stats.trim_mean(x, 0.1)

        for windows in [10, 100, 1000]:
            x_roll_std = x.rolling(windows).std().dropna().values
            x_roll_mean = x.rolling(windows).mean().dropna().values

            feature_dict['ave_roll_std_' + str(windows)] = x_roll_std.mean()
            feature_dict['std_roll_std_' + str(windows)] = x_roll_std.std()
            feature_dict['max_roll_std_' + str(windows)] = x_roll_std.max()
            feature_dict['min_roll_std_' + str(windows)] = x_roll_std.min()
            feature_dict['q01_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.01)
            feature_dict['q05_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.05)
            feature_dict['q95_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.95)
            feature_dict['q99_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.99)
            feature_dict['av_change_abs_roll_std_' + str(windows)] = np.mean(np.diff(x_roll_std))
            feature_dict['av_change_rate_roll_std_' + str(windows)] = np.mean(np.nonzero((np.diff(x_roll_std) / x_roll_std[:-1]))[0])
            feature_dict['abs_max_roll_std_' + str(windows)] = np.abs(x_roll_std).max()

            feature_dict['ave_roll_mean_' + str(windows)] = x_roll_mean.mean()
            feature_dict['std_roll_mean_' + str(windows)] = x_roll_mean.std()
            feature_dict['max_roll_mean_' + str(windows)] = x_roll_mean.max()
            feature_dict['min_roll_mean_' + str(windows)] = x_roll_mean.min()
            feature_dict['q01_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.01)
            feature_dict['q05_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.05)
            feature_dict['q95_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.95)
            feature_dict['q99_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.99)
            feature_dict['av_change_abs_roll_mean_' + str(windows)] = np.mean(np.diff(x_roll_mean))
            feature_dict['av_change_rate_roll_mean_' + str(windows)] = np.mean(np.nonzero((np.diff(x_roll_mean) / x_roll_mean[:-1]))[0])
            feature_dict['abs_max_roll_mean_' + str(windows)] = np.abs(x_roll_mean).max()

        return feature_dict

    def generate(self):
        feature_list = []
        res = Parallel(n_jobs=self.n_jobs,
                       backend='threading')(delayed(self.features)(x, y, s)
                                            for s, x, y in tqdm(self.read_chunks(), total=self.total_data))
        for r in res:
            feature_list.append(r)
        return pd.DataFrame(feature_list)


training_fg = FeatureGenerator(dtype='train', n_jobs=10, chunk_size=150000)
training_data = training_fg.generate()

test_fg = FeatureGenerator(dtype='test', n_jobs=10, chunk_size=150000)
test_data = test_fg.generate()