# Prepare data

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
from scipy.signal import hann, hilbert
from scipy.stats import trim_mean
from sklearn.linear_model import LinearRegression
from tqdm.notebook import tqdm

In [2]:
%load_ext nb_black

<IPython.core.display.Javascript object>

## Feature generation

* https://www.kaggle.com/code/artgor/earthquakes-fe-more-features-and-samples#Feature-generation
* https://www.kaggle.com/code/artgor/seismic-data-eda-and-baseline

```python

X_tr.loc[segment, "mean"] = x.mean()
X_tr.loc[segment, "std"] = x.std()
X_tr.loc[segment, "max"] = x.max()
X_tr.loc[segment, "min"] = x.min()

X_tr.loc[segment, "mean_change_abs"] = np.mean(np.diff(x))
X_tr.loc[segment, "mean_change_rate"] = calc_change_rate(x)
X_tr.loc[segment, "abs_max"] = np.abs(x).max()
X_tr.loc[segment, "abs_min"] = np.abs(x).min()

X_tr.loc[segment, "std_first_50000"] = x[:50000].std()
X_tr.loc[segment, "std_last_50000"] = x[-50000:].std()
X_tr.loc[segment, "std_first_10000"] = x[:10000].std()
X_tr.loc[segment, "std_last_10000"] = x[-10000:].std()

X_tr.loc[segment, "avg_first_50000"] = x[:50000].mean()
X_tr.loc[segment, "avg_last_50000"] = x[-50000:].mean()
X_tr.loc[segment, "avg_first_10000"] = x[:10000].mean()
X_tr.loc[segment, "avg_last_10000"] = x[-10000:].mean()

X_tr.loc[segment, "min_first_50000"] = x[:50000].min()
X_tr.loc[segment, "min_last_50000"] = x[-50000:].min()
X_tr.loc[segment, "min_first_10000"] = x[:10000].min()
X_tr.loc[segment, "min_last_10000"] = x[-10000:].min()

X_tr.loc[segment, "max_first_50000"] = x[:50000].max()
X_tr.loc[segment, "max_last_50000"] = x[-50000:].max()
X_tr.loc[segment, "max_first_10000"] = x[:10000].max()
X_tr.loc[segment, "max_last_10000"] = x[-10000:].max()

X_tr.loc[segment, "max_to_min"] = x.max() / np.abs(x.min())
X_tr.loc[segment, "max_to_min_diff"] = x.max() - np.abs(x.min())
X_tr.loc[segment, "count_big"] = len(x[np.abs(x) > 500])
X_tr.loc[segment, "sum"] = x.sum()

X_tr.loc[segment, "mean_change_rate_first_50000"] = calc_change_rate(x[:50000])
X_tr.loc[segment, "mean_change_rate_last_50000"] = calc_change_rate(x[-50000:])
X_tr.loc[segment, "mean_change_rate_first_10000"] = calc_change_rate(x[:10000])
X_tr.loc[segment, "mean_change_rate_last_10000"] = calc_change_rate(x[-10000:])

X_tr.loc[segment, "q95"] = np.quantile(x, 0.95)
X_tr.loc[segment, "q99"] = np.quantile(x, 0.99)
X_tr.loc[segment, "q05"] = np.quantile(x, 0.05)
X_tr.loc[segment, "q01"] = np.quantile(x, 0.01)

X_tr.loc[segment, "abs_q95"] = np.quantile(np.abs(x), 0.95)
X_tr.loc[segment, "abs_q99"] = np.quantile(np.abs(x), 0.99)
X_tr.loc[segment, "abs_q05"] = np.quantile(np.abs(x), 0.05)
X_tr.loc[segment, "abs_q01"] = np.quantile(np.abs(x), 0.01)

X_tr.loc[segment, "trend"] = add_trend_feature(x)
X_tr.loc[segment, "abs_trend"] = add_trend_feature(x, abs_values=True)
X_tr.loc[segment, "abs_mean"] = np.abs(x).mean()
X_tr.loc[segment, "abs_std"] = np.abs(x).std()

X_tr.loc[segment, "mad"] = x.mad()
X_tr.loc[segment, "kurt"] = x.kurtosis()
X_tr.loc[segment, "skew"] = x.skew()
X_tr.loc[segment, "med"] = x.median()

X_tr.loc[segment, "Hilbert_mean"] = np.abs(hilbert(x)).mean()
X_tr.loc[segment, "Hann_window_mean"] = (
    convolve(x, hann(150), mode="same") / sum(hann(150))
).mean()
X_tr.loc[segment, "classic_sta_lta1_mean"] = classic_sta_lta(x, 500, 10000).mean()
X_tr.loc[segment, "classic_sta_lta2_mean"] = classic_sta_lta(x, 5000, 100000).mean()
X_tr.loc[segment, "classic_sta_lta3_mean"] = classic_sta_lta(x, 3333, 6666).mean()
X_tr.loc[segment, "classic_sta_lta4_mean"] = classic_sta_lta(x, 10000, 25000).mean()
X_tr.loc[segment, "classic_sta_lta5_mean"] = classic_sta_lta(x, 50, 1000).mean()
X_tr.loc[segment, "classic_sta_lta6_mean"] = classic_sta_lta(x, 100, 5000).mean()
X_tr.loc[segment, "classic_sta_lta7_mean"] = classic_sta_lta(x, 333, 666).mean()
X_tr.loc[segment, "classic_sta_lta8_mean"] = classic_sta_lta(x, 4000, 10000).mean()
X_tr.loc[segment, "Moving_average_700_mean"] = (
    x.rolling(window=700).mean().mean(skipna=True)
)
ewma = pd.Series.ewm
X_tr.loc[segment, "exp_Moving_average_300_mean"] = (ewma(x, span=300).mean()).mean(
    skipna=True
)
X_tr.loc[segment, "exp_Moving_average_3000_mean"] = (
    ewma(x, span=3000).mean().mean(skipna=True)
)
X_tr.loc[segment, "exp_Moving_average_30000_mean"] = (
    ewma(x, span=30000).mean().mean(skipna=True)
)
no_of_std = 3
X_tr.loc[segment, "MA_700MA_std_mean"] = x.rolling(window=700).std().mean()
X_tr.loc[segment, "MA_700MA_BB_high_mean"] = (
    X_tr.loc[segment, "Moving_average_700_mean"]
    + no_of_std * X_tr.loc[segment, "MA_700MA_std_mean"]
).mean()
X_tr.loc[segment, "MA_700MA_BB_low_mean"] = (
    X_tr.loc[segment, "Moving_average_700_mean"]
    - no_of_std * X_tr.loc[segment, "MA_700MA_std_mean"]
).mean()
X_tr.loc[segment, "MA_400MA_std_mean"] = x.rolling(window=400).std().mean()
X_tr.loc[segment, "MA_400MA_BB_high_mean"] = (
    X_tr.loc[segment, "Moving_average_700_mean"]
    + no_of_std * X_tr.loc[segment, "MA_400MA_std_mean"]
).mean()
X_tr.loc[segment, "MA_400MA_BB_low_mean"] = (
    X_tr.loc[segment, "Moving_average_700_mean"]
    - no_of_std * X_tr.loc[segment, "MA_400MA_std_mean"]
).mean()
X_tr.loc[segment, "MA_1000MA_std_mean"] = x.rolling(window=1000).std().mean()
X_tr.drop("Moving_average_700_mean", axis=1, inplace=True)

X_tr.loc[segment, "iqr"] = np.subtract(*np.percentile(x, [75, 25]))
X_tr.loc[segment, "q999"] = np.quantile(x, 0.999)
X_tr.loc[segment, "q001"] = np.quantile(x, 0.001)
X_tr.loc[segment, "ave10"] = stats.trim_mean(x, 0.1)

for windows in [10, 100, 1000]:
    x_roll_std = x.rolling(windows).std().dropna().values
    x_roll_mean = x.rolling(windows).mean().dropna().values

    X_tr.loc[segment, "ave_roll_std_" + str(windows)] = x_roll_std.mean()
    X_tr.loc[segment, "std_roll_std_" + str(windows)] = x_roll_std.std()
    X_tr.loc[segment, "max_roll_std_" + str(windows)] = x_roll_std.max()
    X_tr.loc[segment, "min_roll_std_" + str(windows)] = x_roll_std.min()
    X_tr.loc[segment, "q01_roll_std_" + str(windows)] = np.quantile(
        x_roll_std, 0.01
    )
    X_tr.loc[segment, "q05_roll_std_" + str(windows)] = np.quantile(
        x_roll_std, 0.05
    )
    X_tr.loc[segment, "q95_roll_std_" + str(windows)] = np.quantile(
        x_roll_std, 0.95
    )
    X_tr.loc[segment, "q99_roll_std_" + str(windows)] = np.quantile(
        x_roll_std, 0.99
    )
    X_tr.loc[segment, "av_change_abs_roll_std_" + str(windows)] = np.mean(
        np.diff(x_roll_std)
    )
    X_tr.loc[segment, "av_change_rate_roll_std_" + str(windows)] = np.mean(
        np.nonzero((np.diff(x_roll_std) / x_roll_std[:-1]))[0]
    )
    X_tr.loc[segment, "abs_max_roll_std_" + str(windows)] = np.abs(x_roll_std).max()

    X_tr.loc[segment, "ave_roll_mean_" + str(windows)] = x_roll_mean.mean()
    X_tr.loc[segment, "std_roll_mean_" + str(windows)] = x_roll_mean.std()
    X_tr.loc[segment, "max_roll_mean_" + str(windows)] = x_roll_mean.max()
    X_tr.loc[segment, "min_roll_mean_" + str(windows)] = x_roll_mean.min()
    X_tr.loc[segment, "q01_roll_mean_" + str(windows)] = np.quantile(
        x_roll_mean, 0.01
    )
    X_tr.loc[segment, "q05_roll_mean_" + str(windows)] = np.quantile(
        x_roll_mean, 0.05
    )
    X_tr.loc[segment, "q95_roll_mean_" + str(windows)] = np.quantile(
        x_roll_mean, 0.95
    )
    X_tr.loc[segment, "q99_roll_mean_" + str(windows)] = np.quantile(
        x_roll_mean, 0.99
    )
    X_tr.loc[segment, "av_change_abs_roll_mean_" + str(windows)] = np.mean(
        np.diff(x_roll_mean)
    )
    X_tr.loc[segment, "av_change_rate_roll_mean_" + str(windows)] = np.mean(
        np.nonzero((np.diff(x_roll_mean) / x_roll_mean[:-1]))[0]
    )
    X_tr.loc[segment, "abs_max_roll_mean_" + str(windows)] = np.abs(
        x_roll_mean
    ).max()
        
```

In [3]:
def calc_change_rate(x):
    change = (np.diff(x) / x.iloc[:-1]).values
    change = change[np.nonzero(change)[0]]
    change = change[~np.isnan(change)]
    change = change[change != -np.inf]
    change = change[change != np.inf]

    return np.mean(change)


def add_trend_feature(arr, abs_values=False):
    idx = np.array(range(len(arr)))
    if abs_values:
        arr = np.abs(arr)
    lr = LinearRegression()
    lr.fit(idx.reshape(-1, 1), arr)

    return lr.coef_[0]


def classic_sta_lta(x, length_sta, length_lta):

    sta = np.cumsum(x**2)

    # Convert to float
    sta = np.require(sta, dtype=float)

    # Copy for LTA
    lta = sta.copy()

    # Compute the STA and the LTA
    sta[length_sta:] = sta[length_sta:] - sta[:-length_sta]
    sta /= length_sta
    lta[length_lta:] = lta[length_lta:] - lta[:-length_lta]
    lta /= length_lta

    # Pad zeros
    sta[: length_lta - 1] = 0

    # Avoid division by zero by setting zero values to tiny float
    dtiny = np.finfo(0.0).tiny
    idx = lta < dtiny
    lta[idx] = dtiny

    return sta / lta

<IPython.core.display.Javascript object>

In [4]:
def make_features(seq_df: pd.DataFrame):
    X_seq = dict()

    data = seq_df["acoustic_data"]

    X_seq["mean"] = data.mean()
    X_seq["std"] = data.std()
    X_seq["max"] = data.max()
    X_seq["min"] = data.min()

    X_seq["mean_change_abs"] = np.mean(np.diff(data))
    X_seq["mean_change_rate"] = calc_change_rate(data)
    X_seq["abs_max"] = np.abs(data).max()
    X_seq["abs_min"] = np.abs(data).min()

    X_seq["std_first_50000"] = data[:50000].std()
    X_seq["std_last_50000"] = data[-50000:].std()
    X_seq["std_first_10000"] = data[:10000].std()
    X_seq["std_last_10000"] = data[-10000:].std()

    X_seq["avg_first_50000"] = data[:50000].mean()
    X_seq["avg_last_50000"] = data[-50000:].mean()
    X_seq["avg_first_10000"] = data[:10000].mean()
    X_seq["avg_last_10000"] = data[-10000:].mean()

    X_seq["min_first_50000"] = data[:50000].min()
    X_seq["min_last_50000"] = data[-50000:].min()
    X_seq["min_first_10000"] = data[:10000].min()
    X_seq["min_last_10000"] = data[-10000:].min()

    X_seq["max_first_50000"] = data[:50000].max()
    X_seq["max_last_50000"] = data[-50000:].max()
    X_seq["max_first_10000"] = data[:10000].max()
    X_seq["max_last_10000"] = data[-10000:].max()

    X_seq["max_to_min"] = data.max() / np.abs(data.min())
    X_seq["max_to_min_diff"] = data.max() - np.abs(data.min())
    X_seq["count_big"] = len(data[np.abs(data) > 500])
    X_seq["sum"] = data.sum()

    X_seq["mean_change_rate_first_50000"] = calc_change_rate(data[:50000])
    X_seq["mean_change_rate_last_50000"] = calc_change_rate(data[-50000:])
    X_seq["mean_change_rate_first_10000"] = calc_change_rate(data[:10000])
    X_seq["mean_change_rate_last_10000"] = calc_change_rate(data[-10000:])

    X_seq["q95"] = np.quantile(data, 0.95)
    X_seq["q99"] = np.quantile(data, 0.99)
    X_seq["q05"] = np.quantile(data, 0.05)
    X_seq["q01"] = np.quantile(data, 0.01)

    X_seq["abs_q95"] = np.quantile(np.abs(data), 0.95)
    X_seq["abs_q99"] = np.quantile(np.abs(data), 0.99)
    X_seq["abs_q05"] = np.quantile(np.abs(data), 0.05)
    X_seq["abs_q01"] = np.quantile(np.abs(data), 0.01)

    X_seq["trend"] = add_trend_feature(data)
    X_seq["abs_trend"] = add_trend_feature(data, abs_values=True)
    X_seq["abs_mean"] = np.abs(data).mean()
    X_seq["abs_std"] = np.abs(data).std()

    X_seq["mad"] = (data - data.mean()).abs().mean()  # data.mad()
    X_seq["kurt"] = data.kurtosis()
    X_seq["skew"] = data.skew()
    X_seq["med"] = data.median()

    X_seq["Hilbert_mean"] = np.abs(hilbert(data)).mean()
    X_seq["Hann_window_mean"] = (
        np.convolve(data, hann(150), mode="same") / sum(hann(150))
    ).mean()
    X_seq["classic_sta_lta1_mean"] = classic_sta_lta(data, 500, 10000).mean()
    X_seq["classic_sta_lta2_mean"] = classic_sta_lta(data, 5000, 100000).mean()
    X_seq["classic_sta_lta3_mean"] = classic_sta_lta(data, 3333, 6666).mean()
    X_seq["classic_sta_lta4_mean"] = classic_sta_lta(data, 10000, 25000).mean()
    X_seq["classic_sta_lta5_mean"] = classic_sta_lta(data, 50, 1000).mean()
    X_seq["classic_sta_lta6_mean"] = classic_sta_lta(data, 100, 5000).mean()
    X_seq["classic_sta_lta7_mean"] = classic_sta_lta(data, 333, 666).mean()
    X_seq["classic_sta_lta8_mean"] = classic_sta_lta(data, 4000, 10000).mean()
    X_seq["Moving_average_700_mean"] = data.rolling(window=700).mean().mean(skipna=True)

    ewma = pd.Series.ewm
    X_seq["exp_Moving_average_300_mean"] = (ewma(data, span=300).mean()).mean(
        skipna=True
    )
    X_seq["exp_Moving_average_3000_mean"] = (
        ewma(data, span=3000).mean().mean(skipna=True)
    )
    X_seq["exp_Moving_average_30000_mean"] = (
        ewma(data, span=30000).mean().mean(skipna=True)
    )

    no_of_std = 3
    X_seq["MA_700MA_std_mean"] = data.rolling(window=700).std().mean()
    X_seq["MA_700MA_BB_high_mean"] = (
        X_seq["Moving_average_700_mean"] + no_of_std * X_seq["MA_700MA_std_mean"]
    ).mean()
    X_seq["MA_700MA_BB_low_mean"] = (
        X_seq["Moving_average_700_mean"] - no_of_std * X_seq["MA_700MA_std_mean"]
    ).mean()
    X_seq["MA_400MA_std_mean"] = data.rolling(window=400).std().mean()
    X_seq["MA_400MA_BB_high_mean"] = (
        X_seq["Moving_average_700_mean"] + no_of_std * X_seq["MA_400MA_std_mean"]
    ).mean()
    X_seq["MA_400MA_BB_low_mean"] = (
        X_seq["Moving_average_700_mean"] - no_of_std * X_seq["MA_400MA_std_mean"]
    ).mean()
    X_seq["MA_1000MA_std_mean"] = data.rolling(window=1000).std().mean()

    del X_seq["Moving_average_700_mean"]

    X_seq["iqr"] = np.subtract(*np.percentile(data, [75, 25]))
    X_seq["q999"] = np.quantile(data, 0.999)
    X_seq["q001"] = np.quantile(data, 0.001)
    X_seq["ave10"] = trim_mean(data, 0.1)

    for windows in [10, 100, 1000]:
        x_roll_std = data.rolling(windows).std().dropna().values
        x_roll_mean = data.rolling(windows).mean().dropna().values

        X_seq["ave_roll_std_" + str(windows)] = x_roll_std.mean()
        X_seq["std_roll_std_" + str(windows)] = x_roll_std.std()
        X_seq["max_roll_std_" + str(windows)] = x_roll_std.max()
        X_seq["min_roll_std_" + str(windows)] = x_roll_std.min()
        X_seq["q01_roll_std_" + str(windows)] = np.quantile(x_roll_std, 0.01)
        X_seq["q05_roll_std_" + str(windows)] = np.quantile(x_roll_std, 0.05)
        X_seq["q95_roll_std_" + str(windows)] = np.quantile(x_roll_std, 0.95)
        X_seq["q99_roll_std_" + str(windows)] = np.quantile(x_roll_std, 0.99)
        X_seq["av_change_abs_roll_std_" + str(windows)] = np.mean(np.diff(x_roll_std))
        X_seq["abs_max_roll_std_" + str(windows)] = np.abs(x_roll_std).max()

        X_seq["ave_roll_mean_" + str(windows)] = x_roll_mean.mean()
        X_seq["std_roll_mean_" + str(windows)] = x_roll_mean.std()
        X_seq["max_roll_mean_" + str(windows)] = x_roll_mean.max()
        X_seq["min_roll_mean_" + str(windows)] = x_roll_mean.min()
        X_seq["q01_roll_mean_" + str(windows)] = np.quantile(x_roll_mean, 0.01)
        X_seq["q05_roll_mean_" + str(windows)] = np.quantile(x_roll_mean, 0.05)
        X_seq["q95_roll_mean_" + str(windows)] = np.quantile(x_roll_mean, 0.95)
        X_seq["q99_roll_mean_" + str(windows)] = np.quantile(x_roll_mean, 0.99)
        X_seq["av_change_abs_roll_mean_" + str(windows)] = np.mean(np.diff(x_roll_mean))
        X_seq["abs_max_roll_mean_" + str(windows)] = np.abs(x_roll_mean).max()

        with np.errstate(divide="ignore", invalid="ignore"):
            X_seq["av_change_rate_roll_std_" + str(windows)] = np.mean(
                np.nonzero((np.diff(x_roll_std) / x_roll_std[:-1]))[0]
            )
            X_seq["av_change_rate_roll_mean_" + str(windows)] = np.mean(
                np.nonzero((np.diff(x_roll_mean) / x_roll_mean[:-1]))[0]
            )

    return X_seq

<IPython.core.display.Javascript object>

# Make train features

Тестовые данные разбиты на блоки по 150.000, для подготовки обучающего набора нужно так же разбить его на блоки по 150тыс. и сгруппировать/подготовить доп. колонки

In [5]:
# cd ../../data/LANL-Earthquake-Prediction/train && cat ../train.csv | parallel --header : --pipe -N150000 'cat >seq_{#}.csv'

<IPython.core.display.Javascript object>

In [6]:
train_seq_file_list = [
    p
    for p in Path("../../data/LANL-Earthquake-Prediction/train/").iterdir()
    if p.is_file()
]
len(train_seq_file_list)

4195

<IPython.core.display.Javascript object>

In [7]:
feature_list = []

for filename in tqdm(train_seq_file_list):
    seq_df = pd.read_csv(filename)
    seq_features = make_features(seq_df)
    seq_features["seg_id"] = Path(filename).stem
    seq_features["target"] = seq_df["time_to_failure"].values[-1]

    feature_list.append(seq_features)

train_df = pd.DataFrame(feature_list).set_index("seg_id")
train_df

  0%|          | 0/4195 [00:00<?, ?it/s]

Unnamed: 0_level_0,mean,std,max,min,mean_change_abs,mean_change_rate,abs_max,abs_min,std_first_50000,std_last_50000,...,min_roll_mean_1000,q01_roll_mean_1000,q05_roll_mean_1000,q95_roll_mean_1000,q99_roll_mean_1000,av_change_abs_roll_mean_1000,abs_max_roll_mean_1000,av_change_rate_roll_std_1000,av_change_rate_roll_mean_1000,target
seg_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
seq_1630,4.107287,6.480113,145,-136,0.000000,-0.027335,145,0,7.836271,5.689157,...,3.368,3.500,3.755,4.467,4.606,-7.651007e-07,4.780,74636.781485,74636.614967,0.345198
seq_2808,4.466447,3.153520,68,-65,0.000020,0.089663,68,0,3.147644,2.916550,...,3.896,4.029,4.145,4.816,4.944,-6.040268e-08,5.125,74323.679238,74323.590862,10.548500
seq_3552,4.657553,5.043771,240,-201,0.000047,0.086331,240,0,2.820486,3.875171,...,3.543,4.079,4.282,5.036,5.195,1.053691e-06,5.281,74685.306689,74682.808668,13.706999
seq_845,4.884513,7.607336,149,-135,0.000053,0.030698,149,0,8.977789,8.621918,...,3.870,4.292,4.467,5.340,5.527,1.483221e-06,5.850,74541.657276,74540.147038,3.123896
seq_3337,4.760460,4.649864,143,-96,0.000033,0.082509,143,0,3.548759,6.118750,...,4.125,4.271,4.413,5.154,5.404,9.060403e-07,5.582,74609.219499,74607.875394,7.332897
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
seq_1250,4.607953,3.419499,73,-67,0.000000,0.086249,73,0,3.075176,3.372700,...,3.927,4.087,4.195,5.005,5.186,-4.496644e-07,5.466,74584.023781,74583.720903,0.036798
seq_3550,4.643567,4.430010,161,-115,0.000013,0.082237,161,0,3.713703,3.154168,...,3.761,3.948,4.060,5.130,5.233,4.093960e-07,5.380,74337.677991,74337.647333,13.785696
seq_1279,4.633420,4.032830,126,-95,-0.000020,0.075549,126,0,3.350287,5.080004,...,3.962,4.138,4.245,5.018,5.182,-5.637584e-07,5.313,74514.158415,74514.362947,6.962798
seq_814,4.907873,4.094302,64,-52,-0.000033,0.065538,64,0,4.043301,4.273086,...,4.029,4.276,4.518,5.276,5.408,1.697987e-06,5.558,74491.330173,74493.512766,4.331597


<IPython.core.display.Javascript object>

In [8]:
train_df.to_csv("../../data/LANL-Earthquake-Prediction/train_prepared.csv")

<IPython.core.display.Javascript object>

# Make test features

In [9]:
test_seq_file_list = [
    p
    for p in Path("../../data/LANL-Earthquake-Prediction/test/").iterdir()
    if p.is_file()
]
len(test_seq_file_list)

2624

<IPython.core.display.Javascript object>

In [10]:
feature_list = []

for filename in tqdm(test_seq_file_list):
    seq_df = pd.read_csv(filename)
    seq_features = make_features(seq_df)
    seq_features["seg_id"] = Path(filename).stem

    feature_list.append(seq_features)

test_df = pd.DataFrame(feature_list).set_index("seg_id")
test_df

  0%|          | 0/2624 [00:00<?, ?it/s]

Unnamed: 0_level_0,mean,std,max,min,mean_change_abs,mean_change_rate,abs_max,abs_min,std_first_50000,std_last_50000,...,max_roll_mean_1000,min_roll_mean_1000,q01_roll_mean_1000,q05_roll_mean_1000,q95_roll_mean_1000,q99_roll_mean_1000,av_change_abs_roll_mean_1000,abs_max_roll_mean_1000,av_change_rate_roll_std_1000,av_change_rate_roll_mean_1000
seg_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
seg_e5c033,3.942853,5.259674,95,-82,-0.000073,-0.037545,95,0,5.449322,4.589241,...,4.682,3.229,3.467,3.598,4.319,4.451,1.389262e-06,4.682,74523.248722,74521.949339
seg_74537f,3.854553,5.112639,83,-78,-0.000020,-0.047260,83,0,5.199925,3.726957,...,4.648,3.275,3.390,3.537,4.203,4.380,1.268456e-06,4.648,74346.990413,74348.891220
seg_5009d9,4.370267,7.194114,149,-139,0.000067,-0.021931,149,0,5.491165,5.823179,...,5.314,3.466,3.656,3.967,4.814,4.987,-1.718121e-06,5.314,74746.926510,74747.193733
seg_cc7a19,3.799813,7.241903,170,-138,-0.000033,-0.077529,170,0,8.823961,4.709930,...,4.772,2.810,3.355,3.458,4.179,4.276,-2.550336e-07,4.772,74327.728054,74328.160760
seg_abb03a,4.182333,5.515830,141,-138,0.000000,-0.023226,141,0,6.163469,5.194700,...,5.055,3.132,3.348,3.697,4.620,4.721,2.604027e-06,5.055,74597.028976,74597.458084
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
seg_06d7ba,4.327347,8.357528,245,-280,0.000013,-0.014522,280,0,10.606502,7.690559,...,5.597,3.378,3.783,3.931,4.720,4.836,7.986577e-07,5.597,74633.246043,74632.616225
seg_1d980f,3.942400,6.597127,158,-141,-0.000040,-0.053942,158,0,7.324298,6.860940,...,5.232,2.931,3.265,3.366,4.625,4.921,-6.395973e-06,5.232,74523.117383,74522.521642
seg_217eed,4.096700,10.373600,360,-251,0.000027,-0.055808,360,0,5.974448,6.722152,...,6.879,2.314,3.464,3.667,4.490,4.637,-1.208054e-06,6.879,74595.332978,74594.434906
seg_b08e9d,4.252653,3.199164,69,-59,0.000007,0.075376,69,0,3.079637,2.811069,...,4.888,3.494,3.754,3.902,4.614,4.755,4.322148e-06,4.888,74422.349095,74423.027285


<IPython.core.display.Javascript object>

In [11]:
test_df.to_csv("../../data/LANL-Earthquake-Prediction/test_prepared.csv")

<IPython.core.display.Javascript object>