In [3]:
import glob

import pandas as pd
import numpy as np

pd.options.display.max_rows = 1000

# Read Data

#### Description of the Data

Data used in this notebook is from the Kaggle Competition "INGV - Volcanic Eruption Prediction"(https://www.kaggle.com/c/predict-volcanic-eruptions-ingv-oe).

We will explore a bunch of files under `train` and `test` directories. Each file contains ten minutes of logs from ten different sensors placed around a volcano. There are 4432 data files under the train directory and 4521 files under test directory. Each of these files consists of 60K lines. On the disk, size of the files under train and test directory is 30G (15G + 15G).

In [4]:
# Location of the data
DATA_DIR = "/opt/vssexclude/personal/kaggle/volcano/data/raw/train"

# Define the datatypes for different sensor data
data_types = {"sensor_1" : np.float32, 
                 "sensor_2" : np.float32, 
                 "sensor_3" : np.float32,
                 "sensor_4" : np.float32,
                 "sensor_5" : np.float32,
                 "sensor_6" : np.float32,
                 "sensor_7" : np.float32,
                 "sensor_8" : np.float32,
                 "sensor_9" : np.float32,
                 "sensor_10" : np.float32}

### What's next?
- Read multiple CSV files using Pandas. Pandas need data to fit the available memeory (RAM). 
- For simplicity, read first 1000 lines of each file.
- Append the content of each CSV file to one DataFrame
- Add a column named `segment_id` representing the time segment of each CSV file
- Add a column named `time` with ascending values staring from 0 representing time

In [5]:
df = pd.DataFrame()
for name in glob.glob(f"{DATA_DIR}/140*"):
    print(f"Reading {name}")
    temp_df = pd.read_csv(name, dtype=data_types, nrows=1000)
    
    # Extract name of the segment from the file name
    segment_id = int(name.split(".")[0].split("/")[-1])
    temp_df["segment_id"] = segment_id
    
    # Create a column named time
    temp_df["time"] = range(0, len(temp_df))
    
    df = df.append(temp_df)
#df["segment_id"] = df.segment_id.astype('category')
print("\n")    
print(f"Shape of the dataframe consisting of all data from above files: {df.shape}")

Reading /opt/vssexclude/personal/kaggle/volcano/data/raw/train/1407094442.csv
Reading /opt/vssexclude/personal/kaggle/volcano/data/raw/train/1405189645.csv
Reading /opt/vssexclude/personal/kaggle/volcano/data/raw/train/1403440092.csv
Reading /opt/vssexclude/personal/kaggle/volcano/data/raw/train/1405443107.csv
Reading /opt/vssexclude/personal/kaggle/volcano/data/raw/train/1400929225.csv
Reading /opt/vssexclude/personal/kaggle/volcano/data/raw/train/1400727315.csv
Reading /opt/vssexclude/personal/kaggle/volcano/data/raw/train/1403947680.csv
Reading /opt/vssexclude/personal/kaggle/volcano/data/raw/train/1400253000.csv
Reading /opt/vssexclude/personal/kaggle/volcano/data/raw/train/140031872.csv
Reading /opt/vssexclude/personal/kaggle/volcano/data/raw/train/1409167039.csv
Reading /opt/vssexclude/personal/kaggle/volcano/data/raw/train/1408663387.csv
Reading /opt/vssexclude/personal/kaggle/volcano/data/raw/train/1406456924.csv
Reading /opt/vssexclude/personal/kaggle/volcano/data/raw/train/14

In [6]:
df.segment_id.value_counts()

140851065     1000
1406456924    1000
1404122310    1000
1404179874    1000
1403947680    1000
140348256     1000
1407084157    1000
1405189645    1000
1403222059    1000
1403005697    1000
1408663387    1000
1408285202    1000
1403440092    1000
1404322654    1000
1400929225    1000
140031872     1000
1400727315    1000
1408645616    1000
1402556914    1000
1406938061    1000
1405443107    1000
1406234149    1000
1407261706    1000
1409167039    1000
1407094442    1000
1402914692    1000
1400253000    1000
1406626451    1000
1402674973    1000
1404502479    1000
1403244730    1000
Name: segment_id, dtype: int64

For simplicity, we will use the data from 2 sensors

In [9]:
df  = df.loc[:, ["segment_id", "time", "sensor_1", "sensor_4"]]

In [10]:
df.head()

Unnamed: 0,segment_id,time,sensor_1,sensor_4
0,1407094442,0,-580.0,-458.0
1,1407094442,1,-598.0,-432.0
2,1407094442,2,-615.0,-342.0
3,1407094442,3,-592.0,-204.0
4,1407094442,4,-536.0,-14.0


In [11]:
df.isna().sum()

segment_id    0
time          0
sensor_1      0
sensor_4      0
dtype: int64

# Extract Time Series Features using tsfresh

- Generate features only for time series representing data captured by sensor_1
- For each time series, `tsfresh` generates 750+ features.
- The set of features generated by `tsfresh` can be controlled by the parameter `default_fc_parameters`. It accepts various values  from `tsfresh.feature_extraction.settings`. For example
    - `ComprehensiveFCParameters`
    - `EfficientFCParameters`
    - `MinimalFCParameters`
- While preparing and debugging the data pipeline, use `MinimalFCParameters`

In [12]:
from tsfresh import extract_features
from tsfresh.feature_extraction.settings import ComprehensiveFCParameters

In [13]:
# Generate features for sensor_1 for each segment_id
extracted_features_df = extract_features(timeseries_container=df, 
                                 default_fc_parameters=ComprehensiveFCParameters(),
                                 column_id="segment_id",
                                 column_sort="time",
                                 n_jobs=10,
                                 disable_progressbar=False
                                )
extracted_features_df = extracted_features_df.rename_axis("segment_id")

Feature Extraction: 100%|██████████| 31/31 [00:15<00:00,  2.06it/s]


In [14]:
extracted_features_df.head(3)

Unnamed: 0_level_0,sensor_1__variance_larger_than_standard_deviation,sensor_1__has_duplicate_max,sensor_1__has_duplicate_min,sensor_1__has_duplicate,sensor_1__sum_values,sensor_1__abs_energy,sensor_1__mean_abs_change,sensor_1__mean_change,sensor_1__mean_second_derivative_central,sensor_1__median,...,sensor_4__fourier_entropy__bins_2,sensor_4__fourier_entropy__bins_3,sensor_4__fourier_entropy__bins_5,sensor_4__fourier_entropy__bins_10,sensor_4__fourier_entropy__bins_100,sensor_4__permutation_entropy__dimension_3__tau_1,sensor_4__permutation_entropy__dimension_4__tau_1,sensor_4__permutation_entropy__dimension_5__tau_1,sensor_4__permutation_entropy__dimension_6__tau_1,sensor_4__permutation_entropy__dimension_7__tau_1
segment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
140031872,1.0,0.0,0.0,1.0,-54490.0,47882792.0,25.139139,0.024024,-0.001503,-43.0,...,0.079983,0.217718,0.37659,0.604455,1.128716,1.282814,1.941725,2.624954,3.290594,3.948798
140348256,1.0,0.0,0.0,1.0,-12608.0,64050948.0,26.994995,-0.346346,-0.014028,4.5,...,0.188113,0.341424,0.500208,0.65161,1.306434,1.511587,2.467225,3.460273,4.373737,5.122356
140851065,1.0,1.0,0.0,1.0,-3568.0,306786560.0,61.619621,-1.369369,0.037074,11.0,...,0.110453,0.220352,0.249958,0.446547,0.804609,1.065453,1.467375,1.898141,2.319963,2.742632


In [15]:
extracted_features_df.shape

(31, 1558)

In [16]:
[name for name in extracted_features_df.columns]

['sensor_1__variance_larger_than_standard_deviation',
 'sensor_1__has_duplicate_max',
 'sensor_1__has_duplicate_min',
 'sensor_1__has_duplicate',
 'sensor_1__sum_values',
 'sensor_1__abs_energy',
 'sensor_1__mean_abs_change',
 'sensor_1__mean_change',
 'sensor_1__mean_second_derivative_central',
 'sensor_1__median',
 'sensor_1__mean',
 'sensor_1__length',
 'sensor_1__standard_deviation',
 'sensor_1__variation_coefficient',
 'sensor_1__variance',
 'sensor_1__skewness',
 'sensor_1__kurtosis',
 'sensor_1__absolute_sum_of_changes',
 'sensor_1__longest_strike_below_mean',
 'sensor_1__longest_strike_above_mean',
 'sensor_1__count_above_mean',
 'sensor_1__count_below_mean',
 'sensor_1__last_location_of_maximum',
 'sensor_1__first_location_of_maximum',
 'sensor_1__last_location_of_minimum',
 'sensor_1__first_location_of_minimum',
 'sensor_1__percentage_of_reoccurring_values_to_all_values',
 'sensor_1__percentage_of_reoccurring_datapoints_to_all_datapoints',
 'sensor_1__sum_of_reoccurring_value

# Feature Selection based on Hypothesis Testing
- Based on the value of the target variable, `tsfresh` can select relevant features
- For this volcano dataset, target variable is "time_to_erruption" assocaited with each segment.

#### Load target varaible ("time_to_erruption") to be used for feature selection 

In [17]:
# Read the CSV file containing "time_to_erruption" for each segment
time_to_errupt_df = pd.read_csv(f"{DATA_DIR}/../train.csv")

# Find the time_to_erruption for the segment_ids used for feature extraction
selected_segment_ids = extracted_features_df.index

# Filter out the target for the selected segment_ids
y = time_to_errupt_df[time_to_errupt_df.segment_id.isin(selected_segment_ids)]

# `tsfresh` expects target variable in a Pandas Series format
y_series = pd.Series(data=y.time_to_eruption.values, index=y.segment_id)

#### Impute missing values of generated features (if any)

In [18]:
from tsfresh.utilities.dataframe_functions import impute

extracted_features_df = impute(extracted_features_df)

Take a look of the data

In [19]:
y_series.shape, extracted_features_df.shape

((31,), (31, 1558))

In [20]:
y_series.head()

segment_id
1400253000    22532237
1403440092     4524342
1406626451    24133615
1403005697    35260256
1408645616    22407847
dtype: int64

In [21]:
extracted_features_df.head()

Unnamed: 0_level_0,sensor_1__variance_larger_than_standard_deviation,sensor_1__has_duplicate_max,sensor_1__has_duplicate_min,sensor_1__has_duplicate,sensor_1__sum_values,sensor_1__abs_energy,sensor_1__mean_abs_change,sensor_1__mean_change,sensor_1__mean_second_derivative_central,sensor_1__median,...,sensor_4__fourier_entropy__bins_2,sensor_4__fourier_entropy__bins_3,sensor_4__fourier_entropy__bins_5,sensor_4__fourier_entropy__bins_10,sensor_4__fourier_entropy__bins_100,sensor_4__permutation_entropy__dimension_3__tau_1,sensor_4__permutation_entropy__dimension_4__tau_1,sensor_4__permutation_entropy__dimension_5__tau_1,sensor_4__permutation_entropy__dimension_6__tau_1,sensor_4__permutation_entropy__dimension_7__tau_1
segment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
140031872,1.0,0.0,0.0,1.0,-54490.0,47882790.0,25.139139,0.024024,-0.001503,-43.0,...,0.079983,0.217718,0.37659,0.604455,1.128716,1.282814,1.941725,2.624954,3.290594,3.948798
140348256,1.0,0.0,0.0,1.0,-12608.0,64050950.0,26.994995,-0.346346,-0.014028,4.5,...,0.188113,0.341424,0.500208,0.65161,1.306434,1.511587,2.467225,3.460273,4.373737,5.122356
140851065,1.0,1.0,0.0,1.0,-3568.0,306786600.0,61.619621,-1.369369,0.037074,11.0,...,0.110453,0.220352,0.249958,0.446547,0.804609,1.065453,1.467375,1.898141,2.319963,2.742632
1400253000,1.0,0.0,0.0,1.0,-5286.0,137339900.0,42.240242,-0.014014,0.055611,19.0,...,0.138228,0.138228,0.235155,0.32514,1.109209,1.41459,2.238319,3.114445,3.982191,4.805546
1400727315,1.0,1.0,1.0,1.0,354441.0,144066400000.0,954.457458,10.207207,-0.476954,976.5,...,0.110453,0.125256,0.245901,0.32514,0.642095,0.994445,1.309224,1.640863,2.002486,2.372683


#### Select relevant features

In [22]:
from tsfresh import select_features

features_filtered_direct = select_features(extracted_features_df, 
                                           y_series, 
                                           n_jobs=10, 
                                           #show_warnings=True, 
                                           ml_task="regression",
                                          )

In [23]:
features_filtered_direct.shape

(31, 0)

# Rolling Features

In [24]:
df.head()

Unnamed: 0,segment_id,time,sensor_1,sensor_4
0,1407094442,0,-580.0,-458.0
1,1407094442,1,-598.0,-432.0
2,1407094442,2,-615.0,-342.0
3,1407094442,3,-592.0,-204.0
4,1407094442,4,-536.0,-14.0


#### Create Rolling Data

In [39]:
from tsfresh.utilities.dataframe_functions import roll_time_series
df_rolled = roll_time_series(df, 
                             max_timeshift=9, 
                             column_id="segment_id", 
                             column_sort="time", 
                             n_jobs=10)

Rolling: 100%|██████████| 50/50 [00:04<00:00, 10.35it/s]


In [40]:
df.shape, df_rolled.shape

((31000, 4), (308605, 5))

In [41]:
df_rolled.head(100)

Unnamed: 0,segment_id,time,sensor_1,sensor_4,id
12400,140031872,0,-167.0,307.0,"(140031872, 0)"
12431,140031872,0,-167.0,307.0,"(140031872, 1)"
12432,140031872,1,-172.0,319.0,"(140031872, 1)"
12493,140031872,0,-167.0,307.0,"(140031872, 2)"
12494,140031872,1,-172.0,319.0,"(140031872, 2)"
12495,140031872,2,-169.0,276.0,"(140031872, 2)"
12586,140031872,0,-167.0,307.0,"(140031872, 3)"
12587,140031872,1,-172.0,319.0,"(140031872, 3)"
12588,140031872,2,-169.0,276.0,"(140031872, 3)"
12589,140031872,3,-167.0,252.0,"(140031872, 3)"


In [42]:
df_rolled = df_rolled.drop("segment_id", axis="columns")

### Generate features on the rolling data

In [44]:
from tsfresh import extract_features
from tsfresh.feature_extraction.settings import MinimalFCParameters

# Generate features for sensor_1 for each segment_id
# It's important that we retain the automatically generated columns, sort and id
extracted_rolling_features_df = extract_features(timeseries_container=df_rolled, 
                                 default_fc_parameters=MinimalFCParameters(),
                                 column_id="id",
                                 column_sort="time",
                                 n_jobs=10,
                                 disable_progressbar=False
                                )

Feature Extraction: 100%|██████████| 50/50 [00:16<00:00,  3.11it/s]


In [45]:
extracted_rolling_features_df.shape

(31000, 16)

In [46]:
extracted_rolling_features_df.head()

Unnamed: 0,Unnamed: 1,sensor_1__sum_values,sensor_1__median,sensor_1__mean,sensor_1__length,sensor_1__standard_deviation,sensor_1__variance,sensor_1__maximum,sensor_1__minimum,sensor_4__sum_values,sensor_4__median,sensor_4__mean,sensor_4__length,sensor_4__standard_deviation,sensor_4__variance,sensor_4__maximum,sensor_4__minimum
140031872,0,-167.0,-167.0,-167.0,1.0,0.0,0.0,-167.0,-167.0,307.0,307.0,307.0,1.0,0.0,0.0,307.0,307.0
140031872,1,-339.0,-169.5,-169.5,2.0,2.5,6.25,-167.0,-172.0,626.0,313.0,313.0,2.0,6.0,36.0,319.0,307.0
140031872,2,-508.0,-169.0,-169.333328,3.0,2.054805,4.222222,-167.0,-172.0,902.0,307.0,300.666656,3.0,18.116905,328.222229,319.0,276.0
140031872,3,-675.0,-168.0,-168.75,4.0,2.046338,4.1875,-167.0,-172.0,1154.0,291.5,288.5,4.0,26.27261,690.25,319.0,252.0
140031872,4,-833.0,-167.0,-166.600006,5.0,4.673328,21.84,-158.0,-172.0,1399.0,276.0,279.799988,5.0,29.2397,854.960083,319.0,245.0


In [47]:
extracted_rolling_features_df = extracted_rolling_features_df.droplevel(1, axis=0).rename_axis(["segment_id"]).reset_index()

In [48]:
extracted_rolling_features_df.segment_id.value_counts()

140851065     1000
1406456924    1000
1404122310    1000
1404179874    1000
1403947680    1000
140348256     1000
1407084157    1000
1405189645    1000
1403222059    1000
1403005697    1000
1408663387    1000
1408285202    1000
1403440092    1000
1404322654    1000
1400929225    1000
140031872     1000
1400727315    1000
1408645616    1000
1402556914    1000
1406938061    1000
1405443107    1000
1406234149    1000
1407261706    1000
1409167039    1000
1407094442    1000
1402914692    1000
1400253000    1000
1406626451    1000
1402674973    1000
1404502479    1000
1403244730    1000
Name: segment_id, dtype: int64

In [49]:
extracted_rolling_features_df.head()

Unnamed: 0,segment_id,sensor_1__sum_values,sensor_1__median,sensor_1__mean,sensor_1__length,sensor_1__standard_deviation,sensor_1__variance,sensor_1__maximum,sensor_1__minimum,sensor_4__sum_values,sensor_4__median,sensor_4__mean,sensor_4__length,sensor_4__standard_deviation,sensor_4__variance,sensor_4__maximum,sensor_4__minimum
0,140031872,-167.0,-167.0,-167.0,1.0,0.0,0.0,-167.0,-167.0,307.0,307.0,307.0,1.0,0.0,0.0,307.0,307.0
1,140031872,-339.0,-169.5,-169.5,2.0,2.5,6.25,-167.0,-172.0,626.0,313.0,313.0,2.0,6.0,36.0,319.0,307.0
2,140031872,-508.0,-169.0,-169.333328,3.0,2.054805,4.222222,-167.0,-172.0,902.0,307.0,300.666656,3.0,18.116905,328.222229,319.0,276.0
3,140031872,-675.0,-168.0,-168.75,4.0,2.046338,4.1875,-167.0,-172.0,1154.0,291.5,288.5,4.0,26.27261,690.25,319.0,252.0
4,140031872,-833.0,-167.0,-166.600006,5.0,4.673328,21.84,-158.0,-172.0,1399.0,276.0,279.799988,5.0,29.2397,854.960083,319.0,245.0
