#Install tsfresh, removing last version of some libraries to avoid dependency collision (this don't work in tiny/free colab envs, this notebook it's used in a jupyter notebook)

In [None]:
#Try to install tsfresh
!pip install --upgrade pip
!pip uninstall numpy pandas --yes
!pip install tsfresh gdown
!pip install sktime

In [None]:
#Update tsfresh (to avoid some problems in the prev step in some colab envs)
!pip install --upgrade --force-reinstall tsfresh

In [None]:
#Install dependencies (to have some tools that in colab are installed by default) to use it in custom jupyter env
!apt-get install zip unzip
!pip install --upgrade pip
!pip install gdown

# Download cleaned files from google drive

In [None]:
#Remove unused files
!rm -rf sample_data

In [None]:
# Upload (or Download files from drive) cleaned data from EDA step
!gdown --id xxxxxx

Downloading...
From: https://drive.google.com/uc?id=16PUjgobp9eNNNDz3gvun7T9VfllJNP9B
To: /tf/cleanData.zip
100%|██████████████████████████████████████| 2.44G/2.44G [00:57<00:00, 42.6MB/s]


In [None]:
# Unzup all of the data
!unzip -q -o *.zip -d tempData/

In [None]:
#Delete zip files to avoid colab space problems   
!rm -rf *.zip

In [None]:
# Move cleaned data to a root folder, to handle it better

import os
os.rename("tempData/content/cleanData", "data")

# Remove the folder
!rm -rf tempData

#Load libs

In [None]:
# Import the used python libs
import numpy as np
import pandas as pd
import glob
import math
import os
import gc
import sys

#Another mini EDA joining and processing with tsfresh the selected CSVs in the prev step of EDA to get more data with more columns and reducing rows

In [None]:
#Load train CSV into data frames
trainDF = pd.read_csv("data/train.csv")
print("Train Size: " + str(len(trainDF)))

Train Size: 4431
Test Size: 4520


In [None]:
# Get only the train segment data of the cleaned files only
existingSegmentTrain = []

for segment in trainDF.segment_id:
  if os.path.exists('data/train/'+ str(segment) +'.csv'):
    existingSegmentTrain.append(segment)

trainDF = trainDF.query("segment_id in @existingSegmentTrain")
print("New Train Size: " + str(len(trainDF)))
trainDF_EDA.to_csv("data/trainClean.csv")

New Train Size: 1211


In [None]:
#Load test CSV into data frames
testDF = pd.read_csv("data/sample_submission.csv")
print("Test Size: " + str(len(testDF)))

In [None]:
#Get only the test segment data of the cleaned files only
existingSegmentTrain = []

for segment in testDF.segment_id:
  if os.path.exists('data/test/'+ str(segment) +'.csv'):
    existingSegmentTrain.append(segment)

testDF_EDA = testDF.query("segment_id in @existingSegmentTrain")
print("New Test Size: " + str(len(testDF_EDA)))
testDF_EDA.to_csv("data/sample_submissionClean.csv")

New Test Size: 812


In [None]:
testDF_EDA.head(10)

In [None]:
#Get all of the paths of the files of the train segments data
train_frags = glob.glob("data/train/*")
print("Train Size: " + str(len(train_frags)))

In [None]:
# Join all of the separated CSVs into only one, with all of each segment data with the time to eruption

trainDFsArray = []

for flag in train_frags:
  if os.path.exists(flag):
    sensorDF = pd.read_csv(flag)
    segment_id = flag.replace("data/train/", "").replace(".csv", "")
    df = trainDF[trainDF.segment_id.eq(int(segment_id))]
    sensorDF['time_to_eruption'] = df.iloc[0].time_to_eruption
    trainDFsArray.append(sensorDF)
  else:
    print("Flag not found: " + flag)

trainDFs = pd.concat(trainDFsArray, ignore_index=True)
gc.collect()

In [None]:
from tsfresh.feature_extraction import ComprehensiveFCParameters
from sktime.transformations.panel.tsfresh import TSFreshFeatureExtractor
from tsfresh import extract_features
from tsfresh.feature_extraction import MinimalFCParameters
from tsfresh.feature_extraction import EfficientFCParameters

In [None]:
# Mini test to check the processing time of the last processed segment and see if it is viable
t = TSFreshFeatureExtractor(default_fc_parameters="minimal", show_warnings=False)
df2 = t.fit_transform(sensorDF.values)
df2.head()

Feature Extraction: 100%|██████████| 60001/60001 [00:19<00:00, 3146.80it/s]


Unnamed: 0,var_0__sum_values,var_0__median,var_0__mean,var_0__length,var_0__standard_deviation,var_0__variance,var_0__root_mean_square,var_0__maximum,var_0__absolute_maximum,var_0__minimum
0,578.0,11.5,57.8,10.0,191.891011,36822.16,200.407086,482.0,482.0,-153.0
1,609.0,32.0,60.9,10.0,160.609744,25795.49,171.768158,424.0,424.0,-145.0
2,443.0,64.5,44.3,10.0,126.989803,16126.41,134.494981,298.0,298.0,-132.0
3,576.0,35.5,57.6,10.0,133.394303,17794.04,145.299002,275.0,275.0,-113.0
4,411.0,-67.5,41.1,10.0,192.96707,37236.29,197.295464,445.0,445.0,-127.0


In [None]:
# See the generated volume of data of the generated sensor
df2.shape

(60001, 783)

In [None]:
# See all of the efficient parameters of tsfresh to select only the needed
extraction_settings = EfficientFCParameters()
print(extraction_settings)

{'variance_larger_than_standard_deviation': None, 'has_duplicate_max': None, 'has_duplicate_min': None, 'has_duplicate': None, 'sum_values': None, 'abs_energy': None, 'mean_abs_change': None, 'mean_change': None, 'mean_second_derivative_central': None, 'median': None, 'mean': None, 'length': None, 'standard_deviation': None, 'variation_coefficient': None, 'variance': None, 'skewness': None, 'kurtosis': None, 'root_mean_square': None, 'absolute_sum_of_changes': None, 'longest_strike_below_mean': None, 'longest_strike_above_mean': None, 'count_above_mean': None, 'count_below_mean': None, 'last_location_of_maximum': None, 'first_location_of_maximum': None, 'last_location_of_minimum': None, 'first_location_of_minimum': None, 'percentage_of_reoccurring_values_to_all_values': None, 'percentage_of_reoccurring_datapoints_to_all_datapoints': None, 'sum_of_reoccurring_values': None, 'sum_of_reoccurring_data_points': None, 'ratio_value_number_to_time_series_length': None, 'maximum': None, 'absolu

In [None]:
# As we have seen in the tests with tsfresh, extracting all the columns defined by default leaves a dataset that is too large and takes too long to process.
# We define here only the features to process and extract with tsfresh
params = {
    'variance_larger_than_standard_deviation': None,
    'sum_values': None,
    'abs_energy': None,
    'mean_abs_change': None,
    'mean_change': None,
    'mean_second_derivative_central': None,
    'median': None,
    'mean': None,
    'length': None,
    'standard_deviation': None,
    'variation_coefficient': None,
    'variance': None,
    'skewness': None,
    'kurtosis': None,
    'root_mean_square': None,
    'absolute_sum_of_changes': None, 
    'longest_strike_below_mean': None,
    'longest_strike_above_mean': None,
    'count_above_mean': None,
    'count_below_mean': None,
    'last_location_of_maximum': None,
    'first_location_of_maximum': None,
    'last_location_of_minimum': None,
    'first_location_of_minimum': None,
    'ratio_value_number_to_time_series_length': None,
    'maximum': None,
    'minimum': None,
    'benford_correlation': None,
    'time_reversal_asymmetry_statistic': [
        {'lag': 1
        },
        {'lag': 2
        },
        {'lag': 3
        }
    ], 'c3': [
        {'lag': 1
        },
        {'lag': 2
        },
        {'lag': 3
        }
    ], 'cid_ce': [
        {'normalize': True
        },
        {'normalize': False
        }
    ], 'large_standard_deviation': [
        {'r': 0.05
        },
        {'r': 0.1
        },
        {'r': 0.2
        },
        {'r': 0.3
        },
        {'r': 0.4
        },
        {'r': 0.5
        },
        {'r': 0.6
        },
        {'r': 0.7
        },
        {'r': 0.8
        },
        {'r': 0.9
        }
    ], 'quantile': [
        {'q': 0.1
        },
        {'q': 0.2
        },
        {'q': 0.3
        },
        {'q': 0.4
        },
        {'q': 0.6
        },
        {'q': 0.7
        },
        {'q': 0.8
        },
        {'q': 0.9
        }
    ], 'spkt_welch_density': [
        {'coeff': 2
        },
        {'coeff': 5
        },
        {'coeff': 8
        }
    ], 'ar_coefficient': [
        {'coeff': 0, 'k': 10
        },
        {'coeff': 1, 'k': 10
        },
        {'coeff': 2, 'k': 10
        },
        {'coeff': 3, 'k': 10
        },
        {'coeff': 4, 'k': 10
        },
        {'coeff': 5, 'k': 10
        },
        {'coeff': 6, 'k': 10
        },
        {'coeff': 7, 'k': 10
        },
        {'coeff': 8, 'k': 10
        },
        {'coeff': 9, 'k': 10
        },
        {'coeff': 10, 'k': 10
        }
    ], 'fft_coefficient': [
        {'coeff': 0, 'attr': 'real'
        },
        {'coeff': 1, 'attr': 'real'
        },
        {'coeff': 2, 'attr': 'real'
        },
        {'coeff': 3, 'attr': 'real'
        },
        {'coeff': 4, 'attr': 'real'
        }
    ], 'fft_aggregated': [
        {'aggtype': 'centroid'
        },
        {'aggtype': 'variance'
        },
        {'aggtype': 'skew'
        },
        {'aggtype': 'kurtosis'
        }
    ], 'max_langevin_fixed_point': [
        {'m': 3, 'r': 30
        }
    ], 'linear_trend': [
        {'attr': 'pvalue'
        },
        {'attr': 'rvalue'
        },
        {'attr': 'intercept'
        },
        {'attr': 'slope'
        },
        {'attr': 'stderr'
        }
    ], 'energy_ratio_by_chunks': [
        {'num_segments': 10, 'segment_focus': 0
        },
        {'num_segments': 10, 'segment_focus': 1
        },
        {'num_segments': 10, 'segment_focus': 2
        },
        {'num_segments': 10, 'segment_focus': 3
        },
        {'num_segments': 10, 'segment_focus': 4
        },
        {'num_segments': 10, 'segment_focus': 5
        },
        {'num_segments': 10, 'segment_focus': 6
        },
        {'num_segments': 10, 'segment_focus': 7
        },
        {'num_segments': 10, 'segment_focus': 8
        },
        {'num_segments': 10, 'segment_focus': 9
        }
    ], 'ratio_beyond_r_sigma': [
        {'r': 0.5
        },
        {'r': 1
        },
        {'r': 1.5
        },
        {'r': 2
        },
        {'r': 2.5
        },
        {'r': 3
        },
        {'r': 5
        },
        {'r': 6
        },
        {'r': 7
        },
        {'r': 10
        }
    ], 'linear_trend_timewise': [
        {'attr': 'pvalue'
        },
        {'attr': 'rvalue'
        },
        {'attr': 'intercept'
        },
        {'attr': 'slope'
        },
        {'attr': 'stderr'
        }
    ], 'lempel_ziv_complexity': [
        {'bins': 2
        },
        {'bins': 3
        },
        {'bins': 5
        }
    ], 'fourier_entropy': [
        {'bins': 2
        },
        {'bins': 3
        },
        {'bins': 5
        }
    ], 'permutation_entropy': [
        {'tau': 1, 'dimension': 3
        },
        {'tau': 1, 'dimension': 4
        },
        {'tau': 1, 'dimension': 5
        }
    ], 'query_similarity_count': [
        {'query': None, 'threshold': 0.0
        }
    ], 'mean_n_absolute_max': [
        {'number_of_maxima': 7}
    ]
}

{'variance_larger_than_standard_deviation': None, 'sum_values': None, 'abs_energy': None, 'mean_abs_change': None, 'mean_change': None, 'mean_second_derivative_central': None, 'median': None, 'mean': None, 'length': None, 'standard_deviation': None, 'variation_coefficient': None, 'variance': None, 'skewness': None, 'kurtosis': None, 'root_mean_square': None, 'absolute_sum_of_changes': None, 'longest_strike_below_mean': None, 'longest_strike_above_mean': None, 'count_above_mean': None, 'count_below_mean': None, 'last_location_of_maximum': None, 'first_location_of_maximum': None, 'last_location_of_minimum': None, 'first_location_of_minimum': None, 'ratio_value_number_to_time_series_length': None, 'maximum': None, 'minimum': None, 'benford_correlation': None, 'time_reversal_asymmetry_statistic': [{'lag': 1}, {'lag': 2}, {'lag': 3}], 'c3': [{'lag': 1}, {'lag': 2}, {'lag': 3}], 'cid_ce': [{'normalize': True}, {'normalize': False}], 'large_standard_deviation': [{'r': 0.05}, {'r': 0.1}, {'r':

In [None]:
# Join all of the separated CSVs into only one, with all of each segment train data with the time to eruption

!mkdir segments

for flag in train_frags:
  if os.path.exists(flag):
    # Read CSV into DataFrame
    sensorDF = pd.read_csv(flag)
    # Get the segment id removing the path and the extension of the file
    segment_id = flag.replace("data/train/", "").replace(".csv", "")
    # Add an enum to make more easy to tsfresh processing the data
    sensorDF.insert(0, 'time', range(1, 1+len(sensorDF)))
    # Add the segment id to resultant DF
    sensorDF.insert(0, 'id', segment_id)
    # Get the time to eruption
    df = trainDF[trainDF.segment_id.eq(int(segment_id))]
    X = extract_features(sensorDF, column_id='id', column_sort='time', default_fc_parameters=params)
    X.insert(0, 'segment_id', segment_id)
    # Add time to eruption to the resultant DataFrame
    X['time_to_eruption'] = df.iloc[0].time_to_eruption
    # Save the dataframe
    X.to_csv('segments/' + segment_id + '.csv')

In [None]:
# join all of the processed train data into one CSV (resultant File doesn't have headers)
!tail -q -n-1 segments/*.csv > segments.csv

In [None]:
# Get all of the paths of the files of the test segments data
test_frags = glob.glob("data/test/*")
print("Test Size: " + str(len(test_frags)))

In [None]:
# Join all of the separated CSVs into only one, with all of each segment test data with the time to eruption (same process as the train data)

!mkdir segments_test

for flag in test_frags:
  if os.path.exists(flag):
    sensorDF = pd.read_csv(flag)
    segment_id = flag.replace("data/test/", "").replace(".csv", "")
    sensorDF.insert(0, 'time', range(1, 1+len(sensorDF)))
    sensorDF.insert(0, 'id', segment_id)
    df = testDF[testDF.segment_id.eq(int(segment_id))]
    X = extract_features(sensorDF, column_id='id', column_sort='time', default_fc_parameters=params)
    X.insert(0, 'segment_id', segment_id)
    X['time_to_eruption'] = df.iloc[0].time_to_eruption
    X.to_csv('segments_test/' + segment_id + '.csv')

In [None]:
# join all of the processed test data into one CSV (resultant File doesn't have headers)
!tail -q -n-1 segments_test/*.csv > segments_test.csv