# Basic exampe of the RelevantFeatureAugmenter in sklearn pipeline

In [2]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

from tsfresh.examples.robot_execution_failures import download_robot_execution_failures
from tsfresh.examples import load_robot_execution_failures
from tsfresh.transformers import RelevantFeatureAugmenter
from tsfresh.utilities.dataframe_functions import impute

In [3]:
# Download the dataset if you haven't already
download_robot_execution_failures() 
# Load data
df_ts, y = load_robot_execution_failures()

In [4]:
# We create an empty feature matrix that has the proper index
X = pd.DataFrame(index=y.index)

In [5]:
# Split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [6]:
# We have a pipeline that consists of a feature extraction step with a subsequent Random Forest Classifier 
ppl = Pipeline([('fresh', RelevantFeatureAugmenter(column_id='id', column_sort='time')),
                ('clf', RandomForestClassifier())])

In [7]:
# Here comes the tricky part, due to limitations of the sklearn pipeline API, we can not pass the dataframe
# containing the time series dataframe but instead have to use the set_params method
# In this case, df_ts contains the time series of both train and test set, if you have different dataframes for 
# train and test set, you have to call set_params two times (see the notebook pipeline_with_two_datasets.ipynb)
ppl.set_params(fresh__timeseries_container=df_ts)

Pipeline(steps=[('fresh', RelevantFeatureAugmenter(chunksize=None, column_id=None, column_kind=None,
             column_sort=None, column_value=None,
             default_fc_parameters=None, disable_progressbar=None,
             fdr_level=None, filter_only_tsfresh_features=True,
             hypotheses_ind...imators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))])

In [8]:
# We fit the pipeline
ppl.fit(X_train, y_train)

Feature Extraction: 100%|██████████| 6/6 [00:16<00:00,  2.79s/it]
 'a__friedrich_coefficients__m_3__r_30__coeff_3'
 'a__friedrich_coefficients__m_3__r_30__coeff_2'
 'a__friedrich_coefficients__m_3__r_30__coeff_1'
 'a__friedrich_coefficients__m_3__r_30__coeff_0'
 'a__spkt_welch_density__coeff_8' 'c__spkt_welch_density__coeff_8'
 'b__spkt_welch_density__coeff_8' 'e__spkt_welch_density__coeff_8'
 'd__spkt_welch_density__coeff_8' 'f__spkt_welch_density__coeff_8'] did not have any finite values. Filling with zeros.
Feature Extraction: 100%|██████████| 6/6 [00:04<00:00,  1.28it/s]


Pipeline(steps=[('fresh', RelevantFeatureAugmenter(chunksize=None, column_id=None, column_kind=None,
             column_sort=None, column_value=None,
             default_fc_parameters=None, disable_progressbar=None,
             fdr_level=None, filter_only_tsfresh_features=True,
             hypotheses_ind...imators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))])

In [9]:
# Predicting works as well
y_pred = ppl.predict(X_test)

Feature Extraction: 100%|██████████| 6/6 [00:01<00:00,  2.99it/s]


In [10]:
# So, finally we inspect the performance
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00         5
          1       1.00      1.00      1.00        17

avg / total       1.00      1.00      1.00        22

