# Using the RelevantFeatureAugmenter with separate datasets for train and test data

This notebook illustrates the RelevantFeatureAugmenter in pipelines where you have first train on samples from dataset `df_train` but then want to test using samples from another `df_test`.
(Here `df_train` and `df_test` refer to the dataframes that contain the time series data)

Due to limitations in the sklearn pipeline API one has to use the `ppl.set_params(fresh__timeseries_container=df)` method for those two dataframes between train and test run.

In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from tsfresh.examples.robot_execution_failures import download_robot_execution_failures
from tsfresh.examples import load_robot_execution_failures
from tsfresh.feature_extraction.settings import MinimalFCParameters
from tsfresh.transformers import RelevantFeatureAugmenter

  from pandas.core import datetools


In [2]:
download_robot_execution_failures()
df, y = load_robot_execution_failures()
df.shape

(1320, 8)

In [3]:
# Here, df contains the time series of both train and test set. 
# We will split it into a train df_train and a test set  df_test:
y_train, y_test = train_test_split(y)
df_train = df.loc[df.id.isin(y_train.index)]
df_test = df.loc[df.id.isin(y_test.index)]
X_train = pd.DataFrame(index=y_train.index)
X_test = pd.DataFrame(index=y_test.index)
df_train.shape, df_test.shape

((990, 8), (330, 8))

In [4]:
ppl = Pipeline([('fresh', RelevantFeatureAugmenter(column_id='id', column_sort='time', 
                                                   default_fc_parameters=MinimalFCParameters())),
                ('clf', RandomForestClassifier())])

In [5]:
# for the fit on the train test set, we set the fresh__timeseries_container to `df_train`
ppl.set_params(fresh__timeseries_container=df_train)
ppl.fit(X_train, y_train)

Feature Extraction: 100%|██████████| 396/396 [00:00<00:00, 37120.22it/s]
Feature Extraction: 100%|██████████| 396/396 [00:00<00:00, 197496.36it/s]


Pipeline(steps=[('fresh', RelevantFeatureAugmenter(chunksize=None, column_id=None, column_kind=None,
             column_sort=None, column_value=None,
             default_fc_parameters=None, disable_progressbar=None,
             fdr_level=None, filter_only_tsfresh_features=True,
             hypotheses_ind...imators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))])

In [6]:
# for the predict on the test test set, we set the fresh__timeseries_container to `df_test`
ppl.set_params(fresh__timeseries_container=df_test)
y_pred = ppl.predict(X_test)

Feature Extraction: 100%|██████████| 132/132 [00:00<00:00, 449025.25it/s]


In [7]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00         5
          1       1.00      1.00      1.00        17

avg / total       1.00      1.00      1.00        22

