# Using the RelevantFeatureAugmenter with separate datasets for train and test data

This notebook illustrates the RelevantFeatureAugmenter in pipelines where you have first train on samples from dataset `df_train` but then want to test using samples from another `df_test`.
(Here `df_train` and `df_test` refer to the dataframes that contain the time series data)

Due to limitations in the sklearn pipeline API one has to use the `ppl.set_params(fresh__timeseries_container=df)` method for those two dataframes between train and test run.

In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from tsfresh.examples.robot_execution_failures import download_robot_execution_failures
from tsfresh.examples import load_robot_execution_failures
from tsfresh.transformers import RelevantFeatureAugmenter

In [None]:
download_robot_execution_failures
df, y = load_robot_execution_failures()
df.shape

In [None]:
# Here, df contains the time series of both train and test set. 
# We will split it into a train df_train and a test set  df_test:
y_train, y_test = train_test_split(y)
df_train = df.loc[df.id.isin(y_train.index)]
df_test = df.loc[df.id.isin(y_test.index)]
X_train = pd.DataFrame(index=y_train.index)
X_test = pd.DataFrame(index=y_test.index)
df_train.shape, df_test.shape

In [None]:
ppl = Pipeline([('fresh', RelevantFeatureAugmenter(column_id='id', column_sort='time')),
                ('clf', RandomForestClassifier())])

In [None]:
# for the fit on the train test set, we set the fresh__timeseries_container to `df_train`
ppl.set_params(fresh__timeseries_container=df_train)
ppl.fit(X_train, y_train)

In [None]:
# for the predict on the test test set, we set the fresh__timeseries_container to `df_test`
ppl.set_params(fresh__timeseries_container=df_test)
y_pred = ppl.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))