In [1]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd


class PCAForPandas(PCA):
    """This class is just a small wrapper around the PCA estimator of sklearn including normalization to make it 
    compatible with pandas.
    """

    def __init__(self, **kwargs):
        self._z_scaler = StandardScaler()
        super().__init__(**kwargs)

        self._X_columns = None

    def fit(self, X, y=None):
        """Normalize X and call the fit method of the base class with numpy arrays instead of pandas data frames."""

        X = self._prepare(X)

        self._z_scaler.fit(X.values, y)
        z_data = self._z_scaler.transform(X.values, y)

        return super().fit(z_data, y)

    def fit_transform(self, X, y=None):
        """Call the fit and the transform method of this class."""

        X = self._prepare(X)

        self.fit(X, y)
        return self.transform(X, y)

    def transform(self, X, y=None):
        """Normalize X and call the transform method of the base class with numpy arrays instead of pandas data frames."""

        X = self._prepare(X)

        z_data = self._z_scaler.transform(X.values, y)

        transformed_ndarray = super().transform(z_data)

        pandas_df = pd.DataFrame(transformed_ndarray)
        pandas_df.columns = ["pca_{}".format(i) for i in range(len(pandas_df.columns))]

        return pandas_df

    def _prepare(self, X):
        """Check if the data is a pandas DataFrame and sorts the column names.

        :raise AttributeError: if pandas is not a DataFrame or the columns of the new X is not compatible with the 
                               columns from the previous X data
        """
        if not isinstance(X, pd.DataFrame):
            raise AttributeError("X is not a pandas DataFrame")

        X.sort_index(axis=1, inplace=True)

        if self._X_columns is not None:
            if self._X_columns != list(X.columns):
                raise AttributeError("The columns of the new X is not compatible with the columns from the previous X data")
        else:
            self._X_columns = list(X.columns)

        return X

## Load robot failure example

Splits the data set in a past (1 <= id <= 87) and a future set (87 <= id <= 88). It is assumed that the selection process is done in the past and features for future data sets should be determined. The id 87 is overlapping so that the correctness of the procedure can be easily shown.

In [2]:
from tsfresh.examples.robot_execution_failures import download_robot_execution_failures, load_robot_execution_failures
from tsfresh.feature_extraction import extract_features
from tsfresh.feature_selection import select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters, MinimalFCParameters, settings

download_robot_execution_failures()
df, y = load_robot_execution_failures()
df_past = df.iloc[(df.id <= 87).values]
y_past = y[0:-1]

df_future = df.iloc[(df.id >= 87).values]
y_future = y[-2:]

df.head()

  from pandas.core import datetools


Unnamed: 0,id,time,a,b,c,d,e,f
0,1,0,-1,-1,63,-3,-1,0
1,1,1,0,0,62,-3,-1,0
2,1,2,-1,-1,61,-3,0,0
3,1,3,-1,-1,63,-2,-1,0
4,1,4,-1,-1,63,-3,-1,0


# Past

## Extract past features

In [3]:
X_past = extract_features(df_past, column_id='id', column_sort='time', default_fc_parameters=MinimalFCParameters(),
                          impute_function=impute)

Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 81.18it/s]


In [4]:
X_past.head()

Unnamed: 0_level_0,e__sum_values,e__median,e__mean,e__length,e__standard_deviation,e__variance,e__maximum,e__minimum,f__sum_values,f__median,...,a__maximum,a__minimum,c__sum_values,c__median,c__mean,c__length,c__standard_deviation,c__variance,c__maximum,c__minimum
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-10.0,-1.0,-0.666667,15.0,0.471405,0.222222,0.0,-1.0,0.0,0.0,...,0.0,-1.0,938.0,63.0,62.533333,15.0,1.203698,1.448889,64.0,60.0
2,-20.0,-1.0,-1.333333,15.0,2.054805,4.222222,4.0,-5.0,-4.0,0.0,...,0.0,-3.0,932.0,63.0,62.133333,15.0,4.333846,18.782222,70.0,53.0
3,-29.0,-2.0,-1.933333,15.0,1.768867,3.128889,1.0,-5.0,-4.0,0.0,...,1.0,-1.0,917.0,61.0,61.133333,15.0,4.616877,21.315556,68.0,51.0
4,-16.0,-1.0,-1.066667,15.0,2.669998,7.128889,4.0,-6.0,-5.0,0.0,...,1.0,-2.0,933.0,63.0,62.2,15.0,3.833188,14.693333,70.0,56.0
5,-42.0,-3.0,-2.8,15.0,2.039608,4.16,3.0,-5.0,-2.0,0.0,...,2.0,-2.0,909.0,59.0,60.6,15.0,4.841487,23.44,73.0,56.0


## Select past features

In [5]:
X_past_filtered = select_features(X_past, y_past)
X_past_filtered.tail()



Unnamed: 0_level_0,e__standard_deviation,e__variance,c__standard_deviation,c__variance,a__standard_deviation,a__variance,d__variance,d__standard_deviation,b__standard_deviation,b__variance,...,c__mean,c__median,b__maximum,a__minimum,a__maximum,d__minimum,f__minimum,e__minimum,f__maximum,c__maximum
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
83,7.190735,51.706667,51.26645,2628.248889,5.329165,28.4,1058.728889,32.538114,2.205045,4.862222,...,-73.533333,-53.0,-8.0,-28.0,-14.0,70.0,-16.0,-23.0,-10.0,-24.0
84,39.541483,1563.528889,291.988082,85257.04,36.585729,1338.515556,6875.848889,82.920739,33.816498,1143.555556,...,-711.4,-912.0,83.0,-110.0,-25.0,180.0,-28.0,12.0,0.0,-208.0
85,3.841296,14.755556,14.501494,210.293333,4.616877,21.315556,40.995556,6.402777,2.844097,8.088889,...,28.2,32.0,15.0,4.0,19.0,-46.0,-7.0,-1.0,0.0,50.0
86,52.807154,2788.595556,121.420189,14742.862222,38.235179,1461.928889,202.426667,14.227673,16.041058,257.315556,...,-147.733333,-110.0,69.0,21.0,148.0,-95.0,-10.0,14.0,8.0,-14.0
87,80.098162,6415.715556,204.966621,42011.315556,57.753268,3335.44,70.995556,8.425886,23.75673,564.382222,...,-942.466667,-1036.0,162.0,171.0,342.0,-142.0,13.0,222.0,44.0,-486.0


## Principal Component Analysis of past features

In [6]:
pca_past_filtered = PCAForPandas(n_components=4)
X_past_pca_filtered = pca_past_filtered.fit_transform(X_past_filtered)

# add index plus 1 to keep original index from robot example
X_past_pca_filtered.index += 1

X_past_pca_filtered.tail()

Unnamed: 0,pca_0,pca_1,pca_2,pca_3
83,-1.828246,0.510962,0.070269,-0.102048
84,3.74134,3.661448,1.263409,-0.115073
85,-2.361277,-0.105445,-0.078477,0.292859
86,0.261576,0.129725,1.586737,1.390926
87,4.337937,3.201585,1.248812,4.419234


# Future

## Extract future features

Only the selected features from the past data are extracted.

In [7]:
X_future_filtered = extract_features(df_future, column_id='id', column_sort='time',
                                     kind_to_fc_parameters=settings.from_columns(X_past_filtered.columns),
                                     impute_function=impute)

Feature Extraction: 100%|██████████| 6/6 [00:00<00:00, 240.75it/s]


In [8]:
X_future_filtered

Unnamed: 0_level_0,a__maximum,a__minimum,a__standard_deviation,a__variance,e__minimum,e__standard_deviation,e__variance,f__maximum,f__minimum,f__standard_deviation,...,b__maximum,b__standard_deviation,b__variance,c__maximum,c__mean,c__median,c__minimum,c__standard_deviation,c__sum_values,c__variance
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
87,342.0,171.0,57.753268,3335.44,222.0,80.098162,6415.715556,44.0,13.0,9.903983,...,162.0,23.75673,564.382222,-486.0,-942.466667,-1036.0,-1145.0,204.966621,-14137.0,42011.315556
88,-6.0,-13.0,2.061283,4.248889,-27.0,2.628054,6.906667,6.0,3.0,0.884433,...,5.0,1.203698,1.448889,53.0,40.0,42.0,15.0,10.62701,600.0,112.933333


## Principal Component Analysis of future features

The PCA components of the id 87 are the same as in the previous past PCA.

In [9]:
X_future_pca_filtered = pca_past_filtered.transform(X_future_filtered)

# reset index to keep original index from robot example
X_future_pca_filtered.index = [87, 88]

X_future_pca_filtered

Unnamed: 0,pca_0,pca_1,pca_2,pca_3
87,4.337937,3.201585,1.248812,4.419234
88,-2.51065,-0.139986,-0.469103,0.243084
