*tsfresh* returns a great number of features. Depending on the dynamics of the inspected time series, some of them maybe highly correlated. 

A common technique to deal with such highly correlated features are transformations such as a principal component analysis (PCA). This notebooks shows you how to perform a PCA on the extracted features.

In [1]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd


class PCAForPandas(PCA):
    """This class is just a small wrapper around the PCA estimator of sklearn including normalization to make it 
    compatible with pandas DataFrames.
    """

    def __init__(self, **kwargs):
        self._z_scaler = StandardScaler()
        super(self.__class__, self).__init__(**kwargs)

        self._X_columns = None

    def fit(self, X, y=None):
        """Normalize X and call the fit method of the base class with numpy arrays instead of pandas data frames."""

        X = self._prepare(X)

        self._z_scaler.fit(X.values, y)
        z_data = self._z_scaler.transform(X.values, y)

        return super(self.__class__, self).fit(z_data, y)

    def fit_transform(self, X, y=None):
        """Call the fit and the transform method of this class."""

        X = self._prepare(X)

        self.fit(X, y)
        return self.transform(X, y)

    def transform(self, X, y=None):
        """Normalize X and call the transform method of the base class with numpy arrays instead of pandas data frames."""

        X = self._prepare(X)

        z_data = self._z_scaler.transform(X.values, y)

        transformed_ndarray = super(self.__class__, self).transform(z_data)

        pandas_df = pd.DataFrame(transformed_ndarray)
        pandas_df.columns = ["pca_{}".format(i) for i in range(len(pandas_df.columns))]

        return pandas_df

    def _prepare(self, X):
        """Check if the data is a pandas DataFrame and sorts the column names.

        :raise AttributeError: if pandas is not a DataFrame or the columns of the new X is not compatible with the 
                               columns from the previous X data
        """
        if not isinstance(X, pd.DataFrame):
            raise AttributeError("X is not a pandas DataFrame")

        X.sort_index(axis=1, inplace=True)

        if self._X_columns is not None:
            if self._X_columns != list(X.columns):
                raise AttributeError("The columns of the new X is not compatible with the columns from the previous X data")
        else:
            self._X_columns = list(X.columns)

        return X

## Load robot failure example

Splits the data set in a train (1 <= id <= 87) and a test set (87 <= id <= 88). It is assumed that the selection process is done in the past (train) and features for future (test) data sets should be determined. The id 87 is overlapping so that the correctness of the procedure can be easily shown.

In [2]:
from tsfresh.examples.robot_execution_failures import download_robot_execution_failures, load_robot_execution_failures
from tsfresh.feature_extraction import extract_features
from tsfresh.feature_selection import select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters, MinimalFCParameters, settings

download_robot_execution_failures()
df, y = load_robot_execution_failures()
df_train = df.iloc[(df.id <= 87).values]
y_train = y[0:-1]

df_test = df.iloc[(df.id >= 87).values]
y_test = y[-2:]

df.head()

  from pandas.core import datetools


Unnamed: 0,id,time,F_x,F_y,F_z,T_x,T_y,T_z
0,1,0,-1,-1,63,-3,-1,0
1,1,1,0,0,62,-3,-1,0
2,1,2,-1,-1,61,-3,0,0
3,1,3,-1,-1,63,-2,-1,0
4,1,4,-1,-1,63,-3,-1,0


# Train

## Extract train features

In [3]:
X_train = extract_features(df_train, column_id='id', column_sort='time', default_fc_parameters=MinimalFCParameters(),
                           impute_function=impute)

Feature Extraction: 100%|██████████| 522/522 [00:00<00:00, 4771.52it/s]


In [4]:
X_train.head()

variable,F_x__length,F_x__maximum,F_x__mean,F_x__median,F_x__minimum,F_x__standard_deviation,F_x__sum_values,F_x__variance,F_y__length,F_y__maximum,...,T_y__sum_values,T_y__variance,T_z__length,T_z__maximum,T_z__mean,T_z__median,T_z__minimum,T_z__standard_deviation,T_z__sum_values,T_z__variance
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,15.0,0.0,-0.933333,-1.0,-1.0,0.249444,-14.0,0.062222,15.0,0.0,...,-10.0,0.222222,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,15.0,0.0,-0.866667,-1.0,-3.0,0.956847,-13.0,0.915556,15.0,3.0,...,-20.0,4.222222,15.0,0.0,-0.266667,0.0,-1.0,0.442217,-4.0,0.195556
3,15.0,1.0,-0.666667,-1.0,-1.0,0.596285,-10.0,0.355556,15.0,2.0,...,-29.0,3.128889,15.0,0.0,-0.266667,0.0,-1.0,0.442217,-4.0,0.195556
4,15.0,1.0,-0.4,0.0,-2.0,0.95219,-6.0,0.906667,15.0,5.0,...,-16.0,7.128889,15.0,1.0,-0.333333,0.0,-1.0,0.596285,-5.0,0.355556
5,15.0,2.0,-0.6,-1.0,-2.0,0.879394,-9.0,0.773333,15.0,3.0,...,-42.0,4.16,15.0,1.0,-0.133333,0.0,-1.0,0.618241,-2.0,0.382222


## Select train features

In [5]:
X_train_filtered = select_features(X_train, y_train)
X_train_filtered.tail()



variable,T_y__variance,T_y__standard_deviation,F_z__standard_deviation,F_z__variance,F_x__standard_deviation,F_x__variance,T_x__variance,T_x__standard_deviation,F_y__variance,F_y__standard_deviation,...,F_z__sum_values,F_z__median,F_y__maximum,F_x__minimum,F_x__maximum,T_x__minimum,T_z__minimum,T_y__minimum,T_z__maximum,F_z__maximum
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
83,51.706667,7.190735,51.26645,2628.248889,5.329165,28.4,1058.728889,32.538114,4.862222,2.205045,...,-1103.0,-53.0,-8.0,-28.0,-14.0,70.0,-16.0,-23.0,-10.0,-24.0
84,1563.528889,39.541483,291.988082,85257.04,36.585729,1338.515556,6875.848889,82.920739,1143.555556,33.816498,...,-10671.0,-912.0,83.0,-110.0,-25.0,180.0,-28.0,12.0,0.0,-208.0
85,14.755556,3.841296,14.501494,210.293333,4.616877,21.315556,40.995556,6.402777,8.088889,2.844097,...,423.0,32.0,15.0,4.0,19.0,-46.0,-7.0,-1.0,0.0,50.0
86,2788.595556,52.807154,121.420189,14742.862222,38.235179,1461.928889,202.426667,14.227673,257.315556,16.041058,...,-2216.0,-110.0,69.0,21.0,148.0,-95.0,-10.0,14.0,8.0,-14.0
87,6415.715556,80.098162,204.966621,42011.315556,57.753268,3335.44,70.995556,8.425886,564.382222,23.75673,...,-14137.0,-1036.0,162.0,171.0,342.0,-142.0,13.0,222.0,44.0,-486.0


## Principal Component Analysis on train features

In [6]:
pca_train = PCAForPandas(n_components=4)
X_train_pca = pca_train.fit_transform(X_train_filtered)

# add index plus 1 to keep original index from robot example
X_train_pca.index += 1

X_train_pca.tail()

Unnamed: 0,pca_0,pca_1,pca_2,pca_3
83,-1.828246,0.510962,0.070269,-0.102048
84,3.74134,3.661448,1.263409,-0.115073
85,-2.361277,-0.105445,-0.078477,0.292859
86,0.261576,0.129725,1.586737,1.390926
87,4.337937,3.201585,1.248812,4.419234


# Test

## Extract test features

Only the selected features from the train data are extracted.

In [7]:
X_test_filtered = extract_features(df_test, column_id='id', column_sort='time',
                                   kind_to_fc_parameters=settings.from_columns(X_train_filtered.columns),
                                   impute_function=impute)

Feature Extraction: 100%|██████████| 12/12 [00:00<00:00, 2301.09it/s]


In [8]:
X_test_filtered

variable,F_x__maximum,F_x__minimum,F_x__standard_deviation,F_x__variance,F_y__maximum,F_y__standard_deviation,F_y__variance,F_z__maximum,F_z__mean,F_z__median,...,T_x__minimum,T_x__standard_deviation,T_x__variance,T_y__minimum,T_y__standard_deviation,T_y__variance,T_z__maximum,T_z__minimum,T_z__standard_deviation,T_z__variance
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
87,342.0,171.0,57.753268,3335.44,162.0,23.75673,564.382222,-486.0,-942.466667,-1036.0,...,-142.0,8.425886,70.995556,222.0,80.098162,6415.715556,44.0,13.0,9.903983,98.088889
88,-6.0,-13.0,2.061283,4.248889,5.0,1.203698,1.448889,53.0,40.0,42.0,...,-29.0,4.057366,16.462222,-27.0,2.628054,6.906667,6.0,3.0,0.884433,0.782222


## Principal Component Analysis on test features

The PCA components of the id 87 are the same as in the previous train PCA.

In [9]:
X_test_pca = pca_train.transform(X_test_filtered)

# reset index to keep original index from robot example
X_test_pca.index = [87, 88]

X_test_pca

Unnamed: 0,pca_0,pca_1,pca_2,pca_3
87,4.337937,3.201585,1.248812,4.419234
88,-2.51065,-0.139986,-0.469103,0.243084
