In [1]:
import pandas as pd
from tsfresh.examples.robot_execution_failures import download_robot_execution_failures, load_robot_execution_failures
from tsfresh.feature_extraction import extract_features, extract_features_on_sub_features
from tsfresh.feature_selection import select_features
from tsfresh.feature_extraction.settings import MinimalFCParameters 

from tsfresh.feature_extraction.gen_features_dicts_function import derive_features_dictionaries

import matplotlib.pyplot as plt
import json

# ....

In [2]:
download_robot_execution_failures()
timeseries, y = load_robot_execution_failures()
print(timeseries.head())

   id  time  F_x  F_y  F_z  T_x  T_y  T_z
0   1     0   -1   -1   63   -3   -1    0
1   1     1    0    0   62   -3   -1    0
2   1     2   -1   -1   61   -3    0    0
3   1     3   -1   -1   63   -2   -1    0
4   1     4   -1   -1   63   -3   -1    0


# Extract features from the Time Series
Let us start by demonstrating how a simple set of time series features (mean, median, max, variance, ...) are calculated from an example time series.

In [5]:
extracted_features = extract_features(timeseries, column_id="id", column_sort="time")
print(extracted_features.head())

Feature Extraction:   0%|          | 0/20 [00:00<?, ?it/s]
Process ForkPoolWorker-1:
Process ForkPoolWorker-2:
Process ForkPoolWorker-3:
Process ForkPoolWorker-4:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/scott/miniconda3/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/scott/miniconda3/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/scott/miniconda3/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/scott/miniconda3/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/scott/miniconda3/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/scott/miniconda3/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(

# How to extract features from an existing feature matrix
Should we find that these features themselves are not sufficiently informative for whatever reason, we can repeat the same feature extraction process using the  `extract_features_on_sub_features` function.

In principle this works as such:

1. The input time *X* series is windowed and the chosen set of N features are extracted. This returns a new matrix *M* where each column represents a particular **feature time series**.

2. For each feature in the resulting output, step 1 is repeated and for the chosen feature time series. Each new column generated can be referred to as a **sub-feature** or  **feature-dynamic**
    
3. Repeat for each column in *M*.

## Differences to `extract_features`
`extract_features_on_sub_features` shares most of the same parameters as `extract_features`

**Note:** that the resulting output of this operation can lead to an exponential number of columns generated. For instance if the input has 1 time series and we extract N features...

Below the algorithm is demonstrated on the same robot executaion failures dataset.

!!!!DIAGRAM FROM P4P

In [15]:
extracted_sub_features = extract_features_on_sub_features(timeseries_container=timeseries,
                                    sub_feature_split=11,  # window size
                                    column_id="id",
                                    column_sort="time",
                                    sub_default_fc_parameters=MinimalFCParameters(),
                                    ##TODO: check if one of these isnt specified use the other
                                    default_fc_parameters=MinimalFCParameters())
print(extracted_sub_features.head())                                   

Feature Extraction: 100%|██████████| 20/20 [00:00<00:00, 68.36it/s]
Feature Extraction: 100%|██████████| 20/20 [00:01<00:00, 11.75it/s]

   F_x||length__sum_values  F_x||length__median  F_x||length__mean  \
1                     15.0                  7.5                7.5   
2                     15.0                  7.5                7.5   
3                     15.0                  7.5                7.5   
4                     15.0                  7.5                7.5   
5                     15.0                  7.5                7.5   

   F_x||length__length  F_x||length__standard_deviation  \
1                  2.0                              3.5   
2                  2.0                              3.5   
3                  2.0                              3.5   
4                  2.0                              3.5   
5                  2.0                              3.5   

   F_x||length__variance  F_x||length__maximum  F_x||length__minimum  \
1                  12.25                  11.0                   4.0   
2                  12.25                  11.0                   4.0   
3       




# Interpreting the results

As can be seen, running `extract_features_on_sub_features` results in significantly more columns

## Decomposing the column names
`"F_x||length__sum_values"`

This is demonstrated below

In [16]:
## Take a subset of the columns to demonstrate (reduce size of output)
sub_feature_names = extracted_sub_features.columns.tolist()[:120]
f,ff = derive_features_dictionaries(sub_feature_names)

print("The set f features calculated on the original time series:\n")
#[print(f[k],"\n") for k in f.keys()]
print(json.dumps(f,sort_keys=True, indent=4))


The set f features calculated on the original time series:

{
    "F_x": {
        "length": null,
        "maximum": null,
        "mean": null,
        "median": null,
        "minimum": null,
        "standard_deviation": null,
        "sum_values": null,
        "variance": null
    },
    "F_y": {
        "length": null,
        "maximum": null,
        "mean": null,
        "median": null,
        "minimum": null,
        "standard_deviation": null,
        "sum_values": null
    }
}


**talk about how to interpret this**

In [17]:
#print("\nThe set of feature-dynamics/sub-features generate on the feature time-series",ff, sep="\n")
print(json.dumps(ff,sort_keys=True, indent=4))

{
    "F_x||length": {
        "length": null,
        "maximum": null,
        "mean": null,
        "median": null,
        "minimum": null,
        "standard_deviation": null,
        "sum_values": null,
        "variance": null
    },
    "F_x||maximum": {
        "length": null,
        "maximum": null,
        "mean": null,
        "median": null,
        "minimum": null,
        "standard_deviation": null,
        "sum_values": null,
        "variance": null
    },
    "F_x||mean": {
        "length": null,
        "maximum": null,
        "mean": null,
        "median": null,
        "minimum": null,
        "standard_deviation": null,
        "sum_values": null,
        "variance": null
    },
    "F_x||median": {
        "length": null,
        "maximum": null,
        "mean": null,
        "median": null,
        "minimum": null,
        "standard_deviation": null,
        "sum_values": null,
        "variance": null
    },
    "F_x||minimum": {
        "length": null,
     

## Select the most relevant time Series features from both of these datasets

In [18]:
## Typical feature extraction
selected_features = select_features(extracted_features,y)
print(selected_features.head())

   T_y__standard_deviation  T_y__variance  F_z__standard_deviation  \
1                 0.471405       0.222222                 1.203698   
2                 2.054805       4.222222                 4.333846   
3                 1.768867       3.128889                 4.616877   
4                 2.669998       7.128889                 3.833188   
5                 2.039608       4.160000                 4.841487   

   F_z__variance  F_x__standard_deviation  F_x__variance  \
1       1.448889                 0.249444       0.062222   
2      18.782222                 0.956847       0.915556   
3      21.315556                 0.596285       0.355556   
4      14.693333                 0.952190       0.906667   
5      23.440000                 0.879394       0.773333   

   T_x__standard_deviation  T_x__variance  F_y__variance  \
1                 0.339935       0.115556       0.115556   
2                 3.422799      11.715556       4.622222   
3                 2.633122       6.933

In [19]:
selected_sub_features = select_features(extracted_sub_features,y)
print(selected_sub_features.head())

   F_z||variance__maximum  F_z||standard_deviation__maximum  \
1                3.000000                          1.732051   
2               19.107438                          4.371206   
3               22.250000                          4.716991   
4               16.975207                          4.120098   
5               44.750000                          6.689544   

   T_y||variance__maximum  T_y||standard_deviation__maximum  \
1                0.231405                          0.481046   
2                4.628099                          2.151302   
3                2.975207                          1.724879   
4                7.107438                          2.665978   
5                4.561983                          2.135880   

   F_z||variance__median  F_z||variance__sum_values  F_z||variance__mean  \
1               1.888430                   3.776860             1.888430   
2              14.678719                  29.357438            14.678719   
3             

In [20]:
plt.figure(figsize = (15,9))


<Figure size 1080x648 with 0 Axes>

<Figure size 1080x648 with 0 Axes>

# Given this new set of subfeatures - we can decompose this into the useful features...

In [None]:
## RUN the code with the smaller feature set
## 

# Generating new time series
potentially move this to its own notebook