# TSFRESH Robot Failure Example
This example show shows how to use [tsfresh](https://tsfresh.readthedocs.io/) to exctract useful features from multiple timeseries and use them to improve classification performance.

In [9]:
%matplotlib inline
import matplotlib.pylab as plt
import seaborn as sns
from tsfresh.examples.robot_execution_failures import download_robot_execution_failures, load_robot_execution_failures
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report

In [10]:
# We set the logger to Error level
# This is not recommend for normal use as you can oversee important Warning messages
import logging
logging.basicConfig(level=logging.ERROR)

## Load and visualize data
The data set documents 88 robot executions (`id` 1 - 88), which are a subset of the [Robot Execution Failures Data Set](https://archive.ics.uci.edu/ml/datasets/Robot+Execution+Failures). For the purpose of simplicity we are only differentiating between successfull and failed executions (`y`).
For each execution 15 force (F) and torque (T) samples are given, which were measured at regular time intervals for the spatial dimensions x, y, and z. Therefore each row of the data frame references a specific execution (`id`), a time index (`index`) and documents the respective measurements of 6 sensors (`F_x`, `F_y`, `F_z`, `T_x`, `T_y`, `T_z`).

In [21]:
jog = pd.read_csv('jog_sub_1.csv', index_col = 0)[:1500]
sit = pd.read_csv('sit_sub_1.csv', index_col = 0)[:1500]

In [23]:
jog.head()

Unnamed: 0,attitude.roll,attitude.pitch,attitude.yaw,gravity.x,gravity.y,gravity.z,rotationRate.x,rotationRate.y,rotationRate.z,userAcceleration.x,userAcceleration.y,userAcceleration.z
0,3.058304,-1.227988,2.570999,0.027964,0.941814,0.334969,0.160508,-1.386834,-0.749713,0.204199,0.172657,-0.801048
1,3.075964,-1.225818,2.615277,0.022178,0.941083,0.337448,-0.217198,-0.612402,-0.682841,0.089974,-0.373914,-0.506332
2,3.103364,-1.235013,2.651791,0.012594,0.944152,0.329269,0.663253,-0.498534,-0.620223,0.260127,-0.364364,-0.781249
3,3.109208,-1.244901,2.678484,0.010366,0.947364,0.319989,0.4581,-1.202168,-0.304561,0.584253,-0.922813,-0.285169
4,3.074214,-1.263514,2.661371,0.020364,0.953159,0.301783,1.347809,-0.550578,0.610944,0.626501,-1.045978,-0.063884


In [24]:
sit.head()

Unnamed: 0,attitude.roll,attitude.pitch,attitude.yaw,gravity.x,gravity.y,gravity.z,rotationRate.x,rotationRate.y,rotationRate.z,userAcceleration.x,userAcceleration.y,userAcceleration.z
0,0.351138,-0.416302,-0.098023,0.314588,0.404381,-0.858784,-0.003655,0.009577,-0.000234,9.4e-05,0.000252,-0.006954
1,0.351219,-0.416372,-0.098025,0.314649,0.404445,-0.858732,-0.007937,0.007457,0.000807,-0.001309,0.002262,-0.010348
2,0.351145,-0.416497,-0.098045,0.314567,0.40456,-0.858708,-0.00796,-0.001053,-0.002387,-0.002174,1.2e-05,-0.006664
3,0.350849,-0.416695,-0.098148,0.314286,0.40474,-0.858726,-0.000521,-0.006411,-0.001261,0.001693,0.009216,-0.011575
4,0.350859,-0.416863,-0.098022,0.314271,0.404894,-0.858658,-0.001613,-0.004306,0.006206,0.005767,0.00749,-0.008926


In [41]:
# df['attitude.roll'] = df['attitude.roll'] * 10
# df['attitude.roll'] = df['attitude.roll']
# df['attitude.roll'] = df['attitude.roll'] * 10
# df['attitude.roll'] = df['attitude.roll'] * 10
# df['attitude.roll'] = df['attitude.roll'] * 10

In [44]:
df.head(20)

Unnamed: 0,attitude.roll,attitude.pitch,attitude.yaw,gravity.x,gravity.y,gravity.z,rotationRate.x,rotationRate.y,rotationRate.z,userAcceleration.x,userAcceleration.y,userAcceleration.z,id,time
0,30583.04,-1.227988,2.570999,0.027964,0.941814,0.334969,0.160508,-1.386834,-0.749713,0.204199,0.172657,-0.801048,0,0
1,30759.64,-1.225818,2.615277,0.022178,0.941083,0.337448,-0.217198,-0.612402,-0.682841,0.089974,-0.373914,-0.506332,0,1
2,31033.64,-1.235013,2.651791,0.012594,0.944152,0.329269,0.663253,-0.498534,-0.620223,0.260127,-0.364364,-0.781249,0,2
3,31092.08,-1.244901,2.678484,0.010366,0.947364,0.319989,0.4581,-1.202168,-0.304561,0.584253,-0.922813,-0.285169,0,3
4,30742.14,-1.263514,2.661371,0.020364,0.953159,0.301783,1.347809,-0.550578,0.610944,0.626501,-1.045978,-0.063884,0,4
5,29944.83,-1.301926,2.567805,0.038938,0.964072,0.262773,2.414809,1.069776,1.2658,0.580035,-1.048346,-0.200593,0,5
6,28777.05,-1.355284,2.426982,0.055779,0.976867,0.206445,3.261337,0.707698,1.0361,0.023216,-0.850921,-0.867426,0,6
7,25952.29,-1.423334,2.142191,0.076342,0.989147,0.125539,4.630348,-0.697815,1.018621,-0.063067,-0.196087,-0.033894,0,7
8,19431.8,-1.467603,1.509766,0.09595,0.99468,0.037479,4.045324,-1.028207,0.766063,0.011884,0.306009,0.265301,0,8
9,11396.27,-1.447025,0.703775,0.112156,0.99235,-0.051596,5.01208,-0.081093,0.972427,-0.363179,0.773397,0.642828,0,9


In [40]:
df.dtypes

attitude.roll         float64
attitude.pitch        float64
attitude.yaw          float64
gravity.x             float64
gravity.y             float64
gravity.z             float64
rotationRate.x        float64
rotationRate.y        float64
rotationRate.z        float64
userAcceleration.x    float64
userAcceleration.y    float64
userAcceleration.z    float64
id                      int64
time                    int64
dtype: object

In [36]:
jog['label'] = 1
sit['label'] = 0
y1 = pd.Series(data[['id', 'label']].drop_duplicates()['label'])
y1.reset_index(inplace=True, drop=True)
df = pd.concat([jog, sit], axis = 0)
df.reset_index(inplace = True, drop = True)
y.reset_index(inplace = True, drop = True)
df['id'] = df.index//15
df['time'] = df.index%15
y = df['label']
df = df.drop('label', axis = 1)
extraction_settings = ComprehensiveFCParameters()
X = extract_features(df,
                    column_id='id', column_sort='time',
                    default_fc_parameters=extraction_settings,
                    impute_function= impute)



Feature Extraction:   0%|          | 0/10 [00:00<?, ?it/s][A[A

Feature Extraction:  10%|█         | 1/10 [00:29<04:24, 29.35s/it][A[A

Feature Extraction:  20%|██        | 2/10 [00:29<01:58, 14.82s/it][A[A

Feature Extraction:  30%|███       | 3/10 [00:58<02:17, 19.63s/it][A[A

Feature Extraction:  40%|████      | 4/10 [00:58<01:28, 14.73s/it][A[A

Feature Extraction:  50%|█████     | 5/10 [01:28<01:28, 17.65s/it][A[A

Feature Extraction:  60%|██████    | 6/10 [01:29<00:59, 14.87s/it][A[A

Feature Extraction:  70%|███████   | 7/10 [01:57<00:50, 16.84s/it][A[A

Feature Extraction:  80%|████████  | 8/10 [01:58<00:29, 14.83s/it][A[A

Feature Extraction:  90%|█████████ | 9/10 [02:27<00:16, 16.35s/it][A[A

Feature Extraction: 100%|██████████| 10/10 [02:27<00:00, 14.80s/it][A[A

[A[A

In [37]:
X_filtered = extract_relevant_features(df, y,
                                      column_id='id', column_sort='time',
                                      default_fc_parameters=extraction_settings)



Feature Extraction:   0%|          | 0/10 [00:00<?, ?it/s][A[A

Feature Extraction:  10%|█         | 1/10 [00:29<04:25, 29.54s/it][A[A

Feature Extraction:  20%|██        | 2/10 [00:29<01:58, 14.80s/it][A[A

Feature Extraction:  30%|███       | 3/10 [00:58<02:15, 19.37s/it][A[A

Feature Extraction:  40%|████      | 4/10 [00:58<01:27, 14.62s/it][A[A

Feature Extraction:  50%|█████     | 5/10 [01:26<01:26, 17.34s/it][A[A

Feature Extraction:  60%|██████    | 6/10 [01:27<00:58, 14.60s/it][A[A

Feature Extraction:  70%|███████   | 7/10 [01:56<00:49, 16.60s/it][A[A

Feature Extraction:  80%|████████  | 8/10 [01:56<00:29, 14.58s/it][A[A

Feature Extraction:  90%|█████████ | 9/10 [02:25<00:16, 16.21s/it][A[A

Feature Extraction: 100%|██████████| 10/10 [02:26<00:00, 14.63s/it][A[A

  z = (bigu - meanrank) / sd
  z = (bigu - meanrank) / sd


IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match

## Extract Features

In [13]:
extraction_settings = ComprehensiveFCParameters()

In [8]:
X = extract_features(df, 
                     column_id='id', column_sort='time',
                     default_fc_parameters=extraction_settings,
                     impute_function= impute)

Feature Extraction: 100%|██████████| 10/10 [00:31<00:00,  3.11s/it]


In [9]:
X.head()

variable,F_x__abs_energy,F_x__absolute_sum_of_changes,"F_x__agg_autocorrelation__f_agg_""mean""","F_x__agg_autocorrelation__f_agg_""median""","F_x__agg_autocorrelation__f_agg_""var""","F_x__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""intercept""","F_x__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""rvalue""","F_x__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""slope""","F_x__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""stderr""","F_x__agg_linear_trend__f_agg_""max""__chunk_len_50__attr_""intercept""",...,T_z__time_reversal_asymmetry_statistic__lag_1,T_z__time_reversal_asymmetry_statistic__lag_2,T_z__time_reversal_asymmetry_statistic__lag_3,T_z__value_count__value_-inf,T_z__value_count__value_0,T_z__value_count__value_1,T_z__value_count__value_inf,T_z__value_count__value_nan,T_z__variance,T_z__variance_larger_than_standard_deviation
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,14.0,2.0,-0.106351,-0.07206633,0.016879,0.0,-1.0,-1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,15.0,0.0,0.0,0.0,0.0,0.0
2,25.0,14.0,-0.039098,-0.04935275,0.08879,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.195556,0.0
3,12.0,10.0,-0.029815,2.6020850000000003e-17,0.105435,1.0,-1.0,-2.0,0.0,0.0,...,0.0,-0.090909,0.0,0.0,11.0,0.0,0.0,0.0,0.195556,0.0
4,16.0,17.0,-0.049773,-0.06417112,0.14358,1.0,-1.0,-1.0,0.0,0.0,...,0.0,-0.181818,0.0,0.0,8.0,1.0,0.0,0.0,0.355556,0.0
5,17.0,13.0,-0.061467,-0.05172414,0.052642,2.0,-1.0,-2.0,0.0,0.0,...,-0.076923,-0.090909,-0.222222,0.0,9.0,2.0,0.0,0.0,0.382222,0.0


In [10]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 88 entries, 1 to 88
Columns: 4764 entries, F_x__abs_energy to T_z__variance_larger_than_standard_deviation
dtypes: float64(4764)
memory usage: 3.2 MB


In [None]:
X_filtered = extract_relevant_features(df, y, 
                                       column_id='id', column_sort='time', 
                                       default_fc_parameters=extraction_settings)

Feature Extraction:  80%|████████  | 8/10 [00:24<00:06,  3.09s/it]

In [None]:
X_filtered.head()

In [None]:
X_filtered.info()

## Train and evaluate classifier

In [None]:
X_train, X_test, X_filtered_train, X_filtered_test, y_train, y_test = train_test_split(X, X_filtered, y, test_size=.4)

In [None]:
cl = DecisionTreeClassifier()
cl.fit(X_train, y_train)
print(classification_report(y_test, cl.predict(X_test)))

In [None]:
cl.n_features_

In [None]:
cl2 = DecisionTreeClassifier()
cl2.fit(X_filtered_train, y_train)
print(classification_report(y_test, cl2.predict(X_filtered_test)))

In [None]:
cl2.n_features_

Compared to using all features (`cl.n_features_`), using only the relevant features (`cl2.n_features_`) achieves better classification performance with less data.

# Extraction + filtering is the same as filtered extraction

Above, we performed two feature extractions runs. A filtered one and a non filtered one. However, the results of the filtered is equal to just extracting all features and then filtering them.

In [None]:
X_filtered_2 = select_features(X, y)

In [None]:
(X_filtered.columns == X_filtered_2.columns).all()