# Features ft. tsfresh 

Based on [article](https://otus.ru/nest/post/1024/)

In [1]:
import numpy as np
import pandas as pd
from tsfresh import extract_relevant_features
from tsfresh.feature_extraction import settings

from project.utils.data import load_extended_data

### Load data

In [2]:
data = load_extended_data()

In [3]:
data.shape

(1537, 51)

In [4]:
data.head()

Unnamed: 0_level_0,income,outcome,balance,income - outcome,non_working_day,non_working_day_usd,isholiday,key_rate,mosprime_rub_1m,mosprime_rub_1w,...,March,April,May,June,July,August,September,October,November,December
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-15,0.0,0.0,0.0,0.0,1,1,0,10.0,10.51,10.4,...,0,0,0,0,0,0,0,0,0,0
2017-01-16,1.082341,1.024345,0.057996,0.057996,0,1,0,10.0,10.47,10.42,...,0,0,0,0,0,0,0,0,0,0
2017-01-17,0.896334,1.448364,-0.548519,-0.55203,0,0,0,10.0,10.46,10.42,...,0,0,0,0,0,0,0,0,0,0
2017-01-18,0.754759,0.87956,-0.124801,-0.124801,0,0,0,10.0,10.46,10.42,...,0,0,0,0,0,0,0,0,0,0
2017-01-19,0.949877,0.955268,-0.005391,-0.005391,0,0,0,10.0,10.46,10.48,...,0,0,0,0,0,0,0,0,0,0


In [5]:
y = data['balance']
data = data.drop(columns='balance')

### Setup config

In [6]:
settings_minimal = settings.MinimalFCParameters()
print(f'N_features: {len(settings_minimal)}')
settings_minimal

N_features: 10


{'sum_values': None, 'median': None, 'mean': None, 'length': None, 'standard_deviation': None, 'variance': None, 'root_mean_square': None, 'maximum': None, 'absolute_maximum': None, 'minimum': None}

In [7]:
settings_time = settings.TimeBasedFCParameters()
print(f'N_features: {len(settings_time)}')
settings_time

N_features: 1


{'linear_trend_timewise': [{'attr': 'pvalue'}, {'attr': 'rvalue'}, {'attr': 'intercept'}, {'attr': 'slope'}, {'attr': 'stderr'}]}

In [8]:
settings_efficient = settings.EfficientFCParameters()
print(f'N_features: {len(settings_efficient)}')
settings_efficient

N_features: 73


{'variance_larger_than_standard_deviation': None, 'has_duplicate_max': None, 'has_duplicate_min': None, 'has_duplicate': None, 'sum_values': None, 'abs_energy': None, 'mean_abs_change': None, 'mean_change': None, 'mean_second_derivative_central': None, 'median': None, 'mean': None, 'length': None, 'standard_deviation': None, 'variation_coefficient': None, 'variance': None, 'skewness': None, 'kurtosis': None, 'root_mean_square': None, 'absolute_sum_of_changes': None, 'longest_strike_below_mean': None, 'longest_strike_above_mean': None, 'count_above_mean': None, 'count_below_mean': None, 'last_location_of_maximum': None, 'first_location_of_maximum': None, 'last_location_of_minimum': None, 'first_location_of_minimum': None, 'percentage_of_reoccurring_values_to_all_values': None, 'percentage_of_reoccurring_datapoints_to_all_datapoints': None, 'sum_of_reoccurring_values': None, 'sum_of_reoccurring_data_points': None, 'ratio_value_number_to_time_series_length': None, 'maximum': None, 'absolu

In [9]:
settings_comprehensive = settings.ComprehensiveFCParameters()
print(f'N_features: {len(settings_comprehensive)}')
settings_comprehensive

N_features: 75


{'variance_larger_than_standard_deviation': None, 'has_duplicate_max': None, 'has_duplicate_min': None, 'has_duplicate': None, 'sum_values': None, 'abs_energy': None, 'mean_abs_change': None, 'mean_change': None, 'mean_second_derivative_central': None, 'median': None, 'mean': None, 'length': None, 'standard_deviation': None, 'variation_coefficient': None, 'variance': None, 'skewness': None, 'kurtosis': None, 'root_mean_square': None, 'absolute_sum_of_changes': None, 'longest_strike_below_mean': None, 'longest_strike_above_mean': None, 'count_above_mean': None, 'count_below_mean': None, 'last_location_of_maximum': None, 'first_location_of_maximum': None, 'last_location_of_minimum': None, 'first_location_of_minimum': None, 'percentage_of_reoccurring_values_to_all_values': None, 'percentage_of_reoccurring_datapoints_to_all_datapoints': None, 'sum_of_reoccurring_values': None, 'sum_of_reoccurring_data_points': None, 'ratio_value_number_to_time_series_length': None, 'sample_entropy': None, 

In [10]:
ts_settings = settings_comprehensive

### Generate features

In [11]:
data_long = pd.DataFrame(
    {
        "values": data.values.flatten(),
        "id": np.arange(data.shape[0]).repeat(data.shape[1]),
    }
)

In [12]:
y.index = np.arange(data.shape[0])

In [13]:
data_long.shape

(76850, 2)

In [14]:
data_long.head()

Unnamed: 0,values,id
0,0.0,0
1,0.0,0
2,0.0,0
3,1.0,0
4,1.0,0


In [15]:
features_df = extract_relevant_features(
    data_long, 
    y,
    column_id="id",
    default_fc_parameters=ts_settings,
)

Feature Extraction: 100%|██████████| 20/20 [00:07<00:00,  2.68it/s]


In [16]:
features_df.shape

(1537, 354)

In [17]:
features_df.head()

Unnamed: 0,values__count_above__t_0,"values__change_quantiles__f_agg_""mean""__isabs_False__qh_1.0__ql_0.2","values__change_quantiles__f_agg_""mean""__isabs_False__qh_1.0__ql_0.4","values__change_quantiles__f_agg_""var""__isabs_False__qh_0.6__ql_0.0","values__change_quantiles__f_agg_""mean""__isabs_False__qh_0.2__ql_0.0","values__change_quantiles__f_agg_""mean""__isabs_False__qh_0.6__ql_0.0","values__change_quantiles__f_agg_""var""__isabs_True__qh_0.4__ql_0.0","values__change_quantiles__f_agg_""mean""__isabs_True__qh_0.4__ql_0.0","values__change_quantiles__f_agg_""var""__isabs_False__qh_0.4__ql_0.0","values__change_quantiles__f_agg_""mean""__isabs_False__qh_0.4__ql_0.0",...,"values__cwt_coefficients__coeff_11__w_10__widths_(2, 5, 10, 20)",values__energy_ratio_by_chunks__num_segments_10__segment_focus_9,values__approximate_entropy__m_2__r_0.3,values__mean_change,"values__change_quantiles__f_agg_""mean""__isabs_False__qh_1.0__ql_0.0",values__value_count__value_1,values__time_reversal_asymmetry_statistic__lag_2,"values__linear_trend__attr_""slope""","values__cwt_coefficients__coeff_4__w_20__widths_(2, 5, 10, 20)","values__cwt_coefficients__coeff_1__w_20__widths_(2, 5, 10, 20)"
0,1.0,-1.450087e-16,-1.450087e-16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,62.51454,0.0,0.236442,0.0,-1.450087e-16,5.0,1337.489959,-0.348512,40.816336,26.685215
1,1.0,-0.0220886,-0.0220886,0.000134,0.0,-0.002417,0.0,0.0,0.0,0.0,...,62.275306,0.0,0.236442,-0.022089,-0.0220886,4.0,1314.638655,-0.350933,40.898842,26.808125
2,0.98,0.01174533,0.01174533,0.01127,0.021232,0.021232,0.01127,0.021232,0.01127,0.021232,...,62.146776,0.0,0.236442,-0.018293,-0.01829252,3.0,1322.135734,-0.348216,40.640725,26.554636
3,0.98,0.002655347,0.002655347,0.000576,0.0048,0.0048,0.000576,0.0048,0.000576,0.0048,...,62.082452,0.0,0.236442,-0.015403,-0.01540324,3.0,1314.431133,-0.347083,40.53763,26.519678
4,0.98,0.0001146958,0.0001146958,1e-06,0.000207,0.000207,1e-06,0.000207,1e-06,0.000207,...,61.940945,0.0,0.236442,-0.019385,-0.01938525,3.0,1316.129906,-0.347107,40.511242,26.55135


In [18]:
features_df.columns

Index(['values__count_above__t_0',
       'values__change_quantiles__f_agg_"mean"__isabs_False__qh_1.0__ql_0.2',
       'values__change_quantiles__f_agg_"mean"__isabs_False__qh_1.0__ql_0.4',
       'values__change_quantiles__f_agg_"var"__isabs_False__qh_0.6__ql_0.0',
       'values__change_quantiles__f_agg_"mean"__isabs_False__qh_0.2__ql_0.0',
       'values__change_quantiles__f_agg_"mean"__isabs_False__qh_0.6__ql_0.0',
       'values__change_quantiles__f_agg_"var"__isabs_True__qh_0.4__ql_0.0',
       'values__change_quantiles__f_agg_"mean"__isabs_True__qh_0.4__ql_0.0',
       'values__change_quantiles__f_agg_"var"__isabs_False__qh_0.4__ql_0.0',
       'values__change_quantiles__f_agg_"mean"__isabs_False__qh_0.4__ql_0.0',
       ...
       'values__cwt_coefficients__coeff_11__w_10__widths_(2, 5, 10, 20)',
       'values__energy_ratio_by_chunks__num_segments_10__segment_focus_9',
       'values__approximate_entropy__m_2__r_0.3', 'values__mean_change',
       'values__change_quantiles__f

Получили 354 значимых признаков вместо 51 изначальных