### Import libraries

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import random
import statistics
import time
import glob
import os
import feather
from sklearn.model_selection import train_test_split
sns.set(style="whitegrid")

In [2]:
# feature extractors

import pycatch22
from kats.tsfeatures.tsfeatures import TsFeatures
from kats.consts import TimeSeriesData
tsFeatures = TsFeatures()

from tsfeatures import tsfeatures
from tsfresh import extract_features
import tsfel



--------------------------------------

### Functions to calculate the features for _catch22_, _Kats_, _tsfel_ and _tsfresh_

In [3]:
# takes the columns of a dataframe as the rows
def getFeatureRows(features, i, group, extractor):
    
    df = pd.DataFrame(columns=['id', 'group', 'names', 'values', 'method'])

    for column in features:
        df2 = {'id': i, 'group': group, 'names': column, 'values': features[column].values, 'method': extractor}
        df = pd.concat([df, pd.DataFrame.from_dict(df2)], ignore_index=True)
        
    return df


In [19]:
#########################################################
# functions to compute according to the feature extractor


def computeCatch22(d, i, time_var, values_var, group):
    
    ts = dataFrameToList(d[values_var].to_frame())
    rawFeat = pycatch22.catch22_all(ts)
    
    # create a dictionary with the feature name as key and the value as value
    dictionary = {}
    for name,value in zip(rawFeat['names'],rawFeat['values']):
        dictionary[name] = [value]
        
    # then create a dataframe, and from that a dataframe row per feature
    features = pd.DataFrame.from_dict(dictionary)
    return getFeatureRows(features, i, group, 'catch22')
    
    
def computeKats(d, i, time_var, values_var, group):
    
    rawFeatDict = TsFeatures().transform(d)
        
    # then create a dataframe, and from that a dataframe row per feature
    features = pd.DataFrame.from_dict([rawFeatDict])
    return getFeatureRows(features, i, group, 'kats')
    

def computeTsfeatures(d, i, time_var, values_var, group):
    
    ts = d[values_var].to_frame()
    
    ts.rename(columns={values_var: "y"}, inplace=True)
    ts.insert(0, 'ds', pd.date_range(start='2020/12/01', periods=len(ts)))
    ts.insert(0, 'unique_id', len(ts) * [i])
    
    features = tsfeatures(ts)
    return getFeatureRows(features, i, group, 'tsfeatures')
    
    
def computeTsfel(d, i, time_var, values_var, group):
    
    ts = d[values_var].to_frame()
    
    # if no argument is passed retrieves all available features
    cfg = tsfel.get_features_by_domain()
    features = tsfel.time_series_features_extractor(cfg, ts)
    return getFeatureRows(features, i, group, 'tsfel')
    
    
def computeTsfresh(d, i, time_var, values_var, group):
    features = extract_features(d, column_id='id', column_value = values_var, column_sort = time_var)
    return getFeatureRows(features, i, group, 'tsfresh')


# create a switch which chooses the correct function depending on the chosen extractor
switch = {'catch22' : computeCatch22, 'kats' : computeKats, 'tsfeatures': computeTsfeatures, 'tsfel' : computeTsfel, 'tsfresh' : computeTsfresh}


In [20]:
def calculate_features(df, id_var, time_var, values_var, group_var, feature_set):
    
    calculatedFeatures = pd.DataFrame()
    
    for i in df['id'].unique():
        
        print("Computing features for ", i)
        # d as all the data available for the current time series
        d = df.loc[df[id_var] == i]
        group = d[group_var].unique()[0]
        computeFeat = switch[feature_set](d, i, time_var, values_var, group)
        calculatedFeatures = pd.concat([calculatedFeatures, computeFeat], ignore_index=True)
        
    return calculatedFeatures#.sort_values(['names', 'id'], ascending=[False, True], inplace=False, ignore_index=True) #needs to be sorted
        

--------------------------------------

### Load the whole data set and calculate features

In [21]:
empirical1000 = feather.read_dataframe('data/emp1000.feather')
emp604 =        feather.read_dataframe('data/emp604.feather')
emp604

Unnamed: 0,id,timepoint,value,Name,Keywords,Length
0,1,1.0,-0.101970,eXascale,airq,1000
1,1,2.0,-0.840121,eXascale,airq,1000
2,1,3.0,-0.887480,eXascale,airq,1000
3,1,4.0,-0.899042,eXascale,airq,1000
4,1,5.0,-0.928869,eXascale,airq,1000
...,...,...,...,...,...,...
603995,604,996.0,-0.218750,eXascale,"synthetic, 1M",1000000
603996,604,997.0,-0.293234,eXascale,"synthetic, 1M",1000000
603997,604,998.0,-0.603164,eXascale,"synthetic, 1M",1000000
603998,604,999.0,-0.126021,eXascale,"synthetic, 1M",1000000


In [22]:
# load the time series'
#empirical1000 = feather.read_dataframe('data/emp1000.feather')
emp604        = feather.read_dataframe('data/emp604.feather')

outs_catch22 = pd.DataFrame()
outs_kats = pd.DataFrame()
outs_tsfel = pd.DataFrame()
outs_tsfresh = pd.DataFrame()

#df = empirical1000.copy()

'''
# for original data
outs_catch22 = calculate_features(empirical1000, 'id', 'timepoint', 'value', 'Keywords', 'catch22')
outs_kats    = calculate_features(empirical1000, 'id', 'timepoint', 'value', 'Keywords', 'kats')
outs_tsfel   = calculate_features(empirical1000, 'id', 'timepoint', 'value', 'Keywords', 'tsfel')
outs_tsfresh = calculate_features(empirical1000, 'id', 'timepoint', 'value', 'Keywords', 'tsfresh')
'''

# for eXascale data
#outs_catch22    = calculate_features(emp604, 'id', 'timepoint', 'value', 'Keywords', 'catch22')
#outs_kats       = calculate_features(emp604, 'id', 'timepoint', 'value', 'Keywords', 'kats')
#outs_tsfel      = calculate_features(emp604, 'id', 'timepoint', 'value', 'Keywords', 'tsfel')
#outs_tsfresh    = calculate_features(emp604, 'id', 'timepoint', 'value', 'Keywords', 'tsfresh')
outs_tsfeatures = calculate_features(emp604, 'id', 'timepoint', 'value', 'Keywords', 'tsfeatures')
        

Computing features for  1
Computing features for  2
Computing features for  3
Computing features for  4
Computing features for  5
Computing features for  6
Computing features for  7
Computing features for  8
Computing features for  9
Computing features for  10
Computing features for  11
Computing features for  12
Computing features for  13
Computing features for  14
Computing features for  15
Computing features for  16
Computing features for  17
Computing features for  18
Computing features for  19
Computing features for  20
Computing features for  21
Computing features for  22
Computing features for  23
Computing features for  24
Computing features for  25
Computing features for  26
Computing features for  27
Computing features for  28
Computing features for  29
Computing features for  30
Computing features for  31
Computing features for  32
Computing features for  33
Computing features for  34
Computing features for  35
Computing features for  36
Computing features for  37
Computing 

Computing features for  298
Computing features for  299
Computing features for  300
Computing features for  301
Computing features for  302
Computing features for  303
Computing features for  304
Computing features for  305
Computing features for  306
Computing features for  307
Computing features for  308
Computing features for  309
Computing features for  310
Computing features for  311
Computing features for  312
Computing features for  313
Computing features for  314
Computing features for  315
Computing features for  316
Computing features for  317
Computing features for  318
Computing features for  319
Computing features for  320
Computing features for  321
Computing features for  322
Computing features for  323
Computing features for  324
Computing features for  325
Computing features for  326
Computing features for  327
Computing features for  328
Computing features for  329
Computing features for  330
Computing features for  331
Computing features for  332
Computing features f

Computing features for  591
Computing features for  592
Computing features for  593
Computing features for  594
Computing features for  595
Computing features for  596
Computing features for  597
Computing features for  598
Computing features for  599
Computing features for  600
Computing features for  601
Computing features for  602
Computing features for  603
Computing features for  604


In [31]:
outs_tsfeatures.sort_values(['names', 'id'], ascending=[False, True], inplace=True, ignore_index=True) #needs to be sorted
        

In [32]:
# saved in files so no need to recalculate the features

#outs_catch22.to_feather('data/outs_catch22_exp.feather')
#outs_kats.to_feather('data/outs_kats_exp.feather')
#outs_tsfel.to_feather('data/outs_tsfel_exp.feather')
#outs_tsfresh.to_feather('data/outs_tsfresh_exp.feather')

outs_tsfeatures.to_feather('data/outs_tsfresh_exp.feather')

In [44]:
len(outs_tsfeatures['names'].unique())

38

In [22]:
# saved in files so no need to recalculate the features

outs_catch22.to_csv('data/outs_catch22.csv')
outs_kats.to_csv('data/outs_kats.csv')
outs_tsfel.to_csv('data/outs_tsfel.csv')
outs_tsfresh.to_csv('data/outs_tsfresh.csv')

# creates unnessecary column with indices while loading the files -> remove that sort the dataframes correctly

def sortDF(df):
    df = df.iloc[: , 1:]
    return df.sort_values(['names', 'id'], ascending=[False, True], inplace=False, ignore_index=True)

outs_catch22 = sortDF(outs_catch22)
outs_kats    = sortDF(outs_kats)
outs_tsfel   = sortDF(outs_tsfel)
outs_tsfresh = sortDF(outs_tsfresh)

In [7]:
# load the files again
outs_catch22 = feather.read_dataframe('data/outs_catch22_exp.feather').sort_values(['names', 'id'], ascending=[False, True], inplace=False, ignore_index=True)
outs_kats    = feather.read_dataframe('data/outs_kats_exp.feather').sort_values(['names', 'id'], ascending=[False, True], inplace=False, ignore_index=True)
outs_tsfel   = feather.read_dataframe('data/outs_tsfel_exp.feather').sort_values(['names', 'id'], ascending=[False, True], inplace=False, ignore_index=True)
outs_tsfresh = feather.read_dataframe('data/outs_tsfresh_exp.feather').sort_values(['names', 'id'], ascending=[False, True], inplace=False, ignore_index=True)

pythonFeatures = pd.concat([outs_catch22, outs_kats, outs_tsfel, outs_tsfresh], ignore_index=True)

In [29]:
# merge all the feature dataframes from the python extractors and save in file
'''
pythonFeatures = pd.concat([outs_catch22, outs_kats, outs_tsfel, outs_tsfresh], ignore_index=True)
pythonFeatures.to_feather('data/pythonFeatures.feather')
'''

--------------------------------------

### Load already calculated features from _feasts_ and _tsfeatures_

In [9]:
# eXascale data --> only feasts, tsfeatures missing
rFeatures = feather.read_dataframe('data/Emp604_feasts.feather')
rFeatures

# merge all features (hctsa still missing)
Emp1000FeatMat = pd.concat([pythonFeatures, rFeatures], ignore_index=True)
Emp1000FeatMat.to_feather('data/Emp604FeatMat.feather')

In [30]:
'''
# gets the features calculated in R
rFeatures = feather.read_dataframe('data/rFeatures.feather')
'''

In [33]:
# load old data and add tsfeatures
old = feather.read_dataframe('data/Emp604FeatMat.feather')

Emp1000FeatMat = pd.concat([old, outs_tsfeatures], ignore_index=True)
Emp1000FeatMat['method'].unique()

# remove all feasts

df = Emp1000FeatMat[Emp1000FeatMat.method != 'feasts']
df

Unnamed: 0,id,group,names,values,method
0,1,airq,SP_Summaries_welch_rect_centroid,0.042951,catch22
1,2,airq,SP_Summaries_welch_rect_centroid,0.042951,catch22
2,3,airq,SP_Summaries_welch_rect_centroid,0.042951,catch22
3,4,airq,SP_Summaries_welch_rect_centroid,0.042951,catch22
4,5,airq,SP_Summaries_welch_rect_centroid,0.042951,catch22
...,...,...,...,...,...
792443,600,"synthetic, 1M",alpha,0.690349,tsfeatures
792444,601,"synthetic, 1M",alpha,0.495007,tsfeatures
792445,602,"synthetic, 1M",alpha,0.436557,tsfeatures
792446,603,"synthetic, 1M",alpha,0.497863,tsfeatures


In [34]:
df['method'].unique()

array(['catch22', 'kats', 'tsfel', 'tsfresh', 'tsfeatures'], dtype=object)

In [35]:
df.reset_index(inplace=True)
df

Unnamed: 0,index,id,group,names,values,method
0,0,1,airq,SP_Summaries_welch_rect_centroid,0.042951,catch22
1,1,2,airq,SP_Summaries_welch_rect_centroid,0.042951,catch22
2,2,3,airq,SP_Summaries_welch_rect_centroid,0.042951,catch22
3,3,4,airq,SP_Summaries_welch_rect_centroid,0.042951,catch22
4,4,5,airq,SP_Summaries_welch_rect_centroid,0.042951,catch22
...,...,...,...,...,...,...
766471,792443,600,"synthetic, 1M",alpha,0.690349,tsfeatures
766472,792444,601,"synthetic, 1M",alpha,0.495007,tsfeatures
766473,792445,602,"synthetic, 1M",alpha,0.436557,tsfeatures
766474,792446,603,"synthetic, 1M",alpha,0.497863,tsfeatures


In [36]:
f = df.iloc[:,1:]
f

Unnamed: 0,id,group,names,values,method
0,1,airq,SP_Summaries_welch_rect_centroid,0.042951,catch22
1,2,airq,SP_Summaries_welch_rect_centroid,0.042951,catch22
2,3,airq,SP_Summaries_welch_rect_centroid,0.042951,catch22
3,4,airq,SP_Summaries_welch_rect_centroid,0.042951,catch22
4,5,airq,SP_Summaries_welch_rect_centroid,0.042951,catch22
...,...,...,...,...,...
766471,600,"synthetic, 1M",alpha,0.690349,tsfeatures
766472,601,"synthetic, 1M",alpha,0.495007,tsfeatures
766473,602,"synthetic, 1M",alpha,0.436557,tsfeatures
766474,603,"synthetic, 1M",alpha,0.497863,tsfeatures


In [37]:
f.to_feather('data/EmpFeatMat.feather')

In [36]:
# merge all features (hctsa still missing)
Emp1000FeatMat = pd.concat([pythonFeatures, rFeatures], ignore_index=True)
Emp1000FeatMat.to_csv('data/Emp1000FeatMatOhneHCTSA.csv')
Emp1000FeatMat.to_feather('data/Emp1000FeatMatOhneHCTSA.feather')

--------------------------------------