### Import libraries

In [1]:
import numpy as np
import pandas as pd
#import statistics
#import time
#import glob
#import os
import feather
import scipy.stats as stats
import itertools

In [2]:
from utilityFuncs import get_problematic_features



--------------------------------------

### Load calculated features

In [3]:
fullFeatMat = feather.read_dataframe('data/EmpFeatMat.feather')
fullFeatMat

Unnamed: 0,id,group,names,values,method
0,1,airq,SP_Summaries_welch_rect_centroid,0.042951,catch22
1,2,airq,SP_Summaries_welch_rect_centroid,0.042951,catch22
2,3,airq,SP_Summaries_welch_rect_centroid,0.042951,catch22
3,4,airq,SP_Summaries_welch_rect_centroid,0.042951,catch22
4,5,airq,SP_Summaries_welch_rect_centroid,0.042951,catch22
...,...,...,...,...,...
766471,604,"synthetic, 1M",beta,0.276522,tsfeatures
766472,604,"synthetic, 1M",arch_r2,0.010751,tsfeatures
766473,604,"synthetic, 1M",arch_lm,0.814753,tsfeatures
766474,604,"synthetic, 1M",arch_acf,0.011818,tsfeatures


In [4]:
for i in fullFeatMat['method'].unique():
    
    df = fullFeatMat[fullFeatMat.method == i]
    
    print(i, ' = ', len(df['names'].unique()))

catch22  =  22
kats  =  40
tsfel  =  390
tsfresh  =  779
tsfeatures  =  38


In [5]:
df = fullFeatMat[fullFeatMat.method == 'tsfeatures']
# order that shit
df[df.names == 'entropy'].isna().sum()

id         0
group      0
names      0
values    78
method     0
dtype: int64

In [6]:
fullFeatMat[fullFeatMat.method == 'catch22'].isna().sum()

id        0
group     0
names     0
values    0
method    0
dtype: int64

In [7]:
fullFeatMat.columns.values[3]

'values'

In [8]:
# Get the list of all column names from headers
column_headers = list(fullFeatMat.columns.values)
print("The Column Header :", column_headers)

The Column Header : ['id', 'group', 'names', 'values', 'method']


--------------------------------------

### Some functions

In [9]:
# checks if the given columns has more than 90% NaN values

def procentNaN(col):
    return (col.isna().sum() / 604) > 0.1

In [10]:
def myZscore(df):
    df['values'] = stats.zscore(df['values'].values, nan_policy='omit')
    return df

In [11]:
# returns a dataframe with all the features that have not more than 10% NaN's

def remove_problematic_features(d):
    
    goodFeats = {}
    
    for i in d['method'].unique():
        
        # the values need to be normed with z-score (grouped by feature name)
        methodFrame = d.loc[d['method'] == i].groupby('names').apply(myZscore)
        
        # rows are time series', columns are features
        tmpFrame = pd.pivot_table(methodFrame, index=['id'], columns=['names'])
        
        # checks if more than 90% are NaN
        l = tmpFrame.apply(procentNaN)
        
        # returns a list with all feature names that have not more than 90% NaN
        goodFeatsList = list(list(zip(*l.index[l == False].tolist()))[1])
        
        # store the names of the good features
        goodFeats[i] = goodFeatsList
        
        
    return goodFeats

In [12]:
goodFeats1 = remove_problematic_features(fullFeatMat)

for method in goodFeats1:
    print(method, len(goodFeats1[method]))

catch22 22
kats 36
tsfel 378
tsfresh 744
tsfeatures 31


In [13]:


goodFeats2 = get_problematic_features(fullFeatMat)

for method in goodFeats2:
    print(method, len(goodFeats2[method]))

catch22 22
kats 37
tsfel 390
tsfresh 779
tsfeatures 34


In [14]:
fullFeatMat.columns.values

array(['id', 'group', 'names', 'values', 'method'], dtype=object)

In [15]:
goodFeats1['tsfeatures']

['alpha',
 'arch_acf',
 'arch_lm',
 'arch_r2',
 'beta',
 'crossing_points',
 'curvature',
 'diff1_acf1',
 'diff1_acf10',
 'diff1x_pacf5',
 'diff2_acf1',
 'diff2_acf10',
 'diff2x_pacf5',
 'e_acf1',
 'e_acf10',
 'flat_spots',
 'garch_acf',
 'garch_r2',
 'hurst',
 'linearity',
 'lumpiness',
 'nonlinearity',
 'spike',
 'stability',
 'trend',
 'unique_id',
 'unitroot_kpss',
 'unitroot_pp',
 'x_acf1',
 'x_acf10',
 'x_pacf5']

In [17]:
goodFeats2['tsfeatures']

['alpha',
 'arch_acf',
 'arch_lm',
 'arch_r2',
 'beta',
 'crossing_points',
 'curvature',
 'diff1_acf1',
 'diff1_acf10',
 'diff1x_pacf5',
 'diff2_acf1',
 'diff2_acf10',
 'diff2x_pacf5',
 'e_acf1',
 'e_acf10',
 'flat_spots',
 'garch_acf',
 'garch_r2',
 'hurst',
 'linearity',
 'lumpiness',
 'nonlinearity',
 'nperiods',
 'seasonal_period',
 'series_length',
 'spike',
 'stability',
 'trend',
 'unique_id',
 'unitroot_kpss',
 'unitroot_pp',
 'x_acf1',
 'x_acf10',
 'x_pacf5']

Remove bad features and bad IDs.

In [25]:
# returns a list with the ID's of all the 'good' time series' and the filtered version of the raw dataset

def remove_problematic_datasets(d):
    
    # returns a dictionary with a list of good features for each method
    goodFeats = remove_problematic_features(d)
    
    # removes infinite values
    infID = d[abs(d['values']) == np.inf]['id'].unique()
    d = d.loc[~d['id'].isin(infID)]

    #---------------------------------------------
    # remove the bad features
   
    filtered = pd.DataFrame()
    
    for method in d['method'].unique():
        
        # then remove all the features that are not mentioned
        
        temp = d.loc[(d['method'] == method) & (d['names'].isin(goodFeats[method]))]
        filtered = pd.concat([filtered, temp], ignore_index=True)
    
    #----------------------------------------------------------------------------------------
    # now that the bad features are removed, we need to remove any time series that has a NaN
    
    badIDs = []
    
    for method in filtered['method'].unique():
        
        # normalizing the values
        methodFrame = filtered.loc[filtered['method'] == method].groupby('names').apply(myZscore)
        
        # rows are time series', columns are features
        tmpFrame = pd.pivot_table(methodFrame, index=['id'], columns=['names'])
        badIDs = badIDs + np.where(tmpFrame.isna().any(axis=1))[0].tolist()
    
    badIDs = list(set(badIDs))
    
    # this will store all the good features and all the good id's in the dataframe 'filtered'
    filtered = filtered.loc[~filtered['id'].isin(badIDs)]
    
    goodIDs = filtered['id'].unique()
    
    return filtered, goodIDs
    

In [26]:
filt1, goodIDs = remove_problematic_datasets(fullFeatMat)

In [27]:
fullFeatMat['id'].unique() in goodIDs

  fullFeatMat['id'].unique() in goodIDs


False

In [28]:
len(goodIDs)

513

In [29]:
# Change the row indexes
filt1.index = list(range(len(filt1)))
filt1

Unnamed: 0,id,group,names,values,method
0,1,airq,SP_Summaries_welch_rect_centroid,0.042951,catch22
1,2,airq,SP_Summaries_welch_rect_centroid,0.042951,catch22
2,3,airq,SP_Summaries_welch_rect_centroid,0.042951,catch22
3,6,airq,SP_Summaries_welch_rect_centroid,1.497165,catch22
4,7,airq,SP_Summaries_welch_rect_centroid,1.503301,catch22
...,...,...,...,...,...
621751,604,"synthetic, 1M",beta,0.276522,tsfeatures
621752,604,"synthetic, 1M",arch_r2,0.010751,tsfeatures
621753,604,"synthetic, 1M",arch_lm,0.814753,tsfeatures
621754,604,"synthetic, 1M",arch_acf,0.011818,tsfeatures


Norm the values by each feature for each method.

In [30]:
filt2 = pd.DataFrame()

for method in filt1['method'].unique():

    methodFrame = filt1[filt1['method'] == method]
    normedFrame = methodFrame.groupby('names').apply(myZscore)
    
    filt2 = pd.concat([filt2, normedFrame], ignore_index=True)
    
filt2

Unnamed: 0,id,group,names,values,method
0,1,airq,SP_Summaries_welch_rect_centroid,-0.459488,catch22
1,2,airq,SP_Summaries_welch_rect_centroid,-0.459488,catch22
2,3,airq,SP_Summaries_welch_rect_centroid,-0.459488,catch22
3,6,airq,SP_Summaries_welch_rect_centroid,2.624249,catch22
4,7,airq,SP_Summaries_welch_rect_centroid,2.637261,catch22
...,...,...,...,...,...
621751,604,"synthetic, 1M",beta,0.305495,tsfeatures
621752,604,"synthetic, 1M",arch_r2,-0.466958,tsfeatures
621753,604,"synthetic, 1M",arch_lm,0.829934,tsfeatures
621754,604,"synthetic, 1M",arch_acf,-0.471774,tsfeatures


In [13]:
# store to use for PCA
#filt2.to_feather('divers/versuch02.feather')
filt2.to_feather('divers/versuch02_exp.feather')

--------------------------------------

Now adding a column called _comb_id_ which stores the method name combined with the feature name.

In [31]:
def combine_method_and_name(row):
    return row['method'] + '_' + row['names']

In [32]:
filt2['comb_id'] = filt2.apply(lambda row: combine_method_and_name(row), axis=1)
filt3 = filt2.copy()
#del filt1, filt2

In [33]:
filt3

Unnamed: 0,id,group,names,values,method,comb_id
0,1,airq,SP_Summaries_welch_rect_centroid,-0.459488,catch22,catch22_SP_Summaries_welch_rect_centroid
1,2,airq,SP_Summaries_welch_rect_centroid,-0.459488,catch22,catch22_SP_Summaries_welch_rect_centroid
2,3,airq,SP_Summaries_welch_rect_centroid,-0.459488,catch22,catch22_SP_Summaries_welch_rect_centroid
3,6,airq,SP_Summaries_welch_rect_centroid,2.624249,catch22,catch22_SP_Summaries_welch_rect_centroid
4,7,airq,SP_Summaries_welch_rect_centroid,2.637261,catch22,catch22_SP_Summaries_welch_rect_centroid
...,...,...,...,...,...,...
621751,604,"synthetic, 1M",beta,0.305495,tsfeatures,tsfeatures_beta
621752,604,"synthetic, 1M",arch_r2,-0.466958,tsfeatures,tsfeatures_arch_r2
621753,604,"synthetic, 1M",arch_lm,0.829934,tsfeatures,tsfeatures_arch_lm
621754,604,"synthetic, 1M",arch_acf,-0.471774,tsfeatures,tsfeatures_arch_acf


--------------------------------------

### Compute correlation

Next step: take every possible combination of 2 methods and calculate the spearman correlation between all features of the two methods.

In [34]:
# create a list with all the methods used and get all the possible combinations
# take every possible combination of 2 methods and calculate the spearman correlation between all features of the two methods
methods = filt3['method'].unique()
methodCombinations = list(itertools.combinations(methods, r=2))

i = 0

# compute the spearman correlation of each method combination
for combination in methodCombinations:
    
    print('Doing: ', combination)
    
    corrDF = pd.DataFrame(columns=['method1', 'method2', 'feat1', 'feat2', 'corr'])
    
    # stores the feature names of method 1
    frame1 = pd.pivot_table(filt3[filt3['method'] == combination[0]], index=['id'], columns=['comb_id'])
    frame2 = pd.pivot_table(filt3[filt3['method'] == combination[1]], index=['id'], columns=['comb_id'])
    
    # stores all the correlations between each pair of features
    allCorr = pd.concat([frame1, frame2], axis=1).corr(method='spearman').filter(frame2.columns).filter(frame1.columns, axis=0)
    
    # having the correlations in on columne, the feature names as indeces
    tmpDF = allCorr.stack()
    
    # rename indeces and column to be able to reset the index
    tmpDF.index.names = [None, 'feat1', 'feat2']
    tmpDF.columns = ['corr']
    corrDF = tmpDF.reset_index().iloc[: , 1:]
    
    # adds the method names
    l = len(corrDF)
    corrDF.insert(loc=0, column='method2', value=l * [combination[1]])
    corrDF.insert(loc=0, column='method1', value=l * [combination[0]])
    
    #corrDF.to_feather('corrMatsNorm/corrMat' + str(i) + '.feather')
    corrDF.to_feather('test/corrMat' + str(i) + '.feather')
    i += 1


Doing:  ('catch22', 'kats')
Doing:  ('catch22', 'tsfel')
Doing:  ('catch22', 'tsfresh')
Doing:  ('catch22', 'tsfeatures')
Doing:  ('kats', 'tsfel')
Doing:  ('kats', 'tsfresh')
Doing:  ('kats', 'tsfeatures')
Doing:  ('tsfel', 'tsfresh')
Doing:  ('tsfel', 'tsfeatures')
Doing:  ('tsfresh', 'tsfeatures')
