In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import statistics
import time
import glob
import os
import feather
import scipy.stats as stats
import itertools
from sklearn.pipeline import make_pipeline
from sktime.datasets import load_from_tsfile_to_dataframe
sns.set(style="whitegrid")

In [None]:
from featureExtraction import *

--------------------------------------

### LOAD DATA

In [None]:
datasets = ['GlobalClimate', 'HumidityHouse',
            'HungaryChickenpox', 'IstanbulStockExchange', 
            'ParkingBirmingham', 'PedalMe']

#errors = pd.DataFrame(columns=datasets)
badIDs = []


plain = []
for dataset in datasets:
    
    plain.append(pd.read_feather('data/' + dataset + '.feather'))

--------------------------------------

CATCH22

In [None]:
df = pd.DataFrame()


for frame, dataset in zip(plain,datasets):
    print(dataset)
    
    # exclude last element of the series
    X = frame.iloc[:,:-4].copy()
    info = frame.iloc[:,-4:].copy()
    info.columns = ['y', 'ind', 'data', 'set']
    
    features = Catch22.transform(X)
    f = pd.concat([features, info], axis=1)
    
    df = pd.concat([df, f], ignore_index=True)

# replace infinite values with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# remove features with more than 10% NaN
k = len(df.loc[0])

output_catch22 = df.drop(df.columns[df.apply(lambda col: (col.isnull().sum() / len(df)) > 0.1)], axis=1)

print('\n')
print('# bad features = ', k, ' - ', len(output_catch22.loc[0]))


schlechteIDs = output_catch22[output_catch22.isna().any(axis=1)].index.values
badIDs.append(schlechteIDs)

print('Bad IDs: ', schlechteIDs)
output_catch22

KATS

In [None]:
df = pd.DataFrame()

for frame, dataset in zip(plain,datasets):
    print(dataset)
    
    # exclude last element of the series
    X = frame.iloc[:,:-4].copy()
    info = frame.iloc[:,-4:].copy()
    info.columns = ['y', 'ind', 'data', 'set']
    
    features = Kats.transform(X)
    features = pd.concat([features, info], axis=1)
    
    df = pd.concat([df, features], ignore_index=True)

# replace infinite values with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)
    
# remove features with more than 10% NaN
k = len(df.loc[0])
output_kats = df.drop(df.columns[df.apply(lambda col: (col.isnull().sum() / len(df)) > 0.1)], axis=1)

print('\n')
print('# bad features = ', k, ' - ', len(output_kats.loc[0]))


schlechteIDs = output_kats[output_kats.isna().any(axis=1)].index.values
badIDs.append(schlechteIDs)

print('Bad IDs: ', schlechteIDs)
output_kats

TSFEATURES

In [None]:
df = pd.DataFrame()


for frame, dataset in zip(plain,datasets):
    print(dataset)
    
    # exclude last element of the series
    X = frame.iloc[:,:-4].copy()
    info = frame.iloc[:,-4:].copy()
    info.columns = ['y', 'ind', 'data', 'set']
    
    features = TSFeatures.transform(X)
    features = pd.concat([features, info], axis=1)
    
    df = pd.concat([df, features], ignore_index=True)
    
# replace infinite values with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# remove features with more than 10% NaN
k = len(df.loc[0])
output_tsfeatures = df.drop(df.columns[df.apply(lambda col: (col.isnull().sum() / len(df)) > 0.1)], axis=1)

print('\n')
print('# bad features = ', k, ' - ', len(output_tsfeatures.loc[0]))


schlechteIDs = output_tsfeatures[output_tsfeatures.isna().any(axis=1)].index.values
badIDs.append(schlechteIDs)

print('Bad IDs: ', schlechteIDs)
output_tsfeatures

TSFEL

In [None]:
df = pd.DataFrame()


for frame, dataset in zip(plain,datasets):
    print(dataset)
    
    # exclude last element of the series
    X = frame.iloc[:,:-4].copy()
    info = frame.iloc[:,-4:].copy()
    info.columns = ['y', 'ind', 'data', 'set']
    
    features = TSFel.transform(X)
    features = pd.concat([features, info], axis=1)
    
    df = pd.concat([df, features], ignore_index=True)
    
# replace infinite values with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# remove features with more than 10% NaN
k = len(df.loc[0])
output_tsfel = df.drop(df.columns[df.apply(lambda col: (col.isnull().sum() / len(df)) > 0.1)], axis=1)

print('\n')
print('# bad features = ', k, ' - ', len(output_tsfel.loc[0]))


schlechteIDs = output_tsfel[output_tsfel.isna().any(axis=1)].index.values
badIDs.append(schlechteIDs)

print('Bad IDs: ', schlechteIDs)
output_tsfel

TSFRESH

In [None]:
df = pd.DataFrame()


for frame, dataset in zip(plain,datasets):
    print(dataset)
    
    # exclude last element of the series
    X = frame.iloc[:,:-4].copy()
    info = frame.iloc[:,-4:].copy()
    info.columns = ['y', 'ind', 'data', 'set']
    
    features = TSFresh.transform(X)
    features = pd.concat([features, info], axis=1)
    
    df = pd.concat([df, features], ignore_index=True)
    
# replace infinite values with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# remove features with more than 10% NaN
k = len(df.loc[0])
output_tsfresh = df.drop(df.columns[df.apply(lambda col: (col.isnull().sum() / len(df)) > 0.1)], axis=1)

print('\n')
print('# bad features = ', k, ' - ', len(output_tsfresh.loc[0]))


schlechteIDs = output_tsfresh[output_tsfresh.isna().any(axis=1)].index.values
badIDs.append(schlechteIDs)

print('Bad IDs: ', schlechteIDs)
output_tsfresh


In [None]:
frames = [output_catch22,
          output_kats,
          output_tsfeatures,
          output_tsfel,
          output_tsfresh]

In [None]:
for frame in frames:
    frame.replace(np.nan, 0, inplace=True)

In [None]:
for frame in frames:
    print(frame.isnull().values.any())

In [None]:
output_catch22.to_feather('data/output_catch22.feather')
output_kats.to_feather('data/output_kats.feather')
output_tsfeatures.to_feather('data/output_tsfeatures.feather')
output_tsfel.to_feather('data/output_tsfel.feather')
output_tsfresh.to_feather('data/output_tsfresh.feather')

--------------------------------------

In [None]:
# list with unique values that contain the indices to be removed
toRemove = list(set([item for sublist in badIDs for item in sublist]))
toRemove.sort()

In [None]:
# dataframe containing information on 'ind', 'data' and 'set' for the series to be removed 
bf = output_catch22.iloc[toRemove][['ind', 'data', 'set']].copy()

for data, frame in zip(datasets, plain):
    
    # first remove the bad ones in the training set
    badTrainSets = bf.loc[(bf['data'] == data) & (bf['set'] == 'train')]['ind'].values
    frame.drop(frame.loc[(frame['ind'].isin(badTrainSets)) & (frame['set'] == 'train')].index, axis=0, inplace=True)
    
    # the remove the bad ones from the testing set
    badTestSets = bf.loc[(bf['data'] == data) & (bf['set'] == 'test')]['ind'].values
    frame.drop(frame.loc[(frame['ind'].isin(badTestSets)) & (frame['set'] == 'test')].index, axis=0, inplace=True)
    
    # reset index
    frame.reset_index(drop=True, inplace=True)
    
    # save file
    frame.to_feather('data/' + data +'_plain.feather')


In [None]:
t = [output_catch22,
                 output_kats,
                 output_tsfeatures,
                 output_tsfel,
                 output_tsfresh].copy()

In [None]:
# remove the bad series from each feature frame
featureFrames = [output_catch22,
                 output_kats,
                 output_tsfeatures,
                 output_tsfel,
                 output_tsfresh]

for frame in featureFrames:
    frame.drop(toRemove, axis=0, inplace=True)
    frame.reset_index(drop=True, inplace=True)
    
output_catch22.to_feather('featureFrames/output_catch22.feather')
output_kats.to_feather('featureFrames/output_kats.feather')
output_tsfeatures.to_feather('featureFrames/output_tsfeatures.feather')
output_tsfel.to_feather('featureFrames/output_tsfel.feather')
output_tsfresh.to_feather('featureFrames/output_tsfresh.feather')

--------------------------------------