# Initial feature selection

Calculating features for sliding windows in artificial ts in order to select the best subset for FEDD.

In [1]:
import pandas as pd
import numpy as np
import os

from tsfresh.feature_extraction import feature_calculators
from tsfresh.utilities.dataframe_functions import roll_time_series
from tsfresh.feature_extraction.extraction import extract_features
from tsfresh.feature_extraction.settings import ComprehensiveFCParameters

from sklearn.feature_selection import mutual_info_regression

In [2]:
import warnings

warnings.filterwarnings("ignore")

In [3]:
N_CPU = 12

### Implementing missing features described in the FEDD paper

In [4]:
@feature_calculators.set_property("fctype", "simple")
def turning_points(x):
    dx = np.diff(x)
    return float(np.sum(dx[1:] * dx[:-1] < 0))

@feature_calculators.set_property("fctype", "simple")
def mutual_information(x, lag):
    ts1 = np.array(x[:-lag]).reshape(-1, 1)
    ts2 = np.array(x[lag:])
    return mutual_info_regression(ts1, ts2)[0]

setattr(feature_calculators, turning_points.__name__, turning_points)
setattr(feature_calculators, mutual_information.__name__, mutual_information)

### Some functions to run calculations

In [5]:
def rolled_time_series(data, window_size, padding, n_jobs=N_CPU):
    rolled = roll_time_series(
        data, column_id='id', column_sort='timestamp',
        min_timeshift=window_size-1, max_timeshift=window_size-1, rolling_direction=padding,
        n_jobs=n_jobs, disable_progressbar=True
    )
    rolled[['id1', 'window_end']] = pd.DataFrame(rolled['id'].values.tolist(), index=rolled.index)
    return rolled[['window_end', 'timestamp', 'value']].reset_index(drop=True)

In [6]:
def extract_all_features_for_ts(df_rolled, params, n_jobs=N_CPU):
    df_features = extract_features(
        df_rolled, column_id='window_end', column_sort='timestamp', 
        default_fc_parameters=params,
        n_jobs=n_jobs,
        disable_progressbar=True
    )
    # bicorrelation
    df_rolled['value'] = df_rolled.groupby('window_end').transform(
        lambda x: (x - x.mean()) / x.std()
    )['value']
    df_features1 = extract_features(
        df_rolled, column_id='window_end', column_sort='timestamp', 
        default_fc_parameters={
            "c3": [{"lag": lag} for lag in range(1, 4)]
        },
        n_jobs=n_jobs,
        disable_progressbar=True
    )
    df_features1.columns = [f'value__bicorrelation__lag_{lag}' for lag in range(1, 4)]
    df_features = pd.concat([df_features, df_features1], axis=1)
    return df_features

### Parameters

In [7]:
params = ComprehensiveFCParameters()
params[turning_points.__name__] = None
params[mutual_information.__name__] = [{"lag": lag} for lag in [1, 2, 3]]

### Running feature extraction

In [8]:
def find_csv_files(path):
    files = os.listdir(path)
    files = [filename for filename in files if filename.endswith('.csv')]
    files = [os.path.join(path, file) for file in files]
    return files

In [9]:
artificial_ts_files = find_csv_files('../data/raw/2024_04_11_artificial_data')
root_interim_path = '../data/interim/2024_04_11_artificial_data'
artificial_interim_path = os.path.join(root_interim_path, 'features')
os.makedirs(artificial_interim_path, exist_ok=True)

In [10]:
window_size = 24 * 7 # something like 24 h * 7 days --> 1 week
padding = 1 * 6 # something like 6 h --> 6 hours

for file in artificial_ts_files:
    df = pd.read_csv(file, index_col=0)
    df['id'] = 0
    df_rolled = rolled_time_series(df, window_size, padding)
    df_features = extract_all_features_for_ts(df_rolled, params=params)
    df_features.to_csv(os.path.join(artificial_interim_path, os.path.basename(file)))

### Running ADWIN on features

In [11]:
from river.drift import ADWIN
from multiprocessing import Pool

In [12]:
def run_adwin(file):
    df = pd.read_csv(file, index_col=0)
    df = df.dropna(axis=1, how='any')
    results = pd.DataFrame()

    for feature in df.columns:
        s = df.loc[:, [feature]].copy().reset_index()
        s.columns = ['timestamp', 'value']
        adwin = ADWIN(clock=4, grace_period=28)
        detected = []

        for index, row in s.iterrows():
            adwin.update(row['value'])
            if adwin.drift_detected:
                detected.append(row['timestamp'])
        
        res = pd.DataFrame({
            'feature': [feature[7:]],
            'alarms': [len(detected)],
            'drifts': [detected]
        }, index=[0])

        results = pd.concat([results, res])
    
    results['ts'] = os.path.basename(file)
    results['group'] = file.split('/')[-2]

    return results

In [13]:
files = find_csv_files(artificial_interim_path)

In [14]:
with Pool() as pool:
    results = pool.map(run_adwin, files)

In [15]:
results_concat = pd.concat(results)
results_concat.to_csv(os.path.join(root_interim_path, 'adwin_results.csv'))