In [None]:
# !tar -xvf Yahoo_S5_dataset.tgz

In [1]:
from scipy.signal import savgol_filter

def remove_noise(x, window_length = 5, polyorder=2):
    '''
    Use sklearns savgol_filter menthod: https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.savgol_filter.html
    x: input sequence for noise reduction
    window_length: The length of the filter window (i.e., the number of coefficients)
    polyorder: The order of the polynomial used to fit the samples

    returns noise reduced sequence
    '''
    return savgol_filter(x, window_length=window_length, polyorder=polyorder)

def normalize(series):
    '''
    Performs 0-1 normalization on the input series
    series: The input pandas series to normalize

    returns normalized series
    '''
    mean = series.mean()
    std = series.std()
    return (series - mean) / std

def normal_check(series):
    '''
    Quick check to see if a series follows 0-1 normal distribution using mean and std
    series: The input pandas series to normalize

    return: Boolean value for check satisfied or not
    '''

    if ((series.mean() - 0) > 1e-6) or \
        ((series.mean() - 0) < -1e-6)  or \
        ((series.std() - 1) > 1e-6) or \
        ((series.std() - 1) < -1e-6):
        return False
    return True

In [2]:
import os
import glob 

import pandas as pd

in_data_root = 'ydata-labeled-time-series-anomalies-v1_0'
out_data_root = 'preprocessed_ydata-labeled-time-series-anomalies-v1_0'
folder_list = ['A1Benchmark', 'A2Benchmark', 'A3Benchmark', 'A4Benchmark']

for folder in folder_list:
    files = glob.glob(os.path.join(in_data_root, folder) + r"/*.csv")
    
    timestamp_col = 'timestamp'
    if folder in ['A3Benchmark', 'A4Benchmark']:
        files = [file for file in files if '_all' not in file]
        timestamp_col = 'timestamps'
            
    print(f'Extracted {len(files)} file names from folder {folder}')
    
    for file in files:
        df = pd.read_csv(file)
        df = df.sort_values(timestamp_col)
        df = df.reset_index(drop=True)
        df['value_processed'] = remove_noise(df['value'])
        df['value_processed'] = normalize(df['value_processed'])
        if not normal_check(df['value_processed']):
            print(f'Preprocessing incorrect for file {file} in folder {folder}')

        out_dir = os.path.join(out_data_root, folder)
        if not os.path.exists(out_dir):
            os.mkdir(out_dir)
        file_name = file.split("/")[-1]
        out_file_path = os.path.join(out_dir, file_name)
        df.to_csv(out_file_path, index=False)

Extracted 67 file names from folder A1Benchmark
Extracted 100 file names from folder A2Benchmark
Extracted 100 file names from folder A3Benchmark
Extracted 100 file names from folder A4Benchmark


In [4]:
# !tar -zvcf Preprocessed_Yahoo_S5_dataset.gz preprocessed_ydata-labeled-time-series-anomalies-v1_0

a preprocessed_ydata-labeled-time-series-anomalies-v1_0
a preprocessed_ydata-labeled-time-series-anomalies-v1_0/A3Benchmark
a preprocessed_ydata-labeled-time-series-anomalies-v1_0/A2Benchmark
a preprocessed_ydata-labeled-time-series-anomalies-v1_0/A4Benchmark
a preprocessed_ydata-labeled-time-series-anomalies-v1_0/A1Benchmark
a preprocessed_ydata-labeled-time-series-anomalies-v1_0/A1Benchmark/real_59.csv
a preprocessed_ydata-labeled-time-series-anomalies-v1_0/A1Benchmark/real_65.csv
a preprocessed_ydata-labeled-time-series-anomalies-v1_0/A1Benchmark/real_64.csv
a preprocessed_ydata-labeled-time-series-anomalies-v1_0/A1Benchmark/real_58.csv
a preprocessed_ydata-labeled-time-series-anomalies-v1_0/A1Benchmark/real_66.csv
a preprocessed_ydata-labeled-time-series-anomalies-v1_0/A1Benchmark/real_67.csv
a preprocessed_ydata-labeled-time-series-anomalies-v1_0/A1Benchmark/real_63.csv
a preprocessed_ydata-labeled-time-series-anomalies-v1_0/A1Benchmark/real_62.csv
a preprocessed_ydata-labeled-tim