In [2]:
## 0 Imports and Constants
import sys
import os

# Füge das übergeordnete Verzeichnis zu sys.path hinzu
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '../../'))
sys.path.insert(0, parent_dir)

import pandas as pd
from pathlib import Path
import numpy as np

from utilities import split_data_into_sequences, load_sequential_time_series

DATA_FOLDER = Path("../data")
REAL_DATA_FOLDER = DATA_FOLDER / "real"
SYNTHETIC_DATA_FOLDER = DATA_FOLDER / "synthetic"

In [3]:
def split_dataset(data):
    print(f'Shape of data before splitting: {data.shape}')
    split_index = len(data) // 2
    data_1 = data[:split_index]
    data_2 = data[split_index:]

    return data_1, data_2

In [4]:
real_data_df = pd.read_csv(REAL_DATA_FOLDER / 'metro_interstate_traffic_volume_label_encoded_no_categorical.csv') # not sequential, not shuffled

In [5]:
### Load all kinds of data ###
real_data = pd.read_csv(REAL_DATA_FOLDER / 'metro_interstate_traffic_volume_label_encoded_no_categorical.csv').to_numpy() # not sequential, not shuffled
jitt_01 = pd.read_csv(SYNTHETIC_DATA_FOLDER / 'mitv_jittered_01.csv').to_numpy() # not sequential, not shuffled
jitt_02 = pd.read_csv(SYNTHETIC_DATA_FOLDER / 'mitv_jittered_02.csv').to_numpy() # not sequential, not shuffled
jitt_005 = pd.read_csv(SYNTHETIC_DATA_FOLDER / 'mitv_jittered_005.csv').to_numpy() # not sequential, not shuffled
timewarp = pd.read_csv(SYNTHETIC_DATA_FOLDER / 'mitv_time_warped.csv').to_numpy() # not sequential, not shuffled

timegan_gru_seq_shuffled = load_sequential_time_series(SYNTHETIC_DATA_FOLDER / 'mitv_28499_12_5_gru_unscaled.csv', shape=(28499, 12, 5)) # sequential, shuffled
timegan_lstm_seq_shuffled = load_sequential_time_series(SYNTHETIC_DATA_FOLDER / 'mitv_28499_12_5_lstm_unscaled.csv', shape=(28499, 12, 5)) # sequential, shuffled
autoencoder_seq = load_sequential_time_series(SYNTHETIC_DATA_FOLDER / 'mitv_28478_12_5_autoencoder_unscaled.csv', shape=(28478, 12, 5)) # sequential, not shuffled, ordered by train, val, test

In [6]:
# split data
jitt_01_train, jitt_01_test = split_dataset(jitt_01)
jitt_02_train, jitt_02_test = split_dataset(jitt_02)
jitt_005_train, jitt_005_test = split_dataset(jitt_005)
timewarp_train, timewarp_test = split_dataset(timewarp)

# split sequential data
timegan_gru_train_seq_shuffled, timegan_gru_test_seq_shuffled = split_dataset(timegan_gru_seq_shuffled)
timegan_lstm_train_seq_shuffled, timegan_lstm_test_seq_shuffled = split_dataset(timegan_lstm_seq_shuffled)
autoencoder_train_seq, autoencoder_test_seq = split_dataset(autoencoder_seq)

Shape of data before splitting: (28511, 5)
Shape of data before splitting: (28511, 5)
Shape of data before splitting: (28511, 5)
Shape of data before splitting: (28511, 5)
Shape of data before splitting: (28499, 12, 5)
Shape of data before splitting: (28499, 12, 5)
Shape of data before splitting: (28478, 12, 5)


In [7]:
# turn into sequences
real_data_seq_shuffled = split_data_into_sequences(real_data, seq_len=12, shuffle_data=True)

jitt_01_train_seq_shuffled = split_data_into_sequences(jitt_01_train, seq_len=12, shuffle_data=True)
jitt_01_test_seq_shuffled = split_data_into_sequences(jitt_01_test, seq_len=12, shuffle_data=True)

jitt_02_train_seq_shuffled = split_data_into_sequences(jitt_02_train, seq_len=12, shuffle_data=True)
jitt_02_test_seq_shuffled = split_data_into_sequences(jitt_02_test, seq_len=12, shuffle_data=True)

jitt_005_train_seq_shuffled = split_data_into_sequences(jitt_005_train, seq_len=12, shuffle_data=True)
jitt_005_test_seq_shuffled = split_data_into_sequences(jitt_005_test, seq_len=12, shuffle_data=True)

timewarp_train_seq_shuffled = split_data_into_sequences(timewarp_train, seq_len=12, shuffle_data=True)
timewarp_test_seq_shuffled = split_data_into_sequences(timewarp_test, seq_len=12, shuffle_data=True)

Shape of the data after splitting into sequences: (28500, 12, 5)
Shape of the data after splitting into sequences: (14244, 12, 5)
Shape of the data after splitting into sequences: (14245, 12, 5)
Shape of the data after splitting into sequences: (14244, 12, 5)
Shape of the data after splitting into sequences: (14245, 12, 5)
Shape of the data after splitting into sequences: (14244, 12, 5)
Shape of the data after splitting into sequences: (14245, 12, 5)
Shape of the data after splitting into sequences: (14244, 12, 5)
Shape of the data after splitting into sequences: (14245, 12, 5)


In [8]:
# shuffle autoencoder data
autoencoder_train_indices = np.random.permutation(len(autoencoder_train_seq))
autoencoder_test_indices = np.random.permutation(len(autoencoder_test_seq))

autoencoder_train_seq_shuffled = autoencoder_train_seq[autoencoder_train_indices]
autoencoder_test_seq_shuffled = autoencoder_test_seq[autoencoder_test_indices]

In [9]:
data_amount = 3000

### Pick right amount of random train and test data
train_real = real_data_seq_shuffled[:7*data_amount]

train_jitt_01 = jitt_01_train_seq_shuffled[:data_amount]
test_jitt_01 = jitt_01_test_seq_shuffled[:data_amount]

train_jitt_02 = jitt_02_train_seq_shuffled[:data_amount]
test_jitt_02 = jitt_02_test_seq_shuffled[:data_amount]

train_jitt_005 = jitt_005_train_seq_shuffled[:data_amount]
test_jitt_005 = jitt_005_test_seq_shuffled[:data_amount]

train_timewarp = timewarp_train_seq_shuffled[:data_amount]
test_timewarp = timewarp_test_seq_shuffled[:data_amount]

train_timegan_gru = timegan_gru_train_seq_shuffled[:data_amount]
test_timegan_gru = timegan_gru_test_seq_shuffled[:data_amount]

train_timegan_lstm = timegan_lstm_train_seq_shuffled[:data_amount]
test_timegan_lstm = timegan_lstm_test_seq_shuffled[:data_amount]

train_autoencoder = autoencoder_train_seq_shuffled[:data_amount]
test_autoencoder = autoencoder_test_seq_shuffled[:data_amount]

In [21]:
# Add ones and zeros as classifying labels
train_real_labeled = np.concatenate((train_real, np.ones((train_real.shape[0], 1, 5))), axis=1)

train_jitt_01_labeled = np.concatenate((train_jitt_01, np.zeros((train_jitt_01.shape[0], 1, 5))), axis=1)
test_jitt_01_labeled = np.concatenate((test_jitt_01, np.zeros((test_jitt_01.shape[0], 1, 5))), axis=1)

train_jitt_02_labeled = np.concatenate((train_jitt_02, np.zeros((train_jitt_02.shape[0], 1, 5))), axis=1)
test_jitt_02_labeled = np.concatenate((test_jitt_02, np.zeros((test_jitt_02.shape[0], 1, 5))), axis=1)

train_jitt_005_labeled = np.concatenate((train_jitt_005, np.zeros((train_jitt_005.shape[0], 1, 5))), axis=1)
test_jitt_005_labeled = np.concatenate((test_jitt_005, np.zeros((test_jitt_005.shape[0], 1, 5))), axis=1)

train_timewarp_labeled = np.concatenate((train_timewarp, np.zeros((train_timewarp.shape[0], 1, 5))), axis=1)
test_timewarp_labeled = np.concatenate((test_timewarp, np.zeros((test_timewarp.shape[0], 1, 5))), axis=1)

train_timegan_gru_labeled = np.concatenate((train_timegan_gru, np.zeros((train_timegan_gru.shape[0], 1, 5))), axis=1)
test_timegan_gru_labeled = np.concatenate((test_timegan_gru, np.zeros((test_timegan_gru.shape[0], 1, 5))), axis=1)

train_timegan_lstm_labeled = np.concatenate((train_timegan_lstm, np.zeros((train_timegan_lstm.shape[0], 1, 5))), axis=1)
test_timegan_lstm_labeled = np.concatenate((test_timegan_lstm, np.zeros((test_timegan_lstm.shape[0], 1, 5))), axis=1)

train_autoencoder_labeled = np.concatenate((train_autoencoder, np.zeros((train_autoencoder.shape[0], 1, 5))), axis=1)
test_autoencoder_labeled = np.concatenate((test_autoencoder, np.zeros((test_autoencoder.shape[0], 1, 5))), axis=1)

In [23]:
# concatenate all train data and shuffle again
train = np.concatenate((train_real_labeled, train_jitt_01_labeled, train_jitt_02_labeled, train_jitt_005_labeled, train_timewarp_labeled, train_timegan_gru_labeled, train_timegan_lstm_labeled, train_autoencoder_labeled), axis=0)
permutated_train_indices = np.random.permutation(len(train))
train = train[permutated_train_indices]

(42000, 13, 5)

In [25]:
# # save combined train data
# train_to_save = train.reshape(train.shape[0], train.shape[1] * train.shape[2])
# np.savetxt(f'discriminative_train_{train.shape[0]}_{train.shape[1]}_{train.shape[2]}.csv', train_to_save, delimiter=',')

# # save single test data
# test_jitt_01_to_save = test_jitt_01_labeled.reshape(test_jitt_01_labeled.shape[0], test_jitt_01_labeled.shape[1] * test_jitt_01_labeled.shape[2])
# np.savetxt(f'discriminative_test_jitt_01_{test_jitt_01_labeled.shape[0]}_{test_jitt_01_labeled.shape[1]}_{test_jitt_01_labeled.shape[2]}.csv', test_jitt_01_to_save, delimiter=',')

# test_jitt_02_to_save = test_jitt_02_labeled.reshape(test_jitt_02_labeled.shape[0], test_jitt_02_labeled.shape[1] * test_jitt_02_labeled.shape[2])
# np.savetxt(f'discriminative_test_jitt_02_{test_jitt_02_labeled.shape[0]}_{test_jitt_02_labeled.shape[1]}_{test_jitt_02_labeled.shape[2]}.csv', test_jitt_02_to_save, delimiter=',')

# test_jitt_005_to_save = test_jitt_005_labeled.reshape(test_jitt_005_labeled.shape[0], test_jitt_005_labeled.shape[1] * test_jitt_005_labeled.shape[2])
# np.savetxt(f'discriminative_test_jitt_005_{test_jitt_005_labeled.shape[0]}_{test_jitt_005_labeled.shape[1]}_{test_jitt_005_labeled.shape[2]}.csv', test_jitt_005_to_save, delimiter=',')

# test_timewarp_to_save = test_timewarp_labeled.reshape(test_timewarp_labeled.shape[0], test_timewarp_labeled.shape[1] * test_timewarp_labeled.shape[2])
# np.savetxt(f'discriminative_test_timewarp_{test_timewarp_labeled.shape[0]}_{test_timewarp_labeled.shape[1]}_{test_timewarp_labeled.shape[2]}.csv', test_timewarp_to_save, delimiter=',')

# test_timegan_gru_to_save = test_timegan_gru_labeled.reshape(test_timegan_gru_labeled.shape[0], test_timegan_gru_labeled.shape[1] * test_timegan_gru_labeled.shape[2])
# np.savetxt(f'discriminative_test_timegan_gru_{test_timegan_gru_labeled.shape[0]}_{test_timegan_gru_labeled.shape[1]}_{test_timegan_gru_labeled.shape[2]}.csv', test_timegan_gru_to_save, delimiter=',')

# test_timegan_lstm_to_save = test_timegan_lstm_labeled.reshape(test_timegan_lstm_labeled.shape[0], test_timegan_lstm_labeled.shape[1] * test_timegan_lstm_labeled.shape[2])
# np.savetxt(f'discriminative_test_timegan_lstm_{test_timegan_lstm_labeled.shape[0]}_{test_timegan_lstm_labeled.shape[1]}_{test_timegan_lstm_labeled.shape[2]}.csv', test_timegan_lstm_to_save, delimiter=',')

# test_autoencoder_to_save = test_autoencoder_labeled.reshape(test_autoencoder_labeled.shape[0], test_autoencoder_labeled.shape[1] * test_autoencoder_labeled.shape[2])
# np.savetxt(f'discriminative_test_autoencoder_{test_autoencoder_labeled.shape[0]}_{test_autoencoder_labeled.shape[1]}_{test_autoencoder_labeled.shape[2]}.csv', test_autoencoder_to_save, delimiter=',')