In [1]:
## 0 Imports and Constants
import sys
import os

# Füge das übergeordnete Verzeichnis zu sys.path hinzu
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '../../'))
sys.path.insert(0, parent_dir)

import pandas as pd
from pathlib import Path
import numpy as np

from utilities import split_data_into_sequences, load_sequential_time_series
from dotenv import load_dotenv
load_dotenv()


DATA_FOLDER = Path("../../data")
REAL_DATA_FOLDER = DATA_FOLDER / "real"
SYNTHETIC_DATA_FOLDER = DATA_FOLDER / "synthetic" / "usable" / "1y"
no_discriminative_data = int(os.getenv('NO_DISCRIMINATIVE_DATA', 1459))

In [2]:
def split_dataset(data):
    print(f'Shape of data before splitting: {data.shape}')
    split_index = len(data) // 2
    data_1 = data[:split_index]
    data_2 = data[split_index:]

    return data_1, data_2

### Load Data

In [3]:
real_data_df = pd.read_csv(REAL_DATA_FOLDER / 'mitv_prep_1y.csv') # not sequential, not shuffled

In [4]:
real_data = pd.read_csv(REAL_DATA_FOLDER / 'mitv_prep_1y.csv').to_numpy() # not sequential, not shuffled
jitter = pd.read_csv(SYNTHETIC_DATA_FOLDER / 'jittered_01.csv').to_numpy() # not sequential, not shuffled
timewarp = pd.read_csv(SYNTHETIC_DATA_FOLDER / 'time_warped.csv').to_numpy() # not sequential, not shuffled

timegan_gru_seq_shuffled = load_sequential_time_series(SYNTHETIC_DATA_FOLDER / '8747_12_5_timegan_gru.csv', shape=(8747, 12, 5)) # sequential, shuffled
timegan_lstm_seq_shuffled = load_sequential_time_series(SYNTHETIC_DATA_FOLDER / '8747_12_5_timegan_lstm.csv', shape=(8747, 12, 5)) # sequential, shuffled
vae_seq_shuffled = load_sequential_time_series(SYNTHETIC_DATA_FOLDER / '8759_12_5_fc_vae.csv', shape=(8759, 12, 5)) # sequential, shuffled

autoencoder_seq = load_sequential_time_series(SYNTHETIC_DATA_FOLDER / '8726_12_5_lstm_autoencoder.csv', shape=(8726, 12, 5)) # sequential, not shuffled, ordered by train, val, test

### Split Data

In [5]:
# split data
jitter_train, jitter_test = split_dataset(jitter)
timewarp_train, timewarp_test = split_dataset(timewarp)

# split sequential data
timegan_gru_train_seq_shuffled, timegan_gru_test_seq_shuffled = split_dataset(timegan_gru_seq_shuffled)
timegan_lstm_train_seq_shuffled, timegan_lstm_test_seq_shuffled = split_dataset(timegan_lstm_seq_shuffled)
vae_train_seq_shuffled, vae_test_seq_shuffled = split_dataset(vae_seq_shuffled)
autoencoder_train_seq, autoencoder_test_seq = split_dataset(autoencoder_seq)

Shape of data before splitting: (8759, 5)
Shape of data before splitting: (8759, 5)
Shape of data before splitting: (8747, 12, 5)
Shape of data before splitting: (8747, 12, 5)
Shape of data before splitting: (8759, 12, 5)
Shape of data before splitting: (8726, 12, 5)


### Turn into Sequential Data

In [6]:
real_data_seq_shuffled = split_data_into_sequences(real_data, seq_len=12, shuffle_data=True)

jitter_train_seq_shuffled = split_data_into_sequences(jitter_train, seq_len=12, shuffle_data=True)
jitter_test_seq_shuffled = split_data_into_sequences(jitter_test, seq_len=12, shuffle_data=True)

timewarp_train_seq_shuffled = split_data_into_sequences(timewarp_train, seq_len=12, shuffle_data=True)
timewarp_test_seq_shuffled = split_data_into_sequences(timewarp_test, seq_len=12, shuffle_data=True)

Shape of the data after splitting into sequences: (8748, 12, 5)
Shape of the data after splitting into sequences: (4368, 12, 5)
Shape of the data after splitting into sequences: (4369, 12, 5)
Shape of the data after splitting into sequences: (4368, 12, 5)
Shape of the data after splitting into sequences: (4369, 12, 5)


### Shuffle Autoencoder Data

In [7]:
autoencoder_train_indices = np.random.permutation(len(autoencoder_train_seq))
autoencoder_test_indices = np.random.permutation(len(autoencoder_test_seq))

autoencoder_train_seq_shuffled = autoencoder_train_seq[autoencoder_train_indices]
autoencoder_test_seq_shuffled = autoencoder_test_seq[autoencoder_test_indices]

In [8]:
real_data_seq_shuffled.shape

(8748, 12, 5)

### Only keep required amount of data

In [9]:
train_real = real_data_seq_shuffled[:6*no_discriminative_data]

train_jitter = jitter_train_seq_shuffled[:no_discriminative_data]
test_jitter = jitter_test_seq_shuffled[:no_discriminative_data]

train_timewarp = timewarp_train_seq_shuffled[:no_discriminative_data]
test_timewarp = timewarp_test_seq_shuffled[:no_discriminative_data]

train_timegan_gru = timegan_gru_train_seq_shuffled[:no_discriminative_data]
test_timegan_gru = timegan_gru_test_seq_shuffled[:no_discriminative_data]

train_timegan_lstm = timegan_lstm_train_seq_shuffled[:no_discriminative_data]
test_timegan_lstm = timegan_lstm_test_seq_shuffled[:no_discriminative_data]

train_vae = vae_train_seq_shuffled[:no_discriminative_data]
test_vae = vae_test_seq_shuffled[:no_discriminative_data]

train_autoencoder = autoencoder_train_seq_shuffled[:no_discriminative_data]
test_autoencoder = autoencoder_test_seq_shuffled[:no_discriminative_data]

### Add labels for classification

In [10]:
### Adding Ones -> Real Data ###
train_real_labeled = np.concatenate((train_real, np.ones((train_real.shape[0], 1, 5))), axis=1)

### Adding Zeros -> Synthetic Data ###
train_jitter_labeled = np.concatenate((train_jitter, np.zeros((train_jitter.shape[0], 1, 5))), axis=1)
test_jitter_labeled = np.concatenate((test_jitter, np.zeros((test_jitter.shape[0], 1, 5))), axis=1)

train_timewarp_labeled = np.concatenate((train_timewarp, np.zeros((train_timewarp.shape[0], 1, 5))), axis=1)
test_timewarp_labeled = np.concatenate((test_timewarp, np.zeros((test_timewarp.shape[0], 1, 5))), axis=1)

train_timegan_gru_labeled = np.concatenate((train_timegan_gru, np.zeros((train_timegan_gru.shape[0], 1, 5))), axis=1)
test_timegan_gru_labeled = np.concatenate((test_timegan_gru, np.zeros((test_timegan_gru.shape[0], 1, 5))), axis=1)

train_timegan_lstm_labeled = np.concatenate((train_timegan_lstm, np.zeros((train_timegan_lstm.shape[0], 1, 5))), axis=1)
test_timegan_lstm_labeled = np.concatenate((test_timegan_lstm, np.zeros((test_timegan_lstm.shape[0], 1, 5))), axis=1)

train_vae_labeled = np.concatenate((train_vae, np.zeros((train_vae.shape[0], 1, 5))), axis=1)
test_vae_labeled = np.concatenate((test_vae, np.zeros((test_vae.shape[0], 1, 5))), axis=1)

train_autoencoder_labeled = np.concatenate((train_autoencoder, np.zeros((train_autoencoder.shape[0], 1, 5))), axis=1)
test_autoencoder_labeled = np.concatenate((test_autoencoder, np.zeros((test_autoencoder.shape[0], 1, 5))), axis=1)

### Concatenate training data

In [11]:
# concatenate all train data and shuffle again
train = np.concatenate((train_real_labeled, train_jitter_labeled, train_timewarp_labeled, train_timegan_gru_labeled, train_timegan_lstm_labeled, train_vae_labeled, train_autoencoder_labeled), axis=0)
permutated_train_indices = np.random.permutation(len(train))
train = train[permutated_train_indices]

train.shape

(17502, 13, 5)

### Save combined training data and individual test data

In [12]:
if False:
    # save combined train data
    train_to_save = train.reshape(train.shape[0], train.shape[1] * train.shape[2])
    np.savetxt(f'discriminative_train_{train.shape[0]}_{train.shape[1]}_{train.shape[2]}.csv', train_to_save, delimiter=',')

    # save single test data
    test_jitter_to_save = test_jitter_labeled.reshape(test_jitter_labeled.shape[0], test_jitter_labeled.shape[1] * test_jitter_labeled.shape[2])
    np.savetxt(f'discriminative_test_jitter_{test_jitter_labeled.shape[0]}_{test_jitter_labeled.shape[1]}_{test_jitter_labeled.shape[2]}.csv', test_jitter_to_save, delimiter=',')

    test_timewarp_to_save = test_timewarp_labeled.reshape(test_timewarp_labeled.shape[0], test_timewarp_labeled.shape[1] * test_timewarp_labeled.shape[2])
    np.savetxt(f'discriminative_test_timewarp_{test_timewarp_labeled.shape[0]}_{test_timewarp_labeled.shape[1]}_{test_timewarp_labeled.shape[2]}.csv', test_timewarp_to_save, delimiter=',')

    test_timegan_gru_to_save = test_timegan_gru_labeled.reshape(test_timegan_gru_labeled.shape[0], test_timegan_gru_labeled.shape[1] * test_timegan_gru_labeled.shape[2])
    np.savetxt(f'discriminative_test_timegan_gru_{test_timegan_gru_labeled.shape[0]}_{test_timegan_gru_labeled.shape[1]}_{test_timegan_gru_labeled.shape[2]}.csv', test_timegan_gru_to_save, delimiter=',')

    test_timegan_lstm_to_save = test_timegan_lstm_labeled.reshape(test_timegan_lstm_labeled.shape[0], test_timegan_lstm_labeled.shape[1] * test_timegan_lstm_labeled.shape[2])
    np.savetxt(f'discriminative_test_timegan_lstm_{test_timegan_lstm_labeled.shape[0]}_{test_timegan_lstm_labeled.shape[1]}_{test_timegan_lstm_labeled.shape[2]}.csv', test_timegan_lstm_to_save, delimiter=',')

    test_vae_to_save = test_vae_labeled.reshape(test_vae_labeled.shape[0], test_vae_labeled.shape[1] * test_vae_labeled.shape[2])
    np.savetxt(f'discriminative_test_vae_{test_vae_labeled.shape[0]}_{test_vae_labeled.shape[1]}_{test_vae_labeled.shape[2]}.csv', test_vae_to_save, delimiter=',')

    test_autoencoder_to_save = test_autoencoder_labeled.reshape(test_autoencoder_labeled.shape[0], test_autoencoder_labeled.shape[1] * test_autoencoder_labeled.shape[2])
    np.savetxt(f'discriminative_test_autoencoder_{test_autoencoder_labeled.shape[0]}_{test_autoencoder_labeled.shape[1]}_{test_autoencoder_labeled.shape[2]}.csv', test_autoencoder_to_save, delimiter=',')