In [None]:
<font size="+3">Time-series Generative Adversarial Network (TimeGAN)</font>

# Imports & Settings

Adapted from the excellent paper by Jinsung Yoon, Daniel Jarrett, and Mihaela van der Schaar:  
[Time-series Generative Adversarial Networks](https://papers.nips.cc/paper/8789-time-series-generative-adversarial-networks),  
Neural Information Processing Systems (NeurIPS), 2019.

- Last updated Date: April 24th 2020
- [Original code](https://bitbucket.org/mvdschaar/mlforhealthlabpub/src/master/alg/timegan/) author: Jinsung Yoon (jsyoon0823@gmail.com)

import warnings
warnings.filterwarnings('ignore')

!pip install tsaug
!pip install fredapi
!pip install sktime

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from pathlib import Path
from tqdm import tqdm
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from tsaug import TimeWarp, Crop, Quantize, Drift, Reverse
from tensorflow.keras import layers, models
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import GRU, Dense, RNN, GRUCell, Input, Dropout, BatchNormalization, LeakyReLU, Reshape, Conv1D, Flatten, Conv1DTranspose
from tensorflow.keras.losses import BinaryCrossentropy, MeanSquaredError, MeanAbsoluteError
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.utils import plot_model
from tensorflow.keras.metrics import AUC
import matplotlib.pyplot as plt
import seaborn as sns
from fredapi import Fred
from sktime.utils.plotting import plot_series
from keras.models import Sequential
from keras.layers import Dense, LSTM

gpu_devices = tf.config.experimental.list_physical_devices('GPU')
if gpu_devices:
    print('Using GPU')
    tf.config.experimental.set_memory_growth(gpu_devices[0], True)
else:
    print('Using CPU')

sns.set_style('white')

# Experiment Path

results_path = Path('time_gan')
if not results_path.exists():
    results_path.mkdir()

experiment = 0

log_dir = results_path / f'experiment_{experiment:02}'
if not log_dir.exists():
    log_dir.mkdir(parents=True)

hdf_store = results_path / 'TimeSeriesGAN.h5'

# Prepare Data

fred = Fred(api_key='18ecafa4ff3f8087c46dd862605532f1')
data = fred.get_series('SP500')

df = pd.DataFrame({
    "WTI": fred.get_series("DCOILWTICO", observation_start="1987-05-20", observation_end="2020-12-31"),
    "BRENT": fred.get_series("DCOILBRENTEU", observation_start="1987-05-20", observation_end="2020-12-31")
})
df.head()

df.info()

df = df.dropna()
df.info()

df['WTI'].plot();

df['BRENT'].plot();

# Select the 'Close' column
close_col = df['WTI']

# Divide the 'Close' column by the first value in the column
normalized_close = close_col.div(close_col.iloc[0])

# Plot the normalized 'Close' column
ax = normalized_close.plot(figsize=(14, 6), title="Normalized Closing Price", legend=False, color='k')
ax.set_xlabel('')

## Plot Series

## Correlation

sns.clustermap(df.corr(),
               annot=True,
               fmt='.2f',
               cmap=sns.diverging_palette(h_neg=20,
                                          h_pos=220), center=0);

def augment_data(original_data, augmentations):
    augmented_data = original_data.copy()
    augmented_data['WTI'] = augmentations.augment(original_data['WTI'].values)
    plt.figure(figsize=(10, 6))
    plt.plot(df.index, df['WTI'], label='Original Data', marker='o')
    plt.plot(augmented_data.index, augmented_data['WTI'], label='Augmented Data', linestyle='--', marker='x')
    plt.title('Original vs Augmented Bitcoin Close Prices')
    plt.xlabel('Date')
    plt.ylabel('Close Price')
    plt.legend()
    plt.show()
    return augmented_data

augmenter = (
    TimeWarp()
    + Crop(size=8402)
    + Quantize(n_levels=[10, 20, 30])
    + Drift(max_drift=(0.1, 0.5))
    #+ Reverse()
)

augmented_data = augment_data(df, augmenter)

augmenter = (
    TimeWarp()
    + Crop(size=8402)
    + Quantize(n_levels=[10, 20, 30])
    + Drift(max_drift=(0.1, 0.5))
    + Reverse()
)

augmented_data = augment_data(df, augmenter)

## Normalize Data

columns_for_autoencoder = ['WTI', 'BRENT']
scaler = MinMaxScaler()
original_data_scaled = scaler.fit_transform(df[columns_for_autoencoder])


input_dim = len(columns_for_autoencoder)
encoding_dim = 80
timesteps = original_data_scaled.shape[0]

original_data_scaled = np.reshape(original_data_scaled, (timesteps, input_dim, 1))

autoencoder = Sequential()

# Encoder
autoencoder.add(Conv1D(filters=64, kernel_size=3, activation='relu', padding='same', input_shape=(input_dim, 1)))
autoencoder.add(BatchNormalization())
autoencoder.add(Dropout(0.6))

autoencoder.add(Conv1D(filters=32, kernel_size=3, activation='relu', padding='same'))
autoencoder.add(BatchNormalization())
autoencoder.add(Dropout(0.6))

autoencoder.add(Flatten())
autoencoder.add(Dense(encoding_dim, activation='relu'))

# Decoder
autoencoder.add(Dense(32 * input_dim, activation='relu'))
autoencoder.add(Reshape((input_dim, 32)))

autoencoder.add(Conv1DTranspose(filters=64, kernel_size=3, activation='relu', padding='same'))
autoencoder.add(BatchNormalization())
autoencoder.add(Dropout(0.6))

autoencoder.add(Conv1DTranspose(filters=1, kernel_size=3, activation='sigmoid', padding='same'))

original_data_scaled = np.reshape(original_data_scaled, (timesteps, input_dim))

autoencoder.compile(optimizer=Adam(learning_rate=0.0005), loss='mean_squared_error')

autoencoder.fit(original_data_scaled, original_data_scaled, epochs=20, batch_size=32, shuffle=True, validation_split=0.2)

synthetic_data_scaled = autoencoder.predict(original_data_scaled)
synthetic_data = pd.DataFrame(data=scaler.inverse_transform(synthetic_data_scaled.reshape(timesteps, input_dim)), index=df.index, columns=columns_for_autoencoder)

synthetic_data = np.maximum(synthetic_data, 0)

table = pd.concat([df[columns_for_autoencoder], synthetic_data.rename(columns=lambda x: 'Synthetic ' + x)], axis=1)

plt.figure(figsize=(10, 6))
plt.plot(table['BRENT'], label='Original Close', marker='o')
plt.plot(table['Synthetic BRENT'], label='Synthetic Close', marker='o')
plt.title('Original vs Synthetic Close')
plt.xlabel('Date')
plt.ylabel('Close')
plt.legend()
plt.show()

plt.figure(figsize=(10, 6))
plt.plot(table['WTI'], label='Original Close', marker='o')
plt.plot(table['Synthetic WTI'], label='Synthetic Close', marker='o')
plt.title('Original vs Synthetic Close')
plt.xlabel('Date')
plt.ylabel('Close')
plt.legend()
plt.show()

# Time Gan

## Parameters

seq_len = 24
n_seq = 2
batch_size = 128

## Create rolling window sequences

scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df).astype(np.float32)

data = []
for i in range(len(df) - seq_len):
    data.append(scaled_data[i:i + seq_len])

n_windows = len(data)

scaled_data

## Create tf.data.Dataset

real_series = (tf.data.Dataset
               .from_tensor_slices(data)
               .shuffle(buffer_size=n_windows)
               .batch(batch_size))
real_series_iter = iter(real_series.repeat())

## Set up random series generator

def make_random_data():
    while True:
        yield np.random.uniform(low=0, high=1, size=(seq_len, n_seq))

We use the Python generator to feed a `tf.data.Dataset` that continues to call the random number generator as long as necessary and produces the desired batch size.

random_series = iter(tf.data.Dataset
                     .from_generator(make_random_data, output_types=tf.float32)
                     .batch(batch_size)
                     .repeat())

# TimeGAN Components

The design of the TimeGAN components follows the author's sample code.

##  Network Parameters

hidden_dim = 24
num_layers = 3

## Set up logger

writer = tf.summary.create_file_writer(log_dir.as_posix())

## Input place holders

X = Input(shape=[seq_len, n_seq], name='RealData')
Z = Input(shape=[seq_len, n_seq], name='RandomData')

## RNN block generator

We keep it very simple and use a very similar architecture for all four components. For a real-world application, they should be tailored to the data.

def make_rnn(n_layers, hidden_units, output_units, name):
    return Sequential([GRU(units=hidden_units,
                           return_sequences=True,
                           name=f'GRU_{i + 1}') for i in range(n_layers)] +
                      [Dense(units=output_units,
                             activation='sigmoid',
                             name='OUT')], name=name)

## Embedder & Recovery

embedder = make_rnn(n_layers=3, 
                    hidden_units=hidden_dim, 
                    output_units=hidden_dim, 
                    name='Embedder')
recovery = make_rnn(n_layers=3, 
                    hidden_units=hidden_dim, 
                    output_units=n_seq, 
                    name='Recovery')

## Generator & Discriminator

generator = make_rnn(n_layers=3, 
                     hidden_units=hidden_dim, 
                     output_units=hidden_dim, 
                     name='Generator')
discriminator = make_rnn(n_layers=3, 
                         hidden_units=hidden_dim, 
                         output_units=1, 
                         name='Discriminator')
supervisor = make_rnn(n_layers=2, 
                      hidden_units=hidden_dim, 
                      output_units=hidden_dim, 
                      name='Supervisor')

# Define the Encoder
def build_encoder(input_shape, latent_dim):
    model = models.Sequential()
    model.add(layers.InputLayer(input_shape=input_shape))
    model.add(layers.Conv1D(64, kernel_size=5, strides=1, padding='same', activation='relu'))
    model.add(layers.MaxPooling1D(pool_size=2))
    model.add(layers.Flatten())
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dense(latent_dim, activation=None))  # latent_dim is the size of the latent space
    return model

# Define the Decoder
def build_decoder(output_shape, latent_dim):
    model = models.Sequential()
    model.add(layers.InputLayer(input_shape=(latent_dim,)))
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dense(output_shape[0] * output_shape[1], activation='relu'))
    model.add(layers.Reshape((output_shape[0], output_shape[1])))
    model.add(layers.Conv1DTranspose(64, kernel_size=5, strides=1, padding='same', activation='relu'))
    model.add(layers.Conv1DTranspose(1, kernel_size=5, strides=1, padding='same', activation='sigmoid'))
    return model

# Instantiate the encoder and decoder
input_shape = (seq_len, n_seq)
latent_dim = 20
encoder = build_encoder(input_shape, latent_dim)
decoder = build_decoder(input_shape, latent_dim)

# TimeGAN Training

## Settings

train_steps = 5000
gamma = 1

## Generic Loss Functions

mse = MeanSquaredError()
bce = BinaryCrossentropy()

# Phase 1: Autoencoder Training

## Architecture

H = embedder(X)
X_tilde = recovery(H)

autoencoder = Model(inputs=X,
                    outputs=X_tilde,
                    name='Autoencoder')

autoencoder.summary()

plot_model(autoencoder,
           to_file=(results_path / 'autoencoder.png').as_posix(),
           show_shapes=True)

## Autoencoder Optimizer

autoencoder_optimizer = Adam()

## Autoencoder Training Step

@tf.function
def train_autoencoder_init(x):
    with tf.GradientTape() as tape:
        x_tilde = autoencoder(x)
        embedding_loss_t0 = mse(x, x_tilde)
        e_loss_0 = 10 * tf.sqrt(embedding_loss_t0)

    var_list = embedder.trainable_variables + recovery.trainable_variables
    gradients = tape.gradient(e_loss_0, var_list)
    autoencoder_optimizer.apply_gradients(zip(gradients, var_list))
    return tf.sqrt(embedding_loss_t0)

## Autoencoder Training Loop

for step in tqdm(range(train_steps)):
    X_ = next(real_series_iter)
    step_e_loss_t0 = train_autoencoder_init(X_)
    with writer.as_default():
        tf.summary.scalar('Loss Autoencoder Init', step_e_loss_t0, step=step)

## Persist model

autoencoder.save(log_dir / 'autoencoder')

# Phase 2: Supervised training

## Define Optimizer

supervisor_optimizer = Adam()

## Train Step

@tf.function
def train_supervisor(x):
    with tf.GradientTape() as tape:
        h = embedder(x)
        h_hat_supervised = supervisor(h)
        g_loss_s = mse(h[:, 1:, :], h_hat_supervised[:, :-1, :])

    var_list = supervisor.trainable_variables
    gradients = tape.gradient(g_loss_s, var_list)
    supervisor_optimizer.apply_gradients(zip(gradients, var_list))
    return g_loss_s

## Training Loop

for step in tqdm(range(train_steps)):
    X_ = next(real_series_iter)
    step_g_loss_s = train_supervisor(X_)
    with writer.as_default():
        tf.summary.scalar('Loss Generator Supervised Init', step_g_loss_s, step=step)

## Persist Model

supervisor.save(log_dir / 'supervisor')

# Joint Training

## Generator

### Adversarial Architecture - Supervised

E_hat = generator(Z)
H_hat = supervisor(E_hat)
Y_fake = discriminator(H_hat)

adversarial_supervised = Model(inputs=Z,
                               outputs=Y_fake,
                               name='AdversarialNetSupervised')

adversarial_supervised.summary()

plot_model(adversarial_supervised, show_shapes=True)

### Adversarial Architecture in Latent Space

Y_fake_e = discriminator(E_hat)

adversarial_emb = Model(inputs=Z,
                    outputs=Y_fake_e,
                    name='AdversarialNet')

adversarial_emb.summary()

plot_model(adversarial_emb, show_shapes=True)

### Mean & Variance Loss

X_hat = recovery(H_hat)
synthetic_data = Model(inputs=Z,
                       outputs=X_hat,
                       name='SyntheticData')

synthetic_data.summary()

plot_model(synthetic_data, show_shapes=True)

def get_generator_moment_loss(y_true, y_pred):
    y_true_mean, y_true_var = tf.nn.moments(x=y_true, axes=[0])
    y_pred_mean, y_pred_var = tf.nn.moments(x=y_pred, axes=[0])
    g_loss_mean = tf.reduce_mean(tf.abs(y_true_mean - y_pred_mean))
    g_loss_var = tf.reduce_mean(tf.abs(tf.sqrt(y_true_var + 1e-6) - tf.sqrt(y_pred_var + 1e-6)))
    return g_loss_mean + g_loss_var

## Discriminator

### Architecture: Real Data

Y_real = discriminator(H)
discriminator_model = Model(inputs=X,
                            outputs=Y_real,
                            name='DiscriminatorReal')

discriminator_model.summary()

plot_model(discriminator_model, show_shapes=True)

## Optimizers

generator_optimizer = Adam()
discriminator_optimizer = Adam()
embedding_optimizer = Adam()

## Generator Train Step

@tf.function
def train_generator(x, z):
    with tf.GradientTape() as tape:
        y_fake = adversarial_supervised(z)
        generator_loss_unsupervised = bce(y_true=tf.ones_like(y_fake),
                                          y_pred=y_fake)

        y_fake_e = adversarial_emb(z)
        generator_loss_unsupervised_e = bce(y_true=tf.ones_like(y_fake_e),
                                            y_pred=y_fake_e)
        h = embedder(x)
        h_hat_supervised = supervisor(h)
        generator_loss_supervised = mse(h[:, 1:, :], h_hat_supervised[:, 1:, :])

        x_hat = synthetic_data(z)
        generator_moment_loss = get_generator_moment_loss(x, x_hat)

        generator_loss = (generator_loss_unsupervised +
                          generator_loss_unsupervised_e +
                          100 * tf.sqrt(generator_loss_supervised) +
                          100 * generator_moment_loss)

    var_list = generator.trainable_variables + supervisor.trainable_variables
    gradients = tape.gradient(generator_loss, var_list)
    generator_optimizer.apply_gradients(zip(gradients, var_list))
    return generator_loss_unsupervised, generator_loss_supervised, generator_moment_loss

## Embedding Train Step

@tf.function
def train_embedder(x):
    with tf.GradientTape() as tape:
        h = embedder(x)
        h_hat_supervised = supervisor(h)
        generator_loss_supervised = mse(h[:, 1:, :], h_hat_supervised[:, 1:, :])

        x_tilde = autoencoder(x)
        embedding_loss_t0 = mse(x, x_tilde)
        e_loss = 10 * tf.sqrt(embedding_loss_t0) + 0.1 * generator_loss_supervised

    var_list = embedder.trainable_variables + recovery.trainable_variables
    gradients = tape.gradient(e_loss, var_list)
    embedding_optimizer.apply_gradients(zip(gradients, var_list))
    return tf.sqrt(embedding_loss_t0)

## Discriminator Train Step

@tf.function
def get_discriminator_loss(x, z):
    y_real = discriminator_model(x)
    discriminator_loss_real = bce(y_true=tf.ones_like(y_real),
                                  y_pred=y_real)

    y_fake = adversarial_supervised(z)
    discriminator_loss_fake = bce(y_true=tf.zeros_like(y_fake),
                                  y_pred=y_fake)

    y_fake_e = adversarial_emb(z)
    discriminator_loss_fake_e = bce(y_true=tf.zeros_like(y_fake_e),
                                    y_pred=y_fake_e)
    return (discriminator_loss_real +
            discriminator_loss_fake +
            gamma * discriminator_loss_fake_e)

@tf.function
def train_discriminator(x, z):
    with tf.GradientTape() as tape:
        discriminator_loss = get_discriminator_loss(x, z)

    var_list = discriminator.trainable_variables
    gradients = tape.gradient(discriminator_loss, var_list)
    discriminator_optimizer.apply_gradients(zip(gradients, var_list))
    return discriminator_loss

@tf.function
def train_encoder_decoder(real_data):
    with tf.GradientTape() as tape:
        encoded_data = encoder(real_data, training=True)
        reconstructed_data = decoder(encoded_data, training=True)
        reconstruction_loss = reconstruction_loss_fn(real_data, reconstructed_data)
    gradients = tape.gradient(reconstruction_loss, encoder.trainable_variables + decoder.trainable_variables)
    encoder_decoder_optimizer.apply_gradients(zip(gradients, encoder.trainable_variables + decoder.trainable_variables))
    return reconstruction_loss

## Training Loop

step_g_loss_u = step_g_loss_s = step_g_loss_v = step_e_loss_t0 = step_d_loss = step_ed_loss = 0

# Define loss function and optimizer for the encoder-decoder training
reconstruction_loss_fn = tf.keras.losses.MeanSquaredError()
encoder_decoder_optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)

for step in range(train_steps):
    # Train generator (twice as often as discriminator)
    for kk in range(2):
        X_ = next(real_series_iter)
        Z_ = next(random_series)

        # Train generator
        step_g_loss_u, step_g_loss_s, step_g_loss_v = train_generator(X_, Z_)
        # Train embedder
        step_e_loss_t0 = train_embedder(X_)
        # Train encoder-decoder
        step_ed_loss = train_encoder_decoder(X_)

    X_ = next(real_series_iter)
    Z_ = next(random_series)
    step_d_loss = get_discriminator_loss(X_, Z_)
    if step_d_loss > 0.15:
        step_d_loss = train_discriminator(X_, Z_)

    if step % 1000 == 0:
        print(f'{step:6,.0f} | d_loss: {step_d_loss:6.4f} | g_loss_u: {step_g_loss_u:6.4f} | '
              f'g_loss_s: {step_g_loss_s:6.4f} | g_loss_v: {step_g_loss_v:6.4f} | e_loss_t0: {step_e_loss_t0:6.4f} | '
              f'ed_loss: {step_ed_loss:6.4f}')

    with writer.as_default():
        tf.summary.scalar('G Loss S', step_g_loss_s, step=step)
        tf.summary.scalar('G Loss U', step_g_loss_u, step=step)
        tf.summary.scalar('G Loss V', step_g_loss_v, step=step)
        tf.summary.scalar('E Loss T0', step_e_loss_t0, step=step)
        tf.summary.scalar('D Loss', step_d_loss, step=step)
        tf.summary.scalar('ED Loss', step_ed_loss, step=step)

# Assuming scaled_data contains the entire dataset
data_sequences = []
for i in range(len(scaled_data) - seq_len):
    data_sequences.append(scaled_data[i:i + seq_len])
data_sequences = np.array(data_sequences)

data_sequences_reshaped = data_sequences.reshape(-1, seq_len, n_seq)

def detect_anomalies(data, encoder, decoder, threshold):
    encoded_data = encoder(data, training=False)
    reconstructed_data = decoder(encoded_data, training=False)
    reconstruction_error = tf.reduce_mean(tf.square(data - reconstructed_data), axis=[1, 2])
    anomalies = reconstruction_error > threshold
    return anomalies, reconstruction_error

threshold = 0.015  # Set an appropriate threshold based on validation data
anomalies, reconstruction_errors = detect_anomalies(data_sequences_reshaped, encoder, decoder, threshold)

## Persist Synthetic Data Generator

synthetic_data.save(log_dir / 'synthetic_data')

# Generate Synthetic Data

generated_data_scaled = []
for i in range(int(n_windows / batch_size)):
    Z_ = next(random_series)
    d = synthetic_data(Z_)
    generated_data_scaled.append(d)

generated_data_scaled = np.array(np.vstack(generated_data_scaled))
generated_data_scaled.shape

def filter_data(generated_data_batches, encoder, decoder, threshold):
    improved_generated_data = []
    anomalous_data = []
    for batch in generated_data_batches:
        for sample in batch:
            # Ensure sample has the correct shape
            sample = tf.reshape(sample, [1, sample.shape[0], sample.shape[1]])  # Reshape to (1, 24, 1)
            
            # Encode and decode the sample
            encoded_sample = encoder(sample, training=False)
            reconstructed_sample = decoder(encoded_sample, training=False)
            
            # Calculate reconstruction error
            reconstruction_error = tf.reduce_mean(tf.square(sample - reconstructed_sample))
            
            # Filter out anomalies based on reconstruction error
            if reconstruction_error < threshold:
                improved_generated_data.append(sample)
            else:
                anomalous_data.append(sample)
                
    improved_generated_data = np.array(improved_generated_data)
    
    # Reshape back to original sample shape (24, 1)
    improved_generated_data = np.reshape(improved_generated_data, (-1, 24, 2))
    
    return improved_generated_data

threshold = 0.015  # Set an appropriate threshold based on validation data

# Assuming generated_data_scaled is a batch, so we need to convert it into a list of batches
generated_data_batches = [generated_data_scaled]

improved_generated_data = filter_data(generated_data_batches, encoder, decoder, threshold)
print(improved_generated_data.shape)

len(generated_data_scaled)

np.save(log_dir / 'generated_data.npy', generated_data_scaled)

## Rescale

generated_data = (scaler.inverse_transform(generated_data_scaled
                                           .reshape(-1, n_seq))
                  .reshape(-1, seq_len, n_seq))
print(generated_data.shape)
generated_ano_data = (scaler.inverse_transform(improved_generated_data
                                           .reshape(-1, n_seq))
                  .reshape(-1, seq_len, n_seq))
print(generated_ano_data.shape)

differences = generated_data != generated_ano_data
indices = np.where(differences)
print(indices)

## Persist Data

with pd.HDFStore(hdf_store) as store:
    store.put('data/synthetic', pd.DataFrame(generated_data.reshape(-1, n_seq),
                                             columns=['WTI','BRENT']))

## Plot sample Series

# Generate synthetic data for a random window
rand = np.random.randint(len(generated_data))
synthetic = generated_data[rand]

# Select a random real data window
idx = np.random.randint(len(df) - seq_len)
real = df.iloc[idx: idx + seq_len]

# Create a DataFrame with both real and synthetic data
data_to_plot = pd.DataFrame({
    'Real': real['WTI'].values,
    'Synthetic': synthetic[:, 0]  # Assuming synthetic data has the same shape
})

# Plotting
fig, ax = plt.subplots(figsize=(14, 7))
data_to_plot.plot(ax=ax, title='Close', secondary_y='Synthetic', style=['-', '--'], lw=1)
sns.despine()
fig.tight_layout()
plt.show()

# Generate synthetic data for a random window
rand = np.random.randint(len(generated_data))
synthetic = generated_data[rand]

# Select a random real data window
idx = np.random.randint(len(df) - seq_len)
real = df.iloc[idx: idx + seq_len]

# Create a DataFrame with both real and synthetic data
data_to_plot = pd.DataFrame({
    'Real': real['BRENT'].values,
    'Synthetic': synthetic[:, 0]  # Assuming synthetic data has the same shape
})

# Plotting
fig, ax = plt.subplots(figsize=(14, 7))
data_to_plot.plot(ax=ax, title='Close', secondary_y='Synthetic', style=['-', '--'], lw=1)
sns.despine()
fig.tight_layout()
plt.show()

# Generate synthetic data for the same random window
synthetic_ano_data = generated_ano_data[rand]

# Select a random real data window
real = df.iloc[idx: idx + seq_len]

# Create a DataFrame with both real and synthetic data
data_to_plot = pd.DataFrame({
    'Real': real['WTI'].values,
    'Synthetic Improved': synthetic_ano_data[:, 0]  # Assuming synthetic data has the same shape
})

fig, ax = plt.subplots(figsize=(14, 7))
data_to_plot.plot(ax=ax, title='Close', secondary_y='Synthetic Improved', style=['-', '--'], lw=1)
sns.despine()
fig.tight_layout()
plt.show()

experiment = 0

df.to_hdf(hdf_store, 'data/real')
df

def get_real_data():
    df = pd.read_hdf(hdf_store, 'data/real').sort_index()

    # Preprocess the dataset:
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(df)

    data = []
    for i in range(len(df) - seq_len):
        data.append(scaled_data[i:i + seq_len])
    return data


real_data = get_real_data()

n = len(real_data)
n

np.asarray(real_data).shape

synthetic_data = improved_generated_data
synthetic_data.shape

real_data = real_data[:synthetic_data.shape[0]]
len(real_data)

sample_size = 250
idx = np.random.permutation(len(real_data))[:sample_size]

# Data preprocessing
real_sample = np.asarray(real_data)[idx]
synthetic_sample = np.asarray(synthetic_data)[idx]

real_sample_2d = real_sample.reshape(-1, seq_len)
synthetic_sample_2d = synthetic_sample.reshape(-1, seq_len)

real_sample_2d.shape, synthetic_sample_2d.shape

pca = PCA(n_components=2)
pca.fit(real_sample_2d)

# Transform the data and create DataFrames with an additional 'Data' column
pca_real = (pd.DataFrame(pca.transform(real_sample_2d))
            .assign(Data='Real'))
pca_synthetic = (pd.DataFrame(pca.transform(synthetic_sample_2d))
                 .assign(Data='Synthetic'))

# Concatenate the DataFrames and rename the columns
pca_result = pd.concat([pca_real, pca_synthetic]).rename(
    columns={0: '1st Component', 1: '2nd Component'})

print(pca_result)


tsne_data = np.concatenate((real_sample_2d,
                            synthetic_sample_2d), axis=0)

tsne = TSNE(n_components=2,
            verbose=1,
            perplexity=200)
tsne_result = tsne.fit_transform(tsne_data)

tsne_result = pd.DataFrame(tsne_result, columns=['X', 'Y']).assign(Data='Real')
tsne_result.loc[sample_size*1.5:, 'Data'] = 'Synthetic'

pca_result

fig, axes = plt.subplots(ncols=2, figsize=(14, 5))

# PCA Result
sns.scatterplot(x='1st Component', y='2nd Component', data=pca_result,
                hue='Data', style='Data', ax=axes[0])
sns.despine()
axes[0].set_title('PCA Result')

# t-SNE Result
sns.scatterplot(x='X', y='Y', data=tsne_result,
                hue='Data', style='Data', ax=axes[1])
sns.despine()
axes[1].set_title('t-SNE Result')

# Set overall title and layout adjustments
fig.suptitle('Assessing Diversity: Qualitative Comparison of Real and Synthetic Data Distributions', 
             fontsize=14)
fig.tight_layout()
fig.subplots_adjust(top=.88)

plt.show()

real_data = get_real_data()
real_data = np.array(real_data)[:len(synthetic_data)]

synthetic_data.shape

n_series = real_data.shape[0]

idx = np.arange(n_series)

n_train = int(.8*n_series)
train_idx = idx[:n_train]
test_idx = idx[n_train:]

train_data = np.vstack((real_data[train_idx], 
                        synthetic_data[train_idx]))
test_data = np.vstack((real_data[test_idx], 
                       synthetic_data[test_idx]))

n_train, n_test = len(train_idx), len(test_idx)
train_labels = np.concatenate((np.ones(n_train),
                               np.zeros(n_train)))
test_labels = np.concatenate((np.ones(n_test),
                              np.zeros(n_test)))

ts_classifier = Sequential([GRU(2, input_shape=(24, 2), name='GRU'),
                            Dense(1, activation='sigmoid', name='OUT')],
                           name='Time_Series_Classifier')

ts_classifier.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=[AUC(name='AUC'), 'accuracy'])

ts_classifier.summary()

test_data

# Now, train your model with the modified input data
result = ts_classifier.fit(x=train_data,
                            y=train_labels,
                            validation_data=(test_data, test_labels),
                            epochs=250,
                            batch_size=128,
                            verbose=0)

ts_classifier.evaluate(x=test_data, y=test_labels)

history = pd.DataFrame(result.history)
history.info()

from matplotlib.ticker import FuncFormatter

sns.set_style('white')
fig, axes = plt.subplots(ncols=2, figsize=(14,4))
history[['AUC', 'val_AUC']].rename(columns={'AUC': 'Train', 'val_AUC': 'Test'}).plot(ax=axes[1], 
                                                                                     title='ROC Area under the Curve',
                                                                                    style=['-', '--'],
                                                                                    xlim=(0, 250))
history[['accuracy', 'val_accuracy']].rename(columns={'accuracy': 'Train', 'val_accuracy': 'Test'}).plot(ax=axes[0], 
                                                                                                         title='Accuracy',
                                                                                                        style=['-', '--'],
                                                                                                        xlim=(0, 250))
for i in [0, 1]:
    axes[i].set_xlabel('Epoch')

axes[0].yaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0%}'.format(y))) 
axes[0].set_ylabel('Accuracy (%)')
axes[1].set_ylabel('AUC')
sns.despine()
fig.suptitle('Assessing Fidelity: Time Series Classification Performance', fontsize=14)
fig.tight_layout()
fig.subplots_adjust(top=.85);

real_data = get_real_data()
real_data = np.array(real_data)[:len(synthetic_data)]

real_data.shape, synthetic_data.shape

real_train_data = real_data[train_idx, :23, :]
real_train_label = real_data[train_idx, -1, :]

real_test_data = real_data[test_idx, :23, :]
real_test_label = real_data[test_idx, -1, :]

real_train_data.shape, real_train_label.shape, real_test_data.shape, real_test_label.shape

synthetic_train = synthetic_data[:, :23, :]
synthetic_label = synthetic_data[:, -1, :]

synthetic_train.shape, synthetic_label.shape

def get_model():
    model = Sequential()
    model.add(LSTM(50, input_shape=(23, 2), return_sequences=False))
    model.add(Dense(2))
    model.compile(optimizer='adam', loss='mean_absolute_error')
    return model

ts_regression = get_model()
real_result = ts_regression.fit(x=real_train_data,
                                y=real_train_label,
                                validation_data=(
                                    real_test_data, 
                                    real_test_label),
                                epochs=100,
                                batch_size=128,
                                verbose=0)

ts_regression = get_model()
synthetic_result = ts_regression.fit(x=synthetic_train,
                                     y=synthetic_label,
                                     validation_data=(
                                         real_test_data, 
                                         real_test_label),
                                     epochs=100,
                                     batch_size=128,
                                     verbose=0)

synthetic_result = pd.DataFrame(synthetic_result.history).rename(columns={'loss': 'Train', 'val_loss': 'Test'})
real_result = pd.DataFrame(real_result.history).rename(columns={'loss': 'Train', 'val_loss': 'Test'})

fig, axes = plt.subplots(ncols=2, figsize=(14, 4), sharey=True)
synthetic_result.plot(ax=axes[0], title='Train on Synthetic, Test on Real', logy=True, xlim=(0, 100))
real_result.plot(ax=axes[1], title='Train on Real, Test on Real', logy=True, xlim=(0, 100))
for i in [0, 1]:
    axes[i].set_xlabel('Epoch')
    axes[i].set_ylabel('Mean Absolute Error (log scale)')

sns.despine()
fig.suptitle('Assessing Usefulness: Time Series Prediction Performance', fontsize=14)
fig.tight_layout()
fig.subplots_adjust(top=.85);