In [13]:
import pandas as pd

# Load the CSV file into a DataFrame
file_path = './data_train_log_return.csv'
data_train_log_return = pd.read_csv(file_path)

# Display the first few rows of the DataFrame
data_train_log_return.head()


Unnamed: 0,0,0.01249535315117,0.0111256706670408,0.0032520459252687,0.0066249108779032
0,1,0.011439,0.002691,0.001206,0.006947
1,2,0.000632,0.007277,0.004049,7.4e-05
2,3,0.017828,0.02821,0.007758,0.007382
3,4,0.021115,0.019642,0.009238,0.011499
4,5,0.001177,0.002096,0.001348,0.004966


In [4]:
from sklearn.preprocessing import MinMaxScaler

# Assuming that the first column is an index or identifier and not a feature to be scaled
features = data_train_log_return.columns[1:]  # Selecting the actual feature columns

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Fit the scaler to the data and transform it
scaled_data = scaler.fit_transform(data_train_log_return[features])

# Create a new DataFrame with the scaled data
scaled_data_df = pd.DataFrame(scaled_data, columns=features)

# Display the first few rows of the scaled DataFrame
scaled_data_df.head()

Unnamed: 0,0.01249535315117,0.0111256706670408,0.0032520459252687,0.0066249108779032
0,0.115776,0.029773,0.016556,0.092684
1,0.006277,0.081634,0.056031,9.6e-05
2,0.18051,0.31831,0.107548,0.098551
3,0.213811,0.22144,0.128097,0.154019
4,0.011804,0.023056,0.018521,0.066004


In [39]:
# Import necessary libraries
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LeakyReLU
from tensorflow.keras.optimizers import Adam

# Define the standalone generator model
def define_generator(latent_dim, n_outputs=4):
    model = Sequential()
    model.add(Dense(16, activation='relu', kernel_initializer='he_uniform', input_dim=latent_dim))
    model.add(LeakyReLU(alpha=0.01))
    model.add(Dense(n_outputs, activation='sigmoid'))  # Sigmoid to ensure output is between 0 and 1
    return model

# Define the standalone discriminator model
def define_discriminator(n_inputs=4):
    model = Sequential()
    model.add(Dense(16, activation='relu', kernel_initializer='he_uniform', input_dim=n_inputs))
    model.add(LeakyReLU(alpha=0.01))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Define the combined generator and discriminator model, for updating the generator
def define_gan(generator, discriminator):
    discriminator.trainable = False  # freeze the discriminator's weights when training the gan
    model = Sequential()
    model.add(generator)
    model.add(discriminator)
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model

# Generate points in latent space as input for the generator
def generate_latent_points(latent_dim, n_samples):
    x_input = np.random.randn(latent_dim * n_samples)
    x_input = x_input.reshape(n_samples, latent_dim)
    return x_input

# Generate n fake examples and class labels
def generate_fake_samples(generator, latent_dim, n_samples):
    x_input = generate_latent_points(latent_dim, n_samples)
    X = generator.predict(x_input)
    y = np.zeros((n_samples, 1))
    return X, y

# Training the GAN
def train_gan(gan, latent_dim, n_epochs=1000, n_batch=128):
    for i in range(n_epochs):
        X_fake, y_fake = generate_fake_samples(generator, latent_dim, n_batch)
        gan.train_on_batch(X_fake, y_fake)

latent_dim = 4
generator = define_generator(latent_dim)
discriminator = define_discriminator()
gan = define_gan(generator, discriminator)

# Training the GAN (This would take some time based on the number of epochs and the speed of your machine)
train_gan(gan, latent_dim)

# Generating new data
n_samples = 745  # Number of samples to generate
latent_points = generate_latent_points(latent_dim, n_samples)
X_generated = generator.predict(latent_points)

# Saving the generated data to a CSV file
generated_data_df = pd.DataFrame(X_generated)
generated_data_df.to_csv('generated_financial_data.csv', index=False)
















In [20]:
#print(generated_data_df.iloc[:1])

          0         1        2         3
0  0.009549  0.935909  0.02366  0.993557


In [8]:
# Assuming 'generator' and 'discriminator' are your trained Keras models

# Get the weights (parameters) of the generator
generator_weights = generator.get_weights()  # This will be a list of numpy arrays

# Get the weights (parameters) of the discriminator
discriminator_weights = discriminator.get_weights()  # This will be a list of numpy arrays

# If you want to see the shape of the weights for each layer of the generator
for i, weight_array in enumerate(generator_weights):
    print(f"Layer {i} weights shape: {weight_array.shape}")

# If you want to see the shape of the weights for each layer of the discriminator
for i, weight_array in enumerate(discriminator_weights):
    print(f"Layer {i} weights shape: {weight_array.shape}")


Layer 0 weights shape: (4, 16)
Layer 1 weights shape: (16,)
Layer 2 weights shape: (16, 4)
Layer 3 weights shape: (4,)
Layer 0 weights shape: (4, 16)
Layer 1 weights shape: (16,)
Layer 2 weights shape: (16, 1)
Layer 3 weights shape: (1,)


In [9]:
generator.save('generator_model.h5')  # Save the generator model
discriminator.save('discriminator_model.h5')  # Save the discriminator model




  saving_api.save_model(


In [43]:
import numpy as np
from scipy.stats import anderson_ksamp
from scipy.stats import kendalltau
import pandas as pd

def anderson_darling_distance(real_data, generated_data):
    """
    Computes the Anderson-Darling distance for comparing the generated data against the real data.
    :param real_data: The real data distribution (as a numpy array).
    :param generated_data: The generated data distribution (as a numpy array).
    :return: Anderson-Darling test statistic for each feature.
    """
    # Ensure that the data are numpy arrays
    if isinstance(real_data, pd.DataFrame):
        real_data = real_data.to_numpy()
    if isinstance(generated_data, pd.DataFrame):
        generated_data = generated_data.to_numpy()

    # Check that the number of columns matches
    assert real_data.shape[1] == generated_data.shape[1], "The number of features must match in both datasets."

    # Apply the Anderson-Darling test for each feature
    ad_statistics = []
    for i in range(real_data.shape[1]):
        # Compute the statistic for the ith feature
        statistic, _, _ = anderson_ksamp([real_data[:, i], generated_data[:, i]])
        ad_statistics.append(statistic)

    return np.array(ad_statistics)

# Function to calculate the Absolute Kendall error
def absolute_kendall_error(real_data, generated_data):
    """
    Computes the Absolute Kendall error between the generated data and the real data.
    :param real_data: The real data distribution (as a pandas DataFrame).
    :param generated_data: The generated data distribution (as a pandas DataFrame).
    :return: Absolute Kendall error for each pair of features.
    """
    num_features = real_data.shape[1]
    kendall_errors = np.zeros((num_features, num_features))
    # Calculate Kendall's tau coefficient for each pair of features
    for i in range(num_features):
        for j in range(num_features):
            if i != j:  # Only calculate for pairs of different features
                tau_real, _ = kendalltau(real_data.iloc[:, i], real_data.iloc[:, j])
                tau_generated, _ = kendalltau(generated_data.iloc[:, i], generated_data.iloc[:, j])
                kendall_errors[i, j] = abs(tau_real - tau_generated)
    return kendall_errors

# Assuming real_data_df and generated_data_df are pandas DataFrames

# Calculate the Anderson-Darling distance
data_train = data_train_log_return.iloc[:, 1:]
ad_distance = anderson_darling_distance(data_train, generated_data_df)
print("Anderson-Darling distance:", ad_distance)

# Calculate the Absolute Kendall error
#kendall_error = absolute_kendall_error(data_train_log_return, generated_data_df)
#print("Absolute Kendall error:", kendall_error)


Anderson-Darling distance: [684.23117051  93.54890017 755.76584338 750.54098039]


  statistic, _, _ = anderson_ksamp([real_data[:, i], generated_data[:, i]])


In [51]:
#print(data_train_log_return.iloc[:, 1])
real_data_numpy = data_train.to_numpy()
print(real_data_numpy[1])

[6.31663784e-04 7.27739554e-03 4.04865224e-03 7.44992466e-05]


In [36]:
data_train_log_return

Unnamed: 0,0,0.01249535315117,0.0111256706670408,0.0032520459252687,0.0066249108779032
0,1,0.011439,0.002691,0.001206,0.006947
1,2,0.000632,0.007277,0.004049,0.000074
2,3,0.017828,0.028210,0.007758,0.007382
3,4,0.021115,0.019642,0.009238,0.011499
4,5,0.001177,0.002096,0.001348,0.004966
...,...,...,...,...,...
740,741,0.001938,0.008833,0.003927,0.005106
741,742,0.005003,0.018943,0.003057,0.001988
742,743,0.007683,0.001958,0.007002,0.006467
743,744,0.003396,0.001280,0.007621,0.001680


In [40]:
generated_data_df

Unnamed: 0,0,1,2,3
0,0.092189,0.013817,0.982724,0.497315
1,0.242307,0.044335,0.987025,0.851398
2,0.064058,0.000497,0.999719,0.628990
3,0.113409,0.104520,0.828490,0.665430
4,0.583694,0.002118,0.996789,0.567849
...,...,...,...,...
740,0.632790,0.015462,0.959669,0.431850
741,0.038275,0.005078,0.971018,0.239328
742,0.101918,0.013436,0.980855,0.785500
743,0.677570,0.024324,0.969635,0.558983
