In [None]:
!pip install git+https://github.com/Ashhad785/synthcity.git

In [None]:
!pip uninstall -y torchaudio torchdata
!pip install pycox
from pycox import datasets
from synthcity.metrics import Metrics
from synthcity.plugins.core.dataloader import SurvivalAnalysisDataLoader
import numpy as np
import pandas as pd
import os
import shutil
from timeit import default_timer as timer
from synthcity.plugins import Plugins

In [None]:
plugin_name="adsgan"

# Functions

In [None]:
from scipy.stats import gaussian_kde
import numpy as np
import pandas as pd
from timeit import default_timer as timer
from sklearn.mixture import BayesianGaussianMixture


def fit_dpmm_and_sample(data, sample_size, integer_sampling=False, bandwidth='scott'):
   """
   Fit DPMM to data and sample from it
   Parameters:
   -----------
   data : array-like
       Input data to fit DPMM
   sample_size : int
       Number of samples to generate
   integer_sampling : bool, default=False
       If True, returns integer samples. If False, returns continuous samples
   bandwidth : str or float, default='scott'
       Not used for DPMM but kept for API consistency
   Returns:
   --------
   array-like
       Sampled values from the fitted DPMM
   """
   # Reshape data for DPMM
   data = data.reshape(-1, 1)

   # Fit DPMM
   dpmm = BayesianGaussianMixture(
       n_components=10,  # Max number of components
       weight_concentration_prior=1.0,
       random_state=42
   )
   dpmm.fit(data)

   if integer_sampling:
       # Sample more points than needed to account for rounding and filtering
       oversampling_factor = 1.5
       samples = dpmm.sample(int(sample_size * oversampling_factor))[0].reshape(-1)
       # Round to nearest integer and ensure positive
       samples = np.round(np.abs(samples))
       # Convert to integers
       samples = samples.astype(int)
       # Remove any zeros
       samples = samples[samples > 0]
       # If we have more samples than needed due to oversampling, randomly select
       if len(samples) > sample_size:
           samples = np.random.choice(samples, size=sample_size, replace=False)
       # If we have fewer samples than needed, resample with replacement
       elif len(samples) < sample_size:
           samples = np.random.choice(samples, size=sample_size, replace=True)
   else:
       # Direct sampling for continuous values
       samples = dpmm.sample(sample_size)[0].reshape(-1)
       # Ensure all samples are positive
       samples = np.abs(samples)

   return samples


def sinusoidal_embedding(values, event_indicators, embedding_dim):
    # Find the unique values and sort them
    unique_values = sorted(set(values))

    # Create a dictionary to map values to their indices
    value_to_idx = {value: idx for idx, value in enumerate(unique_values)}

    # Create the embedding matrix
    embeddings = np.zeros((len(unique_values), embedding_dim + 1))

    # Assign sinusoidal embeddings based on the order of values
    for idx, value in enumerate(unique_values):
        for j in range(embedding_dim // 2):
            embeddings[idx, 2 * j] = np.sin(idx / (10000 ** (2 * j / embedding_dim)))
            embeddings[idx, 2 * j + 1] = np.cos(idx / (10000 ** (2 * j / embedding_dim)))

    # Map the input values to their embeddings
    value_embeddings = []
    for value, event_indicator in zip(values, event_indicators):
        embedding = embeddings[value_to_idx[value]].copy()
        embedding[-1] = event_indicator
        value_embeddings.append(embedding)

    return np.array(value_embeddings)


from scipy.stats import mannwhitneyu, chi2_contingency,wilcoxon
import matplotlib.pyplot as plt

def identify_variable_types(df):
    continuous_columns = []
    discrete_columns = []

    for col in df.columns:
        unique_vals = df[col].unique()
        num_unique = len(unique_vals)
        if num_unique > 20:  # Threshold for considering a column as continuous
            continuous_columns.append(col)
        else:
            discrete_columns.append(col)

    return continuous_columns, discrete_columns

def compare_distributions(real_df, synthetic_df, alpha=0.05):
    real_df = real_df.drop(['duration', 'event'], axis=1)
    real_continuous, real_discrete = identify_variable_types(real_df)
    p_values_continuous = {}
    p_values_discrete = {}

    synthetic_df = synthetic_df.drop(['duration', 'event'], axis=1)
    synthetic_continuous, synthetic_discrete = identify_variable_types(synthetic_df)

    synthetic_continuous = [col for col in synthetic_continuous if col not in ["event", "duration"]]
    synthetic_discrete = [col for col in synthetic_discrete if col not in ["event", "duration"]]

    # Wilcoxon rank-sum test for continuous variables
    for col in real_continuous:
        if col in synthetic_continuous:
            _, p_value = mannwhitneyu(real_df[col], synthetic_df[col])
            p_values_continuous[col] = p_value

    # Chi-square test for discrete variables
    for col in real_discrete:
        if col in synthetic_discrete:
            contingency_table = pd.crosstab(real_df[col], synthetic_df[col])
            _, p, _, _ = chi2_contingency(contingency_table)
            p_values_discrete[col] = p

    # Plot p-values
    # plt.figure(figsize=(10, 6))

    # continuous_p_values = {col: p_values_continuous[col] for col in real_continuous}
    # discrete_p_values = {col: p_values_discrete[col] for col in real_discrete}

    # plt.plot(list(continuous_p_values.keys()), list(continuous_p_values.values()), label='Continuous', marker='o')
    # plt.plot(list(discrete_p_values.keys()), list(discrete_p_values.values()), label='Discrete', marker='o', linestyle='dashed')

    # # Plot alpha line
    # plt.axhline(y=alpha, color='red', linestyle='--', label=f'alpha = {alpha}')

    # plt.xlabel('Column Name')
    # plt.ylabel('p-value')
    # plt.title('Comparison of p-values for Real and Synthetic Data')
    # plt.xticks(rotation=45)
    # plt.legend()
    # plt.grid(True)
    # plt.tight_layout()
    # plt.show()

    return p_values_continuous, p_values_discrete



# FLCHAIN

In [None]:
dataset="flchain"

metrics_list = []
fit_times = []
generate_times = []
p_values_list = []

for i in range(5):
    df = pd.read_csv('/content/drive/MyDrive/Datasets/flchain_final.csv')
    df = df.drop('Unnamed: 0', axis=1)
    df = df[df['duration'] != 0]

    time = df['duration'].to_list()
    event = df['event'].to_list()

    time_event_encoded = sinusoidal_embedding(time, event, 4)
    X = df.drop(['duration', 'event'], axis=1)
    y = time_event_encoded

    syn_model = Plugins().get(plugin_name)

    # Measure the execution time of the fit function
    start = timer()
    syn_model.fit(X, cond=y)
    fit_time = timer() - start
    fit_times.append(fit_time)

    survival_df=df
    event_0_data = survival_df[survival_df['event'] == 0]['duration'].values
    event_1_data = survival_df[survival_df['event'] == 1]['duration'].values

    random_state = i + 1
    np.random.seed(random_state)

    # Use KDE to sample time values for each event type
    sample_size_0 = len(event_0_data)
    sample_size_1 = len(event_1_data)

    # Sample from KDE for each event type
    sample_event_0 = fit_dpmm_and_sample(event_0_data, sample_size_0,integer_sampling=True)
    sample_event_1 = fit_dpmm_and_sample(event_1_data, sample_size_1,integer_sampling=True)

    z=np.concatenate([sample_event_0,sample_event_1])
    x=np.zeros(len(sample_event_0))
    y=np.ones(len(sample_event_1))
    p=np.concatenate([x,y])

    time_event_gen_encoded=sinusoidal_embedding(z,p,4)

    # Measure the execution time of the generate function
    start = timer()
    X_gen = syn_model.generate(count=len(df), cond=np.array(time_event_gen_encoded)).dataframe()
    generate_time = timer() - start
    generate_times.append(generate_time)

    X_gen['duration'] = z
    X_gen['event'] = p
    X_gen = X_gen.sample(frac=1).reset_index(drop=True)

    # Save X_gen as a CSV file
    X_gen.to_csv(f"/content/drive/MyDrive/ICML/{dataset}_{plugin_name}_iteration_{i+1}.csv", index=False)

    loader1 = SurvivalAnalysisDataLoader(df, target_column="event", time_to_event_column="duration")
    loader2 = SurvivalAnalysisDataLoader(X_gen, target_column="event", time_to_event_column="duration")

    met_df = Metrics.evaluate(X_gt=loader1, X_syn=loader2, task_type='survival_analysis', metrics={
        'stats': ['jensenshannon_dist', 'chi_squared_test', 'feature_corr', 'inv_kl_divergence', 'ks_test',
                 'max_mean_discrepancy', 'wasserstein_dist', 'prdc', 'alpha_precision', 'survival_km_distance'],
        'performance': ['linear_model', 'mlp', 'xgb', 'feat_rank_distance']
    }, use_cache=False, random_state=random_state)

    met_df = met_df.iloc[:, 0]
    metrics_list.append(met_df)

    # Calculate p-values
    p_values_continuous, p_values_discrete = compare_distributions(df, X_gen)
    continuous_column_names = list(p_values_continuous.keys())
    discrete_column_names = list(p_values_discrete.keys())

    p_val = np.concatenate([list(p_values_continuous.values()), list(p_values_discrete.values())])
    p_values_list.append(p_val)

    workspace_dir = os.path.join(os.getcwd(), 'workspace')
    if os.path.exists(workspace_dir):
        shutil.rmtree(workspace_dir)

result_df = pd.concat(metrics_list, axis=1)

# Calculate the row-wise mean and standard deviation of the metrics
result_df['Mean'] = result_df.mean(axis=1)
result_df['Std'] = result_df.std(axis=1)
result_df['Std'] = result_df['Std'].round(4)

# Create DataFrame for p-values
p_values_df = pd.DataFrame(p_values_list, columns=continuous_column_names + discrete_column_names)

# Save result_df and p_values_df as CSV files
result_df.to_csv(f"/content/drive/MyDrive/ICML/{dataset}_{plugin_name}_result_df.csv")
p_values_df.to_csv(f"/content/drive/MyDrive/ICML/{dataset}_{plugin_name}_p_values_df.csv")

avg_fit_time = np.mean(fit_times)
avg_generate_time = np.mean(generate_times)
std_fit_time = np.std(fit_times)
std_generate_time = np.std(generate_times)

print(f"\nAverage Fit Time: {avg_fit_time:.4f} seconds, Standard Deviation: {std_fit_time:.4f} seconds")
print(f"Average Generate Time: {avg_generate_time:.4f} seconds, Standard Deviation: {std_generate_time:.4f} seconds")


In [None]:
result_df

In [None]:
p_values_df

# AIDS

In [None]:
dataset="aids"

metrics_list = []
fit_times = []
generate_times = []
p_values_list = []

for i in range(5):
    df = pd.read_csv('/content/drive/MyDrive/Datasets/aids.csv')
    df = df.drop('Unnamed: 0', axis=1)
    df = df[df['duration'] != 0]

    time = df['duration'].to_list()
    event = df['event'].to_list()

    time_event_encoded = sinusoidal_embedding(time, event, 4)
    X = df.drop(['duration', 'event'], axis=1)
    y = time_event_encoded

    syn_model = Plugins().get(plugin_name)

    # Measure the execution time of the fit function
    start = timer()
    syn_model.fit(X, cond=y)
    fit_time = timer() - start
    fit_times.append(fit_time)

    survival_df=df
    event_0_data = survival_df[survival_df['event'] == 0]['duration'].values
    event_1_data = survival_df[survival_df['event'] == 1]['duration'].values

    random_state = i + 1
    np.random.seed(random_state)

    # Use KDE to sample time values for each event type
    sample_size_0 = len(event_0_data)
    sample_size_1 = len(event_1_data)

    # Sample from KDE for each event type
    sample_event_0 = fit_dpmm_and_sample(event_0_data, sample_size_0,integer_sampling=True)
    sample_event_1 = fit_dpmm_and_sample(event_1_data, sample_size_1,integer_sampling=True)

    z=np.concatenate([sample_event_0,sample_event_1])
    x=np.zeros(len(sample_event_0))
    y=np.ones(len(sample_event_1))
    p=np.concatenate([x,y])

    time_event_gen_encoded=sinusoidal_embedding(z,p,4)

    # Measure the execution time of the generate function
    start = timer()
    X_gen = syn_model.generate(count=len(df), cond=np.array(time_event_gen_encoded)).dataframe()
    generate_time = timer() - start
    generate_times.append(generate_time)

    X_gen['duration'] = z
    X_gen['event'] = p
    X_gen = X_gen.sample(frac=1).reset_index(drop=True)

    # Save X_gen as a CSV file
    X_gen.to_csv(f"/content/drive/MyDrive/ICML/{dataset}_{plugin_name}_iteration_{i+1}.csv", index=False)

    loader1 = SurvivalAnalysisDataLoader(df, target_column="event", time_to_event_column="duration")
    loader2 = SurvivalAnalysisDataLoader(X_gen, target_column="event", time_to_event_column="duration")

    met_df = Metrics.evaluate(X_gt=loader1, X_syn=loader2, task_type='survival_analysis',
                              metrics={
        'stats': ['jensenshannon_dist', 'chi_squared_test', 'feature_corr', 'inv_kl_divergence', 'ks_test',
                 'max_mean_discrepancy', 'wasserstein_dist', 'prdc', 'alpha_precision', 'survival_km_distance'],
        'performance': ['linear_model', 'mlp', 'xgb', 'feat_rank_distance']
    },
        use_cache=False, random_state=random_state)

    met_df = met_df.iloc[:, 0]
    metrics_list.append(met_df)

    # Calculate p-values
    p_values_continuous, p_values_discrete = compare_distributions(df, X_gen)
    continuous_column_names = list(p_values_continuous.keys())
    discrete_column_names = list(p_values_discrete.keys())

    p_val = np.concatenate([list(p_values_continuous.values()), list(p_values_discrete.values())])
    p_values_list.append(p_val)

    workspace_dir = os.path.join(os.getcwd(), 'workspace')
    if os.path.exists(workspace_dir):
        shutil.rmtree(workspace_dir)

result_df = pd.concat(metrics_list, axis=1)

# Calculate the row-wise mean and standard deviation of the metrics
result_df['Mean'] = result_df.mean(axis=1)
result_df['Std'] = result_df.std(axis=1)
result_df['Std'] = result_df['Std'].round(4)

# Create DataFrame for p-values
p_values_df = pd.DataFrame(p_values_list, columns=continuous_column_names + discrete_column_names)

# Save result_df and p_values_df as CSV files
result_df.to_csv(f"/content/drive/MyDrive/ICML/{dataset}_{plugin_name}_result_df.csv")
p_values_df.to_csv(f"/content/drive/MyDrive/ICML/{dataset}_{plugin_name}_p_values_df.csv")

avg_fit_time = np.mean(fit_times)
avg_generate_time = np.mean(generate_times)
std_fit_time = np.std(fit_times)
std_generate_time = np.std(generate_times)

print(f"\nAverage Fit Time: {avg_fit_time:.4f} seconds, Standard Deviation: {std_fit_time:.4f} seconds")
print(f"Average Generate Time: {avg_generate_time:.4f} seconds, Standard Deviation: {std_generate_time:.4f} seconds")


In [None]:
result_df

In [None]:
p_values_df

# Metabric

In [None]:
dataset="metabric"

metrics_list = []
fit_times = []
generate_times = []
p_values_list = []

for i in range(5):
    df = datasets.metabric.read_df()
    df = df[df['duration'] != 0]

    time = df['duration'].to_list()
    event = df['event'].to_list()

    time_event_encoded = sinusoidal_embedding(time, event, 4)
    X = df.drop(['duration', 'event'], axis=1)
    y = time_event_encoded

    syn_model = Plugins().get(plugin_name)

    # Measure the execution time of the fit function
    start = timer()
    syn_model.fit(X, cond=y)
    fit_time = timer() - start
    fit_times.append(fit_time)

    survival_df=df
    event_0_data = survival_df[survival_df['event'] == 0]['duration'].values
    event_1_data = survival_df[survival_df['event'] == 1]['duration'].values

    random_state = i + 1
    np.random.seed(random_state)

    # Use KDE to sample time values for each event type
    sample_size_0 = len(event_0_data)
    sample_size_1 = len(event_1_data)

    # Sample from KDE for each event type
    sample_event_0 = fit_dpmm_and_sample(event_0_data, sample_size_0,integer_sampling=False)
    sample_event_1 = fit_dpmm_and_sample(event_1_data, sample_size_1,integer_sampling=False)

    z=np.concatenate([sample_event_0,sample_event_1])
    x=np.zeros(len(sample_event_0))
    y=np.ones(len(sample_event_1))
    p=np.concatenate([x,y])

    time_event_gen_encoded=sinusoidal_embedding(z,p,4)

    # Measure the execution time of the generate function
    start = timer()
    X_gen = syn_model.generate(count=len(df), cond=np.array(time_event_gen_encoded)).dataframe()
    generate_time = timer() - start
    generate_times.append(generate_time)

    X_gen['duration'] = z
    X_gen['event'] = p
    X_gen = X_gen.sample(frac=1).reset_index(drop=True)

    # Save X_gen as a CSV file
    X_gen.to_csv(f"/content/drive/MyDrive/ICML/{dataset}_{plugin_name}_iteration_{i+1}.csv", index=False)

    loader1 = SurvivalAnalysisDataLoader(df, target_column="event", time_to_event_column="duration")
    loader2 = SurvivalAnalysisDataLoader(X_gen, target_column="event", time_to_event_column="duration")

    met_df = Metrics.evaluate(X_gt=loader1, X_syn=loader2, task_type='survival_analysis', metrics={
        'stats': ['jensenshannon_dist', 'chi_squared_test', 'feature_corr', 'inv_kl_divergence', 'ks_test',
                 'max_mean_discrepancy', 'wasserstein_dist', 'prdc', 'alpha_precision', 'survival_km_distance'],
        'performance': ['linear_model', 'mlp', 'xgb', 'feat_rank_distance']
    }, use_cache=False, random_state=random_state)

    met_df = met_df.iloc[:, 0]
    metrics_list.append(met_df)

    # Calculate p-values
    p_values_continuous, p_values_discrete = compare_distributions(df, X_gen)
    continuous_column_names = list(p_values_continuous.keys())
    discrete_column_names = list(p_values_discrete.keys())

    p_val = np.concatenate([list(p_values_continuous.values()), list(p_values_discrete.values())])
    p_values_list.append(p_val)

    workspace_dir = os.path.join(os.getcwd(), 'workspace')
    if os.path.exists(workspace_dir):
        shutil.rmtree(workspace_dir)

result_df = pd.concat(metrics_list, axis=1)

# Calculate the row-wise mean and standard deviation of the metrics
result_df['Mean'] = result_df.mean(axis=1)
result_df['Std'] = result_df.std(axis=1)
result_df['Std'] = result_df['Std'].round(4)

# Create DataFrame for p-values
p_values_df = pd.DataFrame(p_values_list, columns=continuous_column_names + discrete_column_names)

# Save result_df and p_values_df as CSV files
result_df.to_csv(f"/content/drive/MyDrive/ICML/{dataset}_{plugin_name}_result_df.csv")
p_values_df.to_csv(f"/content/drive/MyDrive/ICML/{dataset}_{plugin_name}_p_values_df.csv")

avg_fit_time = np.mean(fit_times)
avg_generate_time = np.mean(generate_times)
std_fit_time = np.std(fit_times)
std_generate_time = np.std(generate_times)

print(f"\nAverage Fit Time: {avg_fit_time:.4f} seconds, Standard Deviation: {std_fit_time:.4f} seconds")
print(f"Average Generate Time: {avg_generate_time:.4f} seconds, Standard Deviation: {std_generate_time:.4f} seconds")

In [None]:
result_df

In [None]:
p_values_df

# GBSG

In [None]:
dataset="gbsg"

metrics_list = []
fit_times = []
generate_times = []
p_values_list = []

for i in range(5):
    df = datasets.gbsg.read_df()
    df = df[df['duration'] != 0]

    time = df['duration'].to_list()
    event = df['event'].to_list()

    time_event_encoded = sinusoidal_embedding(time, event, 4)
    X = df.drop(['duration', 'event'], axis=1)
    y = time_event_encoded

    syn_model = Plugins().get(plugin_name)

    # Measure the execution time of the fit function
    start = timer()
    syn_model.fit(X, cond=y)
    fit_time = timer() - start
    fit_times.append(fit_time)

    survival_df=df
    event_0_data = survival_df[survival_df['event'] == 0]['duration'].values
    event_1_data = survival_df[survival_df['event'] == 1]['duration'].values

    random_state = i + 1
    np.random.seed(random_state)

    # Use KDE to sample time values for each event type
    sample_size_0 = len(event_0_data)
    sample_size_1 = len(event_1_data)

    # Sample from KDE for each event type
    sample_event_0 = fit_dpmm_and_sample(event_0_data, sample_size_0,integer_sampling=False)
    sample_event_1 = fit_dpmm_and_sample(event_1_data, sample_size_1,integer_sampling=False)

    z=np.concatenate([sample_event_0,sample_event_1])
    x=np.zeros(len(sample_event_0))
    y=np.ones(len(sample_event_1))
    p=np.concatenate([x,y])

    time_event_gen_encoded=sinusoidal_embedding(z,p,4)

    # Measure the execution time of the generate function
    start = timer()
    X_gen = syn_model.generate(count=len(df), cond=np.array(time_event_gen_encoded)).dataframe()
    generate_time = timer() - start
    generate_times.append(generate_time)

    X_gen['duration'] = z
    X_gen['event'] = p
    X_gen = X_gen.sample(frac=1).reset_index(drop=True)

    # Save X_gen as a CSV file
    X_gen.to_csv(f"/content/drive/MyDrive/ICML/{dataset}_{plugin_name}_iteration_{i+1}.csv", index=False)

    loader1 = SurvivalAnalysisDataLoader(df, target_column="event", time_to_event_column="duration")
    loader2 = SurvivalAnalysisDataLoader(X_gen, target_column="event", time_to_event_column="duration")

    met_df = Metrics.evaluate(X_gt=loader1, X_syn=loader2, task_type='survival_analysis', metrics={
        'stats': ['jensenshannon_dist', 'chi_squared_test', 'feature_corr', 'inv_kl_divergence', 'ks_test',
                 'max_mean_discrepancy', 'wasserstein_dist', 'prdc', 'alpha_precision', 'survival_km_distance'],
        'performance': ['linear_model', 'mlp', 'xgb', 'feat_rank_distance']
    }, use_cache=False, random_state=random_state)

    met_df = met_df.iloc[:, 0]
    metrics_list.append(met_df)

    # Calculate p-values
    p_values_continuous, p_values_discrete = compare_distributions(df, X_gen)
    continuous_column_names = list(p_values_continuous.keys())
    discrete_column_names = list(p_values_discrete.keys())

    p_val = np.concatenate([list(p_values_continuous.values()), list(p_values_discrete.values())])
    p_values_list.append(p_val)

    workspace_dir = os.path.join(os.getcwd(), 'workspace')
    if os.path.exists(workspace_dir):
        shutil.rmtree(workspace_dir)

result_df = pd.concat(metrics_list, axis=1)

# Calculate the row-wise mean and standard deviation of the metrics
result_df['Mean'] = result_df.mean(axis=1)
result_df['Std'] = result_df.std(axis=1)
result_df['Std'] = result_df['Std'].round(4)

# Create DataFrame for p-values
p_values_df = pd.DataFrame(p_values_list, columns=continuous_column_names + discrete_column_names)

# Save result_df and p_values_df as CSV files
result_df.to_csv(f"/content/drive/MyDrive/ICML/{dataset}_{plugin_name}_result_df.csv")
p_values_df.to_csv(f"/content/drive/MyDrive/ICML/{dataset}_{plugin_name}_p_values_df.csv")

avg_fit_time = np.mean(fit_times)
avg_generate_time = np.mean(generate_times)
std_fit_time = np.std(fit_times)
std_generate_time = np.std(generate_times)

print(f"\nAverage Fit Time: {avg_fit_time:.4f} seconds, Standard Deviation: {std_fit_time:.4f} seconds")
print(f"Average Generate Time: {avg_generate_time:.4f} seconds, Standard Deviation: {std_generate_time:.4f} seconds")

In [None]:
result_df

In [None]:
p_values_df

# SUPPORT

In [None]:
dataset="support"

metrics_list = []
fit_times = []
generate_times = []
p_values_list = []

for i in range(5):
    df = datasets.support.read_df()
    df = df[df['duration'] != 0]

    time = df['duration'].to_list()
    event = df['event'].to_list()

    time_event_encoded = sinusoidal_embedding(time, event, 4)
    X = df.drop(['duration', 'event'], axis=1)
    y = time_event_encoded

    syn_model = Plugins().get(plugin_name)

    # Measure the execution time of the fit function
    start = timer()
    syn_model.fit(X, cond=y)
    fit_time = timer() - start
    fit_times.append(fit_time)

    survival_df=df
    event_0_data = survival_df[survival_df['event'] == 0]['duration'].values
    event_1_data = survival_df[survival_df['event'] == 1]['duration'].values

    random_state = i + 1
    np.random.seed(random_state)

    # Use KDE to sample time values for each event type
    sample_size_0 = len(event_0_data)
    sample_size_1 = len(event_1_data)

    # Sample from KDE for each event type
    sample_event_0 = fit_dpmm_and_sample(event_0_data, sample_size_0,integer_sampling=True)
    sample_event_1 = fit_dpmm_and_sample(event_1_data, sample_size_1,integer_sampling=True)

    z=np.concatenate([sample_event_0,sample_event_1])
    x=np.zeros(len(sample_event_0))
    y=np.ones(len(sample_event_1))
    p=np.concatenate([x,y])

    time_event_gen_encoded=sinusoidal_embedding(z,p,4)

    # Measure the execution time of the generate function
    start = timer()
    X_gen = syn_model.generate(count=len(df), cond=np.array(time_event_gen_encoded)).dataframe()
    generate_time = timer() - start
    generate_times.append(generate_time)

    X_gen['duration'] = z
    X_gen['event'] = p
    X_gen = X_gen.sample(frac=1).reset_index(drop=True)

    # Save X_gen as a CSV file
    X_gen.to_csv(f"/content/drive/MyDrive/ICML/{dataset}_{plugin_name}_iteration_{i+1}.csv", index=False)

    loader1 = SurvivalAnalysisDataLoader(df, target_column="event", time_to_event_column="duration")
    loader2 = SurvivalAnalysisDataLoader(X_gen, target_column="event", time_to_event_column="duration")

    met_df = Metrics.evaluate(X_gt=loader1, X_syn=loader2, task_type='survival_analysis', metrics={
        'stats': ['jensenshannon_dist', 'chi_squared_test', 'feature_corr', 'inv_kl_divergence', 'ks_test',
                 'max_mean_discrepancy', 'wasserstein_dist', 'prdc', 'alpha_precision', 'survival_km_distance'],
        'performance': ['linear_model', 'mlp', 'xgb', 'feat_rank_distance']
    }, use_cache=False, random_state=random_state)

    met_df = met_df.iloc[:, 0]
    metrics_list.append(met_df)

    # Calculate p-values
    p_values_continuous, p_values_discrete = compare_distributions(df, X_gen)
    continuous_column_names = list(p_values_continuous.keys())
    discrete_column_names = list(p_values_discrete.keys())

    p_val = np.concatenate([list(p_values_continuous.values()), list(p_values_discrete.values())])
    p_values_list.append(p_val)

    workspace_dir = os.path.join(os.getcwd(), 'workspace')
    if os.path.exists(workspace_dir):
        shutil.rmtree(workspace_dir)

result_df = pd.concat(metrics_list, axis=1)

# Calculate the row-wise mean and standard deviation of the metrics
result_df['Mean'] = result_df.mean(axis=1)
result_df['Std'] = result_df.std(axis=1)
result_df['Std'] = result_df['Std'].round(4)

# Create DataFrame for p-values
p_values_df = pd.DataFrame(p_values_list, columns=continuous_column_names + discrete_column_names)

# Save result_df and p_values_df as CSV files
result_df.to_csv(f"/content/drive/MyDrive/ICML/{dataset}_{plugin_name}_result_df.csv")
p_values_df.to_csv(f"/content/drive/MyDrive/ICML/{dataset}_{plugin_name}_p_values_df.csv")

avg_fit_time = np.mean(fit_times)
avg_generate_time = np.mean(generate_times)
std_fit_time = np.std(fit_times)
std_generate_time = np.std(generate_times)

print(f"\nAverage Fit Time: {avg_fit_time:.4f} seconds, Standard Deviation: {std_fit_time:.4f} seconds")
print(f"Average Generate Time: {avg_generate_time:.4f} seconds, Standard Deviation: {std_generate_time:.4f} seconds")

In [None]:
result_df

In [None]:
p_values_df

**bold text**# FLCHAIN