In [None]:
!pip install git+https://github.com/anonymous-785/synthcity.git

In [None]:
!pip uninstall -y torchaudio torchdata
!pip install pycox
from pycox import datasets
from synthcity.metrics import Metrics
from synthcity.plugins.core.dataloader import SurvivalAnalysisDataLoader
import numpy as np
import pandas as pd
import os
import shutil
from timeit import default_timer as timer
from synthcity.plugins import Plugins

In [None]:
plugin_name="ctgan"

# Functions

In [None]:
from scipy.stats import mannwhitneyu, chi2_contingency,wilcoxon
import matplotlib.pyplot as plt

def identify_variable_types(df):
    continuous_columns = []
    discrete_columns = []

    for col in df.columns:
        unique_vals = df[col].unique()
        num_unique = len(unique_vals)
        if num_unique > 20:  # Threshold for considering a column as continuous
            continuous_columns.append(col)
        else:
            discrete_columns.append(col)

    return continuous_columns, discrete_columns

def compare_distributions(real_df, synthetic_df, alpha=0.05):
    real_df = real_df.drop(['duration', 'event'], axis=1)
    real_continuous, real_discrete = identify_variable_types(real_df)
    p_values_continuous = {}
    p_values_discrete = {}

    synthetic_df = synthetic_df.drop(['duration', 'event'], axis=1)
    synthetic_continuous, synthetic_discrete = identify_variable_types(synthetic_df)

    synthetic_continuous = [col for col in synthetic_continuous if col not in ["event", "duration"]]
    synthetic_discrete = [col for col in synthetic_discrete if col not in ["event", "duration"]]

    # Wilcoxon rank-sum test for continuous variables
    for col in real_continuous:
        if col in synthetic_continuous:
            _, p_value = mannwhitneyu(real_df[col], synthetic_df[col])
            p_values_continuous[col] = p_value

    # Chi-square test for discrete variables
    for col in real_discrete:
        if col in synthetic_discrete:
            contingency_table = pd.crosstab(real_df[col], synthetic_df[col])
            _, p, _, _ = chi2_contingency(contingency_table)
            p_values_discrete[col] = p

    # Plot p-values
    # plt.figure(figsize=(10, 6))

    # continuous_p_values = {col: p_values_continuous[col] for col in real_continuous}
    # discrete_p_values = {col: p_values_discrete[col] for col in real_discrete}

    # plt.plot(list(continuous_p_values.keys()), list(continuous_p_values.values()), label='Continuous', marker='o')
    # plt.plot(list(discrete_p_values.keys()), list(discrete_p_values.values()), label='Discrete', marker='o', linestyle='dashed')

    # # Plot alpha line
    # plt.axhline(y=alpha, color='red', linestyle='--', label=f'alpha = {alpha}')

    # plt.xlabel('Column Name')
    # plt.ylabel('p-value')
    # plt.title('Comparison of p-values for Real and Synthetic Data')
    # plt.xticks(rotation=45)
    # plt.legend()
    # plt.grid(True)
    # plt.tight_layout()
    # plt.show()

    return p_values_continuous, p_values_discrete



# FLCHAIN

In [None]:
dataset="flchain"

metrics_list = []
fit_times = []
generate_times = []
p_values_list = []

for i in range(5):
    df = pd.read_csv('/content/drive/MyDrive/Datasets/flchain_final.csv')
    df = df.drop('Unnamed: 0', axis=1)
    df = df[df['duration'] != 0]

    syn_model = Plugins().get(plugin_name)
    X=df
    # Measure the execution time of the fit function
    start = timer()
    syn_model.fit(X)
    fit_time = timer() - start
    fit_times.append(fit_time)

    random_state = i + 1
    np.random.seed(random_state)

    # Measure the execution time of the generate function
    start = timer()
    X_gen = syn_model.generate(count=len(df)).dataframe()
    generate_time = timer() - start
    generate_times.append(generate_time)

    # Save X_gen as a CSV file
    X_gen.to_csv(f"/content/drive/MyDrive/Nips/{dataset}_{plugin_name}_nocond_iteration_{i+1}.csv", index=False)

    loader1 = SurvivalAnalysisDataLoader(df, target_column="event", time_to_event_column="duration")
    loader2 = SurvivalAnalysisDataLoader(X_gen, target_column="event", time_to_event_column="duration")

    met_df = Metrics.evaluate(X_gt=loader1, X_syn=loader2, task_type='survival_analysis', metrics={
        'stats': ['jensenshannon_dist', 'chi_squared_test', 'feature_corr', 'inv_kl_divergence', 'ks_test',
                 'max_mean_discrepancy', 'wasserstein_dist', 'prdc', 'alpha_precision', 'survival_km_distance'],
        'performance': ['linear_model', 'mlp', 'xgb', 'feat_rank_distance']
    }, use_cache=False, random_state=random_state)

    met_df = met_df.iloc[:, 0]
    metrics_list.append(met_df)

    # Calculate p-values
    p_values_continuous, p_values_discrete = compare_distributions(df, X_gen)
    continuous_column_names = list(p_values_continuous.keys())
    discrete_column_names = list(p_values_discrete.keys())

    p_val = np.concatenate([list(p_values_continuous.values()), list(p_values_discrete.values())])
    p_values_list.append(p_val)

    workspace_dir = os.path.join(os.getcwd(), 'workspace')
    if os.path.exists(workspace_dir):
        shutil.rmtree(workspace_dir)

result_df = pd.concat(metrics_list, axis=1)

# Calculate the row-wise mean and standard deviation of the metrics
result_df['Mean'] = result_df.mean(axis=1)
result_df['Std'] = result_df.std(axis=1)
result_df['Std'] = result_df['Std'].round(4)

# Create DataFrame for p-values
p_values_df = pd.DataFrame(p_values_list, columns=continuous_column_names + discrete_column_names)

# Save result_df and p_values_df as CSV files
result_df.to_csv(f"/content/drive/MyDrive/Nips/{dataset}_{plugin_name}_nocond_result_df.csv")
p_values_df.to_csv(f"/content/drive/MyDrive/Nips/{dataset}_{plugin_name}_nocond_p_values_df.csv")

avg_fit_time = np.mean(fit_times)
avg_generate_time = np.mean(generate_times)
std_fit_time = np.std(fit_times)
std_generate_time = np.std(generate_times)

print(f"\nAverage Fit Time: {avg_fit_time:.4f} seconds, Standard Deviation: {std_fit_time:.4f} seconds")
print(f"Average Generate Time: {avg_generate_time:.4f} seconds, Standard Deviation: {std_generate_time:.4f} seconds")


In [None]:
result_df

In [None]:
p_values_df

# AIDS

In [None]:
dataset="aids"

metrics_list = []
fit_times = []
generate_times = []
p_values_list = []

for i in range(5):
    df = pd.read_csv('/content/drive/MyDrive/Datasets/aids.csv')
    df = df.drop('Unnamed: 0', axis=1)
    df = df[df['duration'] != 0]

    syn_model = Plugins().get(plugin_name)
    X=df
    # Measure the execution time of the fit function
    start = timer()
    syn_model.fit(X)
    fit_time = timer() - start
    fit_times.append(fit_time)

    random_state = i + 1
    np.random.seed(random_state)

    # Measure the execution time of the generate function
    start = timer()
    X_gen = syn_model.generate(count=len(df)).dataframe()
    generate_time = timer() - start
    generate_times.append(generate_time)

    # Save X_gen as a CSV file
    X_gen.to_csv(f"/content/drive/MyDrive/Nips/{dataset}_{plugin_name}_nocond_iteration_{i+1}.csv", index=False)

    loader1 = SurvivalAnalysisDataLoader(df, target_column="event", time_to_event_column="duration")
    loader2 = SurvivalAnalysisDataLoader(X_gen, target_column="event", time_to_event_column="duration")

    met_df = Metrics.evaluate(X_gt=loader1, X_syn=loader2, task_type='survival_analysis', metrics={
        'stats': ['jensenshannon_dist', 'chi_squared_test', 'feature_corr', 'inv_kl_divergence', 'ks_test',
                 'max_mean_discrepancy', 'wasserstein_dist', 'prdc', 'alpha_precision', 'survival_km_distance'],
        'performance': ['linear_model', 'mlp', 'xgb', 'feat_rank_distance']
    }, use_cache=False, random_state=random_state)

    met_df = met_df.iloc[:, 0]
    metrics_list.append(met_df)

    # Calculate p-values
    p_values_continuous, p_values_discrete = compare_distributions(df, X_gen)
    continuous_column_names = list(p_values_continuous.keys())
    discrete_column_names = list(p_values_discrete.keys())

    p_val = np.concatenate([list(p_values_continuous.values()), list(p_values_discrete.values())])
    p_values_list.append(p_val)

    workspace_dir = os.path.join(os.getcwd(), 'workspace')
    if os.path.exists(workspace_dir):
        shutil.rmtree(workspace_dir)

result_df = pd.concat(metrics_list, axis=1)

# Calculate the row-wise mean and standard deviation of the metrics
result_df['Mean'] = result_df.mean(axis=1)
result_df['Std'] = result_df.std(axis=1)
result_df['Std'] = result_df['Std'].round(4)

# Create DataFrame for p-values
p_values_df = pd.DataFrame(p_values_list, columns=continuous_column_names + discrete_column_names)

# Save result_df and p_values_df as CSV files
result_df.to_csv(f"/content/drive/MyDrive/Nips/{dataset}_{plugin_name}_nocond_result_df.csv")
p_values_df.to_csv(f"/content/drive/MyDrive/Nips/{dataset}_{plugin_name}_nocond_p_values_df.csv")

avg_fit_time = np.mean(fit_times)
avg_generate_time = np.mean(generate_times)
std_fit_time = np.std(fit_times)
std_generate_time = np.std(generate_times)

print(f"\nAverage Fit Time: {avg_fit_time:.4f} seconds, Standard Deviation: {std_fit_time:.4f} seconds")
print(f"Average Generate Time: {avg_generate_time:.4f} seconds, Standard Deviation: {std_generate_time:.4f} seconds")


In [None]:
result_df

In [None]:
p_values_df

# Metabric

In [None]:
dataset="metabric"

metrics_list = []
fit_times = []
generate_times = []
p_values_list = []

for i in range(5):
    df = datasets.metabric.read_df()
    df = df[df['duration'] != 0]

    syn_model = Plugins().get(plugin_name)
    X=df
    # Measure the execution time of the fit function
    start = timer()
    syn_model.fit(X)
    fit_time = timer() - start
    fit_times.append(fit_time)

    random_state = i + 1
    np.random.seed(random_state)

    # Measure the execution time of the generate function
    start = timer()
    X_gen = syn_model.generate(count=len(df)).dataframe()
    generate_time = timer() - start
    generate_times.append(generate_time)

    # Save X_gen as a CSV file
    X_gen.to_csv(f"/content/drive/MyDrive/Nips/{dataset}_{plugin_name}_nocond_iteration_{i+1}.csv", index=False)

    loader1 = SurvivalAnalysisDataLoader(df, target_column="event", time_to_event_column="duration")
    loader2 = SurvivalAnalysisDataLoader(X_gen, target_column="event", time_to_event_column="duration")

    met_df = Metrics.evaluate(X_gt=loader1, X_syn=loader2, task_type='survival_analysis', metrics={
        'stats': ['jensenshannon_dist', 'chi_squared_test', 'feature_corr', 'inv_kl_divergence', 'ks_test',
                 'max_mean_discrepancy', 'wasserstein_dist', 'prdc', 'alpha_precision', 'survival_km_distance'],
        'performance': ['linear_model', 'mlp', 'xgb', 'feat_rank_distance']
    }, use_cache=False, random_state=random_state)

    met_df = met_df.iloc[:, 0]
    metrics_list.append(met_df)

    # Calculate p-values
    p_values_continuous, p_values_discrete = compare_distributions(df, X_gen)
    continuous_column_names = list(p_values_continuous.keys())
    discrete_column_names = list(p_values_discrete.keys())

    p_val = np.concatenate([list(p_values_continuous.values()), list(p_values_discrete.values())])
    p_values_list.append(p_val)

    workspace_dir = os.path.join(os.getcwd(), 'workspace')
    if os.path.exists(workspace_dir):
        shutil.rmtree(workspace_dir)

result_df = pd.concat(metrics_list, axis=1)

# Calculate the row-wise mean and standard deviation of the metrics
result_df['Mean'] = result_df.mean(axis=1)
result_df['Std'] = result_df.std(axis=1)
result_df['Std'] = result_df['Std'].round(4)

# Create DataFrame for p-values
p_values_df = pd.DataFrame(p_values_list, columns=continuous_column_names + discrete_column_names)

# Save result_df and p_values_df as CSV files
result_df.to_csv(f"/content/drive/MyDrive/Nips/{dataset}_{plugin_name}_nocond_result_df.csv")
p_values_df.to_csv(f"/content/drive/MyDrive/Nips/{dataset}_{plugin_name}_nocond_p_values_df.csv")

avg_fit_time = np.mean(fit_times)
avg_generate_time = np.mean(generate_times)
std_fit_time = np.std(fit_times)
std_generate_time = np.std(generate_times)

print(f"\nAverage Fit Time: {avg_fit_time:.4f} seconds, Standard Deviation: {std_fit_time:.4f} seconds")
print(f"Average Generate Time: {avg_generate_time:.4f} seconds, Standard Deviation: {std_generate_time:.4f} seconds")


In [None]:
result_df

In [None]:
p_values_df

# GBSG

In [None]:
dataset="gbsg"

metrics_list = []
fit_times = []
generate_times = []
p_values_list = []

for i in range(5):
    df = datasets.gbsg.read_df()
    df = df[df['duration'] != 0]

    syn_model = Plugins().get(plugin_name)
    X=df
    # Measure the execution time of the fit function
    start = timer()
    syn_model.fit(X)
    fit_time = timer() - start
    fit_times.append(fit_time)

    random_state = i + 1
    np.random.seed(random_state)

    # Measure the execution time of the generate function
    start = timer()
    X_gen = syn_model.generate(count=len(df)).dataframe()
    generate_time = timer() - start
    generate_times.append(generate_time)

    # Save X_gen as a CSV file
    X_gen.to_csv(f"/content/drive/MyDrive/Nips/{dataset}_{plugin_name}_nocond_iteration_{i+1}.csv", index=False)

    loader1 = SurvivalAnalysisDataLoader(df, target_column="event", time_to_event_column="duration")
    loader2 = SurvivalAnalysisDataLoader(X_gen, target_column="event", time_to_event_column="duration")

    met_df = Metrics.evaluate(X_gt=loader1, X_syn=loader2, task_type='survival_analysis', metrics={
        'stats': ['jensenshannon_dist', 'chi_squared_test', 'feature_corr', 'inv_kl_divergence', 'ks_test',
                 'max_mean_discrepancy', 'wasserstein_dist', 'prdc', 'alpha_precision', 'survival_km_distance'],
        'performance': ['linear_model', 'mlp', 'xgb', 'feat_rank_distance']
    }, use_cache=False, random_state=random_state)

    met_df = met_df.iloc[:, 0]
    metrics_list.append(met_df)

    # Calculate p-values
    p_values_continuous, p_values_discrete = compare_distributions(df, X_gen)
    continuous_column_names = list(p_values_continuous.keys())
    discrete_column_names = list(p_values_discrete.keys())

    p_val = np.concatenate([list(p_values_continuous.values()), list(p_values_discrete.values())])
    p_values_list.append(p_val)

    workspace_dir = os.path.join(os.getcwd(), 'workspace')
    if os.path.exists(workspace_dir):
        shutil.rmtree(workspace_dir)

result_df = pd.concat(metrics_list, axis=1)

# Calculate the row-wise mean and standard deviation of the metrics
result_df['Mean'] = result_df.mean(axis=1)
result_df['Std'] = result_df.std(axis=1)
result_df['Std'] = result_df['Std'].round(4)

# Create DataFrame for p-values
p_values_df = pd.DataFrame(p_values_list, columns=continuous_column_names + discrete_column_names)

# Save result_df and p_values_df as CSV files
result_df.to_csv(f"/content/drive/MyDrive/Nips/{dataset}_{plugin_name}_nocond_result_df.csv")
p_values_df.to_csv(f"/content/drive/MyDrive/Nips/{dataset}_{plugin_name}_nocond_p_values_df.csv")

avg_fit_time = np.mean(fit_times)
avg_generate_time = np.mean(generate_times)
std_fit_time = np.std(fit_times)
std_generate_time = np.std(generate_times)

print(f"\nAverage Fit Time: {avg_fit_time:.4f} seconds, Standard Deviation: {std_fit_time:.4f} seconds")
print(f"Average Generate Time: {avg_generate_time:.4f} seconds, Standard Deviation: {std_generate_time:.4f} seconds")


In [None]:
result_df

In [None]:
p_values_df

# SUPPORT

In [None]:
dataset="support"

metrics_list = []
fit_times = []
generate_times = []
p_values_list = []

for i in range(5):
    df = datasets.support.read_df()
    df = df[df['duration'] != 0]

    syn_model = Plugins().get(plugin_name)
    X=df
    # Measure the execution time of the fit function
    start = timer()
    syn_model.fit(X)
    fit_time = timer() - start
    fit_times.append(fit_time)

    random_state = i + 1
    np.random.seed(random_state)

    # Measure the execution time of the generate function
    start = timer()
    X_gen = syn_model.generate(count=len(df)).dataframe()
    generate_time = timer() - start
    generate_times.append(generate_time)

    # Save X_gen as a CSV file
    X_gen.to_csv(f"/content/drive/MyDrive/Nips/{dataset}_{plugin_name}_nocond_iteration_{i+1}.csv", index=False)

    loader1 = SurvivalAnalysisDataLoader(df, target_column="event", time_to_event_column="duration")
    loader2 = SurvivalAnalysisDataLoader(X_gen, target_column="event", time_to_event_column="duration")

    met_df = Metrics.evaluate(X_gt=loader1, X_syn=loader2, task_type='survival_analysis', metrics={
        'stats': ['jensenshannon_dist', 'chi_squared_test', 'feature_corr', 'inv_kl_divergence', 'ks_test',
                 'max_mean_discrepancy', 'wasserstein_dist', 'prdc', 'alpha_precision', 'survival_km_distance'],
        'performance': ['linear_model', 'mlp', 'xgb', 'feat_rank_distance']
    }, use_cache=False, random_state=random_state)

    met_df = met_df.iloc[:, 0]
    metrics_list.append(met_df)

    # Calculate p-values
    p_values_continuous, p_values_discrete = compare_distributions(df, X_gen)
    continuous_column_names = list(p_values_continuous.keys())
    discrete_column_names = list(p_values_discrete.keys())

    p_val = np.concatenate([list(p_values_continuous.values()), list(p_values_discrete.values())])
    p_values_list.append(p_val)

    workspace_dir = os.path.join(os.getcwd(), 'workspace')
    if os.path.exists(workspace_dir):
        shutil.rmtree(workspace_dir)

result_df = pd.concat(metrics_list, axis=1)

# Calculate the row-wise mean and standard deviation of the metrics
result_df['Mean'] = result_df.mean(axis=1)
result_df['Std'] = result_df.std(axis=1)
result_df['Std'] = result_df['Std'].round(4)

# Create DataFrame for p-values
p_values_df = pd.DataFrame(p_values_list, columns=continuous_column_names + discrete_column_names)

# Save result_df and p_values_df as CSV files
result_df.to_csv(f"/content/drive/MyDrive/Nips/{dataset}_{plugin_name}_nocond_result_df.csv")
p_values_df.to_csv(f"/content/drive/MyDrive/Nips/{dataset}_{plugin_name}_nocond_p_values_df.csv")

avg_fit_time = np.mean(fit_times)
avg_generate_time = np.mean(generate_times)
std_fit_time = np.std(fit_times)
std_generate_time = np.std(generate_times)

print(f"\nAverage Fit Time: {avg_fit_time:.4f} seconds, Standard Deviation: {std_fit_time:.4f} seconds")
print(f"Average Generate Time: {avg_generate_time:.4f} seconds, Standard Deviation: {std_generate_time:.4f} seconds")


In [None]:
result_df

In [None]:
p_values_df