In [None]:
!pip install git+https://github.com/Ashhad785/synthcity.git

In [1]:
!pip uninstall -y torchaudio torchdata
!pip install pycox
from pycox import datasets
from synthcity.metrics import Metrics
from synthcity.plugins.core.dataloader import SurvivalAnalysisDataLoader
import numpy as np
import pandas as pd
import os
import shutil
from timeit import default_timer as timer
from synthcity.plugins import Plugins

[KeOps] Compiling cuda jit compiler engine ... OK
[pyKeOps] Compiling nvrtc binder for python ... OK


In [2]:
plugin_name="adsgan"

# Functions

In [5]:
def sinusoidal_embedding(values, event_indicators, embedding_dim):
    # Find the unique values and sort them
    unique_values = sorted(set(values))

    # Create a dictionary to map values to their indices
    value_to_idx = {value: idx for idx, value in enumerate(unique_values)}

    # Create the embedding matrix
    embeddings = np.zeros((len(unique_values), embedding_dim + 1))

    # Assign sinusoidal embeddings based on the order of values
    for idx, value in enumerate(unique_values):
        for j in range(embedding_dim // 2):
            embeddings[idx, 2 * j] = np.sin(idx / (10000 ** (2 * j / embedding_dim)))
            embeddings[idx, 2 * j + 1] = np.cos(idx / (10000 ** (2 * j / embedding_dim)))

    # Map the input values to their embeddings
    value_embeddings = []
    for value, event_indicator in zip(values, event_indicators):
        embedding = embeddings[value_to_idx[value]].copy()
        embedding[-1] = event_indicator
        value_embeddings.append(embedding)

    return np.array(value_embeddings)


from scipy.stats import mannwhitneyu, chi2_contingency,wilcoxon
import matplotlib.pyplot as plt

def identify_variable_types(df):
    continuous_columns = []
    discrete_columns = []

    for col in df.columns:
        unique_vals = df[col].unique()
        num_unique = len(unique_vals)
        if num_unique > 20:  # Threshold for considering a column as continuous
            continuous_columns.append(col)
        else:
            discrete_columns.append(col)

    return continuous_columns, discrete_columns

def compare_distributions(real_df, synthetic_df, alpha=0.05):
    real_df = real_df.drop(['duration', 'event'], axis=1)
    real_continuous, real_discrete = identify_variable_types(real_df)
    p_values_continuous = {}
    p_values_discrete = {}

    synthetic_df = synthetic_df.drop(['duration', 'event'], axis=1)
    synthetic_continuous, synthetic_discrete = identify_variable_types(synthetic_df)

    synthetic_continuous = [col for col in synthetic_continuous if col not in ["event", "duration"]]
    synthetic_discrete = [col for col in synthetic_discrete if col not in ["event", "duration"]]

    # Wilcoxon rank-sum test for continuous variables
    for col in real_continuous:
        if col in synthetic_continuous:
            _, p_value = mannwhitneyu(real_df[col], synthetic_df[col])
            p_values_continuous[col] = p_value

    # Chi-square test for discrete variables
    for col in real_discrete:
        if col in synthetic_discrete:
            contingency_table = pd.crosstab(real_df[col], synthetic_df[col])
            _, p, _, _ = chi2_contingency(contingency_table)
            p_values_discrete[col] = p

    # Plot p-values
    # plt.figure(figsize=(10, 6))

    # continuous_p_values = {col: p_values_continuous[col] for col in real_continuous}
    # discrete_p_values = {col: p_values_discrete[col] for col in real_discrete}

    # plt.plot(list(continuous_p_values.keys()), list(continuous_p_values.values()), label='Continuous', marker='o')
    # plt.plot(list(discrete_p_values.keys()), list(discrete_p_values.values()), label='Discrete', marker='o', linestyle='dashed')

    # # Plot alpha line
    # plt.axhline(y=alpha, color='red', linestyle='--', label=f'alpha = {alpha}')

    # plt.xlabel('Column Name')
    # plt.ylabel('p-value')
    # plt.title('Comparison of p-values for Real and Synthetic Data')
    # plt.xticks(rotation=45)
    # plt.legend()
    # plt.grid(True)
    # plt.tight_layout()
    # plt.show()

    return p_values_continuous, p_values_discrete



# FLCHAIN

In [6]:
dataset="flchain"

metrics_list = []
fit_times = []
generate_times = []
p_values_list = []

for i in range(5):
    df = pd.read_csv('/content/drive/MyDrive/Datasets/flchain_final.csv')
    df = df.drop('Unnamed: 0', axis=1)
    df = df[df['duration'] != 0]

    time = df['duration'].to_list()
    event = df['event'].to_list()

    time_event_encoded = sinusoidal_embedding(time, event, 4)
    X = df.drop(['duration', 'event'], axis=1)
    y = time_event_encoded

    syn_model = Plugins().get(plugin_name)

    # Measure the execution time of the fit function
    start = timer()
    syn_model.fit(X, cond=y)
    fit_time = timer() - start
    fit_times.append(fit_time)

    survival_df=df
    event_0_data = survival_df[survival_df['event'] == 0]['duration']
    event_1_data = survival_df[survival_df['event'] == 1]['duration']

    random_state = i + 1
    np.random.seed(random_state)

    sample_size_0 = len(event_0_data)
    sample_size_1= len(event_1_data)
    sample_event_0 = np.random.choice(event_0_data, size=sample_size_0)
    sample_event_1 = np.random.choice(event_1_data, size=sample_size_1)

    z=np.concatenate([sample_event_0,sample_event_1])
    x=np.zeros(len(sample_event_0))
    y=np.ones(len(sample_event_1))
    p=np.concatenate([x,y])

    time_event_gen_encoded=sinusoidal_embedding(z,p,4)

    # Measure the execution time of the generate function
    start = timer()
    X_gen = syn_model.generate(count=len(df), cond=np.array(time_event_gen_encoded)).dataframe()
    generate_time = timer() - start
    generate_times.append(generate_time)

    X_gen['duration'] = z
    X_gen['event'] = p
    X_gen = X_gen.sample(frac=1).reset_index(drop=True)

    # Save X_gen as a CSV file
    X_gen.to_csv(f"/content/drive/MyDrive/Nips/{dataset}_{plugin_name}_iteration_{i+1}.csv", index=False)

    loader1 = SurvivalAnalysisDataLoader(df, target_column="event", time_to_event_column="duration")
    loader2 = SurvivalAnalysisDataLoader(X_gen, target_column="event", time_to_event_column="duration")

    met_df = Metrics.evaluate(X_gt=loader1, X_syn=loader2, task_type='survival_analysis', metrics={
        'stats': ['jensenshannon_dist', 'chi_squared_test', 'feature_corr', 'inv_kl_divergence', 'ks_test',
                 'max_mean_discrepancy', 'wasserstein_dist', 'prdc', 'alpha_precision', 'survival_km_distance'],
        'performance': ['linear_model', 'mlp', 'xgb', 'feat_rank_distance']
    }, use_cache=False, random_state=random_state)

    met_df = met_df.iloc[:, 0]
    metrics_list.append(met_df)

    # Calculate p-values
    p_values_continuous, p_values_discrete = compare_distributions(df, X_gen)
    continuous_column_names = list(p_values_continuous.keys())
    discrete_column_names = list(p_values_discrete.keys())

    p_val = np.concatenate([list(p_values_continuous.values()), list(p_values_discrete.values())])
    p_values_list.append(p_val)

    workspace_dir = os.path.join(os.getcwd(), 'workspace')
    if os.path.exists(workspace_dir):
        shutil.rmtree(workspace_dir)

result_df = pd.concat(metrics_list, axis=1)

# Calculate the row-wise mean and standard deviation of the metrics
result_df['Mean'] = result_df.mean(axis=1)
result_df['Std'] = result_df.std(axis=1)
result_df['Std'] = result_df['Std'].round(4)

# Create DataFrame for p-values
p_values_df = pd.DataFrame(p_values_list, columns=continuous_column_names + discrete_column_names)

# Save result_df and p_values_df as CSV files
result_df.to_csv(f"/content/drive/MyDrive/Nips/{dataset}_{plugin_name}_result_df.csv")
p_values_df.to_csv(f"/content/drive/MyDrive/Nips/{dataset}_{plugin_name}_p_values_df.csv")

avg_fit_time = np.mean(fit_times)
avg_generate_time = np.mean(generate_times)
std_fit_time = np.std(fit_times)
std_generate_time = np.std(generate_times)

print(f"\nAverage Fit Time: {avg_fit_time:.4f} seconds, Standard Deviation: {std_fit_time:.4f} seconds")
print(f"Average Generate Time: {avg_generate_time:.4f} seconds, Standard Deviation: {std_generate_time:.4f} seconds")


[2024-04-19T17:08:59.949542+0000][4342][CRITICAL] module disabled: /usr/local/lib/python3.10/dist-packages/synthcity/plugins/generic/plugin_goggle.py
  4%|▍         | 449/10000 [06:23<2:16:01,  1.17it/s]




[2024-04-19T17:21:29.441544+0000][4342][CRITICAL] module disabled: /usr/local/lib/python3.10/dist-packages/synthcity/plugins/generic/plugin_goggle.py
  6%|▋         | 649/10000 [09:25<2:15:43,  1.15it/s]




[2024-04-19T17:36:51.338652+0000][4342][CRITICAL] module disabled: /usr/local/lib/python3.10/dist-packages/synthcity/plugins/generic/plugin_goggle.py
  6%|▋         | 649/10000 [09:28<2:16:34,  1.14it/s]




[2024-04-19T17:52:24.763239+0000][4342][CRITICAL] module disabled: /usr/local/lib/python3.10/dist-packages/synthcity/plugins/generic/plugin_goggle.py
  6%|▋         | 649/10000 [09:35<2:18:19,  1.13it/s]




[2024-04-19T18:08:22.546896+0000][4342][CRITICAL] module disabled: /usr/local/lib/python3.10/dist-packages/synthcity/plugins/generic/plugin_goggle.py
  6%|▋         | 649/10000 [09:30<2:17:02,  1.14it/s]



Average Fit Time: 535.8305 seconds, Standard Deviation: 74.3194 seconds
Average Generate Time: 0.1022 seconds, Standard Deviation: 0.0177 seconds


In [7]:
result_df

Unnamed: 0,min,min.1,min.2,min.3,min.4,Mean,Std
stats.jensenshannon_dist.marginal,0.004688,0.003886,0.004009,0.004029,0.003817,0.004086,0.0003
stats.chi_squared_test.marginal,0.537957,0.538145,0.538435,0.53756,0.537794,0.537978,0.0003
stats.inv_kl_divergence.marginal,0.934386,0.940985,0.940426,0.93263,0.933125,0.936311,0.0036
stats.ks_test.marginal,0.845301,0.890218,0.889294,0.88919,0.890242,0.880849,0.0178
stats.max_mean_discrepancy.joint,0.000262,0.000261,0.00026,0.00026,0.00026,0.000261,0.0
stats.wasserstein_dist.joint,0.021154,0.017659,0.017918,0.018215,0.018071,0.018603,0.0013
stats.prdc.precision,0.93025,0.935332,0.935713,0.93584,0.937746,0.934976,0.0025
stats.prdc.recall,0.979672,0.984627,0.982594,0.986279,0.986787,0.983992,0.0026
stats.prdc.density,0.817939,0.838979,0.837251,0.833083,0.827239,0.830898,0.0076
stats.prdc.coverage,0.909414,0.918816,0.917037,0.91907,0.912591,0.915386,0.0038


In [8]:
p_values_df

Unnamed: 0,age,kappa,lambda,creatinine,sex,sample.yr,flc.grp,mgus,chapter
0,0.4891011,3.79238e-07,5.813786e-05,0.0,0.787067,0.766169,0.280883,1.0,0.995529
1,8.631560999999999e-19,1.463139e-19,1.524861e-10,3.229452e-22,0.149461,0.555853,0.680616,1.0,0.702513
2,1.430575e-18,7.223727e-20,3.550099e-10,5.367928e-21,0.542154,0.279605,0.696338,0.444722,0.952673
3,8.353545e-19,1.3106899999999999e-19,2.455677e-10,2.370231e-22,0.529829,0.290268,0.531453,0.269809,0.999321
4,3.9125989999999994e-19,5.0438699999999997e-20,1.960701e-10,1.125723e-22,0.264611,0.341035,0.199776,0.482488,0.785948


# AIDS

In [None]:
dataset="aids"

metrics_list = []
fit_times = []
generate_times = []
p_values_list = []

for i in range(5):
    df = pd.read_csv('/content/drive/MyDrive/Datasets/aids.csv')
    df = df.drop('Unnamed: 0', axis=1)
    df = df[df['duration'] != 0]

    time = df['duration'].to_list()
    event = df['event'].to_list()

    time_event_encoded = sinusoidal_embedding(time, event, 4)
    X = df.drop(['duration', 'event'], axis=1)
    y = time_event_encoded

    syn_model = Plugins().get(plugin_name)

    # Measure the execution time of the fit function
    start = timer()
    syn_model.fit(X, cond=y)
    fit_time = timer() - start
    fit_times.append(fit_time)

    survival_df=df
    event_0_data = survival_df[survival_df['event'] == 0]['duration']
    event_1_data = survival_df[survival_df['event'] == 1]['duration']

    random_state = i + 1
    np.random.seed(random_state)

    sample_size_0 = len(event_0_data)
    sample_size_1= len(event_1_data)
    sample_event_0 = np.random.choice(event_0_data, size=sample_size_0)
    sample_event_1 = np.random.choice(event_1_data, size=sample_size_1)

    z=np.concatenate([sample_event_0,sample_event_1])
    x=np.zeros(len(sample_event_0))
    y=np.ones(len(sample_event_1))
    p=np.concatenate([x,y])

    time_event_gen_encoded=sinusoidal_embedding(z,p,4)

    # Measure the execution time of the generate function
    start = timer()
    X_gen = syn_model.generate(count=len(df), cond=np.array(time_event_gen_encoded)).dataframe()
    generate_time = timer() - start
    generate_times.append(generate_time)

    X_gen['duration'] = z
    X_gen['event'] = p
    X_gen = X_gen.sample(frac=1).reset_index(drop=True)

    # Save X_gen as a CSV file
    X_gen.to_csv(f"/content/drive/MyDrive/Nips/{dataset}_{plugin_name}_iteration_{i+1}.csv", index=False)

    loader1 = SurvivalAnalysisDataLoader(df, target_column="event", time_to_event_column="duration")
    loader2 = SurvivalAnalysisDataLoader(X_gen, target_column="event", time_to_event_column="duration")

    met_df = Metrics.evaluate(X_gt=loader1, X_syn=loader2, task_type='survival_analysis', metrics={
        'stats': ['jensenshannon_dist', 'chi_squared_test', 'feature_corr', 'inv_kl_divergence', 'ks_test',
                 'max_mean_discrepancy', 'wasserstein_dist', 'prdc', 'alpha_precision', 'survival_km_distance'],
        'performance': ['linear_model', 'mlp', 'xgb', 'feat_rank_distance']
    }, use_cache=False, random_state=random_state)

    met_df = met_df.iloc[:, 0]
    metrics_list.append(met_df)

    # Calculate p-values
    p_values_continuous, p_values_discrete = compare_distributions(df, X_gen)
    continuous_column_names = list(p_values_continuous.keys())
    discrete_column_names = list(p_values_discrete.keys())

    p_val = np.concatenate([list(p_values_continuous.values()), list(p_values_discrete.values())])
    p_values_list.append(p_val)

    workspace_dir = os.path.join(os.getcwd(), 'workspace')
    if os.path.exists(workspace_dir):
        shutil.rmtree(workspace_dir)

result_df = pd.concat(metrics_list, axis=1)

# Calculate the row-wise mean and standard deviation of the metrics
result_df['Mean'] = result_df.mean(axis=1)
result_df['Std'] = result_df.std(axis=1)
result_df['Std'] = result_df['Std'].round(4)

# Create DataFrame for p-values
p_values_df = pd.DataFrame(p_values_list, columns=continuous_column_names + discrete_column_names)

# Save result_df and p_values_df as CSV files
result_df.to_csv(f"/content/drive/MyDrive/Nips/{dataset}_{plugin_name}_result_df.csv")
p_values_df.to_csv(f"/content/drive/MyDrive/Nips/{dataset}_{plugin_name}_p_values_df.csv")

avg_fit_time = np.mean(fit_times)
avg_generate_time = np.mean(generate_times)
std_fit_time = np.std(fit_times)
std_generate_time = np.std(generate_times)

print(f"\nAverage Fit Time: {avg_fit_time:.4f} seconds, Standard Deviation: {std_fit_time:.4f} seconds")
print(f"Average Generate Time: {avg_generate_time:.4f} seconds, Standard Deviation: {std_generate_time:.4f} seconds")


[2024-04-13T10:55:18.138631+0000][10941][CRITICAL] module disabled: /usr/local/lib/python3.10/dist-packages/synthcity/plugins/generic/plugin_goggle.py
  8%|▊         | 799/10000 [02:03<23:39,  6.48it/s]
[2024-04-13T10:58:27.487368+0000][10941][CRITICAL] module disabled: /usr/local/lib/python3.10/dist-packages/synthcity/plugins/generic/plugin_goggle.py
 13%|█▎        | 1349/10000 [03:28<22:20,  6.45it/s]
[2024-04-13T11:02:58.858449+0000][10941][CRITICAL] module disabled: /usr/local/lib/python3.10/dist-packages/synthcity/plugins/generic/plugin_goggle.py
 13%|█▎        | 1349/10000 [03:28<22:16,  6.47it/s]
[2024-04-13T11:07:28.835999+0000][10941][CRITICAL] module disabled: /usr/local/lib/python3.10/dist-packages/synthcity/plugins/generic/plugin_goggle.py
 13%|█▎        | 1349/10000 [03:30<22:26,  6.42it/s]
[2024-04-13T11:12:04.000979+0000][10941][CRITICAL] module disabled: /usr/local/lib/python3.10/dist-packages/synthcity/plugins/generic/plugin_goggle.py
 13%|█▎        | 1349/10000 [03:30


Average Fit Time: 192.6425 seconds, Standard Deviation: 34.4537 seconds
Average Generate Time: 0.0755 seconds, Standard Deviation: 0.0117 seconds


In [None]:
result_df

Unnamed: 0,min,min.1,min.2,min.3,min.4,Mean,Std
stats.jensenshannon_dist.marginal,0.013653,0.005102,0.004803,0.004898,0.004771,0.006646,0.0035
stats.chi_squared_test.marginal,0.896167,0.763337,0.764112,0.763607,0.765464,0.790537,0.0528
stats.inv_kl_divergence.marginal,0.920458,0.954513,0.964672,0.954898,0.954389,0.949786,0.0152
stats.ks_test.marginal,0.888325,0.950678,0.951748,0.951413,0.951614,0.938756,0.0252
stats.max_mean_discrepancy.joint,0.001738,0.001738,0.001738,0.001738,0.001738,0.001738,0.0
stats.wasserstein_dist.joint,30623.065304,30296.487125,29862.941744,29804.092424,30988.26477,30314.970273,450.3145
stats.prdc.precision,0.046916,0.033884,0.042572,0.035621,0.033884,0.038575,0.0053
stats.prdc.recall,0.026064,0.02172,0.023458,0.026064,0.023458,0.024153,0.0017
stats.prdc.density,0.014944,0.01755,0.018766,0.018766,0.014944,0.016994,0.0017
stats.prdc.coverage,0.020851,0.02172,0.019983,0.022589,0.02172,0.021373,0.0009


In [None]:
p_values_df

Unnamed: 0,age,cd4,priorzdv,hemophil,ivdrug,karnof,raceth,sex,strat2,tx,txgrp
0,2.3421759999999996e-50,1.0933450000000001e-33,1.061062e-28,1.0,0.091138,0.757849,0.090484,0.555921,0.810878,0.0016,0.138806
1,6.065534000000001e-17,0.02285822,3.991644e-12,0.07976,0.805572,0.824252,0.334753,0.802355,0.309324,0.617674,0.811564
2,4.157416e-16,0.02950714,2.119103e-11,1.0,0.279162,0.136999,0.618646,0.95571,0.297512,0.658882,0.568105
3,1.962862e-16,0.03689727,2.71032e-12,0.986282,0.81151,0.935954,0.00042,0.54822,0.540078,0.302412,0.530971
4,1.917176e-16,0.01712946,3.930109e-12,0.453138,0.731768,0.188995,0.652758,0.130194,1.0,0.461682,0.725945


# Metabric

In [None]:
dataset="metabric"

metrics_list = []
fit_times = []
generate_times = []
p_values_list = []

for i in range(5):
    df = datasets.metabric.read_df()
    df = df[df['duration'] != 0]

    time = df['duration'].to_list()
    event = df['event'].to_list()

    time_event_encoded = sinusoidal_embedding(time, event, 4)
    X = df.drop(['duration', 'event'], axis=1)
    y = time_event_encoded

    syn_model = Plugins().get(plugin_name)

    # Measure the execution time of the fit function
    start = timer()
    syn_model.fit(X, cond=y)
    fit_time = timer() - start
    fit_times.append(fit_time)

    survival_df=df
    event_0_data = survival_df[survival_df['event'] == 0]['duration']
    event_1_data = survival_df[survival_df['event'] == 1]['duration']

    random_state = i + 1
    np.random.seed(random_state)

    sample_size_0 = len(event_0_data)
    sample_size_1= len(event_1_data)
    sample_event_0 = np.random.choice(event_0_data, size=sample_size_0)
    sample_event_1 = np.random.choice(event_1_data, size=sample_size_1)

    z=np.concatenate([sample_event_0,sample_event_1])
    x=np.zeros(len(sample_event_0))
    y=np.ones(len(sample_event_1))
    p=np.concatenate([x,y])

    time_event_gen_encoded=sinusoidal_embedding(z,p,4)

    # Measure the execution time of the generate function
    start = timer()
    X_gen = syn_model.generate(count=len(df), cond=np.array(time_event_gen_encoded)).dataframe()
    generate_time = timer() - start
    generate_times.append(generate_time)

    X_gen['duration'] = z
    X_gen['event'] = p
    X_gen = X_gen.sample(frac=1).reset_index(drop=True)

    # Save X_gen as a CSV file
    X_gen.to_csv(f"/content/drive/MyDrive/Nips/{dataset}_{plugin_name}_iteration_{i+1}.csv", index=False)

    loader1 = SurvivalAnalysisDataLoader(df, target_column="event", time_to_event_column="duration")
    loader2 = SurvivalAnalysisDataLoader(X_gen, target_column="event", time_to_event_column="duration")

    met_df = Metrics.evaluate(X_gt=loader1, X_syn=loader2, task_type='survival_analysis', metrics={
        'stats': ['jensenshannon_dist', 'chi_squared_test', 'feature_corr', 'inv_kl_divergence', 'ks_test',
                 'max_mean_discrepancy', 'wasserstein_dist', 'prdc', 'alpha_precision', 'survival_km_distance'],
        'performance': ['linear_model', 'mlp', 'xgb', 'feat_rank_distance']
    }, use_cache=False, random_state=random_state)

    met_df = met_df.iloc[:, 0]
    metrics_list.append(met_df)

    # Calculate p-values
    p_values_continuous, p_values_discrete = compare_distributions(df, X_gen)
    continuous_column_names = list(p_values_continuous.keys())
    discrete_column_names = list(p_values_discrete.keys())

    p_val = np.concatenate([list(p_values_continuous.values()), list(p_values_discrete.values())])
    p_values_list.append(p_val)

    workspace_dir = os.path.join(os.getcwd(), 'workspace')
    if os.path.exists(workspace_dir):
        shutil.rmtree(workspace_dir)

result_df = pd.concat(metrics_list, axis=1)

# Calculate the row-wise mean and standard deviation of the metrics
result_df['Mean'] = result_df.mean(axis=1)
result_df['Std'] = result_df.std(axis=1)
result_df['Std'] = result_df['Std'].round(4)

# Create DataFrame for p-values
p_values_df = pd.DataFrame(p_values_list, columns=continuous_column_names + discrete_column_names)

# Save result_df and p_values_df as CSV files
result_df.to_csv(f"/content/drive/MyDrive/Nips/{dataset}_{plugin_name}_result_df.csv")
p_values_df.to_csv(f"/content/drive/MyDrive/Nips/{dataset}_{plugin_name}_p_values_df.csv")

avg_fit_time = np.mean(fit_times)
avg_generate_time = np.mean(generate_times)
std_fit_time = np.std(fit_times)
std_generate_time = np.std(generate_times)

print(f"\nAverage Fit Time: {avg_fit_time:.4f} seconds, Standard Deviation: {std_fit_time:.4f} seconds")
print(f"Average Generate Time: {avg_generate_time:.4f} seconds, Standard Deviation: {std_generate_time:.4f} seconds")


[2024-04-13T11:16:35.993650+0000][10941][CRITICAL] module disabled: /usr/local/lib/python3.10/dist-packages/synthcity/plugins/generic/plugin_goggle.py
  5%|▍         | 499/10000 [01:59<37:47,  4.19it/s]
[2024-04-13T11:20:20.957092+0000][10941][CRITICAL] module disabled: /usr/local/lib/python3.10/dist-packages/synthcity/plugins/generic/plugin_goggle.py
  5%|▍         | 499/10000 [01:57<37:17,  4.25it/s]
[2024-04-13T11:24:03.347230+0000][10941][CRITICAL] module disabled: /usr/local/lib/python3.10/dist-packages/synthcity/plugins/generic/plugin_goggle.py
  5%|▍         | 499/10000 [01:58<37:43,  4.20it/s]
[2024-04-13T11:27:47.033401+0000][10941][CRITICAL] module disabled: /usr/local/lib/python3.10/dist-packages/synthcity/plugins/generic/plugin_goggle.py
  5%|▍         | 499/10000 [01:57<37:15,  4.25it/s]
[2024-04-13T11:31:28.501165+0000][10941][CRITICAL] module disabled: /usr/local/lib/python3.10/dist-packages/synthcity/plugins/generic/plugin_goggle.py
  5%|▍         | 499/10000 [01:57<37:


Average Fit Time: 119.6635 seconds, Standard Deviation: 1.1724 seconds
Average Generate Time: 0.0848 seconds, Standard Deviation: 0.0169 seconds


In [None]:
result_df

Unnamed: 0,min,min.1,min.2,min.3,min.4,Mean,Std
stats.jensenshannon_dist.marginal,0.009024,0.0094,0.009486,0.009408,0.009216,0.009307,0.0002
stats.chi_squared_test.marginal,0.528164,0.526284,0.527028,0.526889,0.527668,0.527206,0.0007
stats.inv_kl_divergence.marginal,0.901807,0.91984,0.900446,0.90094,0.901215,0.90485,0.0075
stats.ks_test.marginal,0.911241,0.909473,0.908613,0.909903,0.90976,0.909798,0.0008
stats.max_mean_discrepancy.joint,0.001058,0.001058,0.001057,0.001057,0.001055,0.001057,0.0
stats.wasserstein_dist.joint,0.037183,0.03814,0.037313,0.039089,0.035725,0.03749,0.0011
stats.prdc.precision,0.982133,0.9732,0.980032,0.977404,0.974777,0.977509,0.0033
stats.prdc.recall,0.955334,0.950079,0.955859,0.953757,0.951655,0.953337,0.0022
stats.prdc.density,1.034157,1.020179,1.023752,1.023121,1.038045,1.027851,0.007
stats.prdc.coverage,0.937467,0.938518,0.944298,0.932738,0.944824,0.939569,0.0045


In [None]:
p_values_df

Unnamed: 0,x0,x1,x2,x3,x8,x4,x5,x6,x7
0,5.02635e-21,4.152874e-28,4.48072e-59,6.812081e-07,0.002581,0.014348,0.414211,0.069509,0.679073
1,5.284639e-22,1.563838e-28,1.329606e-61,9.280435e-07,0.0039,0.60514,0.520481,0.321161,0.21624
2,2.8588180000000004e-23,2.792157e-27,5.843211e-59,2.456029e-07,0.002919,0.312167,0.341493,0.550498,0.200267
3,7.748153000000001e-23,8.527337e-29,6.511547e-60,1.364957e-07,0.006737,0.457226,0.751086,0.477849,0.880831
4,5.427032e-22,7.772052e-29,1.984114e-61,1.155903e-07,0.001746,0.451331,0.516995,0.481445,0.075868


# GBSG

In [None]:
dataset="gbsg"

metrics_list = []
fit_times = []
generate_times = []
p_values_list = []

for i in range(5):
    df = datasets.gbsg.read_df()
    df = df[df['duration'] != 0]

    time = df['duration'].to_list()
    event = df['event'].to_list()

    time_event_encoded = sinusoidal_embedding(time, event, 4)
    X = df.drop(['duration', 'event'], axis=1)
    y = time_event_encoded

    syn_model = Plugins().get(plugin_name)

    # Measure the execution time of the fit function
    start = timer()
    syn_model.fit(X, cond=y)
    fit_time = timer() - start
    fit_times.append(fit_time)

    survival_df=df
    event_0_data = survival_df[survival_df['event'] == 0]['duration']
    event_1_data = survival_df[survival_df['event'] == 1]['duration']

    random_state = i + 1
    np.random.seed(random_state)

    sample_size_0 = len(event_0_data)
    sample_size_1= len(event_1_data)
    sample_event_0 = np.random.choice(event_0_data, size=sample_size_0)
    sample_event_1 = np.random.choice(event_1_data, size=sample_size_1)

    z=np.concatenate([sample_event_0,sample_event_1])
    x=np.zeros(len(sample_event_0))
    y=np.ones(len(sample_event_1))
    p=np.concatenate([x,y])

    time_event_gen_encoded=sinusoidal_embedding(z,p,4)

    # Measure the execution time of the generate function
    start = timer()
    X_gen = syn_model.generate(count=len(df), cond=np.array(time_event_gen_encoded)).dataframe()
    generate_time = timer() - start
    generate_times.append(generate_time)

    X_gen['duration'] = z
    X_gen['event'] = p
    X_gen = X_gen.sample(frac=1).reset_index(drop=True)

    # Save X_gen as a CSV file
    X_gen.to_csv(f"/content/drive/MyDrive/Nips/{dataset}_{plugin_name}_iteration_{i+1}.csv", index=False)

    loader1 = SurvivalAnalysisDataLoader(df, target_column="event", time_to_event_column="duration")
    loader2 = SurvivalAnalysisDataLoader(X_gen, target_column="event", time_to_event_column="duration")

    met_df = Metrics.evaluate(X_gt=loader1, X_syn=loader2, task_type='survival_analysis', metrics={
        'stats': ['jensenshannon_dist', 'chi_squared_test', 'feature_corr', 'inv_kl_divergence', 'ks_test',
                 'max_mean_discrepancy', 'wasserstein_dist', 'prdc', 'alpha_precision', 'survival_km_distance'],
        'performance': ['linear_model', 'mlp', 'xgb', 'feat_rank_distance']
    }, use_cache=False, random_state=random_state)

    met_df = met_df.iloc[:, 0]
    metrics_list.append(met_df)

    # Calculate p-values
    p_values_continuous, p_values_discrete = compare_distributions(df, X_gen)
    continuous_column_names = list(p_values_continuous.keys())
    discrete_column_names = list(p_values_discrete.keys())

    p_val = np.concatenate([list(p_values_continuous.values()), list(p_values_discrete.values())])
    p_values_list.append(p_val)

    workspace_dir = os.path.join(os.getcwd(), 'workspace')
    if os.path.exists(workspace_dir):
        shutil.rmtree(workspace_dir)

result_df = pd.concat(metrics_list, axis=1)

# Calculate the row-wise mean and standard deviation of the metrics
result_df['Mean'] = result_df.mean(axis=1)
result_df['Std'] = result_df.std(axis=1)
result_df['Std'] = result_df['Std'].round(4)

# Create DataFrame for p-values
p_values_df = pd.DataFrame(p_values_list, columns=continuous_column_names + discrete_column_names)

# Save result_df and p_values_df as CSV files
result_df.to_csv(f"/content/drive/MyDrive/Nips/{dataset}_{plugin_name}_result_df.csv")
p_values_df.to_csv(f"/content/drive/MyDrive/Nips/{dataset}_{plugin_name}_p_values_df.csv")

avg_fit_time = np.mean(fit_times)
avg_generate_time = np.mean(generate_times)
std_fit_time = np.std(fit_times)
std_generate_time = np.std(generate_times)

print(f"\nAverage Fit Time: {avg_fit_time:.4f} seconds, Standard Deviation: {std_fit_time:.4f} seconds")
print(f"Average Generate Time: {avg_generate_time:.4f} seconds, Standard Deviation: {std_generate_time:.4f} seconds")


Dataset 'gbsg' not locally available. Downloading...


[2024-04-13T11:35:10.541460+0000][10941][CRITICAL] module disabled: /usr/local/lib/python3.10/dist-packages/synthcity/plugins/generic/plugin_goggle.py


Done


  8%|▊         | 799/10000 [03:24<39:11,  3.91it/s]
[2024-04-13T11:40:35.986062+0000][10941][CRITICAL] module disabled: /usr/local/lib/python3.10/dist-packages/synthcity/plugins/generic/plugin_goggle.py
  8%|▊         | 799/10000 [03:24<39:18,  3.90it/s]
[2024-04-13T11:46:01.323412+0000][10941][CRITICAL] module disabled: /usr/local/lib/python3.10/dist-packages/synthcity/plugins/generic/plugin_goggle.py
  8%|▊         | 799/10000 [03:23<39:06,  3.92it/s]
[2024-04-13T11:51:27.839683+0000][10941][CRITICAL] module disabled: /usr/local/lib/python3.10/dist-packages/synthcity/plugins/generic/plugin_goggle.py
  8%|▊         | 799/10000 [03:24<39:12,  3.91it/s]
[2024-04-13T11:56:54.208536+0000][10941][CRITICAL] module disabled: /usr/local/lib/python3.10/dist-packages/synthcity/plugins/generic/plugin_goggle.py
  8%|▊         | 799/10000 [03:18<38:09,  4.02it/s]



Average Fit Time: 204.5699 seconds, Standard Deviation: 2.3434 seconds
Average Generate Time: 0.0656 seconds, Standard Deviation: 0.0021 seconds


In [None]:
result_df

Unnamed: 0,min,min.1,min.2,min.3,min.4,Mean,Std
stats.jensenshannon_dist.marginal,0.004535,0.004268,0.004853,0.004887,0.005041,0.004717,0.0003
stats.chi_squared_test.marginal,0.547532,0.548162,0.545348,0.54523,0.545562,0.546367,0.0012
stats.inv_kl_divergence.marginal,0.970671,0.968456,0.968307,0.968866,0.968547,0.968969,0.0009
stats.ks_test.marginal,0.918807,0.920002,0.915771,0.915074,0.916219,0.917174,0.0019
stats.max_mean_discrepancy.joint,0.000898,0.000898,0.000899,0.000899,0.000898,0.000898,0.0
stats.wasserstein_dist.joint,0.012268,0.011344,0.012269,0.013077,0.01281,0.012354,0.0006
stats.prdc.precision,0.987455,0.990143,0.985663,0.985215,0.988351,0.987366,0.0018
stats.prdc.recall,0.935484,0.935484,0.928763,0.948477,0.926075,0.934857,0.0078
stats.prdc.density,0.97276,0.981989,0.970609,0.960484,0.96595,0.970358,0.0072
stats.prdc.coverage,0.8069,0.806004,0.793907,0.8069,0.802867,0.803315,0.0049


In [None]:
p_values_df

Unnamed: 0,x3,x4,x5,x6,x0,x1,x2
0,2.6572280000000002e-18,3.220611e-11,0.371267,0.000201,0.948878,0.190918,0.628832
1,3.574633e-17,1.966961e-12,0.251958,2.7e-05,0.473448,0.226338,0.537244
2,9.016398e-20,7.138194e-10,0.064474,0.0007,0.892067,0.927489,0.921723
3,9.322572e-22,1.214655e-09,0.045201,0.001576,0.670486,0.033556,1.0
4,1.509337e-20,3.690529e-10,0.279565,0.0008,1.0,0.242953,0.233955


# SUPPORT

In [None]:
dataset="support"

metrics_list = []
fit_times = []
generate_times = []
p_values_list = []

for i in range(5):
    df = datasets.support.read_df()
    df = df[df['duration'] != 0]

    time = df['duration'].to_list()
    event = df['event'].to_list()

    time_event_encoded = sinusoidal_embedding(time, event, 4)
    X = df.drop(['duration', 'event'], axis=1)
    y = time_event_encoded

    syn_model = Plugins().get(plugin_name)

    # Measure the execution time of the fit function
    start = timer()
    syn_model.fit(X, cond=y)
    fit_time = timer() - start
    fit_times.append(fit_time)

    survival_df=df
    event_0_data = survival_df[survival_df['event'] == 0]['duration']
    event_1_data = survival_df[survival_df['event'] == 1]['duration']

    random_state = i + 1
    np.random.seed(random_state)

    sample_size_0 = len(event_0_data)
    sample_size_1= len(event_1_data)
    sample_event_0 = np.random.choice(event_0_data, size=sample_size_0)
    sample_event_1 = np.random.choice(event_1_data, size=sample_size_1)

    z=np.concatenate([sample_event_0,sample_event_1])
    x=np.zeros(len(sample_event_0))
    y=np.ones(len(sample_event_1))
    p=np.concatenate([x,y])

    time_event_gen_encoded=sinusoidal_embedding(z,p,4)

    # Measure the execution time of the generate function
    start = timer()
    X_gen = syn_model.generate(count=len(df), cond=np.array(time_event_gen_encoded)).dataframe()
    generate_time = timer() - start
    generate_times.append(generate_time)

    X_gen['duration'] = z
    X_gen['event'] = p
    X_gen = X_gen.sample(frac=1).reset_index(drop=True)

    # Save X_gen as a CSV file
    X_gen.to_csv(f"/content/drive/MyDrive/Nips/{dataset}_{plugin_name}_iteration_{i+1}.csv", index=False)

    loader1 = SurvivalAnalysisDataLoader(df, target_column="event", time_to_event_column="duration")
    loader2 = SurvivalAnalysisDataLoader(X_gen, target_column="event", time_to_event_column="duration")

    met_df = Metrics.evaluate(X_gt=loader1, X_syn=loader2, task_type='survival_analysis', metrics={
        'stats': ['jensenshannon_dist', 'chi_squared_test', 'feature_corr', 'inv_kl_divergence', 'ks_test',
                 'max_mean_discrepancy', 'wasserstein_dist', 'prdc', 'alpha_precision', 'survival_km_distance'],
        'performance': ['linear_model', 'mlp', 'xgb', 'feat_rank_distance']
    }, use_cache=False, random_state=random_state)

    met_df = met_df.iloc[:, 0]
    metrics_list.append(met_df)

    # Calculate p-values
    p_values_continuous, p_values_discrete = compare_distributions(df, X_gen)
    continuous_column_names = list(p_values_continuous.keys())
    discrete_column_names = list(p_values_discrete.keys())

    p_val = np.concatenate([list(p_values_continuous.values()), list(p_values_discrete.values())])
    p_values_list.append(p_val)

    workspace_dir = os.path.join(os.getcwd(), 'workspace')
    if os.path.exists(workspace_dir):
        shutil.rmtree(workspace_dir)

result_df = pd.concat(metrics_list, axis=1)

# Calculate the row-wise mean and standard deviation of the metrics
result_df['Mean'] = result_df.mean(axis=1)
result_df['Std'] = result_df.std(axis=1)
result_df['Std'] = result_df['Std'].round(4)

# Create DataFrame for p-values
p_values_df = pd.DataFrame(p_values_list, columns=continuous_column_names + discrete_column_names)

# Save result_df and p_values_df as CSV files
result_df.to_csv(f"/content/drive/MyDrive/Nips/{dataset}_{plugin_name}_result_df.csv")
p_values_df.to_csv(f"/content/drive/MyDrive/Nips/{dataset}_{plugin_name}_p_values_df.csv")

avg_fit_time = np.mean(fit_times)
avg_generate_time = np.mean(generate_times)
std_fit_time = np.std(fit_times)
std_generate_time = np.std(generate_times)

print(f"\nAverage Fit Time: {avg_fit_time:.4f} seconds, Standard Deviation: {std_fit_time:.4f} seconds")
print(f"Average Generate Time: {avg_generate_time:.4f} seconds, Standard Deviation: {std_generate_time:.4f} seconds")


[2024-04-13T19:59:01.838541+0000][2455][CRITICAL] module disabled: /usr/local/lib/python3.10/dist-packages/synthcity/plugins/generic/plugin_goggle.py
  4%|▍         | 449/10000 [08:27<3:00:04,  1.13s/it]


[KeOps] Generating code for Max_SumShiftExpWeight_Reduction reduction (with parameters 0) of formula [c-1/2*(d*Sum((a-b)**2)),1] with a=Var(0,16,0), b=Var(1,16,1), c=Var(2,1,1), d=Var(3,1,2) ... OK


[2024-04-13T20:12:02.827148+0000][2455][CRITICAL] module disabled: /usr/local/lib/python3.10/dist-packages/synthcity/plugins/generic/plugin_goggle.py
  4%|▍         | 449/10000 [08:44<3:06:04,  1.17s/it]




[2024-04-13T20:25:01.322092+0000][2455][CRITICAL] module disabled: /usr/local/lib/python3.10/dist-packages/synthcity/plugins/generic/plugin_goggle.py
  4%|▍         | 449/10000 [08:23<2:58:25,  1.12s/it]




[2024-04-13T20:37:38.717464+0000][2455][CRITICAL] module disabled: /usr/local/lib/python3.10/dist-packages/synthcity/plugins/generic/plugin_goggle.py
  4%|▍         | 449/10000 [08:27<2:59:47,  1.13s/it]




[2024-04-13T20:50:15.994034+0000][2455][CRITICAL] module disabled: /usr/local/lib/python3.10/dist-packages/synthcity/plugins/generic/plugin_goggle.py
  4%|▍         | 449/10000 [08:23<2:58:39,  1.12s/it]



Average Fit Time: 515.8385 seconds, Standard Deviation: 7.7971 seconds
Average Generate Time: 0.1482 seconds, Standard Deviation: 0.0085 seconds


In [None]:
result_df

Unnamed: 0,min,min.1,min.2,min.3,min.4,Mean,Std
stats.jensenshannon_dist.marginal,0.005621,0.005646,0.005763,0.005676,0.005699,0.005681,0.0
stats.chi_squared_test.marginal,0.551496,0.551944,0.551635,0.551796,0.551653,0.551705,0.0002
stats.inv_kl_divergence.marginal,0.973562,0.973136,0.973179,0.972602,0.97361,0.973218,0.0004
stats.ks_test.marginal,0.923257,0.922581,0.922954,0.922785,0.923039,0.922923,0.0002
stats.max_mean_discrepancy.joint,0.000225,0.000225,0.000225,0.000225,0.000225,0.000225,0.0
stats.wasserstein_dist.joint,0.036354,0.036219,0.036277,0.036333,0.036517,0.03634,0.0001
stats.prdc.precision,0.960329,0.963259,0.963372,0.961569,0.964612,0.962628,0.0015
stats.prdc.recall,0.922011,0.920207,0.925166,0.923588,0.920884,0.922371,0.0018
stats.prdc.density,0.944055,0.959089,0.954085,0.96238,0.953364,0.954595,0.0062
stats.prdc.coverage,0.912994,0.916263,0.91739,0.917615,0.917052,0.916263,0.0017


In [None]:
p_values_df

Unnamed: 0,x0,x7,x8,x9,x10,x11,x12,x13,x1,x2,x3,x4,x5,x6
0,1.439564e-15,1.2137330000000001e-27,1.071006e-40,5.348859e-89,6.5e-05,4.627836e-20,9.643612e-201,1.093995e-65,0.91693,0.005193,0.082834,0.989267,0.336832,0.790861
1,1.890747e-16,1.5290399999999999e-26,2.47502e-40,4.367822e-90,6.7e-05,5.925394e-19,2.025548e-202,7.610552e-65,0.791561,0.305458,0.25418,0.345976,0.001317,0.755085
2,5.051416e-15,1.167503e-27,9.58059e-40,4.5959689999999996e-89,0.000109,2.043668e-19,2.089515e-200,5.640070999999999e-65,0.04598,0.564496,0.805058,0.187148,0.575016,0.453582
3,2.135626e-16,1.08182e-27,7.002855e-39,1.185893e-89,5.9e-05,1.7180589999999998e-19,9.772603999999999e-201,1.576937e-66,0.496314,0.869126,0.778002,0.151704,0.381916,0.536724
4,5.37045e-16,1.017775e-25,2.488546e-38,2.0575529999999997e-87,5.5e-05,4.838877e-19,8.39765e-202,8.502807e-65,0.511868,0.372019,0.95803,0.124511,1.0,0.316912


# FLCHAIN

In [None]:
dataset="flchain"

metrics_list = []
fit_times = []
generate_times = []
p_values_list = []

for i in range(5):
    df=pd.read_csv('/content/drive/MyDrive/Datasets/flchain.csv')
    df = df.drop('Unnamed: 0', axis=1)
    df = df[df['duration'] != 0]
    df=df.dropna()

    time = df['duration'].to_list()
    event = df['event'].to_list()

    time_event_encoded = sinusoidal_embedding(time, event, 4)
    X = df.drop(['duration', 'event'], axis=1)
    y = time_event_encoded

    syn_model = Plugins().get(plugin_name)

    # Measure the execution time of the fit function
    start = timer()
    syn_model.fit(X, cond=y)
    fit_time = timer() - start
    fit_times.append(fit_time)

    survival_df=df
    event_0_data = survival_df[survival_df['event'] == 0]['duration']
    event_1_data = survival_df[survival_df['event'] == 1]['duration']

    random_state = i + 1
    np.random.seed(random_state)

    sample_size_0 = len(event_0_data)
    sample_size_1= len(event_1_data)
    sample_event_0 = np.random.choice(event_0_data, size=sample_size_0)
    sample_event_1 = np.random.choice(event_1_data, size=sample_size_1)

    z=np.concatenate([sample_event_0,sample_event_1])
    x=np.zeros(len(sample_event_0))
    y=np.ones(len(sample_event_1))
    p=np.concatenate([x,y])

    time_event_gen_encoded=sinusoidal_embedding(z,p,4)

    # Measure the execution time of the generate function
    start = timer()
    X_gen = syn_model.generate(count=len(df), cond=np.array(time_event_gen_encoded)).dataframe()
    generate_time = timer() - start
    generate_times.append(generate_time)

    X_gen['duration'] = z
    X_gen['event'] = p
    X_gen = X_gen.sample(frac=1).reset_index(drop=True)

    # Save X_gen as a CSV file
    X_gen.to_csv(f"/content/drive/MyDrive/Nips/{dataset}_{plugin_name}_iteration_{i+1}.csv", index=False)

    loader1 = SurvivalAnalysisDataLoader(df, target_column="event", time_to_event_column="duration")
    loader2 = SurvivalAnalysisDataLoader(X_gen, target_column="event", time_to_event_column="duration")

    met_df = Metrics.evaluate(X_gt=loader1, X_syn=loader2, task_type='survival_analysis', metrics={
        'stats': ['jensenshannon_dist', 'chi_squared_test', 'feature_corr', 'inv_kl_divergence', 'ks_test',
                 'max_mean_discrepancy', 'wasserstein_dist', 'prdc', 'alpha_precision', 'survival_km_distance'],
        'performance': ['linear_model', 'mlp', 'xgb', 'feat_rank_distance']
    }, use_cache=False, random_state=random_state)

    met_df = met_df.iloc[:, 0]
    metrics_list.append(met_df)

    # Calculate p-values
    p_values_continuous, p_values_discrete = compare_distributions(df, X_gen)
    continuous_column_names = list(p_values_continuous.keys())
    discrete_column_names = list(p_values_discrete.keys())

    p_val = np.concatenate([list(p_values_continuous.values()), list(p_values_discrete.values())])
    p_values_list.append(p_val)

    workspace_dir = os.path.join(os.getcwd(), 'workspace')
    if os.path.exists(workspace_dir):
        shutil.rmtree(workspace_dir)

result_df = pd.concat(metrics_list, axis=1)

# Calculate the row-wise mean and standard deviation of the metrics
result_df['Mean'] = result_df.mean(axis=1)
result_df['Std'] = result_df.std(axis=1)
result_df['Std'] = result_df['Std'].round(4)

# Create DataFrame for p-values
p_values_df = pd.DataFrame(p_values_list, columns=continuous_column_names + discrete_column_names)

# Save result_df and p_values_df as CSV files
result_df.to_csv(f"/content/drive/MyDrive/Nips/{dataset}_{plugin_name}_result_df.csv")
p_values_df.to_csv(f"/content/drive/MyDrive/Nips/{dataset}_{plugin_name}_p_values_df.csv")

avg_fit_time = np.mean(fit_times)
avg_generate_time = np.mean(generate_times)
std_fit_time = np.std(fit_times)
std_generate_time = np.std(generate_times)

print(f"\nAverage Fit Time: {avg_fit_time:.4f} seconds, Standard Deviation: {std_fit_time:.4f} seconds")
print(f"Average Generate Time: {avg_generate_time:.4f} seconds, Standard Deviation: {std_generate_time:.4f} seconds")
