In [1]:
import os
import pandas as pd
import pyreadr
import random
import gower
import sdv.metadata.single_table as m
from synthcity.plugins.core.dataloader import GenericDataLoader
import numpy as np
from synthcity.metrics.eval_detection import DetectionEvaluator, SyntheticDetectionXGB
from synthcity.metrics.eval_privacy import IdentifiabilityScore, kAnonymization,  lDiversityDistinct
from synthcity.metrics.eval_attacks import AttackEvaluator
from sdmetrics.single_table import DCRBaselineProtection
from synthcity.metrics.eval_statistical import  MaximumMeanDiscrepancy, WassersteinDistance, AlphaPrecision, PRDCScore, InverseKLDivergence, JensenShannonDistance

    The default C++ compiler could not be found on your system.
    You need to either define the CXX environment variable or a symlink to the g++ command.
    For example if g++-8 is the command you can do
      import os
      os.environ['CXX'] = 'g++-8'
    


In [2]:
random.seed(31125)

In [3]:
os.chdir("C:/Users/antoi/Downloads/synthetic_data_paper-main/synthetic_data_paper-main")
current_directory = os.getcwd()

In [4]:
data_path = os.path.join(current_directory, "data")


dataset = "menobalance"
# Read the CSV file

if dataset == "menobalance":
   df = pd.read_csv(data_path + "\\menobalance_data.csv")
   df = df.iloc[:,[0,1,2,3,4,5,6]]
   loader = GenericDataLoader(df)
   # Convert to meta table for Python use
   metadata = m.SingleTableMetadata()
   metadata.detect_from_dataframe(df)
   discrete_columns = ["Smoker","Diabetes"]
   binary_idx = [1,5]
   current_directory = os.path.join(current_directory, "menobalance")
if dataset == "wisconsin":
    url = "http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
    df = pd.read_csv(url, header=None)
    df = df.iloc[:,[14,26,28,29]]
    df.columns = ["14","26","28","29"]
    loader = GenericDataLoader(df)
    discrete_columns = []
    binary_idx = []
   # Convert to meta table for Python use
    metadata = m.SingleTableMetadata()
    metadata.detect_from_dataframe(df)
    current_directory = os.path.join(current_directory, "wisconsin")
if dataset == "cleveland":
    url = "http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
    df = pd.read_csv(url, header=None)
    df = df.iloc[:,[3,7,4,0]]
    df.columns = ["3","7","4","0"]
    discrete_columns = []
    loader = GenericDataLoader(df)
    binary_idx = []
    # Convert to meta table for Python use
    metadata = m.SingleTableMetadata()
    metadata.detect_from_dataframe(df)
    current_directory = os.path.join(current_directory, "cleveland")
if dataset == "simulated_gmcm": 
    simulated_path = "C:/Users/antoi/Documents/UNIBE/Synthetic data generation/results_paper/"
    df = pd.read_csv(data_path + "simulated_GMCM_data.csv")
    df = df.iloc[:,[1,2]]
    discrete_columns = []
    df.columns = ["X1","X2"]
    binary_idx = []
    current_directory = os.path.join(current_directory, "simulated_gmcm")
    loader = GenericDataLoader(df)
    # Convert to meta table for Python use
    metadata = m.SingleTableMetadata()
    metadata.detect_from_dataframe(df)

In [5]:
def median_pairwise_distances(X, binary_idx):
    """
    Computes a vector of median distances per dimension between all pairs of points.
    
    Parameters:
        X (np.ndarray): shape (n_samples, d), input data.
    
    Returns:
        np.ndarray: shape (d,), median per-dimension pairwise distances.
    """
    n_samples, d = X.shape
    medians = np.zeros(d)
    for k in range(d):
        if k in binary_idx:
            medians[k] = 1.0
        else:
            col = X[:, k]
            # Compute absolute difference matrix for this column
            diffs = np.abs(col[:, None] - col[None, :])
            # Extract upper triangle (i < j)
            triu_indices = np.triu_indices(n_samples, k=1)
            dist_values = diffs[triu_indices]
            medians[k] = np.median(dist_values)
    return medians

In [6]:
def anisotropic_rbf(X, Y, lengthscales):
    """
    Compute pairwise anisotropic RBF kernel matrix between X and Y.
    X: (n_x, d), Y: (n_y, d), lengthscales: (d,)
    Returns kernel matrix (n_x, n_y)
    """
    X = X / lengthscales
    Y = Y / lengthscales
    # ||x-y||^2 = ||x||^2 + ||y||^2 - 2x.y
    X2 = np.sum(X**2, 1)[:, None]
    Y2 = np.sum(Y**2, 1)[None, :]
    dist2 = X2 + Y2 - 2 * np.dot(X, Y.T)
    K = np.exp(-0.5 * dist2)
    return K
    
def anisotropic_kernel(X, Y, bandwidths):
    X_exp = X[:, None, :]
    Y_exp = Y[None, :, :]
    sq_diff = ((X_exp - Y_exp) ** 2) / (2 * bandwidths**2)
    K = np.exp(-np.sum(sq_diff, axis=2))
    return K

def mmd2_unbiased(X, Y,binary_idx, lengthscales=None):
    """
    Unbiased estimator for MMD^2 between X (n,d) and Y (m,d)
    Optionally supply lengthscales (d,) or will compute medians.
    """
    if lengthscales is None:
        lengthscales = median_pairwise_distances(X,binary_idx)
    n = X.shape[0]
    m = Y.shape[0]

    K_xx = anisotropic_rbf(X, X, lengthscales)
    #np.fill_diagonal(K_xx, 0)
    K_yy = anisotropic_rbf(Y, Y, lengthscales)
    #np.fill_diagonal(K_yy, 0)
    K_xy = anisotropic_rbf(X, Y, lengthscales)

    mmd2 = (K_xx.sum() / (n*(n))
           + K_yy.sum() / (m*(m))
           - 2 * K_xy.sum() / (n*m))
    return mmd2

In [7]:
def process_csv_files(folder_path, inputs, loader, discrete_columns, colnames, binary_idx,metadata_dict):
    #loader = GenericDataLoader(pd.DataFrame(df))
    dataframes = {input_name: [] for input_name in inputs}
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.csv'):
            file_path = os.path.join(folder_path, file_name)
            df_synthetic = pd.read_csv(file_path)
        elif file_name.endswith('.rds'):
            file_path = os.path.join(folder_path, file_name)
            df_synthetic =  pyreadr.read_r(file_path)
            df_synthetic = pd.DataFrame(df_synthetic[None])
        for column in discrete_columns:
            df_synthetic[column] = pd.to_numeric(df_synthetic[column])
        df_synthetic.columns = colnames
        synthetic_loader = GenericDataLoader(df_synthetic)
            
            # Example computations for each input
        for input_name in inputs:
            if input_name == 'MMD_aniso':
                #computed_value = MMD.evaluate(loader,synthetic_loader)["joint"]
                computed_value = mmd2_unbiased(df.to_numpy(),df_synthetic.to_numpy(), binary_idx)
            elif input_name == "MMD":
                computed_value = MMD.evaluate(loader,synthetic_loader)["joint"]
            elif input_name == 'Wasserstein':
                # Compute something different for input2, e.g., sum
                computed_value = Wasserstein.evaluate(loader,synthetic_loader)["joint"]
            elif input_name == 'IKL':
                # Compute something different for input2, e.g., sum
                computed_value = IKL.evaluate(loader,synthetic_loader)["marginal"]
            elif input_name == 'JSD':
                # Compute something different for input2, e.g., sum
                computed_value = JSD.evaluate(loader,synthetic_loader)["marginal"]
            elif input_name == 'P-PR':
                # Add more conditions as needed
                computed_value = PRDC.evaluate(loader,synthetic_loader)["precision"]
            elif input_name == 'P-RE':
                computed_value = PRDC.evaluate(loader,synthetic_loader)["recall"]
            elif input_name == 'density':
                computed_value = PRDC.evaluate(loader,synthetic_loader)["density"]
            elif input_name == 'coverage':
                computed_value =  PRDC.evaluate(loader,synthetic_loader)["coverage"]
            elif input_name == 'kAnonym':
                computed_value =  kAnonym.evaluate(loader,synthetic_loader)["syn"]
            elif input_name == 'lDiversity':
                computed_value =  lDiversity.evaluate(loader,synthetic_loader)["syn"]
            elif input_name == "xgbDetection":
                computed_value =  xgb_detection.evaluate(loader,synthetic_loader)["mean"]
            elif  input_name == "identifiability":
                computed_value =  identifiability.evaluate(loader,synthetic_loader)["score"]
            elif input_name =="DCRsdv":
                computed_value  = DCRBaselineProtection.compute_breakdown(real_data=df,synthetic_data=df_synthetic,metadata = metadata_dict)["score"]  
            elif input_name == "DCR":
                distances = gower.gower_matrix(df_synthetic, df)
                min_distances = distances.min(axis=1)
                #computed_value = np.mean(min_distances)
                computed_value = np.quantile(a=min_distances,q=0.05)
            dataframes[input_name].append(computed_value)
    
    return dataframes

In [8]:

inputs = ["P-PR","P-RE","MMD","IKL","JSD","DCR","MMD_aniso","DCRsdv"]
alpha = AlphaPrecision()
detection_evaluator =  DetectionEvaluator()
xgb_detection = SyntheticDetectionXGB()
identifiability = IdentifiabilityScore()
kAnonym = kAnonymization()
lDiversity = lDiversityDistinct()
MMD = MaximumMeanDiscrepancy()
Wasserstein = WassersteinDistance()
PRDC =  PRDCScore()

IKL = InverseKLDivergence()
JSD =  JensenShannonDistance()
metadata_dict = metadata.to_dict()


In [9]:
base_path = current_directory

In [None]:
folder_names = [
    "GC_synthetizer",
    "CTGAN synthetizer",
    "GMCM",
    "marginals",
    "ARF",
    "Vines"
    # Add more folder names as needed
]
results = {}

colnames = df.columns

# Process each folder
for folder_name in folder_names:
    folder_path = os.path.join(base_path, folder_name)
    # Extract result name from the folder name
    result_name = folder_name.split(' ')[0].replace(' ', '_')
    # Process files in the folder
    results[result_name] = process_csv_files(folder_path = folder_path, inputs = inputs, loader=loader, discrete_columns = discrete_columns, 
                                             colnames=colnames, binary_idx =  binary_idx, metadata_dict = metadata_dict)  # Replace "inputs" with actual variable

# Convert results into DataFrames and calculate means
means = {}
standard_deviations = {}
for result_name, result_data in results.items():
    df1 = pd.DataFrame(result_data)
    means[result_name] = df1.mean().to_dict()
    standard_deviations[result_name] = df1.std().to_dict()

# Construct the results DataFrame
df_results = pd.DataFrame(means)

In [None]:
df_results

In [None]:
df_std =  pd.DataFrame(standard_deviations)

In [None]:
df_std