In [1]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/Colab Notebooks/synprivutil-main

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/synprivutil-main


In [2]:
!pip install numpy==1.26.4 pandas==2.2.2 sdv==1.15.0 scikit-learn==1.5.1 seaborn==0.12.2 matplotlib==3.9.2 rdt==1.12.3 anonymeter==1.0.0 scipy==1.13.0 dython==0.7.8 POT==0.9.4




## ML utility

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import f1_score, accuracy_score
import numpy as np

def calculate_ml_utility_classification(original, synthetic, target_column):
    """
    Evaluate machine learning utility by linking synthetic data to original data
    using classification models and calculating average F1 score and accuracy.

    Parameters:
        original (pd.DataFrame): Original dataset.
        synthetic (pd.DataFrame): Synthetic dataset.
        target_column (str): The target column for classification.

    Returns:
        dict: Average F1 score and accuracy for STO scenario across multiple classifiers.
    """
    # Step 1: Prepare original data
    X_test_orig = original.drop(columns=[target_column])
    y_test_orig = original[target_column]

    # Step 2: Prepare synthetic data
    X_syn = synthetic.drop(columns=[target_column])
    y_syn = synthetic[target_column]

    # Classification models
    models = [RandomForestClassifier(), GradientBoostingClassifier()]

    # Collect scores for STO (Train on Synthetic, Test on Original)
    sto_f1_list = []
    sto_accuracy_list = []

    for model in models:
        # Train on synthetic data, test on original data
        model.fit(X_syn, y_syn)
        y_pred_syn_on_orig = model.predict(X_test_orig)

        # Calculate F1 score and accuracy
        sto_f1 = f1_score(y_test_orig, y_pred_syn_on_orig, average='weighted')
        sto_accuracy = accuracy_score(y_test_orig, y_pred_syn_on_orig)

        sto_f1_list.append(sto_f1)
        sto_accuracy_list.append(sto_accuracy)

    # Average scores
    avg_sto_f1 = np.mean(sto_f1_list)
    avg_sto_accuracy = np.mean(sto_accuracy_list)

    metrics = {
        "F1_syn": avg_sto_f1,  # Train on synthetic, test on original
        "Accuracy_syn": avg_sto_accuracy  # Train on synthetic, test on original
    }

    return metrics




In [3]:
def replace_outliers_with_mean(original_data, synthetic_data, synthetic_name, threshold=1):
    total_replacements = 0  # Initialize replacement count

    for column in synthetic_data.columns:
        if column in original_data.columns:
            # Use max and min with std range to determine abnormal value
            orig_mean = original_data[column].median()
            orig_max = original_data[column].max()
            orig_min = original_data[column].min()
            orig_std = original_data[column].std()

            lower_bound = orig_min - threshold * orig_std
            upper_bound = orig_max + threshold * orig_std

            # Replace and count
            replaced_column = synthetic_data[column].apply(
                lambda x: orig_mean if x < lower_bound or x > upper_bound else x
            )

            # Count the number of replaced values
            replacements = (synthetic_data[column] != replaced_column).sum()
            total_replacements += replacements

            # Update column data
            synthetic_data[column] = replaced_column

    print(f"Under {synthetic_name} total replacements made: {total_replacements}")
    return synthetic_data

def remove_rows_with_outliers(original_data, synthetic_data, synthetic_name, threshold=1):
  
    # Initialize set of rows to drop
    rows_to_drop = set()

    for column in synthetic_data.columns:
        if column in original_data.columns:
            # Calculate range based on original data
            orig_mean = original_data[column].median()
            orig_max = original_data[column].max()
            orig_min = original_data[column].min()
            orig_std = original_data[column].std()

            lower_bound = orig_min - threshold * orig_std
            upper_bound = orig_max + threshold * orig_std

            # Find rows with outliers
            outlier_indices = synthetic_data[(synthetic_data[column] < lower_bound) |
                                             (synthetic_data[column] > upper_bound)].index
            rows_to_drop.update(outlier_indices)

            
    rows_to_drop = list(rows_to_drop)  # Convert to list for deletion
    cleaned_data = synthetic_data.drop(index=rows_to_drop)

    print(f"Under {synthetic_name}, removed {len(rows_to_drop)} rows with outliers.")
    return cleaned_data


In [4]:
import os
import re
import gc
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.preprocessing import MinMaxScaler
# Load original dataset
original_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Tabula-main-claude/Real_Datasets/adult_test.csv")
original_name = "adult"

# Directory containing synthetic datasets
synthetic_folder = "/content/drive/MyDrive/Colab Notebooks/Tabula-main-claude/adult_syn_data"

# Dictionary to store aggregated results
aggregated_results = defaultdict(lambda: defaultdict(list))

# Function to extract sigma value as float
def extract_sigma(file_name):
    match = re.search(r"sigma-(\d+)", file_name)
    if match:
        sigma_str = match.group(1)
        if len(sigma_str) == 1:  # Single digit like "1" -> "1"
            return int(sigma_str)
        elif len(sigma_str) == 2:  # Two digits like "05" -> "0.5"
            return int(sigma_str) / 10
        elif len(sigma_str) == 3:  # Three digits like "005" -> "0.05"
            return int(sigma_str) / 100
    return None

target_column = "income"  # Replace with your actual target column
task_type = "classification"  # "classification" or "regression"

# Loop through each synthetic file and perform calculations
for file_name in os.listdir(synthetic_folder):
    if file_name.endswith(".csv"):
        synthetic_path = os.path.join(synthetic_folder, file_name)
        synthetic_data = pd.read_csv(synthetic_path)

        # Ensure correct data types (e.g., convert 'age' column to int if needed)
        # if 'age' in synthetic_data.columns:
        #     synthetic_data['age'] = synthetic_data['age'].astype(int)

        for column in synthetic_data.columns:
            if column in original_data.columns:
                # Convert the column type in synthetic data to match the original data
                synthetic_data[column] = synthetic_data[column].astype(original_data[column].dtype)

        sigma = extract_sigma(file_name)
        if sigma is None:
            continue  # Skip files without a valid sigma value
        # print(f"\nProcessing synthetic dataset: {file_name} (sigma: {sigma})")
        # synthetic_data = replace_outliers_with_mean(original_data, synthetic_data, sigma)
        synthetic_data = remove_rows_with_outliers(original_data, synthetic_data, sigma)

        # Step 1: Calculate ML utility
        ml_results = calculate_ml_utility_classification(original_data, synthetic_data, target_column)

        # Step 2: Add metrics to aggregated_results
        for category, metric_value in ml_results.items():
            aggregated_results[sigma][category].append(metric_value)

        # Cleanup
        del synthetic_data
        gc.collect()



# Step 3: Calculate average metrics for each sigma
averaged_results = defaultdict(dict)
for sigma, metrics in aggregated_results.items():
    for metric, values in metrics.items():
        averaged_results[sigma][metric] = np.mean(values)

# Step 4: Convert averaged_results into a DataFrame
final_results = defaultdict(list)
for sigma, metrics in averaged_results.items():
    final_results["sigma"].append(sigma)
    for metric, value in metrics.items():
        final_results[metric].append(value)

df = pd.DataFrame(final_results)

# Sort the DataFrame by sigma
df = df.sort_values(by="sigma").reset_index(drop=True)

# Step 5: Print and verify the DataFrame
print(df)



Under 0.25, removed 3652 rows with outliers.


NameError: name 'calculate_ml_utility_classification' is not defined

## Privacy and utility evaluations

### original

In [None]:
import os
import re
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.preprocessing import MinMaxScaler
from privacy_utility_framework.privacy_utility_framework.metrics.utility_metrics.utility_metric_manager import UtilityMetricManager
from privacy_utility_framework.privacy_utility_framework.metrics.utility_metrics.statistical.basic_stats import BasicStatsCalculator
from privacy_utility_framework.privacy_utility_framework.metrics.utility_metrics.statistical.mutual_information import MICalculator
from privacy_utility_framework.privacy_utility_framework.metrics.utility_metrics.statistical.correlation import CorrelationCalculator
from privacy_utility_framework.privacy_utility_framework.metrics.utility_metrics.statistical.js_similarity import JSCalculator
from privacy_utility_framework.privacy_utility_framework.metrics.privacy_metrics.privacy_metric_manager import PrivacyMetricManager
from privacy_utility_framework.privacy_utility_framework.metrics.privacy_metrics.distance.adversarial_accuracy_class import AdversarialAccuracyCalculator, AdversarialAccuracyCalculator_NN
from privacy_utility_framework.privacy_utility_framework.metrics.privacy_metrics.distance.dcr_class import DCRCalculator
from privacy_utility_framework.privacy_utility_framework.metrics.privacy_metrics.distance.nndr_class import NNDRCalculator
from privacy_utility_framework.privacy_utility_framework.metrics.privacy_metrics.distance.disco import DisclosureCalculator

# Load original dataset
original_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Tabula-main-claude/Real_Datasets/Adult.csv")
# original_data.drop(columns=["index"], inplace=True)
# original_data.drop(columns=["id"], inplace=True)
original_name = "adult"

# Directory containing synthetic datasets
synthetic_folder = "/content/drive/MyDrive/Colab Notebooks/Tabula-main-claude/adult_syn_data"

# Dictionary to store aggregated results
aggregated_results = defaultdict(lambda: defaultdict(list))

# Function to extract sigma value as float
def extract_sigma(file_name):
    match = re.search(r"sigma-(\d+)", file_name)
    if match:
        sigma_str = match.group(1)
        if len(sigma_str) == 1:  # Single digit like "1" -> "1"
            return int(sigma_str)
        elif len(sigma_str) == 2:  # Two digits like "05" -> "0.5"
            return int(sigma_str) / 10
        elif len(sigma_str) == 3:  # Three digits like "005" -> "0.05"
            return int(sigma_str) / 100
    return None


# Loop through each synthetic file and perform calculations
for file_name in os.listdir(synthetic_folder):
    if file_name.endswith(".csv"):
        synthetic_path = os.path.join(synthetic_folder, file_name)
        synthetic_data = pd.read_csv(synthetic_path)

        sigma = extract_sigma(file_name)
        if sigma is None:
            continue  # Skip files without a valid sigma value

        # synthetic_data = replace_outliers_with_mean(original_data, synthetic_data, sigma)
        # synthetic_data = remove_rows_with_outliers(original_data, synthetic_data, sigma)
        # Ensure correct data types (e.g., convert 'age' column to int if needed)
        for column in synthetic_data.columns:
            if column in original_data.columns:
                # Convert the column type in synthetic data to match the original data
                synthetic_data[column] = synthetic_data[column].astype(original_data[column].dtype)


        # print(f"\nProcessing synthetic dataset: {file_name} (sigma: {sigma})")

        # Utility Metric Calculation
        u = UtilityMetricManager()
        utility_metric_list = [
            # BasicStatsCalculator(original_data, synthetic_data, original_name=original_name, synthetic_name=file_name),
            # MICalculator(original_data, synthetic_data, original_name=original_name, synthetic_name=file_name),
            # CorrelationCalculator(original_data, synthetic_data, original_name=original_name, synthetic_name=file_name),
            JSCalculator(original_data, synthetic_data, original_name=original_name, synthetic_name=file_name),
        ]
        u.add_metric(utility_metric_list)
        results_utility = u.evaluate_all()

        # Privacy Metric Calculation
        p = PrivacyMetricManager()
        privacy_metric_list = [
            # DCRCalculator(original_data, synthetic_data, original_name=original_name, synthetic_name=file_name),
            # NNDRCalculator(original_data, synthetic_data, original_name=original_name, synthetic_name=file_name),
            # AdversarialAccuracyCalculator_NN(original_data, synthetic_data, original_name=original_name, synthetic_name=file_name),
            # AdversarialAccuracyCalculator(original_data, synthetic_data, original_name=original_name, synthetic_name=file_name),
        ]
        p.add_metric(privacy_metric_list)
        results_privacy = p.evaluate_all()

        # # DiSCO and repU
        # adult_keys = ['age', 'workclass', 'education', 'marital-status',
        #               'occupation', 'relationship', 'race', 'gender', 'capital-gain',
        #               'capital-loss', 'hours-per-week', 'native-country']
        # adult_target = 'income'  

        # calc = DisclosureCalculator(original_data, synthetic_data, keys=adult_keys, target=adult_target)
        # repU, DiSCO = calc.evaluate()

        # Aggregate metrics
        for key, value in {**results_utility, **results_privacy}.items():
            aggregated_results[sigma][key].append(value)


        # aggregated_results[sigma]["Disclosure_repU"].append(repU)
        # aggregated_results[sigma]["Disclosure_DiSCO"].append(DiSCO)



In [4]:
import numpy as np
# Calculate average metrics for each sigma
averaged_results = defaultdict(dict)
for sigma, metrics in aggregated_results.items():
    for metric, values in metrics.items():
        if isinstance(values[0], dict):
            averaged_results[sigma][metric] = {k: np.mean([v[k] for v in values]) for k in values[0]}
        else:
            averaged_results[sigma][metric] = np.mean(values)

In [5]:
# Clean up averaged_results to extract metric names without file details
cleaned_results = defaultdict(dict)

for sigma, metrics in averaged_results.items():
    for metric, value in metrics.items():
        # Extract only the metric name, ignoring dataset/file info
        metric_name = metric.split("(")[0]
        if metric_name not in cleaned_results[sigma]:
            cleaned_results[sigma][metric_name] = []
        cleaned_results[sigma][metric_name].append(value)

# Aggregate and calculate average values for cleaned results
final_results = defaultdict(dict)
for sigma, metrics in cleaned_results.items():
    for metric, values in metrics.items():
        if isinstance(values[0], dict):  # If metric value is a dictionary
            # Average each key in the dictionary
            final_results[sigma][metric] = {k: sum(v[k] for v in values) / len(values) for k in values[0]}
        else:  # If metric value is a list of scalars
            final_results[sigma][metric] = sum(values) / len(values)

# Prepare data for plotting
data = {
    "sigma": sorted(final_results.keys()),
    "DCR": [final_results[sigma].get("DCRCalculator", None) for sigma in sorted(final_results.keys())],
    "NNDR": [final_results[sigma].get("NNDRCalculator", None) for sigma in sorted(final_results.keys())],
    # "AdversarialAccuracy": [final_results[sigma].get("AdversarialAccuracyCalculator", None) for sigma in sorted(final_results.keys())],
    "NNAA": [final_results[sigma].get("AdversarialAccuracyCalculator_NN", None) for sigma in sorted(final_results.keys())],
    # "Correlation": [final_results[sigma].get("CorrelationCalculator", None) for sigma in sorted(final_results.keys())],
    # "JS": [final_results[sigma].get("JSCalculator", None) for sigma in sorted(final_results.keys())],
    # "repU": [final_results[sigma].get("Disclosure_repU", None) for sigma in sorted(final_results.keys())],
    # "DiSCO": [final_results[sigma].get("Disclosure_DiSCO", None) for sigma in sorted(final_results.keys())],
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Verify DataFrame content
print(df)


    sigma          DCR      NNDR      NNAA
0    0.00     0.193052  0.904975  0.973265
1    0.01     0.389098  0.749186  0.647102
2    0.02     1.236993  0.771475  0.681537
3    0.03     1.059227  0.913428  0.967602
4    0.04     1.761422  0.890156  0.977859
5    0.05    12.224539  0.916962  0.969869
6    0.10     4.492382  0.914446  0.980744
7    0.15     9.979821  0.912635  0.984878
8    0.20    27.527854  0.884207  0.857258
9    0.25    59.524123  0.887548  0.867388
10   0.30   239.945031  0.933274  0.972152
11   0.35  2903.472714  0.924814  0.980547


In [None]:
# 提取 BasicStatsCalculator 的 mean、median、var
basic_stats_data = {
    "sigma": [],
    "mean": [],
    "median": [],
    "var": []
}

for sigma, metrics in final_results.items():
    if "BasicStatsCalculator" in metrics:
        basic_stats = metrics["BasicStatsCalculator"]
        basic_stats_data["sigma"].append(sigma)
        basic_stats_data["mean"].append(basic_stats["mean"])
        basic_stats_data["median"].append(basic_stats["median"])
        basic_stats_data["var"].append(basic_stats["var"])

# 转换为 DataFrame
basic_stats_df = pd.DataFrame(basic_stats_data)

# 显示结果
print(basic_stats_df)


    sigma        mean    median           var
0    0.25    4.252804  0.008968  1.901838e+06
1    0.01    0.039493  0.006569  4.921741e+01
2    0.05    0.894432  0.037042  2.620480e+05
3    0.10    0.339570  0.034717  9.922552e+02
4    0.15    0.734234  0.053231  1.262529e+04
5    0.20    1.969232  0.006718  1.777147e+05
6    0.30   17.155943  0.044074  2.698867e+07
7    0.35  207.408274  0.040310  1.882297e+10
8    0.02    0.100999  0.001715  2.473936e+02
9    0.03    0.095960  0.036591  1.581677e+02
10   0.04    0.145646  0.032641  2.693895e+02
11   0.00    0.040607  0.051116  2.027713e-02


### replace

In [13]:
import os
import re
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.preprocessing import MinMaxScaler
from privacy_utility_framework.privacy_utility_framework.metrics.utility_metrics.utility_metric_manager import UtilityMetricManager
from privacy_utility_framework.privacy_utility_framework.metrics.utility_metrics.statistical.basic_stats import BasicStatsCalculator
from privacy_utility_framework.privacy_utility_framework.metrics.utility_metrics.statistical.mutual_information import MICalculator
from privacy_utility_framework.privacy_utility_framework.metrics.utility_metrics.statistical.correlation import CorrelationCalculator
from privacy_utility_framework.privacy_utility_framework.metrics.utility_metrics.statistical.js_similarity import JSCalculator
from privacy_utility_framework.privacy_utility_framework.metrics.privacy_metrics.privacy_metric_manager import PrivacyMetricManager
from privacy_utility_framework.privacy_utility_framework.metrics.privacy_metrics.distance.adversarial_accuracy_class import AdversarialAccuracyCalculator, AdversarialAccuracyCalculator_NN
from privacy_utility_framework.privacy_utility_framework.metrics.privacy_metrics.distance.dcr_class import DCRCalculator
from privacy_utility_framework.privacy_utility_framework.metrics.privacy_metrics.distance.nndr_class import NNDRCalculator
from privacy_utility_framework.privacy_utility_framework.metrics.privacy_metrics.distance.disco import DisclosureCalculator

# Load original dataset
original_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Tabula-main-claude/Real_Datasets/Adult.csv")
# original_data.drop(columns=["index"], inplace=True)
# original_data.drop(columns=["id"], inplace=True)
original_name = "adult"

# Directory containing synthetic datasets
synthetic_folder = "/content/drive/MyDrive/Colab Notebooks/Tabula-main-claude/adult_syn_data"

# Dictionary to store aggregated results
aggregated_results = defaultdict(lambda: defaultdict(list))

# Function to extract sigma value as float
def extract_sigma(file_name):
    match = re.search(r"sigma-(\d+)", file_name)
    if match:
        sigma_str = match.group(1)
        if len(sigma_str) == 1:  # Single digit like "1" -> "1"
            return int(sigma_str)
        elif len(sigma_str) == 2:  # Two digits like "05" -> "0.5"
            return int(sigma_str) / 10
        elif len(sigma_str) == 3:  # Three digits like "005" -> "0.05"
            return int(sigma_str) / 100
    return None


# Loop through each synthetic file and perform calculations
for file_name in os.listdir(synthetic_folder):
    if file_name.endswith(".csv"):
        synthetic_path = os.path.join(synthetic_folder, file_name)
        synthetic_data = pd.read_csv(synthetic_path)

        sigma = extract_sigma(file_name)
        if sigma is None:
            continue  # Skip files without a valid sigma value

        synthetic_data = replace_outliers_with_mean(original_data, synthetic_data, sigma)
        # synthetic_data = remove_rows_with_outliers(original_data, synthetic_data, sigma)
        # Ensure correct data types (e.g., convert 'age' column to int if needed)
        for column in synthetic_data.columns:
            if column in original_data.columns:
                # Convert the column type in synthetic data to match the original data
                synthetic_data[column] = synthetic_data[column].astype(original_data[column].dtype)


        # print(f"\nProcessing synthetic dataset: {file_name} (sigma: {sigma})")

        # Utility Metric Calculation
        u = UtilityMetricManager()
        utility_metric_list = [
            BasicStatsCalculator(original_data, synthetic_data, original_name=original_name, synthetic_name=file_name),
            # MICalculator(original_data, synthetic_data, original_name=original_name, synthetic_name=file_name),
            # CorrelationCalculator(original_data, synthetic_data, original_name=original_name, synthetic_name=file_name),
            # JSCalculator(original_data, synthetic_data, original_name=original_name, synthetic_name=file_name),
        ]
        u.add_metric(utility_metric_list)
        results_utility = u.evaluate_all()

        # Privacy Metric Calculation
        p = PrivacyMetricManager()
        privacy_metric_list = [
            DCRCalculator(original_data, synthetic_data, original_name=original_name, synthetic_name=file_name),
            # NNDRCalculator(original_data, synthetic_data, original_name=original_name, synthetic_name=file_name),
            # AdversarialAccuracyCalculator_NN(original_data, synthetic_data, original_name=original_name, synthetic_name=file_name),
            # AdversarialAccuracyCalculator(original_data, synthetic_data, original_name=original_name, synthetic_name=file_name),
        ]
        p.add_metric(privacy_metric_list)
        results_privacy = p.evaluate_all()

        # # DiSCO and repU
        adult_keys = ['age', 'workclass', 'education', 'marital-status',
                      'occupation', 'relationship', 'race', 'gender', 'capital-gain',
                      'capital-loss', 'hours-per-week', 'native-country']
        adult_target = 'income' 

        calc = DisclosureCalculator(original_data, synthetic_data, keys=adult_keys, target=adult_target)
        repU, DiSCO = calc.evaluate()

        # Aggregate metrics
        for key, value in {**results_utility, **results_privacy}.items():
            aggregated_results[sigma][key].append(value)


        aggregated_results[sigma]["Disclosure_repU"].append(repU)
        aggregated_results[sigma]["Disclosure_DiSCO"].append(DiSCO)



Under 0.25 total replacements made: 3687
Synthetic and original data checked with synorig.compare,
 looks like no adjustment needed


-------------------Synthesis 1--------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  syndata[j][col] = syndata[j][col].astype('category')


Table for target income from GT alone with keys has 2 rows and 41439 columns.
Table for target income from GT & SD with all key combinations has 2 rows and 72549 columns.
IDENTITY: 
         UiO        UiS     UiOiS      repU
0  77.122149  78.097907  1.924573  1.394292
ATTRIBUTES: 
       Dorig       Dsyn        iS       DiS     DiSCO    DiSDiO  max_denom  \
0  92.070349  88.360093  8.574587  6.396134  4.309815  2.741493       20.0   

   mean_denom  
0     1.96729  
~~~~~~~~~~~~~~~ Done ~~~~~~~~~~~~~~~
Under 0.01 total replacements made: 378
Synthetic and original data checked with synorig.compare,
 looks like no adjustment needed


-------------------Synthesis 1--------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  syndata[j][col] = syndata[j][col].astype('category')


Table for target income from GT alone with keys has 2 rows and 41439 columns.
Table for target income from GT & SD with all key combinations has 2 rows and 62948 columns.
IDENTITY: 
         UiO        UiS     UiOiS      repU
0  77.122149  54.282636  5.691823  3.527702
ATTRIBUTES: 
       Dorig       Dsyn         iS        DiS      DiSCO    DiSDiO  max_denom  \
0  92.070349  75.834857  20.058556  13.181278  10.910692  9.854224       19.0   

   mean_denom  
0    1.711854  
~~~~~~~~~~~~~~~ Done ~~~~~~~~~~~~~~~
Under 0.05 total replacements made: 1771
Synthetic and original data checked with synorig.compare,
 looks like no adjustment needed


-------------------Synthesis 1--------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  syndata[j][col] = syndata[j][col].astype('category')


Table for target income from GT alone with keys has 2 rows and 41439 columns.
Table for target income from GT & SD with all key combinations has 2 rows and 71680 columns.
IDENTITY: 
         UiO        UiS     UiOiS     repU
0  77.122149  66.430302  0.014332  0.00819
ATTRIBUTES: 
       Dorig       Dsyn        iS      DiS     DiSCO    DiSDiO  max_denom  \
0  92.070349  79.602806  0.014332  0.00819  0.006142  0.006142        1.0   

   mean_denom  
0         1.0  
~~~~~~~~~~~~~~~ Done ~~~~~~~~~~~~~~~
Under 0.1 total replacements made: 2689
Synthetic and original data checked with synorig.compare,
 looks like no adjustment needed


-------------------Synthesis 1--------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  syndata[j][col] = syndata[j][col].astype('category')


Table for target income from GT alone with keys has 2 rows and 41439 columns.
Table for target income from GT & SD with all key combinations has 2 rows and 73987 columns.
IDENTITY: 
         UiO        UiS     UiOiS      repU
0  77.122149  74.342442  0.004095  0.002047
ATTRIBUTES: 
       Dorig      Dsyn        iS       DiS     DiSCO    DiSDiO  max_denom  \
0  92.070349  85.62839  0.004095  0.002047  0.002047  0.002047        1.0   

   mean_denom  
0         1.0  
~~~~~~~~~~~~~~~ Done ~~~~~~~~~~~~~~~
Under 0.15 total replacements made: 3102
Synthetic and original data checked with synorig.compare,
 looks like no adjustment needed


-------------------Synthesis 1--------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  syndata[j][col] = syndata[j][col].astype('category')


Table for target income from GT alone with keys has 2 rows and 41439 columns.
Table for target income from GT & SD with all key combinations has 2 rows and 74573 columns.
IDENTITY: 
         UiO        UiS     UiOiS      repU
0  77.122149  77.021112  0.006142  0.006142
ATTRIBUTES: 
       Dorig       Dsyn        iS       DiS     DiSCO    DiSDiO  max_denom  \
0  92.070349  87.206488  0.012285  0.012285  0.010237  0.006142        2.0   

   mean_denom  
0        1.25  
~~~~~~~~~~~~~~~ Done ~~~~~~~~~~~~~~~
Under 0.2 total replacements made: 3343
Synthetic and original data checked with synorig.compare,
 looks like no adjustment needed


-------------------Synthesis 1--------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  syndata[j][col] = syndata[j][col].astype('category')


Table for target income from GT alone with keys has 2 rows and 41439 columns.
Table for target income from GT & SD with all key combinations has 2 rows and 72535 columns.
IDENTITY: 
         UiO        UiS     UiOiS      repU
0  77.122149  77.458867  2.098604  1.568322
ATTRIBUTES: 
       Dorig       Dsyn        iS       DiS     DiSCO    DiSDiO  max_denom  \
0  92.070349  87.956091  9.139675  6.658204  4.547316  3.165309       18.0   

   mean_denom  
0    1.874262  
~~~~~~~~~~~~~~~ Done ~~~~~~~~~~~~~~~
Under 0.3 total replacements made: 4011
Synthetic and original data checked with synorig.compare,
 looks like no adjustment needed


-------------------Synthesis 1--------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  syndata[j][col] = syndata[j][col].astype('category')


Table for target income from GT alone with keys has 2 rows and 41439 columns.
Table for target income from GT & SD with all key combinations has 2 rows and 74627 columns.
IDENTITY: 
         UiO        UiS    UiOiS      repU
0  77.122149  80.042211  0.00819  0.006142
ATTRIBUTES: 
       Dorig       Dsyn       iS       DiS     DiSCO    DiSDiO  max_denom  \
0  92.070349  90.117399  0.00819  0.006142  0.004095  0.004095        1.0   

   mean_denom  
0         1.0  
~~~~~~~~~~~~~~~ Done ~~~~~~~~~~~~~~~
Under 0.35 total replacements made: 4446
Synthetic and original data checked with synorig.compare,
 looks like no adjustment needed


-------------------Synthesis 1--------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  syndata[j][col] = syndata[j][col].astype('category')


Table for target income from GT alone with keys has 2 rows and 41439 columns.
Table for target income from GT & SD with all key combinations has 2 rows and 74623 columns.
IDENTITY: 
         UiO        UiS     UiOiS      repU
0  77.122149  81.880525  0.004095  0.004095
ATTRIBUTES: 
       Dorig       Dsyn        iS       DiS     DiSCO    DiSDiO  max_denom  \
0  92.070349  91.023306  0.004095  0.004095  0.002047  0.002047        1.0   

   mean_denom  
0         1.0  
~~~~~~~~~~~~~~~ Done ~~~~~~~~~~~~~~~
Under 0.02 total replacements made: 773
Synthetic and original data checked with synorig.compare,
 looks like no adjustment needed


-------------------Synthesis 1--------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  syndata[j][col] = syndata[j][col].astype('category')


Table for target income from GT alone with keys has 2 rows and 41439 columns.
Table for target income from GT & SD with all key combinations has 2 rows and 65107 columns.
IDENTITY: 
         UiO        UiS     UiOiS      repU
0  77.122149  58.873374  5.007985  3.105933
ATTRIBUTES: 
       Dorig       Dsyn         iS        DiS     DiSCO    DiSDiO  max_denom  \
0  92.070349  75.429316  17.781827  10.992588  8.777282  7.880513       19.0   

   mean_denom  
0    1.718236  
~~~~~~~~~~~~~~~ Done ~~~~~~~~~~~~~~~
Under 0.03 total replacements made: 1144
Synthetic and original data checked with synorig.compare,
 looks like no adjustment needed


-------------------Synthesis 1--------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  syndata[j][col] = syndata[j][col].astype('category')


Table for target income from GT alone with keys has 2 rows and 41439 columns.
Table for target income from GT & SD with all key combinations has 2 rows and 70344 columns.
IDENTITY: 
         UiO        UiS     UiOiS      repU
0  77.122149  62.321388  0.022522  0.014332
ATTRIBUTES: 
       Dorig       Dsyn        iS       DiS     DiSCO    DiSDiO  max_denom  \
0  92.070349  77.268276  0.022522  0.020474  0.016379  0.016379        1.0   

   mean_denom  
0         1.0  
~~~~~~~~~~~~~~~ Done ~~~~~~~~~~~~~~~
Under 0.04 total replacements made: 1416
Synthetic and original data checked with synorig.compare,
 looks like no adjustment needed


-------------------Synthesis 1--------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  syndata[j][col] = syndata[j][col].astype('category')


Table for target income from GT alone with keys has 2 rows and 41439 columns.
Table for target income from GT & SD with all key combinations has 2 rows and 71173 columns.
IDENTITY: 
         UiO        UiS     UiOiS      repU
0  77.122149  64.949077  0.006142  0.006142
ATTRIBUTES: 
       Dorig       Dsyn        iS       DiS     DiSCO    DiSDiO  max_denom  \
0  92.070349  78.317536  0.020474  0.020474  0.010237  0.010237        2.0   

   mean_denom  
0        1.25  
~~~~~~~~~~~~~~~ Done ~~~~~~~~~~~~~~~
Under 0 total replacements made: 36
Synthetic and original data checked with synorig.compare,
 looks like no adjustment needed


-------------------Synthesis 1--------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  syndata[j][col] = syndata[j][col].astype('category')


Table for target income from GT alone with keys has 2 rows and 41439 columns.
Table for target income from GT & SD with all key combinations has 2 rows and 66512 columns.
IDENTITY: 
         UiO        UiS     UiOiS      repU
0  77.122149  38.186764  0.020474  0.014332
ATTRIBUTES: 
       Dorig       Dsyn        iS       DiS     DiSCO    DiSDiO  max_denom  \
0  92.070349  71.653433  0.020474  0.016379  0.016379  0.016379        1.0   

   mean_denom  
0         1.0  
~~~~~~~~~~~~~~~ Done ~~~~~~~~~~~~~~~


In [14]:
import numpy as np
# Calculate average metrics for each sigma
averaged_results = defaultdict(dict)
for sigma, metrics in aggregated_results.items():
    for metric, values in metrics.items():
        if isinstance(values[0], dict):
            averaged_results[sigma][metric] = {k: np.mean([v[k] for v in values]) for k in values[0]}
        else:
            averaged_results[sigma][metric] = np.mean(values)

In [15]:
# Clean up averaged_results to extract metric names without file details
cleaned_results = defaultdict(dict)

for sigma, metrics in averaged_results.items():
    for metric, value in metrics.items():
        # Extract only the metric name, ignoring dataset/file info
        metric_name = metric.split("(")[0]
        if metric_name not in cleaned_results[sigma]:
            cleaned_results[sigma][metric_name] = []
        cleaned_results[sigma][metric_name].append(value)

# Aggregate and calculate average values for cleaned results
final_results = defaultdict(dict)
for sigma, metrics in cleaned_results.items():
    for metric, values in metrics.items():
        if isinstance(values[0], dict):  # If metric value is a dictionary
            # Average each key in the dictionary
            final_results[sigma][metric] = {k: sum(v[k] for v in values) / len(values) for k in values[0]}
        else:  # If metric value is a list of scalars
            final_results[sigma][metric] = sum(values) / len(values)

# Prepare data for plotting
data = {
    "sigma": sorted(final_results.keys()),
    "DCR": [final_results[sigma].get("DCRCalculator", None) for sigma in sorted(final_results.keys())],
    # "NNDR": [final_results[sigma].get("NNDRCalculator", None) for sigma in sorted(final_results.keys())],
    # "AdversarialAccuracy": [final_results[sigma].get("AdversarialAccuracyCalculator", None) for sigma in sorted(final_results.keys())],
    # "NNAA": [final_results[sigma].get("AdversarialAccuracyCalculator_NN", None) for sigma in sorted(final_results.keys())],
    # "Correlation": [final_results[sigma].get("CorrelationCalculator", None) for sigma in sorted(final_results.keys())],
    # "JS": [final_results[sigma].get("JSCalculator", None) for sigma in sorted(final_results.keys())],
    "repU": [final_results[sigma].get("Disclosure_repU", None) for sigma in sorted(final_results.keys())],
    "DiSCO": [final_results[sigma].get("Disclosure_DiSCO", None) for sigma in sorted(final_results.keys())],
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Verify DataFrame content
print(df)


    sigma       DCR      repU      DiSCO
0    0.00  0.186277  0.014332   0.016379
1    0.01  0.075474  3.527702  10.910692
2    0.02  0.092398  3.105933   8.777282
3    0.03  0.218609  0.014332   0.016379
4    0.04  0.254084  0.006142   0.010237
5    0.05  0.235138  0.008190   0.006142
6    0.10  0.302533  0.002047   0.002047
7    0.15  0.350046  0.006142   0.010237
8    0.20  0.204912  1.568322   4.547316
9    0.25  0.209828  1.394292   4.309815
10   0.30  0.292292  0.006142   0.004095
11   0.35  0.326677  0.004095   0.002047


In [16]:
basic_stats_data = {
    "sigma": [],
    "mean": [],
    "median": [],
    "var": []
}

for sigma, metrics in final_results.items():
    if "BasicStatsCalculator" in metrics:
        basic_stats = metrics["BasicStatsCalculator"]
        basic_stats_data["sigma"].append(sigma)
        basic_stats_data["mean"].append(basic_stats["mean"])
        basic_stats_data["median"].append(basic_stats["median"])
        basic_stats_data["var"].append(basic_stats["var"])

# Convert to DataFrame
basic_stats_df = pd.DataFrame(basic_stats_data)

# Display results
print(basic_stats_df)


    sigma      mean    median       var
0    0.25  0.026221  0.008968  0.011879
1    0.01  0.024460  0.006569  0.012641
2    0.05  0.043114  0.037042  0.013290
3    0.10  0.046519  0.034717  0.014424
4    0.15  0.054308  0.053231  0.015965
5    0.20  0.026352  0.006718  0.011874
6    0.30  0.048381  0.044074  0.013237
7    0.35  0.049159  0.040310  0.013305
8    0.02  0.024690  0.001715  0.012317
9    0.03  0.041696  0.036591  0.012565
10   0.04  0.044787  0.032641  0.014509
11   0.00  0.040666  0.051116  0.013012


In [None]:
import matplotlib.pyplot as plt

datasets = df["sigma"]

# Privacy metrics
DCR = df["DCR"]
adversarial_accuracy = df["NNAA"]

# Utility metrics
correlation = df["Correlation"]
JS = df["JS"]

# Create subplots and first y-axis
fig, ax1 = plt.subplots(figsize=(14, 8))

# Plot privacy metrics
ax1.plot(datasets, DCR, marker='o', color='red', label="DCR (Privacy)", linestyle='-')
ax1.set_xlabel("Sigma Values")
ax1.set_ylabel("Privacy Metrics", color='red')
ax1.tick_params(axis='y', labelcolor='red')

# Create another y-axis, plot utility metrics
ax2 = ax1.twinx()
ax2.plot(datasets, correlation, marker='^', color='blue', label="Correlation (Utility)", linestyle='-')
ax2.plot(datasets, JS, marker='x', color='green', label="JS (Utility)", linestyle='--')
ax2.plot(datasets, adversarial_accuracy, marker='s', color='orange', label="NNAA (Privacy)", linestyle='--')
ax2.set_ylabel("Utility Metrics", color='blue')
ax2.tick_params(axis='y', labelcolor='blue')

# Add legend
lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines + lines2, labels + labels2, loc='upper right')

# Add title
plt.title("Privacy vs Utility Metrics across Sigma Values")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


### remove

In [18]:
import os
import re
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.preprocessing import MinMaxScaler
from privacy_utility_framework.privacy_utility_framework.metrics.utility_metrics.utility_metric_manager import UtilityMetricManager
from privacy_utility_framework.privacy_utility_framework.metrics.utility_metrics.statistical.basic_stats import BasicStatsCalculator
from privacy_utility_framework.privacy_utility_framework.metrics.utility_metrics.statistical.mutual_information import MICalculator
from privacy_utility_framework.privacy_utility_framework.metrics.utility_metrics.statistical.correlation import CorrelationCalculator
from privacy_utility_framework.privacy_utility_framework.metrics.utility_metrics.statistical.js_similarity import JSCalculator
from privacy_utility_framework.privacy_utility_framework.metrics.privacy_metrics.privacy_metric_manager import PrivacyMetricManager
from privacy_utility_framework.privacy_utility_framework.metrics.privacy_metrics.distance.adversarial_accuracy_class import AdversarialAccuracyCalculator, AdversarialAccuracyCalculator_NN
from privacy_utility_framework.privacy_utility_framework.metrics.privacy_metrics.distance.dcr_class import DCRCalculator
from privacy_utility_framework.privacy_utility_framework.metrics.privacy_metrics.distance.nndr_class import NNDRCalculator
from privacy_utility_framework.privacy_utility_framework.metrics.privacy_metrics.distance.disco import DisclosureCalculator

# Load original dataset
original_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Tabula-main-claude/Real_Datasets/Adult.csv")
# original_data.drop(columns=["index"], inplace=True)
# original_data.drop(columns=["id"], inplace=True)
original_name = "adult"

# Directory containing synthetic datasets
synthetic_folder = "/content/drive/MyDrive/Colab Notebooks/Tabula-main-claude/adult_syn_data"

# Dictionary to store aggregated results
aggregated_results = defaultdict(lambda: defaultdict(list))

# Function to extract sigma value as float
def extract_sigma(file_name):
    match = re.search(r"sigma-(\d+)", file_name)
    if match:
        sigma_str = match.group(1)
        if len(sigma_str) == 1:  # Single digit like "1" -> "1"
            return int(sigma_str)
        elif len(sigma_str) == 2:  # Two digits like "05" -> "0.5"
            return int(sigma_str) / 10
        elif len(sigma_str) == 3:  # Three digits like "005" -> "0.05"
            return int(sigma_str) / 100
    return None


# Loop through each synthetic file and perform calculations
for file_name in os.listdir(synthetic_folder):
    if file_name.endswith(".csv"):
        synthetic_path = os.path.join(synthetic_folder, file_name)
        synthetic_data = pd.read_csv(synthetic_path)

        sigma = extract_sigma(file_name)
        if sigma is None:
            continue  # Skip files without a valid sigma value

        # synthetic_data = replace_outliers_with_mean(original_data, synthetic_data, sigma)
        synthetic_data = remove_rows_with_outliers(original_data, synthetic_data, sigma)
        # Ensure correct data types (e.g., convert 'age' column to int if needed)
        for column in synthetic_data.columns:
            if column in original_data.columns:
                # Convert the column type in synthetic data to match the original data
                synthetic_data[column] = synthetic_data[column].astype(original_data[column].dtype)


        # print(f"\nProcessing synthetic dataset: {file_name} (sigma: {sigma})")

        # Utility Metric Calculation
        u = UtilityMetricManager()
        utility_metric_list = [
            BasicStatsCalculator(original_data, synthetic_data, original_name=original_name, synthetic_name=file_name),
            # MICalculator(original_data, synthetic_data, original_name=original_name, synthetic_name=file_name),
            # CorrelationCalculator(original_data, synthetic_data, original_name=original_name, synthetic_name=file_name),
            # JSCalculator(original_data, synthetic_data, original_name=original_name, synthetic_name=file_name),
        ]
        u.add_metric(utility_metric_list)
        results_utility = u.evaluate_all()

        # Privacy Metric Calculation
        p = PrivacyMetricManager()
        privacy_metric_list = [
            DCRCalculator(original_data, synthetic_data, original_name=original_name, synthetic_name=file_name),
            # NNDRCalculator(original_data, synthetic_data, original_name=original_name, synthetic_name=file_name),
            # AdversarialAccuracyCalculator_NN(original_data, synthetic_data, original_name=original_name, synthetic_name=file_name),
            # AdversarialAccuracyCalculator(original_data, synthetic_data, original_name=original_name, synthetic_name=file_name),
        ]
        p.add_metric(privacy_metric_list)
        results_privacy = p.evaluate_all()

        # # DiSCO and repU
        adult_keys = ['age', 'workclass', 'education', 'marital-status',
                      'occupation', 'relationship', 'race', 'gender', 'capital-gain',
                      'capital-loss', 'hours-per-week', 'native-country']
        adult_target = 'income'  

        calc = DisclosureCalculator(original_data, synthetic_data, keys=adult_keys, target=adult_target)
        repU, DiSCO = calc.evaluate()

        # Aggregate metrics
        for key, value in {**results_utility, **results_privacy}.items():
            aggregated_results[sigma][key].append(value)


        aggregated_results[sigma]["Disclosure_repU"].append(repU)
        aggregated_results[sigma]["Disclosure_DiSCO"].append(DiSCO)



Under 0.25, removed 3652 rows with outliers.
Synthetic and original data checked with synorig.compare,
 looks like no adjustment needed


-------------------Synthesis 1--------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  syndata[j][col] = syndata[j][col].astype('category')


Table for target income from GT alone with keys has 2 rows and 41439 columns.
Table for target income from GT & SD with all key combinations has 2 rows and 69852 columns.
IDENTITY: 
         UiO        UiS     UiOiS      repU
0  77.122149  79.491413  1.791491  1.326727
ATTRIBUTES: 
       Dorig      Dsyn        iS       DiS     DiSCO    DiSDiO  max_denom  \
0  92.070349  89.04915  8.177388  6.146349  4.127595  2.598174       20.0   

   mean_denom  
0     1.99604  
~~~~~~~~~~~~~~~ Done ~~~~~~~~~~~~~~~
Under 0.01, removed 378 rows with outliers.
Synthetic and original data checked with synorig.compare,
 looks like no adjustment needed


-------------------Synthesis 1--------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  syndata[j][col] = syndata[j][col].astype('category')


Table for target income from GT alone with keys has 2 rows and 41439 columns.
Table for target income from GT & SD with all key combinations has 2 rows and 62762 columns.
IDENTITY: 
         UiO        UiS     UiOiS      repU
0  77.122149  54.394395  5.661111  3.509275
ATTRIBUTES: 
       Dorig       Dsyn         iS        DiS     DiSCO    DiSDiO  max_denom  \
0  92.070349  75.876982  20.019655  13.166946  10.88817  9.827607       19.0   

   mean_denom  
0    1.713825  
~~~~~~~~~~~~~~~ Done ~~~~~~~~~~~~~~~
Under 0.05, removed 1767 rows with outliers.
Synthetic and original data checked with synorig.compare,
 looks like no adjustment needed


-------------------Synthesis 1--------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  syndata[j][col] = syndata[j][col].astype('category')


Table for target income from GT alone with keys has 2 rows and 41439 columns.
Table for target income from GT & SD with all key combinations has 2 rows and 70531 columns.
IDENTITY: 
         UiO        UiS     UiOiS      repU
0  77.122149  67.063733  0.014332  0.010237
ATTRIBUTES: 
       Dorig       Dsyn        iS       DiS    DiSCO   DiSDiO  max_denom  \
0  92.070349  80.029582  0.014332  0.010237  0.00819  0.00819        1.0   

   mean_denom  
0         1.0  
~~~~~~~~~~~~~~~ Done ~~~~~~~~~~~~~~~
Under 0.1, removed 2674 rows with outliers.
Synthetic and original data checked with synorig.compare,
 looks like no adjustment needed


-------------------Synthesis 1--------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  syndata[j][col] = syndata[j][col].astype('category')


Table for target income from GT alone with keys has 2 rows and 41439 columns.
Table for target income from GT & SD with all key combinations has 2 rows and 72021 columns.
IDENTITY: 
         UiO        UiS     UiOiS      repU
0  77.122149  75.190882  0.004095  0.002047
ATTRIBUTES: 
       Dorig       Dsyn        iS       DiS     DiSCO    DiSDiO  max_denom  \
0  92.070349  86.391101  0.004095  0.004095  0.004095  0.004095        1.0   

   mean_denom  
0         1.0  
~~~~~~~~~~~~~~~ Done ~~~~~~~~~~~~~~~
Under 0.15, removed 3086 rows with outliers.
Synthetic and original data checked with synorig.compare,
 looks like no adjustment needed


-------------------Synthesis 1--------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  syndata[j][col] = syndata[j][col].astype('category')


Table for target income from GT alone with keys has 2 rows and 41439 columns.
Table for target income from GT & SD with all key combinations has 2 rows and 72216 columns.
IDENTITY: 
         UiO        UiS     UiOiS      repU
0  77.122149  78.038821  0.004095  0.004095
ATTRIBUTES: 
       Dorig       Dsyn        iS       DiS    DiSCO    DiSDiO  max_denom  \
0  92.070349  87.934217  0.010237  0.010237  0.00819  0.004095        2.0   

   mean_denom  
0    1.333333  
~~~~~~~~~~~~~~~ Done ~~~~~~~~~~~~~~~
Under 0.2, removed 3332 rows with outliers.
Synthetic and original data checked with synorig.compare,
 looks like no adjustment needed


-------------------Synthesis 1--------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  syndata[j][col] = syndata[j][col].astype('category')


Table for target income from GT alone with keys has 2 rows and 41439 columns.
Table for target income from GT & SD with all key combinations has 2 rows and 70086 columns.
IDENTITY: 
         UiO        UiS     UiOiS      repU
0  77.122149  78.663144  1.981901  1.510995
ATTRIBUTES: 
       Dorig       Dsyn        iS      DiS    DiSCO    DiSDiO  max_denom  \
0  92.070349  88.827339  8.738381  6.51898  4.44085  3.073175       18.0   

   mean_denom  
0    1.889373  
~~~~~~~~~~~~~~~ Done ~~~~~~~~~~~~~~~
Under 0.3, removed 3980 rows with outliers.
Synthetic and original data checked with synorig.compare,
 looks like no adjustment needed


-------------------Synthesis 1--------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  syndata[j][col] = syndata[j][col].astype('category')


Table for target income from GT alone with keys has 2 rows and 41439 columns.
Table for target income from GT & SD with all key combinations has 2 rows and 71420 columns.
IDENTITY: 
         UiO        UiS    UiOiS      repU
0  77.122149  81.093589  0.00819  0.006142
ATTRIBUTES: 
       Dorig       Dsyn       iS       DiS     DiSCO    DiSDiO  max_denom  \
0  92.070349  90.871039  0.00819  0.006142  0.004095  0.004095        1.0   

   mean_denom  
0         1.0  
~~~~~~~~~~~~~~~ Done ~~~~~~~~~~~~~~~
Under 0.35, removed 4387 rows with outliers.
Synthetic and original data checked with synorig.compare,
 looks like no adjustment needed


-------------------Synthesis 1--------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  syndata[j][col] = syndata[j][col].astype('category')


Table for target income from GT alone with keys has 2 rows and 41439 columns.
Table for target income from GT & SD with all key combinations has 2 rows and 70999 columns.
IDENTITY: 
         UiO       UiS     UiOiS      repU
0  77.122149  83.02219  0.004095  0.004095
ATTRIBUTES: 
       Dorig      Dsyn        iS       DiS     DiSCO    DiSDiO  max_denom  \
0  92.070349  91.69171  0.004095  0.004095  0.002047  0.002047        1.0   

   mean_denom  
0         1.0  
~~~~~~~~~~~~~~~ Done ~~~~~~~~~~~~~~~
Under 0.02, removed 772 rows with outliers.
Synthetic and original data checked with synorig.compare,
 looks like no adjustment needed


-------------------Synthesis 1--------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  syndata[j][col] = syndata[j][col].astype('category')


Table for target income from GT alone with keys has 2 rows and 41439 columns.
Table for target income from GT & SD with all key combinations has 2 rows and 64700 columns.
IDENTITY: 
         UiO        UiS     UiOiS      repU
0  77.122149  59.125189  4.962942  3.101839
ATTRIBUTES: 
       Dorig      Dsyn         iS        DiS     DiSCO    DiSDiO  max_denom  \
0  92.070349  75.68832  17.661029  10.982351  8.775234  7.884608       19.0   

   mean_denom  
0    1.728226  
~~~~~~~~~~~~~~~ Done ~~~~~~~~~~~~~~~
Under 0.03, removed 1141 rows with outliers.
Synthetic and original data checked with synorig.compare,
 looks like no adjustment needed


-------------------Synthesis 1--------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  syndata[j][col] = syndata[j][col].astype('category')


Table for target income from GT alone with keys has 2 rows and 41439 columns.
Table for target income from GT & SD with all key combinations has 2 rows and 69624 columns.
IDENTITY: 
         UiO        UiS     UiOiS      repU
0  77.122149  62.595657  0.022522  0.014332
ATTRIBUTES: 
       Dorig       Dsyn        iS       DiS     DiSCO    DiSDiO  max_denom  \
0  92.070349  77.474147  0.022522  0.020474  0.016379  0.016379        1.0   

   mean_denom  
0         1.0  
~~~~~~~~~~~~~~~ Done ~~~~~~~~~~~~~~~
Under 0.04, removed 1412 rows with outliers.
Synthetic and original data checked with synorig.compare,
 looks like no adjustment needed


-------------------Synthesis 1--------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  syndata[j][col] = syndata[j][col].astype('category')


Table for target income from GT alone with keys has 2 rows and 41439 columns.
Table for target income from GT & SD with all key combinations has 2 rows and 70220 columns.
IDENTITY: 
         UiO        UiS     UiOiS      repU
0  77.122149  65.179318  0.006142  0.006142
ATTRIBUTES: 
       Dorig      Dsyn        iS       DiS     DiSCO    DiSDiO  max_denom  \
0  92.070349  78.70138  0.020474  0.020474  0.010237  0.010237        2.0   

   mean_denom  
0        1.25  
~~~~~~~~~~~~~~~ Done ~~~~~~~~~~~~~~~
Under 0, removed 36 rows with outliers.
Synthetic and original data checked with synorig.compare,
 looks like no adjustment needed


-------------------Synthesis 1--------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  syndata[j][col] = syndata[j][col].astype('category')


Table for target income from GT alone with keys has 2 rows and 41439 columns.
Table for target income from GT & SD with all key combinations has 2 rows and 66497 columns.
IDENTITY: 
         UiO        UiS     UiOiS      repU
0  77.122149  38.194264  0.020474  0.014332
ATTRIBUTES: 
       Dorig       Dsyn        iS       DiS     DiSCO    DiSDiO  max_denom  \
0  92.070349  71.653023  0.020474  0.016379  0.016379  0.016379        1.0   

   mean_denom  
0         1.0  
~~~~~~~~~~~~~~~ Done ~~~~~~~~~~~~~~~


In [19]:
import numpy as np
# Calculate average metrics for each sigma
averaged_results = defaultdict(dict)
for sigma, metrics in aggregated_results.items():
    for metric, values in metrics.items():
        if isinstance(values[0], dict):
            averaged_results[sigma][metric] = {k: np.mean([v[k] for v in values]) for k in values[0]}
        else:
            averaged_results[sigma][metric] = np.mean(values)

In [20]:
# Clean up averaged_results to extract metric names without file details
cleaned_results = defaultdict(dict)

for sigma, metrics in averaged_results.items():
    for metric, value in metrics.items():
        # Extract only the metric name, ignoring dataset/file info
        metric_name = metric.split("(")[0]
        if metric_name not in cleaned_results[sigma]:
            cleaned_results[sigma][metric_name] = []
        cleaned_results[sigma][metric_name].append(value)

# Aggregate and calculate average values for cleaned results
final_results = defaultdict(dict)
for sigma, metrics in cleaned_results.items():
    for metric, values in metrics.items():
        if isinstance(values[0], dict):  # If metric value is a dictionary
            # Average each key in the dictionary
            final_results[sigma][metric] = {k: sum(v[k] for v in values) / len(values) for k in values[0]}
        else:  # If metric value is a list of scalars
            final_results[sigma][metric] = sum(values) / len(values)

# Prepare data for plotting
data = {
    "sigma": sorted(final_results.keys()),
    "DCR": [final_results[sigma].get("DCRCalculator", None) for sigma in sorted(final_results.keys())],
    # "NNDR": [final_results[sigma].get("NNDRCalculator", None) for sigma in sorted(final_results.keys())],
    # "AdversarialAccuracy": [final_results[sigma].get("AdversarialAccuracyCalculator", None) for sigma in sorted(final_results.keys())],
    "NNAA": [final_results[sigma].get("AdversarialAccuracyCalculator_NN", None) for sigma in sorted(final_results.keys())],
    "Correlation": [final_results[sigma].get("CorrelationCalculator", None) for sigma in sorted(final_results.keys())],
    "JS": [final_results[sigma].get("JSCalculator", None) for sigma in sorted(final_results.keys())],
    "repU": [final_results[sigma].get("Disclosure_repU", None) for sigma in sorted(final_results.keys())],
    "DiSCO": [final_results[sigma].get("Disclosure_DiSCO", None) for sigma in sorted(final_results.keys())],
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Verify DataFrame content
print(df)


    sigma       DCR  NNAA Correlation    JS      repU      DiSCO
0    0.00  0.186265  None        None  None  0.014332   0.016379
1    0.01  0.075491  None        None  None  3.509275  10.888170
2    0.02  0.092569  None        None  None  3.101839   8.775234
3    0.03  0.218704  None        None  None  0.014332   0.016379
4    0.04  0.254419  None        None  None  0.006142   0.010237
5    0.05  0.235289  None        None  None  0.010237   0.008190
6    0.10  0.303368  None        None  None  0.002047   0.004095
7    0.15  0.351083  None        None  None  0.004095   0.008190
8    0.20  0.206036  None        None  None  1.510995   4.440850
9    0.25  0.211020  None        None  None  1.326727   4.127595
10   0.30  0.293553  None        None  None  0.006142   0.004095
11   0.35  0.328640  None        None  None  0.004095   0.002047


In [None]:
basic_stats_data = {
    "sigma": [],
    "mean": [],
    "median": [],
    "var": []
}

for sigma, metrics in final_results.items():
    if "BasicStatsCalculator" in metrics:
        basic_stats = metrics["BasicStatsCalculator"]
        basic_stats_data["sigma"].append(sigma)
        basic_stats_data["mean"].append(basic_stats["mean"])
        basic_stats_data["median"].append(basic_stats["median"])
        basic_stats_data["var"].append(basic_stats["var"])

basic_stats_df = pd.DataFrame(basic_stats_data)

print(basic_stats_df)


In [None]:
import matplotlib.pyplot as plt

datasets = df["sigma"]

DCR = df["DCR"]
adversarial_accuracy = df["NNAA"]

correlation = df["Correlation"]
JS = df["JS"]

fig, ax1 = plt.subplots(figsize=(14, 8))

ax1.plot(datasets, DCR, marker='o', color='red', label="DCR (Privacy)", linestyle='-')
ax1.set_xlabel("Sigma Values")
ax1.set_ylabel("Privacy Metrics", color='red')
ax1.tick_params(axis='y', labelcolor='red')

ax2 = ax1.twinx()
ax2.plot(datasets, correlation, marker='^', color='blue', label="Correlation (Utility)", linestyle='-')
ax2.plot(datasets, JS, marker='x', color='green', label="JS (Utility)", linestyle='--')
ax2.plot(datasets, adversarial_accuracy, marker='s', color='orange', label="NNAA (Privacy)", linestyle='--')
ax2.set_ylabel("Utility Metrics", color='blue')
ax2.tick_params(axis='y', labelcolor='blue')

lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines + lines2, labels + labels2, loc='upper right')

plt.title("Privacy vs Utility Metrics across Sigma Values")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


## Comparison

In [None]:
import os
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def extract_sigma(file_name):
    match = re.search(r"sigma-(\d+)", file_name)
    if match:
        sigma_str = match.group(1)
        if len(sigma_str) == 1:  # Single digit like "1" -> "1"
            return int(sigma_str)
        elif len(sigma_str) == 2:  # Two digits like "05" -> "0.5"
            return int(sigma_str) / 10
        elif len(sigma_str) == 3:  # Three digits like "005" -> "0.05"
            return int(sigma_str) / 100
    return None



def plot_comparison_for_each_column(original_data, synthetic_folder):
    """
    For each column, plot synthetic data with different sigma values against original data.
    """
    synthetic_files = [
        (file_name, extract_sigma(file_name))
        for file_name in os.listdir(synthetic_folder)
        if file_name.endswith(".csv") and extract_sigma(file_name) is not None
    ]

    # Sort files by sigma
    synthetic_files = sorted(synthetic_files, key=lambda x: x[1])

    # Iterate over each column
    for column in original_data.columns:
        # Adjust the number of rows and columns in the subplot grid
        num_files = len(synthetic_files)
        ncols = 3
        nrows = (num_files + ncols - 1) // ncols  # Ceiling division for rows

        fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(10, 2 * nrows))

        # Flatten axes array for easy indexing
        axes = axes.flatten()

        for i, (file_name, sigma) in enumerate(synthetic_files):
            synthetic_path = os.path.join(synthetic_folder, file_name)
            synthetic_data = pd.read_csv(synthetic_path)
            # synthetic_data = replace_outliers_with_mean(original_data, synthetic_data, sigma)
            synthetic_data = remove_rows_with_outliers(original_data, synthetic_data, sigma)
            # Plot original and synthetic distributions
            sns.kdeplot(original_data[column], ax=axes[i], label='Original', color='blue')
            if column in synthetic_data.columns:
                sns.kdeplot(synthetic_data[column], ax=axes[i], label=f'Sigma {sigma}', color='red', alpha=0.3)

            axes[i].set_title(f'Sigma {sigma}', fontsize=6)
            axes[i].legend(fontsize=5)
            axes[i].tick_params(axis='both', which='major', labelsize=5)

        # Hide unused subplots
        for j in range(len(synthetic_files), len(axes)):
            axes[j].axis('off')

        fig.suptitle(f'Distribution of {column}', fontsize=14)
        plt.tight_layout()
        plt.show()



# Load original dataset
original_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Tabula-main-claude/Real_Datasets/Adult.csv")

# Path to synthetic datasets folder
synthetic_folder = "/content/drive/MyDrive/Colab Notebooks/Tabula-main-claude/adult_syn_data"

# Generate plots
plot_comparison_for_each_column(original_data, synthetic_folder)
