In [1]:
import warnings
from pathlib import Path

import pandas as pd

from synthius.metric.utils import utils
from synthius.utilities import MetricsAggregator

warnings.filterwarnings("ignore")

In [2]:
train_data = Path("PATH_TO_TRAIN_DATASET_AS_CSV")  # TODO: Change this to the path of the training dataset
test_data = Path("PATH_TO_TEST_DATASET_AS_CSV")  # TODO: Change this to the path of the test dataset
synt_path = Path("PATH_TO_SYNTHETIC_DATA_DIRECTORY")  # TODO: Change this to the path of the synthetic data directory
models_path = Path("PATH_TO_MODELS_DIRECTORY")  # TODO: Change this to the path of the models directory
RESULTS_PATH = Path("PATH_TO_RESULTS_DIRECTORY")  # TODO: Change this to the path of the results directory

synthetic_data_paths = [
    synt_path / "ARF.csv",
    synt_path / "CopulaGAN.csv",
    synt_path / "CTGAN.csv",
    synt_path / "GaussianCopula.csv",
    synt_path / "GaussianMultivariate.csv",
    synt_path / "TVAE.csv",
    synt_path / "WGAN.csv",
]


TARGET = "TARGET_COLUMN"  # TODO: Change this to the target column
POS_LABEL = "POSITIVE_LABEL"  # TODO: Change this to the positive label
# If it's a binary classification problem, use TRUE without quotation marks
ID = None  # TODO: Change this to the ID column if exists

### Modify the key fields, sensitive fields, and auxiliary columns as per your data. Below is an example of how it should be used.

In [3]:
key_fields = [
    "Age",
    "Education",
    "Occupation",
    "Income",
    "Marital-status",
    "Native-country",
    "Relationship",
]

sensitive_fields = ["Race", "Sex"]


aux_cols = [
    ["Occupation", "Education", "Education-num", "Hours-per-week", "Capital-loss", "Capital-gain"],
    ["Race", "Sex", "Fnlwgt", "Age", "Native-country", "Workclass", "Marital-status", "Relationship"],
]

In [None]:
# We make sure we use the clean columns from the data
inference_all_columns = utils.clean_columns(pd.read_csv(test_data)).columns

metrics_result = MetricsAggregator(
    real_data_path=train_data,
    synthetic_data_paths=synthetic_data_paths,
    control_data=test_data,
    key_fields=key_fields,
    sensitive_fields=sensitive_fields,
    distance_scaler="MinMaxScaler",
    singlingout_mode="multivariate",
    singlingout_n_attacks=6_000,
    singlingout_n_cols=7,
    linkability_n_neighbors=500,
    linkability_n_attacks=None,
    linkability_aux_cols=aux_cols,
    id_column=ID,
    utility_test_path=test_data,
    utility_models_path=models_path,
    inference_all_columns = inference_all_columns,
    inference_use_custom_model=True,
    inference_sample_attacks=False,
    inference_n_attacks=None,
    label_column=TARGET,
    pos_label=POS_LABEL,
    want_parallel=False,
    need_split=False,
)

## Choose the evaluation method

The `MetricsAggregator` class provides three distinct modes to evaluate metrics, depending on your use case. Below is a detailed explanation and examples for each mode:

### 1. Running Metrics for Synthetic Models Only

This mode calculates metrics exclusively for synthetic models, without involving the original dataset. Use this when you want to evaluate the performance or properties of your synthetic data independently.

```
metrics_result.run_metrics_for_models()
display(metrics_result.all_results)
```

### 2. Running Metrics for the Original Dataset Only

This mode calculates metrics for the original dataset by splitting train dataset into two equal parts (50-50 split). It is useful for benchmarking or validating your metrics.

```
metrics_result.run_metrics_for_original()
display(metrics_result.all_results)
```


### 3. Running Metrics for Both Synthetic Models and the Original Dataset

This mode evaluates metrics for both synthetic models and the original dataset.
```
metrics_result.run_all_with_original()
display(metrics_result.all_results)
```

### Update Existing Results with Original Dataset Values

If you want to update the results for synthetic models with the original dataset results without re-running all the metrics, follow these steps:

```
# Load the current results
metrics_result = MetricsAggregator.load_results(Path("res.pkl"))

# Run the calculation for the original dataset
metrics_result.run_metrics_for_original()

# Update the utility metric to include the original dataset results
metrics_result.run_or_update_metric("Utility")

# Display the updated results
display(metrics_result.all_results)
```



In [None]:
metrics_result.run_all_with_original()
display(metrics_result.all_results)

In [None]:
inference_all_columns = utils.clean_columns(pd.read_csv(test_data)).columns

metrics_result = MetricsAggregator(
    real_data_path=train_data,
    synthetic_data_paths=synthetic_data_paths,
    control_data=test_data,
    key_fields=key_fields,
    sensitive_fields=sensitive_fields,
    distance_scaler="MinMaxScaler",
    singlingout_mode="multivariate",
    singlingout_n_attacks=6_000,
    singlingout_n_cols=7,
    linkability_n_neighbors=500,
    linkability_n_attacks=None,
    linkability_aux_cols=aux_cols,
    id_column=ID,
    utility_test_path=test_data,
    utility_models_path=models_path,
    inference_all_columns = inference_all_columns,
    inference_use_custom_model=True,
    inference_sample_attacks=False,
    inference_n_attacks=None,
    label_column=TARGET,
    pos_label=POS_LABEL,
    want_parallel=False,
    need_split=False,
)