In [None]:
import warnings
from pathlib import Path

import pandas as pd

from synthius.optimization import NSGAIISamplerHPOptimizer
from synthius.metric.utils import utils

warnings.filterwarnings("ignore")

In [None]:
train_data = Path("PATH_TO_TRAIN_DATASET_AS_CSV")  # TODO: Change this to the path of the training dataset

TARGET = "TARGET_COLUMN"  # TODO: Change this to the target column
POS_LABEL = "POSITIVE_LABEL"  # TODO: Change this to the positive label like : ">50K"
NEG_LABEL = "NEGATIVE_LABEL"  # TODO: Change this to the negative label like : "<=50K"
# If it's a binary classification problem, use TRUE without quotation marks
ID = None  # TODO: Change this to the ID column if exists

### Modify the key fields, sensitive fields, and auxiliary columns as per your data. Below is an example of how it should be used.

In [3]:
key_fields = [
    "Age",
    "Education",
    "Occupation",
    "Income",
    "Marital-status",
    "Native-country",
    "Relationship",
]

sensitive_fields = ["Race", "Sex"]


aux_cols = [
    ["Occupation", "Education", "Education-num", "Hours-per-week", "Capital-loss", "Capital-gain"],
    ["Race", "Sex", "Fnlwgt", "Age", "Native-country", "Workclass", "Marital-status", "Relationship"],
]

In [None]:
# We make sure we use the clean columns from the data
inference_all_columns = utils.clean_columns(pd.read_csv(train_data)).columns

optimizer = NSGAIISamplerHPOptimizer(
    selected_metrics=["CS Test", "CategoricalNB", "Overall Quality"],
    key_fields=key_fields,
    sensitive_fields=sensitive_fields,
    linkability_aux_cols=aux_cols,
    distance_scaler="MinMaxScaler",
    singlingout_mode="multivariate",
    singlingout_n_attacks=4_000,
    singlingout_n_cols=7,
    linkability_n_neighbors=500,
    linkability_n_attacks=None,
    inference_all_columns=inference_all_columns,
    inference_use_custom_model=True,
    inference_sample_attacks=False,
    inference_n_attacks=None,
)

best_trial = optimizer.run_synthetic_pipeline(
    real_data_path=train_data,
    label_column=TARGET,
    id_column=ID,
    output_path="results",  # Change it if you want to save the results in a different directory
    n_trials=20,
    positive_condition_value=POS_LABEL,
    negative_condition_value=NEG_LABEL,
)

In [None]:
result = optimizer.evaluate_best_model_metrics()
display(result.all_results)

In [None]:
result.save_results(Path("res.pkl"))