In [None]:
import warnings
from pathlib import Path

import pandas as pd
from sdv.metadata import SingleTableMetadata
from sdv.sampling import Condition
from sdv.single_table import CopulaGANSynthesizer, CTGANSynthesizer, GaussianCopulaSynthesizer, TVAESynthesizer
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")


data_path = Path("PATH_TO_ORIGINAL_DATA")  # TODO: Change this to the path of the original data
synt_path = Path("PATH_TO_SYNTHETIC_DATA_DIRECTORY")  # TODO: Change this to the path of the synthetic data directory


data = pd.read_csv(data_path, low_memory=False)

LABEL = "TARGET_COLUMN"  # TODO: Change this to the target column
ID = None  # TODO: Change this to the ID column if exists

train_data, test_data = train_test_split(
    data,
    test_size=0.2,
    random_state=42,
    stratify=data[LABEL],
)


train_data.to_csv(data_path / "train.csv", index=False)
test_data.to_csv(data_path / "test.csv", index=False)

In [None]:
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data)
metadata_dict = metadata.to_dict()

### Choose between binary or non-binary target

In [None]:
total_samples = train_data.shape[0]

# If the target column is not binary, you can use the following code to create a condition
category_counts = train_data[LABEL].value_counts()
target_a = category_counts.get("TRUE_CONDITION_VALUE", 0)
target_b = category_counts.get("FALSE_CONDITION_VALUE", 0)

true_condition = Condition(num_rows=target_a, column_values={LABEL: "TRUE_CONDITION_VALUE"})
false_condition = Condition(num_rows=target_b, column_values={LABEL: "FALSE_CONDITION_VALUE"})

# IF the target column is binary, you can use the following code to create a condition and comment the above code
# true_samples = train_data[LABEL].sum()
# false_samples = total_samples - true_samples
# true_condition = Condition(num_rows=true_samples, column_values={LABEL: True})
# false_condition = Condition(num_rows=false_samples, column_values={LABEL: False})

### CopulaGAN

In [None]:
copula_gan_synthesizer = CopulaGANSynthesizer(metadata)
copula_gan_synthesizer.fit(train_data)
copula_gan_synthetic_data = copula_gan_synthesizer.sample_from_conditions(conditions=[true_condition, false_condition])
copula_gan_synthetic_data.to_csv(synt_path / "CopulaGAN.csv", index=False)

### CTGAN

In [None]:
ctgan_synthesizer = CTGANSynthesizer(metadata)
ctgan_synthesizer.fit(train_data)
ctgan_synthetic_data = ctgan_synthesizer.sample_from_conditions(conditions=[true_condition, false_condition])
ctgan_synthetic_data.to_csv(synt_path / "CTGAN.csv", index=False)

### Gaussian Copula

In [None]:
gaussian_copula_synthesizer = GaussianCopulaSynthesizer(metadata)
gaussian_copula_synthesizer.fit(train_data)
gaussian_copula_synthetic_data = gaussian_copula_synthesizer.sample_from_conditions(
    conditions=[true_condition, false_condition],
)
gaussian_copula_synthetic_data.to_csv(synt_path / "GaussianCopula.csv", index=False)

### TAVE

In [None]:
tvae_synthesizer = TVAESynthesizer(metadata)
tvae_synthesizer.fit(train_data)
tvae_synthetic_data = tvae_synthesizer.sample_from_conditions(conditions=[true_condition, false_condition])
tvae_synthetic_data.to_csv(synt_path / "TVAE.csv", index=False)

### Gaussian Multivariate

In [None]:
from synthius.model import GaussianMultivariateSynthesizer

gaussian_multivariate_synthesizer = GaussianMultivariateSynthesizer(train_data, synt_path)
gaussian_multivariate_synthesizer.synthesize(num_sample=total_samples)

### WGAN

Based on the size of the data and its complexity, HP may need some changes.

In [None]:
from synthius.data import DataImputationPreprocessor
from synthius.model import WGAN

data_preprocessor = DataImputationPreprocessor(train_data)
processed_train_data = data_preprocessor.fit_transform()

n_features = processed_train_data.shape[1]
wgan_imputer = WGAN(n_features=n_features, base_nodes=128, batch_size=512, critic_iters=5, lambda_gp=10.0, num_epochs=100_000)
wgan_imputer.train(processed_train_data, log_interval=5_000, log_training=True)

wgan_synthetic_samples = wgan_imputer.generate_samples(total_samples)
wgan_synthetic_data = pd.DataFrame(wgan_synthetic_samples, columns=processed_train_data.columns)

# --------------------- Decoding ---------------------#
decoded_wgan_synthetic_data = data_preprocessor.inverse_transform(wgan_synthetic_data)
# --------------------- Saving ---------------------#
decoded_wgan_synthetic_data.to_csv(synt_path / "WGAN.csv", index=False)

### ARF

In [None]:
from synthius.model import ARF

model = ARF(x=train_data, id_column=ID, min_node_size=5, num_trees=50, max_features=0.3)
forde = model.forde()
synthetic_data_arf = model.forge(n=total_samples)

synthetic_data_arf.to_csv(synt_path / "ARF.csv", index=False)