In [125]:
from sdv.evaluation.single_table import run_diagnostic, evaluate_quality, get_column_plot
from sdv.metadata import SingleTableMetadata
import pandas as pd
import os

In [126]:
# Constants
REAL_DATA_PATH = "../datasets/ben10_master.csv"
METADATA_PATH = "../Synthetic Data/metadata.json"
COLUMN_TO_PLOT = "winner"  # Change this to the column you want to plot

In [127]:
real_data = pd.read_csv(REAL_DATA_PATH)
if os.path.exists(METADATA_PATH):
    metadata = SingleTableMetadata.load_from_json(METADATA_PATH)
    print("Loaded metadata from metadata.json")
else:
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(
        data=real_data,
    )
    metadata.save_to_json(METADATA_PATH)
    print("Created and saved new metadata to metadata.json")

Loaded metadata from metadata.json


In [128]:
synthesizers = [
    {
        'name': 'CopulaGAN',
        'output': '../datasets/ben10_CopulaGAN_synthetic.csv'
    },
    {
        'name': 'CTGAN',
        'output': '../datasets/ben10_CTGAN_synthetic.csv'
    },
    {
        'name': 'GaussianCopula',
        'output': '../datasets/ben10_GaussianCopula_synthetic.csv'
    },
    {
        'name': 'TVAE',
        'output': '../datasets/ben10_TVAE_synthetic.csv'
    }
]

In [129]:
for synth in synthesizers:
    print(f"--- {synth['name']} ---")
    synthetic_data = pd.read_csv(synth['output'])

    diagnostic = run_diagnostic(real_data, synthetic_data, metadata)

    quality_report = evaluate_quality(real_data, synthetic_data, metadata)

    fig = get_column_plot(
        real_data=real_data,
        synthetic_data=synthetic_data,
        metadata=metadata,
        column_name=COLUMN_TO_PLOT
    )
    fig.show()

--- CopulaGAN ---
Generating report ...

(1/2) Evaluating Data Validity: |██████████| 13/13 [00:00<00:00, 1264.55it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 100.16it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 13/13 [00:00<00:00, 318.79it/s]|
Column Shapes Score: 91.97%

(2/2) Evaluating Column Pair Trends: |██████████| 78/78 [00:00<00:00, 132.35it/s]|
Column Pair Trends Score: 48.89%

Overall Score (Average): 70.43%



--- CTGAN ---
Generating report ...

(1/2) Evaluating Data Validity: |██████████| 13/13 [00:00<00:00, 490.36it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 331.67it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 13/13 [00:00<00:00, 359.20it/s]|
Column Shapes Score: 94.09%

(2/2) Evaluating Column Pair Trends: |██████████| 78/78 [00:00<00:00, 114.44it/s]|
Column Pair Trends Score: 68.94%

Overall Score (Average): 81.52%



--- GaussianCopula ---
Generating report ...

(1/2) Evaluating Data Validity: |██████████| 13/13 [00:00<00:00, 810.34it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 458.14it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 13/13 [00:00<00:00, 375.15it/s]|
Column Shapes Score: 97.45%

(2/2) Evaluating Column Pair Trends: |██████████| 78/78 [00:00<00:00, 106.25it/s]|
Column Pair Trends Score: 34.94%

Overall Score (Average): 66.2%



--- TVAE ---
Generating report ...

(1/2) Evaluating Data Validity: |██████████| 13/13 [00:00<00:00, 585.49it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 195.41it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 13/13 [00:00<00:00, 251.53it/s]|
Column Shapes Score: 73.68%

(2/2) Evaluating Column Pair Trends: |██████████| 78/78 [00:00<00:00, 81.71it/s]|
Column Pair Trends Score: 49.1%

Overall Score (Average): 61.39%

