In [8]:
import pandas as pd
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.metadata import Metadata
from sdv.metadata import SingleTableMetadata
from sdv.evaluation.single_table import get_column_plot
from sdv.evaluation.single_table import evaluate_quality
import os

In [9]:
# Load the real data
real_data = pd.read_csv('../datasets/ben10_master.csv')

In [10]:
# Load or create metadata
metadata_path = 'metadata.json'

if os.path.exists(metadata_path):
    metadata = Metadata.load_from_json(metadata_path)
    print("Loaded metadata from metadata.json")
else:
    metadata = Metadata()
    metadata.detect_table_from_dataframe(
        table_name='ben10_table',
        data=real_data
    )
    metadata.save_to_json(metadata_path)
    print("Created and saved new metadata to metadata.json")

Loaded metadata from metadata.json


In [11]:
# Initialize and fit the GaussianCopulaSynthesizer
synthesizer = GaussianCopulaSynthesizer(metadata)
synthesizer.fit(data=real_data)

In [12]:
# Generate synthetic data
synthetic_data = synthesizer.sample(num_rows=10000)

In [13]:
# Save the synthetic data to a CSV file
synthetic_data.to_csv('../datasets/ben10_GaussianCopula_synthetic.csv', index=False)
print("Synthetic dataset saved to ../datasets/ben10_synthetic.csv")

Synthetic dataset saved to ../datasets/ben10_synthetic.csv


In [14]:
# Evaluate the quality of the synthetic data
quality_report = evaluate_quality(
    real_data,
    synthetic_data,
    metadata
)
print("Column Shapes Details:", quality_report.get_details(property_name='Column Shapes'))
print("Column Pair Trends Details:", quality_report.get_details(property_name='Column Pair Trends'))

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 13/13 [00:00<00:00, 362.71it/s]|
Column Shapes Score: 97.45%

(2/2) Evaluating Column Pair Trends: |██████████| 78/78 [00:00<00:00, 118.61it/s]|
Column Pair Trends Score: 34.94%

Overall Score (Average): 66.2%

Column Shapes Details:                    Column        Metric     Score
0              alien_name  TVComplement  0.963167
1              enemy_name  TVComplement  0.969800
2          alien1_species  TVComplement  0.958267
3      alien1_home_planet  TVComplement  0.974500
4   alien1_strength_level  TVComplement  0.978133
5      alien1_speed_level  TVComplement  0.969767
6     alien1_intelligence  TVComplement  0.988233
7          alien2_species  TVComplement  0.973600
8      alien2_home_planet  TVComplement  0.976767
9   alien2_strength_level  TVComplement  0.983800
10     alien2_speed_level  TVComplement  0.988433
11    alien2_intelligence  TVComplement  0.973967
12                 winner  TVComplement  0.97043

In [15]:
# Visualize the distribution of a the winner column
fig = get_column_plot(
    real_data=real_data,
    synthetic_data=synthetic_data,
    column_name='winner',
    metadata=metadata
)

fig.show()