- Ref: https://docs.sdv.dev/sdv/

In [1]:
%load_ext lab_black
import pandas as pd
from sdv.datasets.demo import download_demo, get_available_demos
from sdv.metadata import SingleTableMetadata
from sdv.single_table import (
    CTGANSynthesizer,
    TVAESynthesizer,
    GaussianCopulaSynthesizer,
    CopulaGANSynthesizer,
)
from sdv.lite import SingleTablePreset
from sdv.evaluation.single_table import evaluate_quality
import warnings

warnings.filterwarnings("ignore")

In [2]:
cardio = pd.read_csv("cardio_final.csv")
cardio

Unnamed: 0,id,age,gender,height,weight,systolic,diastolic,cholesterol,glucose,smoke,alcohol_intake,physical_activity,cv_disease,bmi
0,0,51,Male,168,62.0,110,80,Normal,Normal,False,False,True,False,22.0
1,1,56,Female,156,85.0,140,90,Extremely High,Normal,False,False,True,True,34.9
2,2,52,Female,165,64.0,130,70,Extremely High,Normal,False,False,False,True,23.5
3,3,49,Male,169,82.0,150,100,Normal,Normal,False,False,True,True,28.7
4,4,48,Female,156,56.0,100,60,Normal,Normal,False,False,False,False,23.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,53,Male,168,76.0,120,80,Normal,Normal,True,False,True,False,26.9
69996,99995,62,Female,158,126.0,140,90,High,High,False,False,True,True,50.5
69997,99996,53,Male,183,105.0,180,90,Extremely High,Normal,False,True,False,True,31.4
69998,99998,62,Female,163,72.0,135,80,Normal,High,False,False,False,True,27.1


In [3]:
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=cardio)

In [4]:
metadata

{
    "columns": {
        "id": {
            "sdtype": "numerical"
        },
        "age": {
            "sdtype": "numerical"
        },
        "gender": {
            "sdtype": "categorical"
        },
        "height": {
            "sdtype": "numerical"
        },
        "weight": {
            "sdtype": "numerical"
        },
        "systolic": {
            "sdtype": "numerical"
        },
        "diastolic": {
            "sdtype": "numerical"
        },
        "cholesterol": {
            "sdtype": "categorical"
        },
        "glucose": {
            "sdtype": "categorical"
        },
        "smoke": {
            "sdtype": "boolean"
        },
        "alcohol_intake": {
            "sdtype": "boolean"
        },
        "physical_activity": {
            "sdtype": "boolean"
        },
        "cv_disease": {
            "sdtype": "boolean"
        },
        "bmi": {
            "sdtype": "numerical"
        }
    },
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V

In [5]:
# check errors

metadata.validate()

In [6]:
metadata.update_column(column_name="id", sdtype="id", regex_format="[0-9]{5}")


metadata.update_column(
    column_name="age", sdtype="numerical", computer_representation="Int32"
)

metadata.update_column(
    column_name="height", sdtype="numerical", computer_representation="Int64"
)

metadata.update_column(
    column_name="weight", sdtype="numerical", computer_representation="Float"
)

metadata.update_column(
    column_name="systolic", sdtype="numerical", computer_representation="Int64"
)

metadata.update_column(
    column_name="diastolic", sdtype="numerical", computer_representation="Int64"
)

metadata.update_column(
    column_name="bmi", sdtype="numerical", computer_representation="Float"
)

metadata.update_column(column_name="smoke", sdtype="boolean")

metadata.update_column(column_name="alcohol_intake", sdtype="boolean")

metadata.update_column(column_name="physical_activity", sdtype="boolean")

metadata.update_column(column_name="cv_disease", sdtype="boolean")

In [7]:
metadata.set_primary_key(column_name="id")

In [8]:
metadata

{
    "columns": {
        "id": {
            "sdtype": "id",
            "regex_format": "[0-9]{5}"
        },
        "age": {
            "sdtype": "numerical",
            "computer_representation": "Int32"
        },
        "gender": {
            "sdtype": "categorical"
        },
        "height": {
            "sdtype": "numerical",
            "computer_representation": "Int64"
        },
        "weight": {
            "sdtype": "numerical",
            "computer_representation": "Float"
        },
        "systolic": {
            "sdtype": "numerical",
            "computer_representation": "Int64"
        },
        "diastolic": {
            "sdtype": "numerical",
            "computer_representation": "Int64"
        },
        "cholesterol": {
            "sdtype": "categorical"
        },
        "glucose": {
            "sdtype": "categorical"
        },
        "smoke": {
            "sdtype": "boolean"
        },
        "alcohol_intake": {
            "sdtype": "

In [9]:
cardio = cardio.head(100)

In [10]:
synthesizer = CTGANSynthesizer(metadata)
synthesizer.fit(cardio)

In [16]:
synthesizer.save("my_synthesizer.pkl")

synthesizer = CTGANSynthesizer.load("my_synthesizer.pkl")

In [11]:
synthetic_data = synthesizer.sample(num_rows=100)
synthetic_data.head()

Unnamed: 0,id,age,gender,height,weight,systolic,diastolic,cholesterol,glucose,smoke,alcohol_intake,physical_activity,cv_disease,bmi
0,0,48,Female,148,79.0,113,76,Normal,High,False,False,True,False,28.7
1,1,48,Male,148,52.0,90,60,Normal,Normal,False,False,True,True,32.0
2,2,58,Female,149,59.0,137,92,Normal,Normal,False,False,True,True,24.7
3,3,62,Female,148,52.0,94,79,High,High,False,False,False,False,30.8
4,4,44,Female,148,51.0,140,68,Normal,Normal,False,False,True,False,21.0


In [13]:
quality_report = evaluate_quality(cardio, synthetic_data, metadata)

Creating report: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 11.73it/s]



Overall Quality Score: 74.47%

Properties:
Column Shapes: 79.08%
Column Pair Trends: 69.86%
