In [19]:
import time
import pandas as pd

# models 
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.single_table import CTGANSynthesizer
from sdv.single_table import TVAESynthesizer

# autodetection of metadata
from sdv.metadata import SingleTableMetadata

# evaluation of synthetic data
from sdv.evaluation.single_table import evaluate_quality
from sdv.evaluation.single_table import run_diagnostic

from sdv.datasets.demo import download_demo

# visualisation
# from sdv.evaluation.single_table import get_column_plot

In [20]:
import warnings
warnings.filterwarnings('ignore')

from pprint import pprint

# Notes

### Synthesizers -- models to generate synthetic data 

Models to generate single table data 
- Gaussian Copula -- Statistical modelling | most customizable and faster 
- CTGAN – GAN-based (Deep learning) | high fidelity dataset, training time required
- TVAE – variational autoencoder (Deep learning)| high fidelity dataset, training time required

Others
- PAR  (?) – for sequential
- HMA  (?) - for multi-table

**Notebook focuses on Gaussian Copula, CTGAN, TVAE** 

Inputs required for the synthesizers 
- Metadata (Required)
- Real Data (Required)
- Customisable parameters (Optional)

### Autodetecting metadata 

[From the Doc]
The detected metadata is not guaranteed to be accurate or complete. Be sure to carefully inspect the metadata and update information.
- Primary keys and other identifiers are not auto-detected. See set_primary_key and add_alternate_keys method to add them.
- Dates and sensitive information may be auto-detected incorrectly. See update_column method to update them.

### Evaluation Report

**Quality Report**
For checking data  Fidelity:  How well the synthetic data captures mathematical properties from your real data.

Evaluates: 
- Overall score including Column Shapes, Columns Pair Trends
- Metrics and score details for Column Shapes, Columns Pair Trends
- All scaore range from 0 to 1

**Diagnoistic Report** 
Give a general sense of the strengths and weakness of the synthetic data model. 
Evaluates: 
- Overall [diagnosic results](https://docs.sdv.dev/sdmetrics/reports/diagnostic-report/single-table-api#get_results) -- SUCCESS, WARNING, DANGER 
- Properties -- Syntheiss, Coverage, Boundaries 

# Experiments 

**Mock Data**
- SDV demo data 
- simulated_hospital_data with varying: (1) num of columns -- 5, 10, 20 (2) num of rows -- 1k, 10k, 50k, 100k

In [31]:
BASE_MOCK_HOSP_PATH = "synthetic_data"

str_num_rows = [1000, 10000, 50000, 100000]
str_num_columns = [5, 10, 20]

**Real Data** 

In [51]:
REAL_DATA_LIST = ["real_data/community_survey.csv", "real_data/us_census.csv"] #"real_data/time_use.csv", 

## Utility functions

In [23]:
def get_model_time_score(synthesizer, metadata, real_data, samples=500, gen_quality_report=False, gen_diagnostic_report=True): 
    
    # ---------------------
    # Step 1: Train
    # ---------------------
    begin_time = time.time()
    synthesizer = GaussianCopulaSynthesizer(metadata, enforce_min_max_values=True, enforce_rounding=False)
    synthesizer.fit(real_data)
    end_time = time.time()
    
    training_time = end_time - begin_time
    
    # ---------------------
    # Step 2: Sample
    # ---------------------
    begin_time = time.time()
    synthetic_data = synthesizer.sample(num_rows=samples)
    end_time = time.time()
    
    sampling_time = end_time - begin_time
    
    # ---------------------
    # Step 3: Evaluate
    # ---------------------

    # 1. QUALITY REPORT
    # time consuming so keeping conditional
    if gen_quality_report:
        begin_time = time.time()
        quality_report_obj = evaluate_quality(
                real_data,
                synthetic_data,
                metadata
            )
        end_time = time.time()
        
        quality_report_time = end_time - begin_time
        
        quality_report = { 
            "quality_report_obj": quality_report_obj,
            "quality_report_time": quality_report_time,
            "quality_report_score": quality_report_obj.get_score()
        }
        
    # 2. DIAGNOSTIC REPORT
    if gen_diagnostic_report:
        begin_time = time.time()
        diagnostic_report_obj = run_diagnostic(
                real_data,
                synthetic_data,
                metadata
            )
        end_time = time.time()
        
        diagnostic_report_time = end_time - begin_time
        
        diagnostic_report = { 
            "diagnostic_report_obj": diagnostic_report_obj,
            "diagnostic_report_time": diagnostic_report_time
            
        }
        
    return {
            "training_time": training_time,
            "sampling_time": sampling_time,
            "quality_report": quality_report if gen_quality_report else None,
            "diagnostic_report": diagnostic_report}

In [24]:
def detect_metadata(real_data_df): 
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(data=real_data_df)
    pprint(metadata.to_dict())
#     python_dict = metadata.to_dict()
    return metadata

## Models

## 1. Gaussian Copula Model 

In [25]:
evaluate_synthesizer = GaussianCopulaSynthesizer

### 1.1. Mock SDV demo data

In [26]:
demo_data, demo_metadata = download_demo(
    modality='single_table',
    dataset_name='fake_hotel_guests'
)

pprint(demo_metadata)
synthesizer = evaluate_synthesizer(demo_metadata)

pprint(get_model_time_score(synthesizer, demo_metadata, demo_data, 500, True, True))

{
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1",
    "columns": {
        "guest_email": {
            "sdtype": "email",
            "pii": true
        },
        "has_rewards": {
            "sdtype": "boolean"
        },
        "room_type": {
            "sdtype": "categorical"
        },
        "amenities_fee": {
            "sdtype": "numerical",
            "computer_representation": "Float"
        },
        "checkin_date": {
            "sdtype": "datetime",
            "datetime_format": "%d %b %Y"
        },
        "checkout_date": {
            "sdtype": "datetime",
            "datetime_format": "%d %b %Y"
        },
        "room_rate": {
            "sdtype": "numerical",
            "computer_representation": "Float"
        },
        "billing_address": {
            "sdtype": "address",
            "pii": true
        },
        "credit_card_number": {
            "sdtype": "credit_card_number",
            "pii": true
        }
    },
    "primary_key": "guest_e

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 50.25it/s]



Overall Quality Score: 87.83%

Properties:
Column Shapes: 87.28%
Column Pair Trends: 88.38%


Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.05it/s]


DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ The synthetic data covers over 90% of the categories present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x14492ef70>,
                       'diagnostic_report_time': 1.9558310508728027},
 'quality_report': {'quality_report_obj': <sdmetrics.reports.single_table.quality_report.QualityReport object at 0x1448b9f40>,
                    'quality_report_score': 0.8782824635868114,
                    'quality_report_time': 0.22413206100463867},
 'sampling_time': 0.2985079288482666,
 'training_time': 1.2819840908050537}





### 1.2. Mock hospital data

**Varying number of rows and columns**

In [36]:
for c in str_num_columns:
    for r in str_num_rows:        
        file_name = BASE_MOCK_HOSP_PATH + f"/{c}_data_fields/hosp_{r}x{c}.csv"
            
        hosp_df = pd.read_csv(file_name)
        hosp_metadata = detect_metadata(hosp_df)
        
        synthesizer = evaluate_synthesizer(hosp_metadata)

        print("#"*30)
        print(f"Generating for: {file_name}")
        print("-"*30)
        pprint(get_model_time_score(synthesizer, hosp_metadata, hosp_df, samples=hosp_df.shape[0]))

{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age': {'sdtype': 'numerical'},
             'DateOfDiagnosis': {'sdtype': 'categorical'},
             'Gender': {'sdtype': 'categorical'},
             'NRIC': {'sdtype': 'categorical'},
             'Name': {'sdtype': 'categorical'}}}
##############################
Generating for: synthetic_data/5_data_fields/hosp_1000x5.csv
------------------------------


Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:03<00:00,  1.26it/s]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x144a88550>,
                       'diagnostic_report_time': 3.1740341186523438},
 'quality_report': None,
 'sampling_time': 0.036592960357666016,
 'training_time': 0.5281028747558594}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age': {'sdtype': 'numerical'},
             'DateOfDiagnosis': {'sdtype': 'categorical'},
             'Gender': {'sdtype': 'categorical'},
             'NRIC': {'sdtype': 'categorical'},
             'Name': {'sdtype': 'categorical'}}}
##############################
Gener

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:35<00:00,  8.99s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x147cbd610>,
                       'diagnostic_report_time': 35.980793952941895},
 'quality_report': None,
 'sampling_time': 0.11009526252746582,
 'training_time': 0.7555878162384033}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age': {'sdtype': 'numerical'},
             'DateOfDiagnosis': {'sdtype': 'categorical'},
             'Gender': {'sdtype': 'categorical'},
             'NRIC': {'sdtype': 'categorical'},
             'Name': {'sdtype': 'categorical'}}}
##############################
Genera

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [01:08<00:00, 17.11s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x147936ca0>,
                       'diagnostic_report_time': 68.44937515258789},
 'quality_report': None,
 'sampling_time': 0.4289979934692383,
 'training_time': 2.961811065673828}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age': {'sdtype': 'numerical'},
             'DateOfDiagnosis': {'sdtype': 'categorical'},
             'Gender': {'sdtype': 'categorical'},
             'NRIC': {'sdtype': 'categorical'},
             'Name': {'sdtype': 'categorical'}}}
##############################
Generatin

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [01:41<00:00, 25.27s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x147c67bb0>,
                       'diagnostic_report_time': 101.10261392593384},
 'quality_report': None,
 'sampling_time': 1.0197410583496094,
 'training_time': 6.101722002029419}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age0': {'sdtype': 'numerical'},
             'BMI0': {'sdtype': 'numerical'},
             'BloodType0': {'sdtype': 'categorical'},
             'DateOfDiagnosis0': {'sdtype': 'categorical'},
             'Disease0': {'sdtype': 'categorical'},
             'Gender0': {'sdtype

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:05<00:00,  1.27s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x147c58e50>,
                       'diagnostic_report_time': 5.1008923053741455},
 'quality_report': None,
 'sampling_time': 0.056378841400146484,
 'training_time': 0.7904999256134033}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age0': {'sdtype': 'numerical'},
             'BMI0': {'sdtype': 'numerical'},
             'BloodType0': {'sdtype': 'categorical'},
             'DateOfDiagnosis0': {'sdtype': 'categorical'},
             'Disease0': {'sdtype': 'categorical'},
             'Gender0': {'sdt

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [01:00<00:00, 15.01s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x147c91d30>,
                       'diagnostic_report_time': 60.04349708557129},
 'quality_report': None,
 'sampling_time': 0.20702314376831055,
 'training_time': 2.0034260749816895}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age0': {'sdtype': 'numerical'},
             'BMI0': {'sdtype': 'numerical'},
             'BloodType0': {'sdtype': 'categorical'},
             'DateOfDiagnosis0': {'sdtype': 'categorical'},
             'Disease0': {'sdtype': 'categorical'},
             'Gender0': {'sdtyp

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [01:54<00:00, 28.61s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x147cbd910>,
                       'diagnostic_report_time': 114.44423580169678},
 'quality_report': None,
 'sampling_time': 0.7958579063415527,
 'training_time': 6.420621156692505}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age0': {'sdtype': 'numerical'},
             'BMI0': {'sdtype': 'numerical'},
             'BloodType0': {'sdtype': 'categorical'},
             'DateOfDiagnosis0': {'sdtype': 'categorical'},
             'Disease0': {'sdtype': 'categorical'},
             'Gender0': {'sdtype

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [02:46<00:00, 41.66s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x147c81fd0>,
                       'diagnostic_report_time': 166.6308240890503},
 'quality_report': None,
 'sampling_time': 1.924010992050171,
 'training_time': 12.069630861282349}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age0': {'sdtype': 'numerical'},
             'Age1': {'sdtype': 'numerical'},
             'BMI0': {'sdtype': 'numerical'},
             'BMI1': {'sdtype': 'numerical'},
             'BloodType0': {'sdtype': 'categorical'},
             'BloodType1': {'sdtype': 'categorical'},

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:11<00:00,  2.90s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x147c67bb0>,
                       'diagnostic_report_time': 11.613440036773682},
 'quality_report': None,
 'sampling_time': 0.12970590591430664,
 'training_time': 1.5793561935424805}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age0': {'sdtype': 'numerical'},
             'Age1': {'sdtype': 'numerical'},
             'BMI0': {'sdtype': 'numerical'},
             'BMI1': {'sdtype': 'numerical'},
             'BloodType0': {'sdtype': 'categorical'},
             'BloodType1': {'sdtype': 'categorical

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [02:06<00:00, 31.68s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x147c67bb0>,
                       'diagnostic_report_time': 126.73464894294739},
 'quality_report': None,
 'sampling_time': 0.3910808563232422,
 'training_time': 3.8084909915924072}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age0': {'sdtype': 'numerical'},
             'Age1': {'sdtype': 'numerical'},
             'BMI0': {'sdtype': 'numerical'},
             'BMI1': {'sdtype': 'numerical'},
             'BloodType0': {'sdtype': 'categorical'},
             'BloodType1': {'sdtype': 'categorical'

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [03:37<00:00, 54.48s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x144904eb0>,
                       'diagnostic_report_time': 217.91224122047424},
 'quality_report': None,
 'sampling_time': 1.8924031257629395,
 'training_time': 13.4011390209198}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age0': {'sdtype': 'numerical'},
             'Age1': {'sdtype': 'numerical'},
             'BMI0': {'sdtype': 'numerical'},
             'BMI1': {'sdtype': 'numerical'},
             'BloodType0': {'sdtype': 'categorical'},
             'BloodType1': {'sdtype': 'categorical'},

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [05:22<00:00, 80.74s/it]


DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x149142d90>,
                       'diagnostic_report_time': 322.9465229511261},
 'quality_report': None,
 'sampling_time': 3.729057788848877,
 'training_time': 28.79592800140381}





### 1.3. Real-world data

In [None]:
for file_name in REAL_DATA_LIST:
    real_df = pd.read_csv(file_name)
    real_metadata = detect_metadata(real_df)
    
    synthesizer = evaluate_synthesizer(real_metadata)

    print("#"*30)
    print(f"Generating for: {file_name} {real_df.shape}")
    print(get_model_time_score(synthesizer, real_metadata, real_df, samples=real_df.shape[0]), True, True)
    print("#"*30)

{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'ACCESS': {'sdtype': 'numerical'},
             'ACR': {'sdtype': 'numerical'},
             'ADJHSG': {'sdtype': 'numerical'},
             'ADJINC': {'sdtype': 'numerical'},
             'AGS': {'sdtype': 'numerical'},
             'BATH': {'sdtype': 'numerical'},
             'BDSP': {'sdtype': 'numerical'},
             'BLD': {'sdtype': 'numerical'},
             'BROADBND': {'sdtype': 'numerical'},
             'BUS': {'sdtype': 'numerical'},
             'COMPOTHX': {'sdtype': 'numerical'},
             'CONP': {'sdtype': 'numerical'},
             'DIALUP': {'sdtype': 'numerical'},
             'DIVISION': {'sdtype': 'numerical'},
             'DSL': {'sdtype': 'numerical'},
             'ELEP': {'sdtype': 'numerical'},
             'FACCESSP': {'sdtype': 'numerical'},
             'FACRP': {'sdtype': 'numerical'},
             'FAGSP': {'sdtype': 'numerical'},
             'FBATHP': {'sdtype': 'numerical'},
            

# 2. CTGAN Model

In [38]:
evaluate_synthesizer = CTGANSynthesizer

### 2.1. Mock SDV-demo Data

In [39]:
demo_data, demo_metadata = download_demo(
    modality='single_table',
    dataset_name='fake_hotel_guests'
)

pprint(demo_metadata)
synthesizer = evaluate_synthesizer(demo_metadata)

pprint(get_model_time_score(synthesizer, demo_metadata, demo_data, 500, True, True))

{
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1",
    "columns": {
        "guest_email": {
            "sdtype": "email",
            "pii": true
        },
        "has_rewards": {
            "sdtype": "boolean"
        },
        "room_type": {
            "sdtype": "categorical"
        },
        "amenities_fee": {
            "sdtype": "numerical",
            "computer_representation": "Float"
        },
        "checkin_date": {
            "sdtype": "datetime",
            "datetime_format": "%d %b %Y"
        },
        "checkout_date": {
            "sdtype": "datetime",
            "datetime_format": "%d %b %Y"
        },
        "room_rate": {
            "sdtype": "numerical",
            "computer_representation": "Float"
        },
        "billing_address": {
            "sdtype": "address",
            "pii": true
        },
        "credit_card_number": {
            "sdtype": "credit_card_number",
            "pii": true
        }
    },
    "primary_key": "guest_e

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 57.00it/s]



Overall Quality Score: 87.83%

Properties:
Column Shapes: 87.28%
Column Pair Trends: 88.38%


Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.16it/s]


DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ The synthetic data covers over 90% of the categories present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x144a6bbe0>,
                       'diagnostic_report_time': 1.8593480587005615},
 'quality_report': {'quality_report_obj': <sdmetrics.reports.single_table.quality_report.QualityReport object at 0x147cb64c0>,
                    'quality_report_score': 0.8782824635868114,
                    'quality_report_time': 0.20322179794311523},
 'sampling_time': 0.2915680408477783,
 'training_time': 1.232923984527588}





### 2.2. Mock hospital data 

In [41]:
for c in str_num_columns:
    for r in str_num_rows:
        file_name = BASE_MOCK_HOSP_PATH + f"/{c}_data_fields/hosp_{r}x{c}.csv"
        hosp_df = pd.read_csv(file_name)
        hosp_metadata = detect_metadata(hosp_df)
        
        synthesizer = evaluate_synthesizer(hosp_metadata)

        print("#"*30)
        print(f"Generating for: {file_name}")
        print("-"*30)
        pprint(get_model_time_score(synthesizer, hosp_metadata, hosp_df, samples=hosp_df.shape[0]))

{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age': {'sdtype': 'numerical'},
             'DateOfDiagnosis': {'sdtype': 'categorical'},
             'Gender': {'sdtype': 'categorical'},
             'NRIC': {'sdtype': 'categorical'},
             'Name': {'sdtype': 'categorical'}}}
##############################
Generating for: synthetic_data/5_data_fields/hosp_1000x5.csv
------------------------------


Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.55it/s]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x1475fe070>,
                       'diagnostic_report_time': 2.589355945587158},
 'quality_report': None,
 'sampling_time': 0.034508705139160156,
 'training_time': 0.43996620178222656}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age': {'sdtype': 'numerical'},
             'DateOfDiagnosis': {'sdtype': 'categorical'},
             'Gender': {'sdtype': 'categorical'},
             'NRIC': {'sdtype': 'categorical'},
             'Name': {'sdtype': 'categorical'}}}
##############################
Gener

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:33<00:00,  8.40s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x147cb9820>,
                       'diagnostic_report_time': 33.62070274353027},
 'quality_report': None,
 'sampling_time': 0.1117868423461914,
 'training_time': 0.7006809711456299}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age': {'sdtype': 'numerical'},
             'DateOfDiagnosis': {'sdtype': 'categorical'},
             'Gender': {'sdtype': 'categorical'},
             'NRIC': {'sdtype': 'categorical'},
             'Name': {'sdtype': 'categorical'}}}
##############################
Generati

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [01:06<00:00, 16.62s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x143fb0ac0>,
                       'diagnostic_report_time': 66.47591495513916},
 'quality_report': None,
 'sampling_time': 0.44310784339904785,
 'training_time': 3.012270927429199}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age': {'sdtype': 'numerical'},
             'DateOfDiagnosis': {'sdtype': 'categorical'},
             'Gender': {'sdtype': 'categorical'},
             'NRIC': {'sdtype': 'categorical'},
             'Name': {'sdtype': 'categorical'}}}
##############################
Generati

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [01:43<00:00, 25.84s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x143d78490>,
                       'diagnostic_report_time': 103.36800789833069},
 'quality_report': None,
 'sampling_time': 1.1162910461425781,
 'training_time': 6.289997100830078}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age0': {'sdtype': 'numerical'},
             'BMI0': {'sdtype': 'numerical'},
             'BloodType0': {'sdtype': 'categorical'},
             'DateOfDiagnosis0': {'sdtype': 'categorical'},
             'Disease0': {'sdtype': 'categorical'},
             'Gender0': {'sdtype

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:05<00:00,  1.25s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x147ddf940>,
                       'diagnostic_report_time': 5.011920690536499},
 'quality_report': None,
 'sampling_time': 0.054991960525512695,
 'training_time': 0.801738977432251}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age0': {'sdtype': 'numerical'},
             'BMI0': {'sdtype': 'numerical'},
             'BloodType0': {'sdtype': 'categorical'},
             'DateOfDiagnosis0': {'sdtype': 'categorical'},
             'Disease0': {'sdtype': 'categorical'},
             'Gender0': {'sdtyp

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [01:00<00:00, 15.14s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x147c91280>,
                       'diagnostic_report_time': 60.56951570510864},
 'quality_report': None,
 'sampling_time': 0.2168731689453125,
 'training_time': 1.9510090351104736}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age0': {'sdtype': 'numerical'},
             'BMI0': {'sdtype': 'numerical'},
             'BloodType0': {'sdtype': 'categorical'},
             'DateOfDiagnosis0': {'sdtype': 'categorical'},
             'Disease0': {'sdtype': 'categorical'},
             'Gender0': {'sdtype

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [01:49<00:00, 27.50s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x147c91280>,
                       'diagnostic_report_time': 109.99098229408264},
 'quality_report': None,
 'sampling_time': 0.8214170932769775,
 'training_time': 6.4502458572387695}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age0': {'sdtype': 'numerical'},
             'BMI0': {'sdtype': 'numerical'},
             'BloodType0': {'sdtype': 'categorical'},
             'DateOfDiagnosis0': {'sdtype': 'categorical'},
             'Disease0': {'sdtype': 'categorical'},
             'Gender0': {'sdtyp

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [02:43<00:00, 40.93s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x147cbd160>,
                       'diagnostic_report_time': 163.72747492790222},
 'quality_report': None,
 'sampling_time': 1.8007919788360596,
 'training_time': 11.952319145202637}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age0': {'sdtype': 'numerical'},
             'Age1': {'sdtype': 'numerical'},
             'BMI0': {'sdtype': 'numerical'},
             'BMI1': {'sdtype': 'numerical'},
             'BloodType0': {'sdtype': 'categorical'},
             'BloodType1': {'sdtype': 'categorical'

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:10<00:00,  2.52s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x147673ac0>,
                       'diagnostic_report_time': 10.089614868164062},
 'quality_report': None,
 'sampling_time': 0.12871599197387695,
 'training_time': 1.5737409591674805}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age0': {'sdtype': 'numerical'},
             'Age1': {'sdtype': 'numerical'},
             'BMI0': {'sdtype': 'numerical'},
             'BMI1': {'sdtype': 'numerical'},
             'BloodType0': {'sdtype': 'categorical'},
             'BloodType1': {'sdtype': 'categorical

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [02:06<00:00, 31.63s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x144a6b580>,
                       'diagnostic_report_time': 126.51141905784607},
 'quality_report': None,
 'sampling_time': 0.4152340888977051,
 'training_time': 4.077377080917358}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age0': {'sdtype': 'numerical'},
             'Age1': {'sdtype': 'numerical'},
             'BMI0': {'sdtype': 'numerical'},
             'BMI1': {'sdtype': 'numerical'},
             'BloodType0': {'sdtype': 'categorical'},
             'BloodType1': {'sdtype': 'categorical'}

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [03:36<00:00, 54.15s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x143f9bb20>,
                       'diagnostic_report_time': 216.5890769958496},
 'quality_report': None,
 'sampling_time': 1.8787572383880615,
 'training_time': 14.218127965927124}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age0': {'sdtype': 'numerical'},
             'Age1': {'sdtype': 'numerical'},
             'BMI0': {'sdtype': 'numerical'},
             'BMI1': {'sdtype': 'numerical'},
             'BloodType0': {'sdtype': 'categorical'},
             'BloodType1': {'sdtype': 'categorical'}

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [05:22<00:00, 80.57s/it]


DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x143df3550>,
                       'diagnostic_report_time': 322.2915871143341},
 'quality_report': None,
 'sampling_time': 4.26455020904541,
 'training_time': 27.464805841445923}





### 2.3. Real data

In [None]:
for file_name in REAL_DATA_LIST:
    real_df = pd.read_csv(file_name)
    real_metadata = detect_metadata(real_df)
    
    synthesizer = evaluate_synthesizer(real_metadata)

    print("#"*30)
    print(f"Generating for: {file_name} {real_df.shape}")
    print(get_model_time_score(synthesizer, real_metadata, real_df, samples=real_df.shape[0]), True, True)
    print("#"*30)

# 3. TVAE Model

In [43]:
evaluate_synthesizer = TVAESynthesizer

### 3.1. Mock 

In [44]:
demo_data, demo_metadata = download_demo(
    modality='single_table',
    dataset_name='fake_hotel_guests'
)

pprint(demo_metadata)
synthesizer = evaluate_synthesizer(demo_metadata)

pprint(get_model_time_score(synthesizer, demo_metadata, demo_data, 500, True, True))

{
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1",
    "columns": {
        "guest_email": {
            "sdtype": "email",
            "pii": true
        },
        "has_rewards": {
            "sdtype": "boolean"
        },
        "room_type": {
            "sdtype": "categorical"
        },
        "amenities_fee": {
            "sdtype": "numerical",
            "computer_representation": "Float"
        },
        "checkin_date": {
            "sdtype": "datetime",
            "datetime_format": "%d %b %Y"
        },
        "checkout_date": {
            "sdtype": "datetime",
            "datetime_format": "%d %b %Y"
        },
        "room_rate": {
            "sdtype": "numerical",
            "computer_representation": "Float"
        },
        "billing_address": {
            "sdtype": "address",
            "pii": true
        },
        "credit_card_number": {
            "sdtype": "credit_card_number",
            "pii": true
        }
    },
    "primary_key": "guest_e

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 51.81it/s]



Overall Quality Score: 87.83%

Properties:
Column Shapes: 87.28%
Column Pair Trends: 88.38%


Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.99it/s]


DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ The synthetic data covers over 90% of the categories present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x14a8381c0>,
                       'diagnostic_report_time': 2.014273166656494},
 'quality_report': {'quality_report_obj': <sdmetrics.reports.single_table.quality_report.QualityReport object at 0x149142280>,
                    'quality_report_score': 0.8782824635868114,
                    'quality_report_time': 0.2190699577331543},
 'sampling_time': 0.3153808116912842,
 'training_time': 1.3100779056549072}





### 3.2. Mock hospital data

In [47]:
for c in str_num_columns:
    for r in str_num_rows:
        file_name = BASE_MOCK_HOSP_PATH + f"/{c}_data_fields/hosp_{r}x{c}.csv"
        hosp_df = pd.read_csv(file_name)
        hosp_metadata = detect_metadata(hosp_df)
        
        synthesizer = evaluate_synthesizer(hosp_metadata)

        print("#"*30)
        print(f"Generating for: {file_name}")
        print("-"*30)
        pprint(get_model_time_score(synthesizer, hosp_metadata, hosp_df, samples=hosp_df.shape[0]))

{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age': {'sdtype': 'numerical'},
             'DateOfDiagnosis': {'sdtype': 'categorical'},
             'Gender': {'sdtype': 'categorical'},
             'NRIC': {'sdtype': 'categorical'},
             'Name': {'sdtype': 'categorical'}}}
##############################
Generating for: synthetic_data/5_data_fields/hosp_1000x5.csv
------------------------------


Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.49it/s]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x149142250>,
                       'diagnostic_report_time': 2.6896071434020996},
 'quality_report': None,
 'sampling_time': 0.03888201713562012,
 'training_time': 0.4524209499359131}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age': {'sdtype': 'numerical'},
             'DateOfDiagnosis': {'sdtype': 'categorical'},
             'Gender': {'sdtype': 'categorical'},
             'NRIC': {'sdtype': 'categorical'},
             'Name': {'sdtype': 'categorical'}}}
##############################
Genera

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:33<00:00,  8.45s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x147c67a60>,
                       'diagnostic_report_time': 33.789533376693726},
 'quality_report': None,
 'sampling_time': 0.10407567024230957,
 'training_time': 0.760303258895874}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age': {'sdtype': 'numerical'},
             'DateOfDiagnosis': {'sdtype': 'categorical'},
             'Gender': {'sdtype': 'categorical'},
             'NRIC': {'sdtype': 'categorical'},
             'Name': {'sdtype': 'categorical'}}}
##############################
Generat

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [01:05<00:00, 16.44s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x149142ca0>,
                       'diagnostic_report_time': 65.77390217781067},
 'quality_report': None,
 'sampling_time': 0.4225599765777588,
 'training_time': 3.0995888710021973}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age': {'sdtype': 'numerical'},
             'DateOfDiagnosis': {'sdtype': 'categorical'},
             'Gender': {'sdtype': 'categorical'},
             'NRIC': {'sdtype': 'categorical'},
             'Name': {'sdtype': 'categorical'}}}
##############################
Generati

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [01:54<00:00, 28.71s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x147b54370>,
                       'diagnostic_report_time': 114.83510303497314},
 'quality_report': None,
 'sampling_time': 0.9915950298309326,
 'training_time': 6.494463920593262}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age0': {'sdtype': 'numerical'},
             'BMI0': {'sdtype': 'numerical'},
             'BloodType0': {'sdtype': 'categorical'},
             'DateOfDiagnosis0': {'sdtype': 'categorical'},
             'Disease0': {'sdtype': 'categorical'},
             'Gender0': {'sdtype

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:05<00:00,  1.47s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x147b431c0>,
                       'diagnostic_report_time': 5.884649276733398},
 'quality_report': None,
 'sampling_time': 0.05597209930419922,
 'training_time': 0.8592450618743896}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age0': {'sdtype': 'numerical'},
             'BMI0': {'sdtype': 'numerical'},
             'BloodType0': {'sdtype': 'categorical'},
             'DateOfDiagnosis0': {'sdtype': 'categorical'},
             'Disease0': {'sdtype': 'categorical'},
             'Gender0': {'sdtyp

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [01:04<00:00, 16.02s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x1479365b0>,
                       'diagnostic_report_time': 64.08899593353271},
 'quality_report': None,
 'sampling_time': 0.1959390640258789,
 'training_time': 2.3670918941497803}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age0': {'sdtype': 'numerical'},
             'BMI0': {'sdtype': 'numerical'},
             'BloodType0': {'sdtype': 'categorical'},
             'DateOfDiagnosis0': {'sdtype': 'categorical'},
             'Disease0': {'sdtype': 'categorical'},
             'Gender0': {'sdtype

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [01:56<00:00, 29.05s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x147b43dc0>,
                       'diagnostic_report_time': 116.22190308570862},
 'quality_report': None,
 'sampling_time': 0.893563985824585,
 'training_time': 6.701810121536255}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age0': {'sdtype': 'numerical'},
             'BMI0': {'sdtype': 'numerical'},
             'BloodType0': {'sdtype': 'categorical'},
             'DateOfDiagnosis0': {'sdtype': 'categorical'},
             'Disease0': {'sdtype': 'categorical'},
             'Gender0': {'sdtype'

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [02:54<00:00, 43.54s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x14a8386d0>,
                       'diagnostic_report_time': 174.16585206985474},
 'quality_report': None,
 'sampling_time': 1.639620065689087,
 'training_time': 12.584924936294556}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age0': {'sdtype': 'numerical'},
             'Age1': {'sdtype': 'numerical'},
             'BMI0': {'sdtype': 'numerical'},
             'BMI1': {'sdtype': 'numerical'},
             'BloodType0': {'sdtype': 'categorical'},
             'BloodType1': {'sdtype': 'categorical'}

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:10<00:00,  2.67s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x147cb9c40>,
                       'diagnostic_report_time': 10.694057941436768},
 'quality_report': None,
 'sampling_time': 0.11630964279174805,
 'training_time': 1.5646450519561768}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age0': {'sdtype': 'numerical'},
             'Age1': {'sdtype': 'numerical'},
             'BMI0': {'sdtype': 'numerical'},
             'BMI1': {'sdtype': 'numerical'},
             'BloodType0': {'sdtype': 'categorical'},
             'BloodType1': {'sdtype': 'categorical

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [02:06<00:00, 31.66s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x147b431c0>,
                       'diagnostic_report_time': 126.6598219871521},
 'quality_report': None,
 'sampling_time': 0.40526413917541504,
 'training_time': 4.1520397663116455}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age0': {'sdtype': 'numerical'},
             'Age1': {'sdtype': 'numerical'},
             'BMI0': {'sdtype': 'numerical'},
             'BMI1': {'sdtype': 'numerical'},
             'BloodType0': {'sdtype': 'categorical'},
             'BloodType1': {'sdtype': 'categorical'

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [03:37<00:00, 54.47s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x143e9d310>,
                       'diagnostic_report_time': 217.89478302001953},
 'quality_report': None,
 'sampling_time': 2.0721261501312256,
 'training_time': 13.893616199493408}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age0': {'sdtype': 'numerical'},
             'Age1': {'sdtype': 'numerical'},
             'BMI0': {'sdtype': 'numerical'},
             'BMI1': {'sdtype': 'numerical'},
             'BloodType0': {'sdtype': 'categorical'},
             'BloodType1': {'sdtype': 'categorical'

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [05:29<00:00, 82.48s/it]


DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x1479365b0>,
                       'diagnostic_report_time': 329.90741181373596},
 'quality_report': None,
 'sampling_time': 3.8388729095458984,
 'training_time': 27.20659303665161}





### 3.3. Mock hospital data

In [46]:
for file_name in REAL_DATA_LIST:
    real_df = pd.read_csv(file_name)
    real_metadata = detect_metadata(real_df)
    
    synthesizer = evaluate_synthesizer(real_metadata)

    print("#"*30)
    print(f"Generating for: {file_name} {real_df.shape}")
    print(get_model_time_score(synthesizer, real_metadata, real_df, samples=real_df.shape[0]), True, True)
    print("#"*30)

FileNotFoundError: [Errno 2] No such file or directory: 'real_data/time_use.csv'

# Dumps

In [None]:
quality_report.get_visualization('Column Shapes')

In [None]:
custom_synthesizer = GaussianCopulaSynthesizer(
    metadata,
    default_distribution='truncnorm',
    numerical_distributions={
        'checkin_date': 'uniform',
        'checkout_date': 'uniform',
        'room_rate': 'gaussian_kde'
    }
)

custom_synthesizer.fit(real_data)

In [None]:
learned_distributions = custom_synthesizer.get_learned_distributions()
learned_distributions['has_rewards']

In [None]:
synthetic_data_customized = custom_synthesizer.sample(num_rows=500)

quality_report = evaluate_quality(
    real_data,
    synthetic_data_customized,
    metadata
)

In [None]:
fig = get_column_plot(
    real_data=real_data,
    synthetic_data=synthetic_data_customized,
    column_name='room_rate',
    metadata=metadata
)
    
fig.show()

In [None]:
from sdv.sampling import Condition

suite_guests_with_rewards = Condition(
    num_rows=250,
    column_values={'room_type': 'SUITE', 'has_rewards': True}
)

suite_guests_without_rewards = Condition(
    num_rows=250,
    column_values={'room_type': 'SUITE', 'has_rewards': False}
)