In [1]:
import time
import pandas as pd

# models 
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.single_table import CTGANSynthesizer
from sdv.single_table import TVAESynthesizer

# autodetection of metadata
from sdv.metadata import SingleTableMetadata

# evaluation of synthetic data
from sdv.evaluation.single_table import evaluate_quality
from sdv.evaluation.single_table import run_diagnostic

from sdv.datasets.demo import download_demo

# visualisation
# from sdv.evaluation.single_table import get_column_plot

In [2]:
import warnings
warnings.filterwarnings('ignore')

from pprint import pprint

# Notes

### Synthesizers -- models to generate synthetic data 

Models to generate single table data 
- Gaussian Copula -- Statistical modelling | most customizable and faster 
- CTGAN – GAN-based (Deep learning) | high fidelity dataset, training time required
- TVAE – variational autoencoder (Deep learning)| high fidelity dataset, training time required

Others
- PAR  (?) – for sequential
- HMA  (?) - for multi-table

**Notebook focuses on Gaussian Copula, CTGAN, TVAE** 

Inputs required for the synthesizers 
- Metadata (Required)
- Real Data (Required)
- Customisable parameters (Optional)

### Autodetecting metadata 

[From the Doc]
The detected metadata is not guaranteed to be accurate or complete. Be sure to carefully inspect the metadata and update information.
- Primary keys and other identifiers are not auto-detected. See set_primary_key and add_alternate_keys method to add them.
- Dates and sensitive information may be auto-detected incorrectly. See update_column method to update them.

### Evaluation Report

**Quality Report**
For checking data  Fidelity:  How well the synthetic data captures mathematical properties from your real data.

Evaluates: 
- Overall score including Column Shapes, Columns Pair Trends
- Metrics and score details for Column Shapes, Columns Pair Trends
- All scaore range from 0 to 1

**Diagnoistic Report** 
Give a general sense of the strengths and weakness of the synthetic data model. 
Evaluates: 
- Overall [diagnosic results](https://docs.sdv.dev/sdmetrics/reports/diagnostic-report/single-table-api#get_results) -- SUCCESS, WARNING, DANGER 
- Properties -- Syntheiss, Coverage, Boundaries 

# Experiments 

**Mock Data**
- SDV demo data 
- simulated_hospital_data with varying: (1) num of columns -- 5, 10, 20 (2) num of rows -- 1k, 10k, 50k, 100k

In [3]:
BASE_MOCK_HOSP_PATH = "synthetic_data"

str_num_rows = [1000, 10000, 50000, 100000]
str_num_columns = [5, 10, 20]

**Real Data** 

In [51]:
REAL_DATA_LIST = ["real_data/community_survey.csv", "real_data/us_census.csv"] #"real_data/time_use.csv", 

## Utility functions

In [15]:
def get_model_time_score(synthesizer, metadata, real_data, samples=500, gen_quality_report=False, gen_diagnostic_report=True): 
    
    # ---------------------
    # Step 1: Train
    # ---------------------
    begin_time = time.time()
    synthesizer.fit(real_data)
    end_time = time.time()
    
    training_time = end_time - begin_time
    
    # ---------------------
    # Step 2: Sample
    # ---------------------
    begin_time = time.time()
    synthetic_data = synthesizer.sample(num_rows=samples)
    end_time = time.time()
    
    sampling_time = end_time - begin_time
    
    # ---------------------
    # Step 3: Evaluate
    # ---------------------

    # 1. QUALITY REPORT
    # time consuming so keeping conditional
    if gen_quality_report:
        begin_time = time.time()
        quality_report_obj = evaluate_quality(
                real_data,
                synthetic_data,
                metadata
            )
        end_time = time.time()
        
        quality_report_time = end_time - begin_time
        
        quality_report = { 
            "quality_report_obj": quality_report_obj,
            "quality_report_time": quality_report_time,
            "quality_report_score": quality_report_obj.get_score()
        }
        
    # 2. DIAGNOSTIC REPORT
    if gen_diagnostic_report:
        begin_time = time.time()
        diagnostic_report_obj = run_diagnostic(
                real_data,
                synthetic_data,
                metadata
            )
        end_time = time.time()
        
        diagnostic_report_time = end_time - begin_time
        
        diagnostic_report = { 
            "diagnostic_report_obj": diagnostic_report_obj,
            "diagnostic_report_time": diagnostic_report_time
            
        }
        
    return {
            "training_time": training_time,
            "sampling_time": sampling_time,
            "quality_report": quality_report if gen_quality_report else None,
            "diagnostic_report": diagnostic_report if gen_diagnostic_report else None}

In [10]:
def detect_metadata(real_data_df): 
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(data=real_data_df)
    pprint(metadata.to_dict())
#     python_dict = metadata.to_dict()
    return metadata

## Models

## 1. Gaussian Copula Model 

In [6]:
evaluate_synthesizer = GaussianCopulaSynthesizer

### 1.1. Mock SDV demo data

In [11]:
demo_data, demo_metadata = download_demo(
    modality='single_table',
    dataset_name='fake_hotel_guests'
)

pprint(demo_metadata)
synthesizer = evaluate_synthesizer(demo_metadata, enforce_min_max_values=True, enforce_rounding=False)

pprint(get_model_time_score(synthesizer, demo_metadata, demo_data, 500))

{
    "columns": {
        "guest_email": {
            "sdtype": "email",
            "pii": true
        },
        "has_rewards": {
            "sdtype": "boolean"
        },
        "room_type": {
            "sdtype": "categorical"
        },
        "amenities_fee": {
            "sdtype": "numerical",
            "computer_representation": "Float"
        },
        "checkin_date": {
            "sdtype": "datetime",
            "datetime_format": "%d %b %Y"
        },
        "checkout_date": {
            "sdtype": "datetime",
            "datetime_format": "%d %b %Y"
        },
        "room_rate": {
            "sdtype": "numerical",
            "computer_representation": "Float"
        },
        "billing_address": {
            "sdtype": "address",
            "pii": true
        },
        "credit_card_number": {
            "sdtype": "credit_card_number",
            "pii": true
        }
    },
    "primary_key": "guest_email",
    "METADATA_SPEC_VERSION": "SINGLE_TABL

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 47.59it/s]



Overall Quality Score: 87.83%

Properties:
Column Shapes: 87.28%
Column Pair Trends: 88.38%


Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.08it/s]


DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ The synthetic data covers over 90% of the categories present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x144346a60>,
                       'diagnostic_report_time': 1.9242639541625977},
 'quality_report': {'quality_report_obj': <sdmetrics.reports.single_table.quality_report.QualityReport object at 0x1449c0e20>,
                    'quality_report_score': 0.8782824635868114,
                    'quality_report_time': 0.22680377960205078},
 'sampling_time': 0.36507701873779297,
 'training_time': 1.60667085647583}





### 1.2. Mock hospital data

**Varying number of rows and columns**

In [16]:
for c in str_num_columns:
    for r in str_num_rows:        
        file_name = BASE_MOCK_HOSP_PATH + f"/{c}_data_fields/hosp_{r}x{c}.csv"
            
        hosp_df = pd.read_csv(file_name)
        hosp_metadata = detect_metadata(hosp_df)
        
        synthesizer = evaluate_synthesizer(hosp_metadata, enforce_min_max_values=True, enforce_rounding=False)

        print("#"*30)
        print(f"Generating for: {file_name}")
        print("-"*30)
        pprint(get_model_time_score(synthesizer, hosp_metadata, hosp_df, samples=hosp_df.shape[0]))

{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age': {'sdtype': 'numerical'},
             'DateOfDiagnosis': {'sdtype': 'categorical'},
             'Gender': {'sdtype': 'categorical'},
             'NRIC': {'sdtype': 'categorical'},
             'Name': {'sdtype': 'categorical'}}}
##############################
Generating for: synthetic_data/5_data_fields/hosp_1000x5.csv
------------------------------


Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.42it/s]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x146f57bb0>,
                       'diagnostic_report_time': 2.83032488822937},
 'quality_report': None,
 'sampling_time': 0.07252311706542969,
 'training_time': 0.4693310260772705}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age': {'sdtype': 'numerical'},
             'DateOfDiagnosis': {'sdtype': 'categorical'},
             'Gender': {'sdtype': 'categorical'},
             'NRIC': {'sdtype': 'categorical'},
             'Name': {'sdtype': 'categorical'}}}
##############################
Generati

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:34<00:00,  8.61s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x1476503d0>,
                       'diagnostic_report_time': 34.457128047943115},
 'quality_report': None,
 'sampling_time': 0.14510798454284668,
 'training_time': 0.8101482391357422}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age': {'sdtype': 'numerical'},
             'DateOfDiagnosis': {'sdtype': 'categorical'},
             'Gender': {'sdtype': 'categorical'},
             'NRIC': {'sdtype': 'categorical'},
             'Name': {'sdtype': 'categorical'}}}
##############################
Genera

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [01:02<00:00, 15.54s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x144346a60>,
                       'diagnostic_report_time': 62.185527086257935},
 'quality_report': None,
 'sampling_time': 0.4968688488006592,
 'training_time': 3.0849459171295166}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age': {'sdtype': 'numerical'},
             'DateOfDiagnosis': {'sdtype': 'categorical'},
             'Gender': {'sdtype': 'categorical'},
             'NRIC': {'sdtype': 'categorical'},
             'Name': {'sdtype': 'categorical'}}}
##############################
Generat

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [01:37<00:00, 24.43s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x147696f40>,
                       'diagnostic_report_time': 97.72226524353027},
 'quality_report': None,
 'sampling_time': 0.8451488018035889,
 'training_time': 5.720771074295044}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age0': {'sdtype': 'numerical'},
             'BMI0': {'sdtype': 'numerical'},
             'BloodType0': {'sdtype': 'categorical'},
             'DateOfDiagnosis0': {'sdtype': 'categorical'},
             'Disease0': {'sdtype': 'categorical'},
             'Gender0': {'sdtype'

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:05<00:00,  1.34s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x146f42040>,
                       'diagnostic_report_time': 5.35106897354126},
 'quality_report': None,
 'sampling_time': 0.07695269584655762,
 'training_time': 0.8008823394775391}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age0': {'sdtype': 'numerical'},
             'BMI0': {'sdtype': 'numerical'},
             'BloodType0': {'sdtype': 'categorical'},
             'DateOfDiagnosis0': {'sdtype': 'categorical'},
             'Disease0': {'sdtype': 'categorical'},
             'Gender0': {'sdtype

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:59<00:00, 14.92s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x146f57c40>,
                       'diagnostic_report_time': 59.67865204811096},
 'quality_report': None,
 'sampling_time': 0.2223680019378662,
 'training_time': 1.97564697265625}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age0': {'sdtype': 'numerical'},
             'BMI0': {'sdtype': 'numerical'},
             'BloodType0': {'sdtype': 'categorical'},
             'DateOfDiagnosis0': {'sdtype': 'categorical'},
             'Disease0': {'sdtype': 'categorical'},
             'Gender0': {'sdtype':

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [01:42<00:00, 25.56s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x147cae850>,
                       'diagnostic_report_time': 102.24076700210571},
 'quality_report': None,
 'sampling_time': 0.8990569114685059,
 'training_time': 6.716246128082275}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age0': {'sdtype': 'numerical'},
             'BMI0': {'sdtype': 'numerical'},
             'BloodType0': {'sdtype': 'categorical'},
             'DateOfDiagnosis0': {'sdtype': 'categorical'},
             'Disease0': {'sdtype': 'categorical'},
             'Gender0': {'sdtype

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [02:44<00:00, 41.06s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x1476b3310>,
                       'diagnostic_report_time': 164.26127314567566},
 'quality_report': None,
 'sampling_time': 1.610666036605835,
 'training_time': 10.999433040618896}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age0': {'sdtype': 'numerical'},
             'Age1': {'sdtype': 'numerical'},
             'BMI0': {'sdtype': 'numerical'},
             'BMI1': {'sdtype': 'numerical'},
             'BloodType0': {'sdtype': 'categorical'},
             'BloodType1': {'sdtype': 'categorical'}

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:10<00:00,  2.57s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x1476b39d0>,
                       'diagnostic_report_time': 10.286925792694092},
 'quality_report': None,
 'sampling_time': 0.13703012466430664,
 'training_time': 1.6411948204040527}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age0': {'sdtype': 'numerical'},
             'Age1': {'sdtype': 'numerical'},
             'BMI0': {'sdtype': 'numerical'},
             'BMI1': {'sdtype': 'numerical'},
             'BloodType0': {'sdtype': 'categorical'},
             'BloodType1': {'sdtype': 'categorical

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [02:03<00:00, 30.84s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x147c92eb0>,
                       'diagnostic_report_time': 123.34596610069275},
 'quality_report': None,
 'sampling_time': 0.3784189224243164,
 'training_time': 3.7650418281555176}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age0': {'sdtype': 'numerical'},
             'Age1': {'sdtype': 'numerical'},
             'BMI0': {'sdtype': 'numerical'},
             'BMI1': {'sdtype': 'numerical'},
             'BloodType0': {'sdtype': 'categorical'},
             'BloodType1': {'sdtype': 'categorical'

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [03:41<00:00, 55.47s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x1476b1070>,
                       'diagnostic_report_time': 221.89803791046143},
 'quality_report': None,
 'sampling_time': 1.8327300548553467,
 'training_time': 13.411086797714233}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age0': {'sdtype': 'numerical'},
             'Age1': {'sdtype': 'numerical'},
             'BMI0': {'sdtype': 'numerical'},
             'BMI1': {'sdtype': 'numerical'},
             'BloodType0': {'sdtype': 'categorical'},
             'BloodType1': {'sdtype': 'categorical'

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [05:37<00:00, 84.35s/it]


DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x145f0e9d0>,
                       'diagnostic_report_time': 337.39805579185486},
 'quality_report': None,
 'sampling_time': 3.803658962249756,
 'training_time': 26.336549758911133}





### 1.3. Real-world data

In [56]:
# for file_name in REAL_DATA_LIST:
#     real_df = pd.read_csv(file_name)
#     real_metadata = detect_metadata(real_df)
    
#     synthesizer = evaluate_synthesizer(real_metadata)

#     print("#"*30)
#     print(f"Generating for: {file_name} {real_df.shape}")
#     print(get_model_time_score(synthesizer, real_metadata, real_df, samples=real_df.shape[0]))
#     print("#"*30)

# 2. CTGAN Model

In [17]:
evaluate_synthesizer = CTGANSynthesizer

### 2.1. Mock SDV-demo Data

In [18]:
demo_data, demo_metadata = download_demo(
    modality='single_table',
    dataset_name='fake_hotel_guests'
)

pprint(demo_metadata)
synthesizer = evaluate_synthesizer(demo_metadata, enforce_min_max_values=True, enforce_rounding=False)

pprint(get_model_time_score(synthesizer, demo_metadata, demo_data, 500))

{
    "columns": {
        "guest_email": {
            "sdtype": "email",
            "pii": true
        },
        "has_rewards": {
            "sdtype": "boolean"
        },
        "room_type": {
            "sdtype": "categorical"
        },
        "amenities_fee": {
            "sdtype": "numerical",
            "computer_representation": "Float"
        },
        "checkin_date": {
            "sdtype": "datetime",
            "datetime_format": "%d %b %Y"
        },
        "checkout_date": {
            "sdtype": "datetime",
            "datetime_format": "%d %b %Y"
        },
        "room_rate": {
            "sdtype": "numerical",
            "computer_representation": "Float"
        },
        "billing_address": {
            "sdtype": "address",
            "pii": true
        },
        "credit_card_number": {
            "sdtype": "credit_card_number",
            "pii": true
        }
    },
    "primary_key": "guest_email",
    "METADATA_SPEC_VERSION": "SINGLE_TABL

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.06it/s]


DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ The synthetic data covers over 90% of the categories present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data

! More than 10% the synthetic data does not follow the min/max boundaries set by the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x14769bee0>,
                       'diagnostic_report_time': 1.9457759857177734},
 'quality_report': None,
 'sampling_time': 0.3443422317504883,
 'training_time': 66.81222605705261}





### 2.2. Mock hospital data 

In [19]:
for c in str_num_columns:
    for r in [1000, 10000]: #str_num_rows:
        file_name = BASE_MOCK_HOSP_PATH + f"/{c}_data_fields/hosp_{r}x{c}.csv"
        hosp_df = pd.read_csv(file_name)
        hosp_metadata = detect_metadata(hosp_df)
        
        synthesizer = evaluate_synthesizer(hosp_metadata, enforce_min_max_values=True, enforce_rounding=False)

        print("#"*30)
        print(f"Generating for: {file_name}")
        print("-"*30)
        pprint(get_model_time_score(synthesizer, hosp_metadata, hosp_df, samples=hosp_df.shape[0]))

{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age': {'sdtype': 'numerical'},
             'DateOfDiagnosis': {'sdtype': 'categorical'},
             'Gender': {'sdtype': 'categorical'},
             'NRIC': {'sdtype': 'categorical'},
             'Name': {'sdtype': 'categorical'}}}
##############################
Generating for: synthetic_data/5_data_fields/hosp_1000x5.csv
------------------------------


Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.47it/s]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x1458b7fd0>,
                       'diagnostic_report_time': 2.733616828918457},
 'quality_report': None,
 'sampling_time': 0.26647305488586426,
 'training_time': 556.220388174057}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age': {'sdtype': 'numerical'},
             'DateOfDiagnosis': {'sdtype': 'categorical'},
             'Gender': {'sdtype': 'categorical'},
             'NRIC': {'sdtype': 'categorical'},
             'Name': {'sdtype': 'categorical'}}}
##############################
Generatin

KeyboardInterrupt: 

### 2.3. Real data

In [None]:
# for file_name in REAL_DATA_LIST:
#     real_df = pd.read_csv(file_name)
#     real_metadata = detect_metadata(real_df)
    
#     synthesizer = evaluate_synthesizer(real_metadata)

#     print("#"*30)
#     print(f"Generating for: {file_name} {real_df.shape}")
#     print(get_model_time_score(synthesizer, real_metadata, real_df, samples=real_df.shape[0]))
#     print("#"*30)

# 3. TVAE Model

In [20]:
evaluate_synthesizer = TVAESynthesizer

### 3.1. Mock SDV-demo data 

In [21]:
demo_data, demo_metadata = download_demo(
    modality='single_table',
    dataset_name='fake_hotel_guests'
)

pprint(demo_metadata)
synthesizer = evaluate_synthesizer(demo_metadata, enforce_min_max_values=True, enforce_rounding=False)

pprint(get_model_time_score(synthesizer, demo_metadata, demo_data, 500))

{
    "columns": {
        "guest_email": {
            "sdtype": "email",
            "pii": true
        },
        "has_rewards": {
            "sdtype": "boolean"
        },
        "room_type": {
            "sdtype": "categorical"
        },
        "amenities_fee": {
            "sdtype": "numerical",
            "computer_representation": "Float"
        },
        "checkin_date": {
            "sdtype": "datetime",
            "datetime_format": "%d %b %Y"
        },
        "checkout_date": {
            "sdtype": "datetime",
            "datetime_format": "%d %b %Y"
        },
        "room_rate": {
            "sdtype": "numerical",
            "computer_representation": "Float"
        },
        "billing_address": {
            "sdtype": "address",
            "pii": true
        },
        "credit_card_number": {
            "sdtype": "credit_card_number",
            "pii": true
        }
    },
    "primary_key": "guest_email",
    "METADATA_SPEC_VERSION": "SINGLE_TABL

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.87it/s]


DiagnosticResults:

SUCCESS:
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the numerical ranges present in the real data
! The synthetic data is missing more than 10% of the categories present in the real data
! More than 10% of the synthetic rows are copies of the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x146f3ac10>,
                       'diagnostic_report_time': 2.1494181156158447},
 'quality_report': None,
 'sampling_time': 0.3445770740509033,
 'training_time': 30.49004077911377}





### 3.2. Mock hospital data

In [22]:
for c in [5]:
    for r in  [1000, 10000]:
        file_name = BASE_MOCK_HOSP_PATH + f"/{c}_data_fields/hosp_{r}x{c}.csv"
        hosp_df = pd.read_csv(file_name)
        hosp_metadata = detect_metadata(hosp_df)
        
        synthesizer = evaluate_synthesizer(hosp_metadata, enforce_min_max_values=True, enforce_rounding=False)

        print("#"*30)
        print(f"Generating for: {file_name}")
        print("-"*30)
        pprint(get_model_time_score(synthesizer, hosp_metadata, hosp_df, samples=hosp_df.shape[0]))

{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age': {'sdtype': 'numerical'},
             'DateOfDiagnosis': {'sdtype': 'categorical'},
             'Gender': {'sdtype': 'categorical'},
             'NRIC': {'sdtype': 'categorical'},
             'Name': {'sdtype': 'categorical'}}}
##############################
Generating for: synthetic_data/5_data_fields/hosp_1000x5.csv
------------------------------


Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:03<00:00,  1.12it/s]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

DANGER:
x The synthetic data is missing more than 50% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x143944700>,
                       'diagnostic_report_time': 3.571470260620117},
 'quality_report': None,
 'sampling_time': 0.08768272399902344,
 'training_time': 26.401947021484375}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age': {'sdtype': 'numerical'},
             'DateOfDiagnosis': {'sdtype': 'categorical'},
             'Gender': {'sdtype': 'categorical'},
             'NRIC': {'sdtype': 'categorical'},
             'Name': {'sdtype': 'categorical'}}}
##############################

Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:31<00:00,  7.76s/it]


DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

DANGER:
x The synthetic data is missing more than 50% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x1490ba6d0>,
                       'diagnostic_report_time': 31.03897500038147},
 'quality_report': None,
 'sampling_time': 12.799145936965942,
 'training_time': 22312.21906399727}





# TVAE (quality)

In [27]:
# duplicate
for c in [5]:
    for r in  [1000, 10000]:
        file_name = BASE_MOCK_HOSP_PATH + f"/{c}_data_fields/hosp_{r}x{c}.csv"
        hosp_df = pd.read_csv(file_name)
        hosp_metadata = detect_metadata(hosp_df)
        
        synthesizer = evaluate_synthesizer(hosp_metadata, enforce_min_max_values=True, enforce_rounding=False)

        print("#"*30)
        print(f"Generating for: {file_name}")
        print("-"*30)
        pprint(get_model_time_score(synthesizer, hosp_metadata, hosp_df, hosp_df.shape[0], True, True))

{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age': {'sdtype': 'numerical'},
             'DateOfDiagnosis': {'sdtype': 'categorical'},
             'Gender': {'sdtype': 'categorical'},
             'NRIC': {'sdtype': 'categorical'},
             'Name': {'sdtype': 'categorical'}}}
##############################
Generating for: synthetic_data/5_data_fields/hosp_1000x5.csv
------------------------------


Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:10<00:00,  2.66s/it]



Overall Quality Score: 26.27%

Properties:
Column Shapes: 40.94%
Column Pair Trends: 11.6%


Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.57it/s]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

DANGER:
x The synthetic data is missing more than 50% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x147650be0>,
                       'diagnostic_report_time': 2.549103021621704},
 'quality_report': {'quality_report_obj': <sdmetrics.reports.single_table.quality_report.QualityReport object at 0x147626ee0>,
                    'quality_report_score': 0.2627000000000052,
                    'quality_report_time': 11.189966917037964},
 'sampling_time': 0.061669111251831055,
 'training_time': 23.604692935943604}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age': {'sdtype': 'numerical'},
     

KeyboardInterrupt: 

# CTGAN (quality)

In [28]:
evaluate_synthesizer = CTGANSynthesizer
# duplicate
for c in [5]:
    for r in  [1000, 10000]:
        file_name = BASE_MOCK_HOSP_PATH + f"/{c}_data_fields/hosp_{r}x{c}.csv"
        hosp_df = pd.read_csv(file_name)
        hosp_metadata = detect_metadata(hosp_df)
        
        synthesizer = evaluate_synthesizer(hosp_metadata, enforce_min_max_values=True, enforce_rounding=False)

        print("#"*30)
        print(f"Generating for: {file_name}")
        print("-"*30)
        pprint(get_model_time_score(synthesizer, hosp_metadata, hosp_df, hosp_df.shape[0], True, True))

{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age': {'sdtype': 'numerical'},
             'DateOfDiagnosis': {'sdtype': 'categorical'},
             'Gender': {'sdtype': 'categorical'},
             'NRIC': {'sdtype': 'categorical'},
             'Name': {'sdtype': 'categorical'}}}
##############################
Generating for: synthetic_data/5_data_fields/hosp_1000x5.csv
------------------------------


Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:06<00:00,  1.54s/it]



Overall Quality Score: 46.57%

Properties:
Column Shapes: 71.18%
Column Pair Trends: 21.97%


Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.41it/s]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x14434c7f0>,
                       'diagnostic_report_time': 2.8496294021606445},
 'quality_report': {'quality_report_obj': <sdmetrics.reports.single_table.quality_report.QualityReport object at 0x1499bad00>,
                    'quality_report_score': 0.4657499999999999,
                    'quality_report_time': 7.034662246704102},
 'sampling_time': 0.2557530403137207,
 'training_time': 549.3154628276825}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age': {'sdtype': 'numerical'},
             'Da

KeyboardInterrupt: 

# Gaussina Copula 

In [29]:
evaluate_synthesizer = GaussianCopulaSynthesizer
# duplicate
for c in [5]:
    for r in  [1000, 10000]:
        file_name = BASE_MOCK_HOSP_PATH + f"/{c}_data_fields/hosp_{r}x{c}.csv"
        hosp_df = pd.read_csv(file_name)
        hosp_metadata = detect_metadata(hosp_df)
        
        synthesizer = evaluate_synthesizer(hosp_metadata, enforce_min_max_values=True, enforce_rounding=False)

        print("#"*30)
        print(f"Generating for: {file_name}")
        print("-"*30)
        pprint(get_model_time_score(synthesizer, hosp_metadata, hosp_df, hosp_df.shape[0], True, True))

{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age': {'sdtype': 'numerical'},
             'DateOfDiagnosis': {'sdtype': 'categorical'},
             'Gender': {'sdtype': 'categorical'},
             'NRIC': {'sdtype': 'categorical'},
             'Name': {'sdtype': 'categorical'}}}
##############################
Generating for: synthetic_data/5_data_fields/hosp_1000x5.csv
------------------------------


Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:06<00:00,  1.72s/it]



Overall Quality Score: 50.5%

Properties:
Column Shapes: 77.28%
Column Pair Trends: 23.72%


Creating report: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:03<00:00,  1.05it/s]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x14ab27f70>,
                       'diagnostic_report_time': 3.8298580646514893},
 'quality_report': {'quality_report_obj': <sdmetrics.reports.single_table.quality_report.QualityReport object at 0x14ab30c10>,
                    'quality_report_score': 0.5049999999999999,
                    'quality_report_time': 7.656247138977051},
 'sampling_time': 0.08017420768737793,
 'training_time': 0.5256392955780029}
{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'Age': {'sdtype': 'numerical'},
             '

Creating report: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [38:39<00:00, 579.84s/it]



Overall Quality Score: 51.62%

Properties:
Column Shapes: 77.91%
Column Pair Trends: 25.33%


Creating report: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:32<00:00,  8.10s/it]


DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the categories present in the real data
{'diagnostic_report': {'diagnostic_report_obj': <sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x143f9e880>,
                       'diagnostic_report_time': 32.387190103530884},
 'quality_report': {'quality_report_obj': <sdmetrics.reports.single_table.quality_report.QualityReport object at 0x14ab78310>,
                    'quality_report_score': 0.5162100000000078,
                    'quality_report_time': 2325.6293292045593},
 'sampling_time': 0.138502836227417,
 'training_time': 0.842177152633667}





### 3.3. Real-data

In [None]:
# for file_name in REAL_DATA_LIST:
#     real_df = pd.read_csv(file_name)
#     real_metadata = detect_metadata(real_df)
    
#     synthesizer = evaluate_synthesizer(real_metadata)

#     print("#"*30)
#     print(f"Generating for: {file_name} {real_df.shape}")
#     print(get_model_time_score(synthesizer, real_metadata, real_df, samples=real_df.shape[0]))
#     print("#"*30)

# Dumps

In [None]:
quality_report.get_visualization('Column Shapes')

In [None]:
custom_synthesizer = GaussianCopulaSynthesizer(
    metadata,
    default_distribution='truncnorm',
    numerical_distributions={
        'checkin_date': 'uniform',
        'checkout_date': 'uniform',
        'room_rate': 'gaussian_kde'
    }
)

custom_synthesizer.fit(real_data)

In [None]:
learned_distributions = custom_synthesizer.get_learned_distributions()
learned_distributions['has_rewards']

In [None]:
synthetic_data_customized = custom_synthesizer.sample(num_rows=500)

quality_report = evaluate_quality(
    real_data,
    synthetic_data_customized,
    metadata
)

In [None]:
fig = get_column_plot(
    real_data=real_data,
    synthetic_data=synthetic_data_customized,
    column_name='room_rate',
    metadata=metadata
)
    
fig.show()

In [None]:
from sdv.sampling import Condition

suite_guests_with_rewards = Condition(
    num_rows=250,
    column_values={'room_type': 'SUITE', 'has_rewards': True}
)

suite_guests_without_rewards = Condition(
    num_rows=250,
    column_values={'room_type': 'SUITE', 'has_rewards': False}
)