In [28]:
import time
import pandas as pd

# models 
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.single_table import CTGANSynthesizer
from sdv.single_table import TVAESynthesizer

# autodetection of metadata
from sdv.metadata import SingleTableMetadata

# evaluation of synthetic data
from sdv.evaluation.single_table import evaluate_quality
from sdv.evaluation.single_table import run_diagnostic

from sdv.datasets.demo import download_demo

# visualisation
# from sdv.evaluation.single_table import get_column_plot

In [11]:
import warnings
warnings.filterwarnings('ignore')

from pprint import pprint

# Notes

### Synthesizers -- models to generate synthetic data 

Models to generate single table data 
- Gaussian Copula -- Statistical modelling | most customizable and faster 
- CTGAN – GAN-based (Deep learning) | high fidelity dataset, training time required
- TVAE – variational autoencoder (Deep learning)| high fidelity dataset, training time required

Others
- PAR  (?) – for sequential
- HMA  (?) - for multi-table

**Notebook focuses on Gaussian Copula, CTGAN, TVAE** 

Inputs required for the synthesizers 
- Metadata (Required)
- Real Data (Required)
- Customisable parameters (Optional)

### Autodetecting metadata 

[From the Doc]
The detected metadata is not guaranteed to be accurate or complete. Be sure to carefully inspect the metadata and update information.
- Primary keys and other identifiers are not auto-detected. See set_primary_key and add_alternate_keys method to add them.
- Dates and sensitive information may be auto-detected incorrectly. See update_column method to update them.

### Evaluation Report

**Quality Report**
For checking data  Fidelity:  How well the synthetic data captures mathematical properties from your real data.

Evaluates: 
- Overall score including Column Shapes, Columns Pair Trends
- Metrics and score details for Column Shapes, Columns Pair Trends
- All scaore range from 0 to 1

**Diagnoistic Report** 
Give a general sense of the strengths and weakness of the synthetic data model. 
Evaluates: 
- Overall [diagnosic results](https://docs.sdv.dev/sdmetrics/reports/diagnostic-report/single-table-api#get_results) -- SUCCESS, WARNING, DANGER 
- Properties -- Syntheiss, Coverage, Boundaries 

# Experiments 

**Mock Data**
- SDV demo data 
- simulated_hospital_data with varying: (1) num of columns -- 5, 10, 20 (2) num of rows -- 1k, 10k, 50k, 100k

In [33]:
BASE_MOCK_HOSP_PATh = "synthetic_data/clean/"

str_num_rows = [1000, 10000, 50000, 100000]
str_num_columns = [5, 10, 20]

**Real Data** (TODO)

Output sample size is set the same as the input dataset size. 

In [32]:
REAL_DATA_LIST = ["real_data/time_use.csv", "real_data/time_use.csv", "real_data/community_survey.csv", "real_data/census_tract_data.csv"] 

## Utility functions

In [14]:
def get_model_time_score(synthesizer, metadata, real_data, samples=500, gen_quality_report=False, gen_diagnostic_report=True): 
    
    # ---------------------
    # Step 1: Train
    # ---------------------
    begin_time = time.time()
    synthesizer = GaussianCopulaSynthesizer(metadata, enforce_min_max_values=True, enforce_rounding=False)
    synthesizer.fit(real_data)
    end_time = time.time()
    
    training_time = end_time - begin_time
    
    # ---------------------
    # Step 2: Sample
    # ---------------------
    begin_time = time.time()
    synthetic_data = synthesizer.sample(num_rows=samples)
    end_time = time.time()
    
    sampling_time = end_time - begin_time
    
    # ---------------------
    # Step 3: Evaluate
    # ---------------------

    # 1. QUALITY REPORT
    # time consuming so keeping conditional
    if gen_quality_report:
        begin_time = time.time()
        quality_report_obj = evaluate_quality(
                real_data,
                synthetic_data,
                metadata
            )
        end_time = time.time()
        
        quality_report_time = end_time - begin_time
        
        quality_report = { 
            "quality_report_obj": quality_report_obj,
            "quality_report_time": quality_report_time,
            "quality_report_score": quality_report_obj.get_score()
        }
        
    # 2. DIAGNOSTIC REPORT
    if gen_diagnostic_report:
        begin_time = time.time()
        diagnostic_report_obj = run_diagnostic(
                real_data,
                synthetic_data,
                metadata
            )
        end_time = time.time()
        
        diagnostic_report_time = end_time - begin_time
        
        diagnostic_report = { 
            "diagnostic_report_obj": diagnostic_report_obj,
            "diagnostic_report_time": diagnostic_report_time
            
        }
        
    return {
            "training_time": training_time,
            "sampling_time": sampling_time,
            "quality_report": quality_report if gen_quality_report else None,
            "diagnostic_report": diagnostic_report}

In [22]:
def detect_metadata(real_data_df): 
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(data=real_data_df)
    pprint(metadata.to_dict())
#     python_dict = metadata.to_dict()
    return metadata

# 1. Gaussian Copula Model 

In [35]:
evaluate_synthesizer = GaussianCopulaSynthesizer

### 1.1. Mock SDV demo data

In [None]:
demo_data, demo_metadata = download_demo(
    modality='single_table',
    dataset_name='fake_hotel_guests'
)

pprint(demo_metadata)
synthesizer = evaluate_synthesizer(demo_metadata)

pprint(get_model_time_score(synthesizer, demo_metadata, real_demo_data, 500, True, True))

### 1.2. Mock hospital data

**Varying number of rows and columns**

In [None]:
for c in str_num_columns:
    for r in str_num_rows:
        file_name = BASE_MOCK_HOSP_PATh + f"{c}_data_fields/hosp_{r}x{c}.csv"
        hosp_df = pd.read_csv(file_name)
        hosp_metadata = detect_metadata(hosp_df)
        
        synthesizer = evaluate_synthesizer(hosp_metadata)

        print("#"*30)
        print(f"Generating for: {file_name}")
        print("-"*30)
        pprint(get_model_time_score(synthesizer, hosp_metadata, hosp_df, samples=hosp_df.shape[0]))

### 1.3. Real-world data

In [None]:
for file_name in REAL_DATA_LIST:
    real_df = pd.read_csv(file_name)
    real_metadata = detect_metadata(real_df)
    
    synthesizer = evaluate_synthesizer(real_metadata)

    print("#"*30)
    print(f"Generating for: {file_name} {real_df.shape}")
    print(get_model_time_score(synthesizer, real_metadata, real_df, samples=real_df.shape[0]), True, True)
    print("#"*30)

# 2. CTGAN Model

In [36]:
evaluate_synthesizer = CTGANSynthesizer

In [None]:
demo_data, demo_metadata = download_demo(
    modality='single_table',
    dataset_name='fake_hotel_guests'
)

pprint(demo_metadata)
synthesizer = evaluate_synthesizer(demo_metadata)

pprint(get_model_time_score(synthesizer, demo_metadata, real_demo_data, 500, True, True))

In [None]:
for c in str_num_columns:
    for r in str_num_rows:
        file_name = BASE_MOCK_HOSP_PATh + f"{c}_data_fields/hosp_{r}x{c}.csv"
        hosp_df = pd.read_csv(file_name)
        hosp_metadata = detect_metadata(hosp_df)
        
        synthesizer = evaluate_synthesizer(hosp_metadata)

        print("#"*30)
        print(f"Generating for: {file_name}")
        print("-"*30)
        pprint(get_model_time_score(synthesizer, hosp_metadata, hosp_df, samples=hosp_df.shape[0]))

In [None]:
for file_name in REAL_DATA_LIST:
    real_df = pd.read_csv(file_name)
    real_metadata = detect_metadata(real_df)
    
    synthesizer = evaluate_synthesizer(real_metadata)

    print("#"*30)
    print(f"Generating for: {file_name} {real_df.shape}")
    print(get_model_time_score(synthesizer, real_metadata, real_df, samples=real_df.shape[0]), True, True)
    print("#"*30)

# 3. TVAE Model

In [None]:
evaluate_synthesizer = TVAESynthesizer

In [None]:
demo_data, demo_metadata = download_demo(
    modality='single_table',
    dataset_name='fake_hotel_guests'
)

pprint(demo_metadata)
synthesizer = evaluate_synthesizer(demo_metadata)

pprint(get_model_time_score(synthesizer, demo_metadata, real_demo_data, 500, True, True))

In [None]:
for c in str_num_columns:
    for r in str_num_rows:
        file_name = BASE_MOCK_HOSP_PATh + f"{c}_data_fields/hosp_{r}x{c}.csv"
        hosp_df = pd.read_csv(file_name)
        hosp_metadata = detect_metadata(hosp_df)
        
        synthesizer = evaluate_synthesizer(hosp_metadata)

        print("#"*30)
        print(f"Generating for: {file_name}")
        print("-"*30)
        pprint(get_model_time_score(synthesizer, hosp_metadata, hosp_df, samples=hosp_df.shape[0]))

In [None]:
for file_name in REAL_DATA_LIST:
    real_df = pd.read_csv(file_name)
    real_metadata = detect_metadata(real_df)
    
    synthesizer = evaluate_synthesizer(real_metadata)

    print("#"*30)
    print(f"Generating for: {file_name} {real_df.shape}")
    print(get_model_time_score(synthesizer, real_metadata, real_df, samples=real_df.shape[0]), True, True)
    print("#"*30)

# Dumps

In [None]:
quality_report.get_visualization('Column Shapes')

In [None]:
custom_synthesizer = GaussianCopulaSynthesizer(
    metadata,
    default_distribution='truncnorm',
    numerical_distributions={
        'checkin_date': 'uniform',
        'checkout_date': 'uniform',
        'room_rate': 'gaussian_kde'
    }
)

custom_synthesizer.fit(real_data)

In [None]:
learned_distributions = custom_synthesizer.get_learned_distributions()
learned_distributions['has_rewards']

In [None]:
synthetic_data_customized = custom_synthesizer.sample(num_rows=500)

quality_report = evaluate_quality(
    real_data,
    synthetic_data_customized,
    metadata
)

In [None]:
fig = get_column_plot(
    real_data=real_data,
    synthetic_data=synthetic_data_customized,
    column_name='room_rate',
    metadata=metadata
)
    
fig.show()

In [None]:
from sdv.sampling import Condition

suite_guests_with_rewards = Condition(
    num_rows=250,
    column_values={'room_type': 'SUITE', 'has_rewards': True}
)

suite_guests_without_rewards = Condition(
    num_rows=250,
    column_values={'room_type': 'SUITE', 'has_rewards': False}
)