This notebook performs synthesis using the updated helper functions.

In [1]:
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt
import itertools
import os
import sys
import pickle

from bayes_opt import BayesianOptimization
from bayes_opt import UtilityFunction

# Add the parent directory to path
sys.path.append('..')

# Then import
from helper_functions import *

rng = np.random.RandomState(42)

Import SK data.

In [2]:
# import standardized lat/long location data
train_data = pd.read_csv("../../Data/SK/cleaned_sk_data.csv")

In [3]:
train_data

Unnamed: 0,latitude,longitude,sex,age,state
0,0.944157,-1.010655,1,5,0
1,0.889746,-0.694899,1,5,0
2,0.917482,-0.682299,1,3,0
3,0.916660,-0.682551,1,3,0
4,0.886074,-0.668883,1,3,0
...,...,...,...,...,...
6707,0.881500,-0.925307,1,4,0
6708,-1.728784,1.470871,1,4,0
6709,-1.548071,1.557316,1,4,0
6710,-1.549392,1.555785,1,4,0


Define order of synthesis.

In [4]:
# Example synthesis steps
synthesis_steps = [
    (['latitude', 'longitude'], 'gmm'),
    ('sex', 'multinomial'),
    ('age', 'multinomial'),
    ('state', 'multinomial')
]

Set up parameter bounds.

In [5]:
# Example parameter bounds
param_bounds = {
    'gmm': {
        'num_components': (10, 200),
    },
    'multinomial': {
        'sex': {'C': (0.1, 5)},
        'age': {'C': (0.1, 5)},
        'state': {'C': (0.1, 5)}
    }
}

TESTING CODE

In [6]:
train_data=train_data
number_synthetic_datasets=3
synthesis_steps=synthesis_steps
param_bounds=param_bounds
random_state=42

In [7]:
# Process parameter bounds for Bayesian optimization
dimensions = []
param_mapping = []

In [8]:
step_vars, method = synthesis_steps[0]

In [9]:
method

'gmm'

In [10]:
param_bounds

{'gmm': {'num_components': (10, 200)},
 'multinomial': {'sex': {'C': (0.1, 5)},
  'age': {'C': (0.1, 5)},
  'state': {'C': (0.1, 5)}}}

In [11]:
# Track which parameters we need to optimize
for step_vars, method in synthesis_steps:
    if isinstance(step_vars, str):
        step_vars = [step_vars]
            
    if method == 'gmm' and 'gmm' in param_bounds:
        for param, bounds in param_bounds['gmm'].items():
                if isinstance(bounds, (list, tuple)) and len(bounds) == 2:
                    # Add dimension for this parameter
                    dim_name = f"gmm_{param}"
                    dimensions.append((bounds[0], bounds[1], 'uniform', dim_name))
                    param_mapping.append(('gmm', param))
                    
    elif method == 'multinomial':
        for var in step_vars:
            # Check for variable-specific parameters
            var_params = param_bounds.get('multinomial', {}).get(var, {})
            for param, bounds in var_params.items():
                if isinstance(bounds, (list, tuple)) and len(bounds) == 2:
                    dim_name = f"multinomial_{var}_{param}"
                    dimensions.append((bounds[0], bounds[1], 'uniform', dim_name))
                    param_mapping.append(('multinomial', var, param))

In [12]:
dim_name

'multinomial_state_C'

In [13]:
dimensions

[(10, 200, 'uniform', 'gmm_num_components'),
 (0.1, 5, 'uniform', 'multinomial_sex_C'),
 (0.1, 5, 'uniform', 'multinomial_age_C'),
 (0.1, 5, 'uniform', 'multinomial_state_C')]

In [14]:
param_mapping

[('gmm', 'num_components'),
 ('multinomial', 'sex', 'C'),
 ('multinomial', 'age', 'C'),
 ('multinomial', 'state', 'C')]

In [15]:
pbounds = {f"x{i}": (low, high) for i, (low, high, _, _) in enumerate(dimensions)}

In [16]:
pbounds

{'x0': (10, 200), 'x1': (0.1, 5), 'x2': (0.1, 5), 'x3': (0.1, 5)}

In [17]:
kwargs = {'x0': 100, 'x1': 10, 'x2': 1.0, 'x3': 3}

In [18]:
# Reconstruct the param_values structure from the flat parameter space
param_values = {'gmm': {}, 'multinomial': {}}
    
for i, (param_type, *rest) in enumerate(param_mapping):
    param_name = f"x{i}"
    if param_name in kwargs:
        if param_type == 'gmm':
            param, = rest
            param_values['gmm'][param] = int(kwargs[param_name])
        else:  # multinomial
            var, param = rest
            if var not in param_values['multinomial']:
                param_values['multinomial'][var] = {}
            param_values['multinomial'][var][param] = kwargs[param_name]

In [19]:
param_values

{'gmm': {'num_components': 100},
 'multinomial': {'sex': {'C': 10}, 'age': {'C': 1.0}, 'state': {'C': 3}}}

In [20]:
# Run synthesis with current parameters
pmse_ratios, _ = perform_synthesis(
    train_data=train_data,
    number_synthetic_datasets=number_synthetic_datasets,
    synthesis_steps=synthesis_steps,
    param_values=param_values,
    random_state=random_state
)



   latitude  longitude
0  0.944157  -1.010655
1  0.889746  -0.694899
2  0.917482  -0.682299
3  0.916660  -0.682551
4  0.886074  -0.668883
5  0.915043  -0.684308
6  0.914255  -0.690260
7  0.916922  -0.680286
8  0.888412  -0.629299
9  0.916498  -0.680810


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- sex


In [None]:
def evaluate_models(**kwargs):
    # Reconstruct the param_values structure from the flat parameter space
    param_values = {'gmm': {}, 'multinomial': {}}
    
    for i, (param_type, *rest) in enumerate(param_mapping):
        param_name = f"x{i}"
        if param_name in kwargs:
            if param_type == 'gmm':
                param, = rest
                param_values['gmm'][param] = int(kwargs[param_name])
            else:  # multinomial
                var, param = rest
                if var not in param_values['multinomial']:
                    param_values['multinomial'][var] = {}
                param_values['multinomial'][var][param] = kwargs[param_name]
        
        # Run synthesis with current parameters
    pmse_ratios, _ = perform_synthesis(
        train_data=train_data,
        number_synthetic_datasets=number_synthetic_datasets,
        synthesis_steps=synthesis_steps,
        param_values=param_values,
        random_state=random_state
    )
        
    # Return negative of mean pMSE ratio (we want to maximize this)
    return -1 * ((1 - np.mean(pmse_ratios)) ** 2)

In [7]:
# Run optimization
result = optimize_models(
    train_data=train_data,
    number_synthetic_datasets=3,
    synthesis_steps=synthesis_steps,
    param_bounds=param_bounds,
    random_state=42
)

NameError: name 'optimize_models' is not defined

In [None]:
def optimize_models(train_data,
                   number_synthetic_datasets,
                   synthesis_steps,
                   param_bounds,
                   random_state=None):
    """
    Optimize synthesis model parameters using Bayesian optimization.
    
    Parameters:
    -----------
    train_data : pd.DataFrame
        The training data to synthesize
    number_synthetic_datasets : int
        Number of synthetic datasets to generate
    synthesis_steps : list of tuples
        List of (variables, method) tuples specifying synthesis order.
    param_bounds : dict
        Dictionary specifying parameter bounds for optimization.
        Example:
            {
                'gmm': {
                    'num_components': (10, 200),
                    'n_init': (1, 10)
                },
                'multinomial': {
                    'C': (0.001, 3),  # Global default
                    'var1': {'C': (0.1, 5)}  # Specific for var1
                }
            }
    random_state : int, optional
        Random state for reproducibility
    """
    # Process parameter bounds for Bayesian optimization
    dimensions = []
    param_mapping = []
    
    # Track which parameters we need to optimize
    for step_vars, method in synthesis_steps:
        if isinstance(step_vars, str):
            step_vars = [step_vars]
            
        if method == 'gmm' and 'gmm' in param_bounds:
            for param, bounds in param_bounds['gmm'].items():
                if isinstance(bounds, (list, tuple)) and len(bounds) == 2:
                    # Add dimension for this parameter
                    dim_name = f"gmm_{param}"
                    dimensions.append((bounds[0], bounds[1], 'uniform', dim_name))
                    param_mapping.append(('gmm', param))
                    
        elif method == 'multinomial':
            for var in step_vars:
                # Check for variable-specific parameters
                var_params = param_bounds.get('multinomial', {}).get(var, {})
                for param, bounds in var_params.items():
                    if isinstance(bounds, (list, tuple)) and len(bounds) == 2:
                        dim_name = f"multinomial_{var}_{param}"
                        dimensions.append((bounds[0], bounds[1], 'uniform', dim_name))
                        param_mapping.append(('multinomial', var, param))
                
                # Check for global multinomial parameters
                global_params = param_bounds.get('multinomial', {})
                for param, bounds in global_params.items():
                    if (isinstance(bounds, (list, tuple)) and len(bounds) == 2 and 
                        not any(isinstance(v, dict) for v in global_params.values())):
                        dim_name = f"multinomial_global_{param}"
                        dimensions.append((bounds[0], bounds[1], 'uniform', dim_name))
                        param_mapping.append(('multinomial', 'global', param))
    
    def evaluate_models(**kwargs):
        # Reconstruct the param_values structure from the flat parameter space
        param_values = {'gmm': {}, 'multinomial': {}}
        
        for i, (param_type, *rest) in enumerate(param_mapping):
            param_name = f"x{i}"
            if param_name in kwargs:
                if param_type == 'gmm':
                    param, = rest
                    param_values['gmm'][param] = int(kwargs[param_name])
                else:  # multinomial
                    var, param = rest
                    if var not in param_values['multinomial']:
                        param_values['multinomial'][var] = {}
                    param_values['multinomial'][var][param] = kwargs[param_name]
        
        # Run synthesis with current parameters
        pmse_ratios, _ = perform_synthesis(
            train_data=train_data,
            number_synthetic_datasets=number_synthetic_datasets,
            synthesis_steps=synthesis_steps,
            param_values=param_values,
            random_state=random_state
        )
        
        # Return negative of mean pMSE ratio (we want to maximize this)
        return -1 * ((1 - np.mean(pmse_ratios)) ** 2)
    
    # Create parameter bounds for Bayesian optimization
    pbounds = {f"x{i}": (low, high) for i, (low, high, _, _) in enumerate(dimensions)}
    
    # Run optimization
    optimizer = BayesianOptimization(
        f=evaluate_models,
        pbounds=pbounds,
        random_state=random_state
    )
    
    utility = UtilityFunction(kind="ei", xi=1e-02)
    optimizer.maximize(init_points=5, n_iter=25, acquisition_function=utility)
    
    # Process results
    best_params = {'gmm': {}, 'multinomial': {}}
    for i, (param_type, *rest) in enumerate(param_mapping):
        param_name = f"x{i}"
        if param_type == 'gmm':
            param, = rest
            best_params['gmm'][param] = int(optimizer.max['params'][param_name])
        else:  # multinomial
            var, param = rest
            if var not in best_params['multinomial']:
                best_params['multinomial'][var] = {}
            best_params['multinomial'][var][param] = optimizer.max['params'][param_name]
    
    return {
        'best_params': best_params,
        'best_score': -optimizer.max['target'],  # Convert back to positive pMSE
        'optimizer': optimizer
    }