This notebook performs the synthesis using our proposed synthesization approach for the training data only (excludes the holdout data). The synthesis model is a logistic/multinomial logistic regression.

In [None]:
import pandas as pd
import numpy as np
from numpy.random import default_rng
from sklearn.mixture import GaussianMixture
from scipy.spatial import cKDTree
import matplotlib.pyplot as plt
import seaborn as sns

import time

from scipy.stats import ks_2samp

from copulas.multivariate import GaussianMultivariate

from sklearn import preprocessing

from sklearn.tree import DecisionTreeClassifier

import itertools

from bayes_opt import BayesianOptimization
from bayes_opt import UtilityFunction

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures

from bayesian_bootstrap import bayesian_bootstrap

rng = np.random.RandomState(42)

Import the data.

In [None]:
# import standardized lat/long location data
train_data = pd.read_csv("Data/cleaned_ipums_data.csv")

***

In [None]:
train_data

***

Calculate the average hellinger distance of bootstrapped data. Serves as a proxy for the hellinger distance between data sets sampled from the same data generating distribution.

In [None]:
def hellinger(sigma_real, sigma_synth):
    d = 1/2 * np.log(np.linalg.det((sigma_real + sigma_synth)/2)/np.sqrt(np.linalg.det(sigma_real) * np.linalg.det(sigma_synth)))
    return np.sqrt(1-np.exp(-d))

In [None]:
def null_hellinger(train_data, num_bootstraps):
    copula1 = GaussianMultivariate()
    copula1.fit(train_data)
    full_h = []
    for i in range(num_bootstraps):
        sampled_data = train_data.sample(n=train_data.shape[0], replace=True).reset_index(drop=True)
        copula2 = GaussianMultivariate()
        copula2.fit(sampled_data)
        h = hellinger(copula1.correlation, copula2.correlation)
        full_h.append(h)
    return full_h

In [None]:
def hellinger_distance_ratio(synthetic_dataset, original_data_corr_mat, null_h):
    copula2 = GaussianMultivariate()
    copula2.fit(synthetic_dataset)
    h = hellinger(original_data_corr_mat, copula2.correlation)
    return h/null_h

***

Test the copula model. According to wikipedia, the Gaussian copula is the joint CDF of a multivariate Gaussian distribution with mean vector 0 and covariance matrix equal to the correlation matrix.

***

Testing logistic and multinomial logistic regression synthesizers.

In [None]:
def polynomial_and_standardize(dataset, poly_degree=3, interaction_only=False):
    
    poly = PolynomialFeatures(degree=poly_degree, interaction_only=interaction_only, include_bias=False)
    
    X = poly.fit_transform(dataset)
    
    scaled_X = preprocessing.StandardScaler().fit_transform(X)
    
    return scaled_X

In [None]:
def multinomial_synthesizer(orig_data, synth_data_sets, target, penalty_param, poly_degree=3, interaction_only=False):
    
    mn_model = LogisticRegression(penalty='l1', C=penalty_param, solver='saga', max_iter=1000, multi_class='multinomial', random_state=rng)
    
    X = polynomial_and_standardize(dataset=orig_data, poly_degree=poly_degree, interaction_only=interaction_only)
    
    sXs = [polynomial_and_standardize(dataset=Y, poly_degree=poly_degree, interaction_only=interaction_only) for Y in synth_data_sets]
    
    vals = []
    
    mn_model.fit(X, target)
    
    rng_mn = default_rng()
    
    for Y in sXs:
        
        probs = mn_model.predict_proba(Y)
    
        v = [np.argmax(rng_mn.multinomial(n=1, pvals=p, size=1)==1) for p in probs]
    
        vals.append(pd.Series(v, name=target.name))
    
    return vals

In [None]:
def train_models_mn(#overall parameters
                    train_data,
                    original_cor_mat,
                    null_h,
                    number_synthetic_datasets,
                    # hyperparameters for GMM, end with underscore means Bayesian optimization will choose
                    number_gmm_initializations,
                    num_components_,
                    # hyperparameters for CART, end with underscore means Bayesian optimization will choose
                    C_non_white_,
                    C_sex_):
    
    num_samples = train_data.shape[0]
    
    ########## Code for GMM ############
    
    # fit GMM model
    GMM = GaussianMixture(num_components_, n_init=number_gmm_initializations, init_params="k-means++", random_state=rng).fit(train_data.loc[:,["INCWAGE", "years_of_educ", "potential_experience"]])
    
    # list for synthetic datasets
    sXs = []
    
    # generate and store number_synthetic_datasets synthetic datasets
    for i in range(number_synthetic_datasets):
        sX = GMM.sample(num_samples)[0]
        sX = pd.DataFrame(sX)
        sX.columns = ["INCWAGE", "years_of_educ", "potential_experience"]
        sXs.append(sX)
        
    ####################################################################################################
        
    ########### Code for non-white MN ##########
    
    synth_non_white_vars = multinomial_synthesizer(orig_data=train_data.loc[:,["INCWAGE", "years_of_educ", "potential_experience"]], 
                                                   synth_data_sets=sXs, 
                                                   target=train_data.non_white, 
                                                   penalty_param=C_non_white_)
    
    sXs = [pd.concat([Y, synth_non_white_vars[i]], axis=1) for i,Y in enumerate(sXs)]
        
    ####################################################################################################
        
    ########### Code for sex MN ##########
    
    synth_sex_vars = multinomial_synthesizer(orig_data=train_data.loc[:,["INCWAGE", "years_of_educ", "potential_experience", "non_white"]], 
                                             synth_data_sets=sXs, 
                                             target=train_data.SEX, 
                                             penalty_param=C_sex_)
    
    sXs = [pd.concat([Y, synth_sex_vars[i]], axis=1) for i,Y in enumerate(sXs)]
        
    ####################################################################################################
        
    ###### Calculate pMSE ratios ######
    hellinger_ratios = [hellinger_distance_ratio(Y, original_cor_mat, null_h) for Y in sXs]
    
    return hellinger_ratios, sXs

In [None]:
def optimize_models_mn(train_data,
                       original_cor_mat,
                       null_h,
                       number_synthetic_datasets,
                       number_gmm_initializations,
                       random_state):

    def evaluate_models(num_components_, C_non_white_, C_sex_, original_cor_mat=original_cor_mat, null_h=null_h):

        hellinger_ratios, _   = train_models_mn(train_data=train_data,
                                                original_cor_mat=original_cor_mat,
                                                null_h=null_h,
                                                number_synthetic_datasets=number_synthetic_datasets,
                                                number_gmm_initializations=number_gmm_initializations,
                                                num_components_=int(num_components_),
                                                C_non_white_=C_non_white_,
                                                C_sex_=C_sex_)
        
        return -1 * ((1 - np.mean(hellinger_ratios))**2)

    optimizer = BayesianOptimization(
        f=evaluate_models,
        pbounds={
            "num_components_": (10, 200.99),
            "C_non_white_": (0.001, 2),
            "C_sex_": (0.001, 2)
        },
        random_state=random_state)
    
    utility = UtilityFunction(kind="ei", xi=1e-02)
    optimizer.maximize(init_points=5, n_iter=25, acquisition_function=utility)
    print("Final Result: ", optimizer.max)
    return optimizer.max, optimizer

The default value for $\alpha = 1e-06$.

In [None]:
nsd = 20
ngi = 5
copula = GaussianMultivariate()
copula.fit(train_data)
original_cor_mat = copula.correlation
null_h = null_hellinger(train_data, nsd)
# random_states = [np.random.RandomState(1234), np.random.RandomState(4321), np.random.RandomState(10620), np.random.RandomState(91695), np.random.RandomState(31296)]
random_states = [np.random.RandomState(1234)]

In [None]:
np.mean(null_h)

In [None]:
optimization_results = [optimize_models_mn(train_data=train_data, original_cor_mat=original_cor_mat, null_h=np.mean(null_h), number_synthetic_datasets=nsd, number_gmm_initializations=ngi, random_state=r) for r in random_states]

In [None]:
run_targets = [np.minimum.accumulate(-i[1].space.target) for i in optimization_results]

In [None]:
plt.plot(run_targets[0])
plt.scatter(np.arange(len(run_targets[0])), run_targets[0], s=6)
# plt.plot(run_targets[1])
# plt.scatter(np.arange(len(run_targets[1])), run_targets[1], s=6)
# plt.plot(run_targets[2])
# plt.scatter(np.arange(len(run_targets[2])), run_targets[2], s=6)
# plt.plot(run_targets[3])
# plt.scatter(np.arange(len(run_targets[3])), run_targets[3], s=6)
# plt.plot(run_targets[4])
# plt.scatter(np.arange(len(run_targets[4])), run_targets[4], s=6)
# plt.title("Running Minimum Objective Value for MNL Synthesis")
# plt.ylim(-0.01, 0.47)
plt.show()

Choose the params that gave the best objective value across all random seeds.

In [None]:
best_params = optimization_results[np.argmax([x[0]['target'] for x in optimization_results])][0]

In [None]:
best_params

Generate 20 synthetic data sets.

In [None]:
pmse_ratios, full_sXs, GMM = train_models_mn(train_data=train_data,
                                             number_synthetic_datasets=20,
                                             # hyperparameters for GMM
                                             number_gmm_initializations=ngi,
                                             num_components_=int(best_params['params']['num_components_']),
                                             # hyperparameters for CART, end with underscore means Bayesian optimization will choose
                                             C_non_white_=best_params['params']['C_non_white_'],
                                             C_sex_=best_params['params']['C_sex_'])

In [None]:
best_params['params']['C_sex_']

In [None]:
np.mean(pmse_ratios)

In [None]:
plt.violinplot(pmse_ratios)
plt.xlabel("Density")
plt.ylabel("pMSE Ratio")
plt.title("Distribution of pMSE Ratios")
plt.show()

# Save the synthetic datasets.

In [None]:
for i, sX in enumerate(full_sXs):
    sX.to_csv("Data/synthetic_datasets/logistic_logistic_pmse_" + str(i) + ".csv", index=False)

***