# Analysis of Utility of IPUMS data.

In [1]:
import itertools

import matplotlib.pyplot as plt

from scipy.spatial import KDTree

from statsmodels.discrete.discrete_model import Logit
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant

from helper_functions import *

In [2]:
train_data = pd.read_csv("../Data/IPUMS/non_normalized_cleaned_ipums_data.csv")

In [3]:
train_data

Unnamed: 0,incwage,years_of_educ,potential_experience,non_white,sex
0,11000,16.000000,18.000000,0,1
1,38000,15.000000,23.000000,0,0
2,7000,13.000000,2.000000,0,0
3,2600,11.000000,0.000000,0,0
4,20800,12.000000,33.000000,1,0
...,...,...,...,...,...
197751,22500,16.000000,13.000000,0,0
197752,4500,16.000000,11.000000,0,1
197753,37000,17.737673,19.262327,0,0
197754,1500,15.000000,13.000000,0,1


In [4]:
sXs_mnl = [pd.read_csv("../Data/IPUMS/Synthetic Datasets/gmm_and_mnl_" + str(i) + ".csv") for i in range(20)]
# ad_sXs_mnl = [pd.read_csv("../Data/SK/Synthetic Datasets/ad_gmm_and_mnl_" + str(i) + ".csv") for i in range(20)]

sXs_cart = [pd.read_csv("../Data/IPUMS/Synthetic Datasets/gmm_and_cart_" + str(i) + ".csv") for i in range(20)]
# ad_sXs_cart = [pd.read_csv("../Data/SK/Synthetic Datasets/ad_gmm_and_cart_" + str(i) + ".csv") for i in range(20)]

sXs_synthpop = [pd.read_csv("../Data/IPUMS/Synthetic Datasets/synthpop_baseline_" + str(i) + ".csv") for i in range(20)]

# sXs_mostly = [pd.read_csv("../Data/IPUMS/Synthetic Datasets/mostly_" + str(i) + ".csv") for i in range(20)]

In [5]:
def non_normalize(train_data, synthetic_datasets):
    normalizer = preprocessing.StandardScaler().fit(train_data.loc[:,["incwage", "years_of_educ", "potential_experience"]])
    new_synthetic_datasets = []
    for Z in synthetic_datasets:
        Z.loc[:, ["incwage", "years_of_educ", "potential_experience"]] = normalizer.inverse_transform(Z.loc[:,["incwage", "years_of_educ", "potential_experience"]])
        Z.loc[:, 'incwage'] = np.where(Z.incwage < 1, 1, Z.incwage)

In [6]:
non_normalize(train_data=train_data, synthetic_datasets=sXs_mnl)

In [7]:
non_normalize(train_data=train_data, synthetic_datasets=sXs_cart)

In [8]:
non_normalize(train_data=train_data, synthetic_datasets=sXs_synthpop)

***

## Utility Measure 1 - pMSE Ratios

In [9]:
ratios_mnl = [pmse_ratio(train_data, Y) for Y in sXs_mnl]
print(np.mean(ratios_mnl))

# ratios_ad_mnl = [pmse_ratio(train_data, Y) for Y in ad_sXs_mnl]
# print(np.mean(ratios_ad_mnl))

ratios_cart = [pmse_ratio(train_data, Y) for Y in sXs_cart]
print(np.mean(ratios_cart))

# ratios_ad_cart = [pmse_ratio(train_data, Y) for Y in ad_sXs_cart]
# print(np.mean(ratios_ad_cart))

ratios_synthpop = [pmse_ratio(train_data, Y) for Y in sXs_synthpop]
print(np.mean(ratios_synthpop))

# ratios_mostly = [pmse_ratio(train_data, Y) for Y in sXs_mostly]
# np.mean(ratios_mostly)

0.8860045775175831
1.1909169060485403
0.408693890140759


***

Save data for plotting in R.

In [10]:
# pmse_results = pd.DataFrame({'Dataset': np.arange(1, len(ratios_mnl)+1),
#                              'MNL': ratios_mnl,
#                              # 'AD-MNL': ratios_ad_mnl,
#                              'CART': ratios_cart,
#                              # 'AD-CART': ratios_ad_cart,
#                              'MOSTLY.AI': ratios_mostly})

# pmse_results = pmse_results.melt(id_vars=["Dataset"], value_vars=["MNL", "CART", "MOSTLY.AI"], var_name="Type")
# pmse_results.to_csv('../Results/IPUMS/pmse_metrics.csv', index=False)

In [11]:
# # Combine data
# # plot_data = list([ratios_mnl, ratios_ad_mnl, ratios_cart, ratios_ad_cart, ratios_mostly])

# plot_data = list([ratios_mnl, ratios_cart, ratios_mostly])

# fig, ax = plt.subplots(figsize=(10,10))

# # xticklabels = ["MNL", "AD-MNL", "CART", "AD-CART", "MOSTLY.AI"]
# xticklabels = ["MNL", "CART", "MOSTLY.AI"]
# # ax.set_xticks([1, 2, 3, 4, 5])
# ax.set_xticks([1, 2, 3])
# ax.set_xticklabels(xticklabels)

# ax.violinplot(plot_data, showmeans=True)

# plt.title("Distributions of Logistic Regression Based pMSE Ratios")

# plt.show()

***

# Utility Metric: Analysis Specific Utility

###  * L1 distance between confidential and synthetic coefficient estimates
###  * Confidence interval ratio
###  * Sign, Significance, and Overlap (SSO)

Analysis specific utility.

Use logistic regression to predict COVID-19 deaths based on latitude and longitude, age, and sex.

Compute the model for the original data.

***

Annual earnings as a function of years_of_educ, non_white, and potential experience (up to third degree polynomial).

Do the above regression for males and females separately.

This notebook uses the normalized data.

In [12]:
train_data

Unnamed: 0,incwage,years_of_educ,potential_experience,non_white,sex
0,11000,16.000000,18.000000,0,1
1,38000,15.000000,23.000000,0,0
2,7000,13.000000,2.000000,0,0
3,2600,11.000000,0.000000,0,0
4,20800,12.000000,33.000000,1,0
...,...,...,...,...,...
197751,22500,16.000000,13.000000,0,0
197752,4500,16.000000,11.000000,0,1
197753,37000,17.737673,19.262327,0,0
197754,1500,15.000000,13.000000,0,1


Compute the variables for the polynomials of `potential_experience` and the target log of `incwage`.

In [13]:
def new_vars(data_sets):
    for Z in data_sets:
        Z.loc[:,'potential_experience_2'] = Z.potential_experience**2
        Z.loc[:,'potential_experience_3'] = Z.potential_experience**3
        Z.loc[:,'log_incwage'] = np.log(Z.incwage)

In [14]:
new_vars([train_data])

In [15]:
new_vars(sXs_mnl)
new_vars(sXs_cart)
# new_vars(sXs_mostly)
new_vars(sXs_synthpop)

In [16]:
sXs_mnl[0]

Unnamed: 0,incwage,years_of_educ,potential_experience,non_white,sex,potential_experience_2,potential_experience_3,log_incwage
0,17839.574843,12.003886,7.674275,0,0,58.894490,451.972491,9.789175
1,29564.638115,12.002297,7.287517,0,1,53.107908,387.024796,10.294334
2,23409.009584,11.999091,9.928967,0,0,98.584380,978.841032,10.060876
3,16247.971664,11.999564,8.659090,0,0,74.979848,649.257291,9.695723
4,27666.823426,12.001844,1.392891,0,1,1.940145,2.702410,10.227989
...,...,...,...,...,...,...,...,...
197751,24418.984955,14.999153,1.623454,0,0,2.635604,4.278783,10.103116
197752,30012.171509,15.000374,5.352043,0,0,28.644370,153.305912,10.309358
197753,11471.350021,15.001054,1.430702,0,1,2.046909,2.928517,9.347608
197754,18369.830671,14.998274,3.651067,0,0,13.330294,48.669801,9.818465


Split into female/male synthetic data sets.

In [17]:
sXs_mnl_female = [Z.loc[Z.sex == 1,:] for Z in sXs_mnl]
sXs_mnl_male = [Z.loc[Z.sex == 0,:] for Z in sXs_mnl]

In [18]:
sXs_cart_female = [Z.loc[Z.sex == 1,:] for Z in sXs_cart]
sXs_cart_male = [Z.loc[Z.sex == 0,:] for Z in sXs_cart]

In [19]:
# sXs_mostly_female = [Z.loc[Z.sex == 1,:] for Z in sXs_mostly]
# sXs_mostly_male = [Z.loc[Z.sex == 0,:] for Z in sXs_mostly]

In [20]:
sXs_synthpop_female = [Z.loc[Z.sex == 1,:] for Z in sXs_synthpop]
sXs_synthpop_male = [Z.loc[Z.sex == 0,:] for Z in sXs_synthpop]

In [21]:
train_female = train_data.loc[train_data.sex == 1,:]
train_male = train_data.loc[train_data.sex == 0,:]

In [22]:
ols_train_female = ols_param_fetcher(data=train_female, y='log_incwage', X=['years_of_educ', 'non_white', 'potential_experience', 'potential_experience_2', 'potential_experience_3'])

In [23]:
ols_train_male = ols_param_fetcher(data=train_male, y='log_incwage', X=['years_of_educ', 'non_white', 'potential_experience', 'potential_experience_2', 'potential_experience_3'])

In [24]:
ols_train_female

{'params': const                     6.185850
 years_of_educ             0.161283
 non_white                -0.015301
 potential_experience      0.153996
 potential_experience_2   -0.005422
 potential_experience_3    0.000058
 dtype: float64,
 'l_var': array([4.21162561e-04, 1.94141687e-06, 8.11209507e-05, 3.71984637e-06,
        1.05403807e-08, 2.28290018e-12]),
 'CI':           0         1
 0  6.145626  6.226073
 1  0.158552  0.164014
 2 -0.032954  0.002352
 3  0.150215  0.157776
 4 -0.005623 -0.005221
 5  0.000055  0.000061}

In [25]:
ols_train_male

{'params': const                     6.608703
 years_of_educ             0.136343
 non_white                -0.195098
 potential_experience      0.218667
 potential_experience_2   -0.007680
 potential_experience_3    0.000082
 dtype: float64,
 'l_var': array([2.28497237e-04, 9.74753129e-07, 6.11123627e-05, 2.49616254e-06,
        6.74568404e-09, 1.42161020e-12]),
 'CI':           0         1
 0  6.579076  6.638330
 1  0.134408  0.138278
 2 -0.210420 -0.179776
 3  0.215571  0.221764
 4 -0.007841 -0.007519
 5  0.000080  0.000085}

In [26]:
def coef_L1_calc(original_data, synthetic_datasets, synthetic_data_type, target_variable, exog_variables, param_names):

    # copy synthetic datasets so they don't get edited on a global scope
    all_synth = synthetic_datasets.copy()

    # train a logistic regression model with state as the target and lat, long, sex, age, and sex*age as predictors
    # function returns all parameter estimates, standard errors, and confidence intervals for the training data
    ols_train = ols_param_fetcher(data=original_data, y=target_variable, X=exog_variables)

    # estimate the same logistic regression model for all synthetic data sets and save params, standard errors, and CIs
    ols_synth = [ols_param_fetcher(data=Y, y=target_variable, X=exog_variables) for Y in synthetic_datasets]

    # create a dataframe with the L1 distances for each coefficient in the columns, (rows are for each synthetic data set)
    # and a column identifying the data type
    l1_frame = pd.DataFrame()

    # calculate L1 distance
    for i in ols_synth:
        l1_frame = pd.concat([l1_frame, np.abs(i['params'] - ols_train['params'])], axis=1)

    l1_frame = l1_frame.T.reset_index(drop=True)
    l1_frame.columns = param_names
    l1_frame['Data Type'] = synthetic_data_type
    l1_frame['Measure'] = 'L1 Distance'

    # calculate CI ratio (width of synthetic / width of original)
    # calculate confidence interval ratios
    CI_ratio_frame = pd.DataFrame()
    for i in ols_synth:
        CI_ratio_frame = pd.concat([CI_ratio_frame, (i['CI'].iloc[:,1]-i['CI'].iloc[:,0]) / (ols_train['CI'].iloc[:,1]-ols_train['CI'].iloc[:,0])], axis=1)

    CI_ratio_frame = CI_ratio_frame.T.reset_index(drop=True)
    CI_ratio_frame.columns = param_names
    CI_ratio_frame['Data Type'] = synthetic_data_type
    CI_ratio_frame['Measure'] = 'CI Ratio'
    
    # calculate whether the signs of coefficients match
    sign_frame = pd.DataFrame()
    for i in ols_synth:
        sign_frame = pd.concat([sign_frame, abs(ols_train['params']) + abs(i['params']) == abs(ols_train['params'] + i['params'])], axis=1)

    sign_frame = sign_frame.T.reset_index(drop=True)
    sign_frame.columns = param_names
    sign_frame['Data Type'] = synthetic_data_type
    sign_frame['Measure'] = 'Sign Match'
    
    # check whether the statistical significance of the coefficients matches
    sig_frame = pd.DataFrame()
    orig_sig = pd.concat([ols_train['CI'].iloc[:,0] <= 0, 0 <= ols_train['CI'].iloc[:,1]], axis=1).all(axis=1)
    for i in ols_synth:
        sig_frame = pd.concat([sig_frame, pd.concat([i['CI'].iloc[:,0] <= 0, 0 <= i['CI'].iloc[:,1]], axis=1).all(axis=1).eq(orig_sig, axis=0)], axis=1)

    sig_frame = sig_frame.T.reset_index(drop=True)
    sig_frame.columns = param_names
    sig_frame['Data Type'] = synthetic_data_type
    sig_frame['Measure'] = 'Significance Match'
    
    # check whether confidence intervals overlap
    overlap_frame = pd.DataFrame()
    for synth in ols_synth:
        overlaps = []
        for i,j in synth['CI'].iterrows():
            i1 = pd.Interval(ols_train['CI'].iloc[i,0], ols_train['CI'].iloc[i,1], closed='both')
            i2 = pd.Interval(j[0], j[1], closed='both')
            overlaps.append(i1.overlaps(i2))
        overlap_frame = pd.concat([overlap_frame, pd.Series(overlaps)], axis=1)

    overlap_frame = overlap_frame.T.reset_index(drop=True)
    overlap_frame.columns = param_names
    overlap_frame['Data Type'] = synthetic_data_type
    overlap_frame['Measure'] = 'CI Overlap'

    # create dataframe with the actual point estimates and confidence intervals
    p_and_i_full = pd.DataFrame()
    
    for i, Z in enumerate(ols_synth):
        p_and_i = pd.concat([Z['params'].reset_index(), Z['CI']], axis=1)
        p_and_i.columns = ['Parameter', 'Point Estimate', 'Lower Bound', 'Upper Bound']
        p_and_i.loc[:,'Type'] = synthetic_data_type
        p_and_i.loc[:,'index'] = i
        p_and_i_full = pd.concat([p_and_i_full, p_and_i], axis=0)

    p_and_i_full = p_and_i_full.reset_index(drop=True)

    return pd.concat([l1_frame, CI_ratio_frame, sign_frame, sig_frame, overlap_frame], axis=0), p_and_i_full

Results for female regression.

In [27]:
# all_data_types = ['Original', 'MNL', 'CART', 'MOSTLY.AI']
all_data_types = ['Original', 'MNL', 'CART', 'Synthpop']
# female_all_data = [[train_female], sXs_mnl_female, sXs_cart_female, sXs_mostly_female]
female_all_data = [[train_female], sXs_mnl_female, sXs_cart_female, sXs_synthpop_female]

In [28]:
female_all_l1_results = pd.DataFrame()
female_all_p_and_i_results = pd.DataFrame()

for i, j in enumerate(female_all_data):
    current_l1_results, current_p_and_i_results = coef_L1_calc(original_data=train_female, synthetic_datasets=j, synthetic_data_type=all_data_types[i],
                                                               target_variable='log_incwage', 
                                                               exog_variables=['years_of_educ', 'non_white', 'potential_experience', 'potential_experience_2', 'potential_experience_3'],
                                                               param_names=['Intercept', 'Years of Education', 'Non-white', 'Potential Experience', 'Potential Experience^2', 'Potential Experience^3'])

    female_all_l1_results = pd.concat([female_all_l1_results, current_l1_results.reset_index()], axis=0)
    female_all_p_and_i_results = pd.concat([female_all_p_and_i_results, current_p_and_i_results])

In [29]:
female_all_l1_results = female_all_l1_results.melt(id_vars=["Data Type", "Measure", "index"], 
                                                   value_vars=['Intercept', 'Years of Education', 'Non-white', 'Potential Experience', 'Potential Experience^2', 'Potential Experience^3'], 
                                                   var_name="Variable")

Save data for analysis and plotting in R.

In [30]:
female_all_l1_results.to_csv('../Results/IPUMS/log_nn_female_analysis_specific.csv', index=False)

In [31]:
female_all_p_and_i_results.to_csv('../Results/IPUMS/log_nn_female_point_estimates_and_intervals.csv', index=False)

***

Results for male regression.

In [32]:
# male_all_data = [[train_male], sXs_mnl_male, sXs_cart_male, sXs_mostly_male]
male_all_data = [[train_male], sXs_mnl_male, sXs_cart_male, sXs_synthpop_male]

In [33]:
male_all_l1_results = pd.DataFrame()
male_all_p_and_i_results = pd.DataFrame()

for i, j in enumerate(male_all_data):
    current_l1_results, current_p_and_i_results = coef_L1_calc(original_data=train_male, synthetic_datasets=j, synthetic_data_type=all_data_types[i],
                                                               target_variable='log_incwage', 
                                                               exog_variables=['years_of_educ', 'non_white', 'potential_experience', 'potential_experience_2', 'potential_experience_3'],
                                                               param_names=['Intercept', 'Years of Education', 'Non-white', 'Potential Experience', 'Potential Experience^2', 'Potential Experience^3'])

    male_all_l1_results = pd.concat([male_all_l1_results, current_l1_results.reset_index()], axis=0)
    male_all_p_and_i_results = pd.concat([male_all_p_and_i_results, current_p_and_i_results])

In [34]:
male_all_l1_results = male_all_l1_results.melt(id_vars=["Data Type", "Measure", "index"], 
                                               value_vars=['Intercept', 'Years of Education', 'Non-white', 'Potential Experience', 'Potential Experience^2', 'Potential Experience^3'], 
                                               var_name="Variable")

In [35]:
male_all_l1_results.to_csv('../Results/IPUMS/log_nn_male_analysis_specific.csv', index=False)

In [36]:
male_all_p_and_i_results.to_csv('../Results/IPUMS/log_nn_male_point_estimates_and_intervals.csv', index=False)