In [21]:
import numpy as np, pandas as pd
pd.set_option('display.max_rows', 8)
!date

%load_ext autoreload
%autoreload 2

Thu Jan 30 14:59:28 PST 2020
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Attempt at a minimal model that reproduces the sort of bias observed by Akee in AIAN counts

I think it is necessary and sufficient to have a two-level geographic hierarchy with two race groups.

In [22]:
np.random.seed(12345) # set random seed for reproducibility

In [23]:
K = 10_000  # number of areas at most-detailed level
n_k = 1_000 # number of individuals in area k

overall_minority_share = 0.02 # 2% of total population is in minority race group
segregation_factor = 1 # tunable parameter controlling how many areas are "majority minority"  --- TODO: should this be parameterized differently?

minority_share = np.random.beta(overall_minority_share/segregation_factor,
                                (1-overall_minority_share)/segregation_factor, size=K)

print('Minority majority areas if size was large:', (minority_share > .5).sum(), 'out of', K)

Minority majority areas if size was large: 154 out of 10000


In [24]:
# simulate race-/location-stratified counts
precise_minority_count = np.random.binomial(n_k, minority_share)
precise_majority_count = n_k - precise_minority_count

print('Minority majority areas with size from sim:',
      (precise_minority_count > precise_majority_count).sum(), 'out of', K)

Minority majority areas with size from sim: 153 out of 10000


In [25]:
print('Overall minority share if size was large :', 100*overall_minority_share, '%')
print('Overall minority share with size from sim:', 100*precise_minority_count.sum()/(n_k*K), '%')

Overall minority share if size was large : 2.0 %
Overall minority share with size from sim: 2.04406 %


In [26]:
# add Geometric noise, to make counts differentially private
def GDPC(epsilon, exact_counts):
    """ Geometric DP Counts
    Parameters
    ----------
    epsilon : float-able
    exact_counts : pd.Series
    
    Results
    -------
    returns dp_counts, a pd.Series with index matching exact_counts"""
    
    z = float(epsilon)

    all_errors = (np.random.geometric(z, size=len(exact_counts))
                    - np.random.geometric(z, size=len(exact_counts)))
    dp_counts = exact_counts + all_errors
    return dp_counts

epsilon = '0.1'
dp_minority_count = GDPC(epsilon, precise_minority_count)
dp_majority_count = GDPC(epsilon, precise_majority_count)

In [27]:
print('DP estimate of minority share:', 100*dp_minority_count.sum()/(dp_minority_count + dp_majority_count).sum(), '%')

DP estimate of minority share: 2.0455684233798275 %


In [28]:
# but this probably includes illogical counts, such as negative numbers of the minority population
print('Number of negative minority counts in DP estimate:', (dp_minority_count < 0).sum())
print('Number of negative majority counts in DP estimate:', (dp_majority_count < 0).sum())

Number of negative minority counts in DP estimate: 4273
Number of negative majority counts in DP estimate: 0


In [29]:
# the complex part of the TopDown Algorithm is an optimization step that
# removes illogical counts such as negatives, using constrained convex optimization

# in this minimal example, I will optimize to find counts that are "close" to the dp counts,
# are non-negative, and have race-stratified sums that match the precise total sum for both minority and majority
# groups

import pyomo.environ
from pyomo.core import *
from pyomo.opt import SolverFactory


In [30]:
def nonnegative_optimize(imprecise_counts, control_total):
    """optimize the imprecise counts so that they sum to
    the control total and are non-negative
    
    Parameters
    ----------
    imprecise_counts : list-like of floats
    control_total : float
    
    Results
    -------
    returns optimized_counts, which are close to imprecise counts,
    but not negative, and match control total in aggregate
    """
    imprecise_counts = list(imprecise_counts)
    
    model = ConcreteModel()
    model.I = range(len(imprecise_counts))
    model.x = Var(model.I, within=NonNegativeReals)
    model.objective = Objective(
        expr=sum((model.x[i] - imprecise_counts[i])**2 for i in model.I))
    model.constraint = Constraint(
        expr=summation(model.x) == control_total)
    
    solver = SolverFactory('ipopt')
    results = solver.solve(model, options={'acceptable_tol':1e-4}, tee=True)
    optimized_counts = [value(model.x[i]) for i in model.I]
        
    return np.array(optimized_counts)

In [31]:
nonnegative_optimize([10, 10, 10, -10], 40)

Ipopt 3.12.13: acceptable_tol=0.0001


******************************************************************************
This program contains Ipopt, a library for large-scale nonlinear optimization.
 Ipopt is released as open source code under the Eclipse Public License (EPL).
         For more information visit http://projects.coin-or.org/Ipopt
******************************************************************************

This is Ipopt version 3.12.13, running with linear solver mumps.
NOTE: Other linear solvers might be more efficient (see Ipopt documentation).

Number of nonzeros in equality constraint Jacobian...:        4
Number of nonzeros in inequality constraint Jacobian.:        0
Number of nonzeros in Lagrangian Hessian.............:        4

Total number of variables............................:        4
                     variables with only lower bounds:        4
                variables with lower and upper bounds:        0
                     variables with only uppe

array([13.33333334, 13.33333334, 13.33333334,  0.        ])

In [32]:
nn_minority_count = nonnegative_optimize(dp_minority_count, precise_minority_count.sum())
nn_majority_count = nonnegative_optimize(dp_majority_count, precise_majority_count.sum())

Ipopt 3.12.13: acceptable_tol=0.0001


******************************************************************************
This program contains Ipopt, a library for large-scale nonlinear optimization.
 Ipopt is released as open source code under the Eclipse Public License (EPL).
         For more information visit http://projects.coin-or.org/Ipopt
******************************************************************************

This is Ipopt version 3.12.13, running with linear solver mumps.
NOTE: Other linear solvers might be more efficient (see Ipopt documentation).

Number of nonzeros in equality constraint Jacobian...:    10000
Number of nonzeros in inequality constraint Jacobian.:        0
Number of nonzeros in Lagrangian Hessian.............:    10000

Total number of variables............................:    10000
                     variables with only lower bounds:    10000
                variables with lower and upper bounds:        0
                     variables with only uppe

In [17]:
# confirm that this got rid of negative counts
print('Number of negative minority counts in DP estimate:', (nn_minority_count < 0).sum())
print('Number of negative majority counts in DP estimate:', (nn_majority_count < 0).sum())

Number of negative minority counts in DP estimate: 0
Number of negative majority counts in DP estimate: 0


In [18]:
# but it does have small positive counts (and non-integral counts...)
print('Number of near-zero minority counts in DP estimate:', (nn_minority_count < .5).sum())
print('Number of near-zero majority counts in DP estimate:', (nn_majority_count < .5).sum())

Number of near-zero minority counts in DP estimate: 8057
Number of near-zero majority counts in DP estimate: 1


In [19]:
# the total number of minorities should now match the precise total
print(f'Total minority counts --- precise {precise_minority_count.sum()}, dp {dp_minority_count.sum()}, non-neg {nn_minority_count.sum()}')

Total minority counts --- precise 204406, dp 204572, non-neg 204406.00005826657


In [20]:
# but I expect that the number of minorities in minority-majority areas is lower in the non-neg version
minority_majority_area = (minority_share > .5)
print(f'Minority counts in minority-majority areas')
print(f'''precise {precise_minority_count[minority_majority_area].sum()},
     dp {dp_minority_count[minority_majority_area].sum()},
non-neg {nn_minority_count[minority_majority_area].sum()}''')

Minority counts in minority-majority areas
precise 110945,
     dp 110815,
non-neg 108884.38229855901


In [None]:
# 2% under-count, and I expect that gets worse as the counts get smaller