In [10]:
import numpy as np, pandas as pd
pd.set_option('display.max_rows', 8)
!date

%load_ext autoreload
%autoreload 2

Mon Feb  3 13:04:44 PST 2020
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Attempt at a minimal model that reproduces the sort of bias observed by Akee in AIAN counts

I think it is necessary and sufficient to have a two-level geographic hierarchy with two race groups.

In [139]:
seed = 12345
np.random.seed(seed) # set random seed for reproducibility

In [103]:
K = 10_000  # number of areas at most-detailed level
n_k = 1_000 # number of individuals in area k

overall_minority_share = 0.2 # 2% of total population is in minority race group
segregation_factor = 1 # tunable parameter controlling how many areas are "majority minority"  --- TODO: should this be parameterized differently?

minority_share = np.random.beta(overall_minority_share/segregation_factor,
                                (1-overall_minority_share)/segregation_factor, size=K)

print('Minority majority areas if size was large:', (minority_share > .5).sum(), 'out of', K)

Minority majority areas if size was large: 1678 out of 10000


In [104]:
# simulate race-/location-stratified counts
precise_minority_count = np.random.binomial(n_k, minority_share)
precise_majority_count = n_k - precise_minority_count

print('Minority majority areas with size from sim:',
      (precise_minority_count > precise_majority_count).sum(), 'out of', K)

Minority majority areas with size from sim: 1687 out of 10000


In [105]:
print('Overall minority share if size was large :', 100*overall_minority_share, '%')
print('Overall minority share with size from sim:', 100*precise_minority_count.sum()/(n_k*K), '%')

Overall minority share if size was large : 20.0 %
Overall minority share with size from sim: 20.11717 %


In [106]:
# add Geometric noise, to make counts differentially private
def GDPC(epsilon, exact_counts):
    """ Geometric DP Counts
    Parameters
    ----------
    epsilon : float-able
    exact_counts : pd.Series
    
    Results
    -------
    returns dp_counts, a pd.Series with index matching exact_counts"""
    
    z = float(epsilon)

    all_errors = (np.random.geometric(z, size=len(exact_counts))
                    - np.random.geometric(z, size=len(exact_counts)))
    dp_counts = exact_counts + all_errors
    return dp_counts

epsilon = '0.1'
dp_minority_count = GDPC(epsilon, precise_minority_count)
dp_majority_count = GDPC(epsilon, precise_majority_count)

In [107]:
print('DP estimate of minority share:', 100*dp_minority_count.sum()/(dp_minority_count + dp_majority_count).sum(), '%')

DP estimate of minority share: 20.125314290696995 %


In [108]:
# but this probably includes illogical counts, such as negative numbers of the minority population
print('Number of negative minority counts in DP estimate:', (dp_minority_count < 0).sum())
print('Number of negative majority counts in DP estimate:', (dp_majority_count < 0).sum())

Number of negative minority counts in DP estimate: 1562
Number of negative majority counts in DP estimate: 25


In [109]:
# the complex part of the TopDown Algorithm is an optimization step that
# removes illogical counts such as negatives, using constrained convex optimization

# in this minimal example, I will optimize to find counts that are "close" to the dp counts,
# are non-negative, and have race-stratified sums that match the precise total sum for both minority and majority
# groups

import pyomo.environ
from pyomo.core import *
from pyomo.opt import SolverFactory


In [110]:
def nonnegative_optimize(imprecise_counts, control_total):
    """optimize the imprecise counts so that they sum to
    the control total and are non-negative
    
    Parameters
    ----------
    imprecise_counts : list-like of floats
    control_total : float
    
    Results
    -------
    returns optimized_counts, which are close to imprecise counts,
    but not negative, and match control total in aggregate
    """
    imprecise_counts = list(imprecise_counts)
    
    model = ConcreteModel()
    model.I = range(len(imprecise_counts))
    model.x = Var(model.I, within=NonNegativeReals)
    model.objective = Objective(
        expr=sum((model.x[i] - imprecise_counts[i])**2 for i in model.I))
    model.constraint = Constraint(
        expr=summation(model.x) == control_total)
    
    solver = SolverFactory('ipopt')
    results = solver.solve(model, options={'acceptable_tol':1e-4}, tee=False)
    optimized_counts = [value(model.x[i]) for i in model.I]
        
    return np.array(optimized_counts)

In [151]:
check = nonnegative_optimize([10, 10, 10, -10], 40)
if not np.allclose(check,
                   np.array([13.33333334, 13.33333334, 13.33333334,  0.]),
                   rtol=1e-7,
                   equal_nan=False):
    print("something went wrong")
    print(check)
    break

In [112]:
nn_minority_count = nonnegative_optimize(dp_minority_count, precise_minority_count.sum())
nn_majority_count = nonnegative_optimize(dp_majority_count, precise_majority_count.sum())

In [152]:
print((nn_minority_count < 0).sum())

0


In [113]:
# confirm that this got rid of negative counts
print('Number of negative minority counts in DP estimate:', (nn_minority_count < 0).sum())
print('Number of negative majority counts in DP estimate:', (nn_majority_count < 0).sum())

Number of negative minority counts in DP estimate: 0
Number of negative majority counts in DP estimate: 0


In [114]:
# but it does have small positive counts (and non-integral counts...)
print('Number of near-zero minority counts in DP estimate:', (nn_minority_count < .5).sum())
print('Number of near-zero majority counts in DP estimate:', (nn_majority_count < .5).sum())

Number of near-zero minority counts in DP estimate: 2106
Number of near-zero majority counts in DP estimate: 31


In [115]:
# the total number of minorities should now match the precise total
print(f'Total minority counts --- precise {precise_minority_count.sum()}, dp {dp_minority_count.sum()}, non-neg {nn_minority_count.sum()}')

Total minority counts --- precise 2011717, dp 2012591, non-neg 2011717.0000102385


In [117]:
# but I expect that the number of minorities in minority-majority areas is lower in the non-neg version
minority_majority_area = (minority_share > .5)
print(f'Minority counts in minority-majority areas')
print(f'''precise {precise_minority_count[minority_majority_area].sum()},
     dp {dp_minority_count[minority_majority_area].sum()},
non-neg {nn_minority_count[minority_majority_area].sum()}''')
print(f'pct diff {1-(nn_minority_count[minority_majority_area].sum()/precise_minority_count[minority_majority_area].sum())}')

Minority counts in minority-majority areas
precise 1275989,
     dp 1275763,
non-negg 1272344.9305498141
pct diff 0.0028558784207276533


In [162]:
final_counts = pd.DataFrame(data=[precise_minority_count[minority_majority_area],
                                  dp_minority_count[minority_majority_area],
                                  nn_minority_count[minority_majority_area]])
final_counts.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1668,1669,1670,1671,1672,1673,1674,1675,1676,1677
0,977.0,594.0,839.0,840.0,499.0,854.0,541.0,680.0,724.0,950.0,...,519.0,779.0,519.0,879.0,738.0,929.0,718.0,989.0,579.0,674.0
1,993.0,587.0,841.0,873.0,481.0,859.0,549.0,693.0,723.0,942.0,...,519.0,778.0,533.0,888.0,754.0,940.0,716.0,1004.0,580.0,677.0
2,990.96301,584.96301,838.96301,870.96301,478.96301,856.96301,546.96301,690.96301,720.96301,939.96301,...,516.96301,775.96301,530.96301,885.96301,751.96301,937.96301,713.96301,1001.96301,577.96301,674.96301


In [153]:
output = pd.DataFrame(np.sum(test, axis = 1))
output['count_type'] = ['precise','dp','non-neg']
print(a)

              0 count_type
0  1.275989e+06    precise
1  1.275763e+06         dp
2  1.272345e+06    non-neg


In [164]:
print(seed)
test = pd.DataFrame(['precise','dp','non-neg'])
test[seed] = pd.DataFrame(np.sum(final_counts, axis = 1))
print(test)

12345
     0             12345
0  precise  1.275989e+06
1       dp  1.275763e+06
2  non-neg  1.272345e+06


0
2
4
6
8


In [140]:
output.to_csv("/share/scratch/users/beatrixh/seed_{}_minority_{}_nk_{}.csv".format(seed,
                                                                               overall_minority_share,
                                                                               n_k),
             index=False)

In [134]:
# 2% under-count, (1.75% at 2% minority) and I expect that gets worse as the counts get smaller

In [86]:
# 1% undercount at 5% minority

In [87]:
# 2.79% undercount at .5% minority 

In [88]:
# 5% undercount at .05% minority

In [None]:
# .29% undercount at 20% minority