In [1]:
import numpy as np, matplotlib.pyplot as plt, pandas as pd
pd.set_option('display.max_rows', 8)
!date

%load_ext autoreload
%autoreload 2

Mon May 16 20:04:53 PDT 2022


# Scenario 1, 2, 3: non-TDA approaches to DAS

In [2]:
np.random.seed(12345)

# Load synthetic data for TX and use it to simulate 2010 and 2020 populations

In [3]:
import linked_census_disclosure.data as lcd_data

In [4]:
%%time

df = lcd_data.read_synth_data('tx')

CPU times: user 51.4 s, sys: 8.65 s, total: 1min
Wall time: 1min 2s


In [5]:
# does this have the expected number of rows?
f'{len(df):,.0f}' # expect population of texas in 2010 to be 25,145,561

'25,145,561'

# Focus in on the 0-7 year olds

In [6]:
df_all = df

df = df[df.age < 8].copy()

n_kids = len(df)  # number of children
f'{n_kids:,.0f}'

'3,095,857'

# Make gender column, based on BRFSS 2019 SOGI results

In future work, could try to incorporate observation that there is substantial age dependence in these values.  But for now, keep it simple, and work with the crude prevalence rates.

In [7]:
p_trans_boy = 0.18 / 100
p_trans_girl = 0.22 / 100
p_trans_other = 0.12 / 100
p_cis = 98.08 / 100

# rescale to sum to 100%
p_gender = np.array([p_trans_boy, p_trans_girl, p_trans_other, p_cis])
p_gender /= p_gender.sum()
p_gender

array([0.00182556, 0.00223124, 0.00121704, 0.99472617])

In [8]:
# first initialize gender without distinguishing cis boy and cis girl
# since that matches BRFSS SOGI question
df['gender'] = np.random.choice(['trans_boy', 'trans_girl', 'trans_other', 'cis'], p=p_gender, size=len(df))

In [9]:
# now distinguish cis based on reconstructed sex_id
df.gender = np.where(df.gender == 'cis',
                     df.sex_id.map({1:'cis_boy', 2:'cis_girl'}),
                     df.gender)

In [10]:
df['trans'] = df.gender.isin(['trans_boy', 'trans_girl', 'trans_other'])

In [11]:
np.round(100 * df.gender.value_counts(normalize=True), 2)

cis_boy        50.80
cis_girl       48.67
trans_girl      0.23
trans_boy       0.18
trans_other     0.12
Name: gender, dtype: float64

In [12]:
np.round(100 * df.gender.value_counts(normalize=True).filter(like='trans').sum(), 2)

0.53

In [13]:
np.round(100 * df.trans.mean(), 2)

0.53

In [14]:
#### simulate 10 years of demographic change

df['age_2010'] = df.age
df['age_2020'] = df.age + 10
del df['age']

In [15]:


# ignore births and in-migration, because we are focused
# only on kids who can be linked between 2010 and 2020 census


# p_stay from ACS, see [2022_04_19a_das_dhc_attack_mig_data.ipynb](2022_04_19a_das_dhc_attack_mig_data.ipynb)

In [16]:

# simple model of internal and out-migration, based on probability
# of being in same house for 10+ years among household with 8-17 year olds
# in ACS


p_stay = 0.23


In [17]:
all_locations = df_all.hh_id.unique()

In [18]:
locations_2010 = df.hh_id.unique()

In [19]:
random_location = np.random.choice(all_locations, size=len(locations_2010),
                     replace=True)

locations_2020 = np.where(np.random.uniform(size=len(locations_2010)) < p_stay,
                         locations_2010,
                         random_location)

s_location_2020 = pd.Series(locations_2020,
                            index=locations_2010)

In [20]:
np.mean(locations_2010 == locations_2020)  # should be around 0.23

0.2300973519264052

In [21]:
df['hh_id_2010'] = df.hh_id
df['hh_id_2020'] = df.hh_id.map(s_location_2020)
del df['hh_id']

df

Unnamed: 0,state,county,tract,block,geoid,sex_id,relationship,hispanic,racaian,racasn,...,racnhpi,racsor,racwht,pweight,gender,trans,age_2010,age_2020,hh_id_2010,hh_id_2020
10,48,441,11600,1000,484410116001000,1,20,0,0,0,...,0,0,1,1,cis_boy,False,4,14,484410116001000-46,484790017113021-44
11,48,441,11600,1000,484410116001000,1,20,0,0,0,...,0,0,1,1,cis_boy,False,0,10,484410116001000-47,482014325011000-289
12,48,441,11600,1000,484410116001000,1,20,0,0,0,...,0,0,1,1,cis_boy,False,0,10,484410116001000-48,484410116001000-48
13,48,441,11600,1000,484410116001000,1,20,0,0,0,...,0,0,1,1,cis_boy,False,4,14,484410116001000-49,483671404152042-232
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25145452,48,113,7916,2001,481130079162001,2,25,1,0,0,...,0,0,1,1,cis_girl,False,6,16,481130079162001-756,481130079162001-756
25145492,48,113,7916,2001,481130079162001,1,27,1,0,0,...,0,1,0,1,cis_boy,False,4,14,481130079162001-762,481130136221000-10
25145493,48,113,7916,2001,481130079162001,1,25,1,0,0,...,0,1,0,1,cis_boy,False,3,13,481130079162001-788,481130079162001-788
25145517,48,113,7916,1002,481130079161002,2,30,1,0,0,...,0,1,0,1,cis_girl,False,0,10,481130079161002-784,481450005001010-5


In [22]:
# number of trans families identified by age, geoid from reconstruction without noise

df['geoid_2010'] = df.geoid
df['geoid_2020'] = df.hh_id_2020.map(lambda x: int(x.split('-')[0]))

In [23]:
np.mean(df.hh_id_2010 == df.hh_id_2020)  # should be around 23%

0.2299767075804858

In [24]:
np.mean(df.geoid_2010 == df.geoid_2020)  # should be around 23%

0.22998155276551857

# final step: model of gender reported on 2020 census

# BRFSS data in [2022_04_18b_das_dhc_attack_sogi_data.ipynb](2022_04_18b_das_dhc_attack_sogi_data.ipynb)

In [25]:
def gender_to_sex_2010(gender):
    # map gender to gender assigned at birth
    sex = gender.map({'trans_boy':2,
                       'trans_girl':1,
                       'trans_other':np.nan,  # fill in next, randomly
                       'cis_boy':1,
                       'cis_girl':2
                      })
    sex[gender == 'trans_other'] = np.random.choice([1,2], size=sum(gender == 'trans_other'))
    return sex

def gender_to_sex_2020(gender, reported_sex_2010):
    # start with values reported in 2010
    sex = reported_sex_2010.copy()
    
    # update a random subset of the trans_boy and trans_girl entries
    sex = np.where((gender == 'trans_boy') & (np.random.uniform(size=len(sex)) < 0.5),
                   1,
                   sex
                  )
    sex = np.where((gender == 'trans_girl') & (np.random.uniform(size=len(sex)) < 0.5),
                   2,
                   sex
                  )
    return sex


In [26]:
df['reported_sex_2010'] = gender_to_sex_2010(df.gender)

df['reported_sex_2020'] = gender_to_sex_2020(df.gender, df.reported_sex_2010)

In [27]:
df['gender'].value_counts(normalize=True)

cis_boy        0.508043
cis_girl       0.486707
trans_girl     0.002254
trans_boy      0.001809
trans_other    0.001187
Name: gender, dtype: float64

In [28]:
np.mean(df.reported_sex_2010 != df.reported_sex_2020)

0.002002676480212103

# Values for results section

In [29]:
# Our synthetic population matched the age, sex, race/ethnicity, and geography of Texas
# on census day April 1, 2010, with
# X children ages 0-7 in Y household on census day 2010

n_kids = len(df)  # number of children
f'{n_kids:,.0f}'

'3,095,857'

In [30]:
# back of envelope scenario 1
3_095_857 * (0.18 + 0.23)/100 * 0.5 

6346.506850000001

In [31]:
# back of envelope scenario 2
3_095_857 * 0.002 * 0.23 

1424.09422

In [32]:
n_households = df.hh_id_2010.nunique()  # number of households
f'{n_households:,.0f}'

'2,411,149'

In [33]:
# number of household that were in same place in 2010 and 2020 census
n_stayed = (df.hh_id_2010 == df.hh_id_2020).sum()
f'{n_stayed:,.0f}'

'711,975'

In [34]:
# number of trans kids that were in same place in 2010 and 2020 census

n_trans_stayed = (df.trans & (df.hh_id_2010 == df.hh_id_2020)).sum()
f'{n_trans_stayed:,.0f}'

'3,774'

In [35]:
# number of trans families identified if full census data with names and dob was released

n_hh_w_sex_different = df[df.reported_sex_2010 != df.reported_sex_2020].hh_id_2010.nunique()
f'{n_hh_w_sex_different:,.0f}'

'6,199'

In [36]:
# number of trans kids identified if full census data with names and dob was released

n_kids_w_sex_different = len(df[df.reported_sex_2010 != df.reported_sex_2020])
f'{n_kids_w_sex_different:,.0f}'

'6,200'

In [37]:
# number of trans kids total

n_trans_kids = sum(df.trans)
f'{n_trans_kids:,.0f}'

'16,254'

In [38]:
np.round(100 * n_kids_w_sex_different/n_trans_kids)

38.0

In [39]:
del df['geoid']
df

Unnamed: 0,state,county,tract,block,sex_id,relationship,hispanic,racaian,racasn,racblk,...,gender,trans,age_2010,age_2020,hh_id_2010,hh_id_2020,geoid_2010,geoid_2020,reported_sex_2010,reported_sex_2020
10,48,441,11600,1000,1,20,0,0,0,0,...,cis_boy,False,4,14,484410116001000-46,484790017113021-44,484410116001000,484790017113021,1.0,1.0
11,48,441,11600,1000,1,20,0,0,0,0,...,cis_boy,False,0,10,484410116001000-47,482014325011000-289,484410116001000,482014325011000,1.0,1.0
12,48,441,11600,1000,1,20,0,0,0,0,...,cis_boy,False,0,10,484410116001000-48,484410116001000-48,484410116001000,484410116001000,1.0,1.0
13,48,441,11600,1000,1,20,0,0,0,0,...,cis_boy,False,4,14,484410116001000-49,483671404152042-232,484410116001000,483671404152042,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25145452,48,113,7916,2001,2,25,1,0,0,0,...,cis_girl,False,6,16,481130079162001-756,481130079162001-756,481130079162001,481130079162001,2.0,2.0
25145492,48,113,7916,2001,1,27,1,0,0,0,...,cis_boy,False,4,14,481130079162001-762,481130136221000-10,481130079162001,481130136221000,1.0,1.0
25145493,48,113,7916,2001,1,25,1,0,0,0,...,cis_boy,False,3,13,481130079162001-788,481130079162001-788,481130079162001,481130079162001,1.0,1.0
25145517,48,113,7916,1002,2,30,1,0,0,0,...,cis_girl,False,0,10,481130079161002-784,481450005001010-5,481130079161002,481450005001010,2.0,2.0


In [40]:
# without reidentification to link on, there is still a risk of identifying a block with a trans kid
# by finding blocks where there was a single kids of a given age in 2010 and that age+10 in 2020 and
# different reported sex

k_anon_2010 = df.groupby(['geoid_2010', 'age_2020', 'hispanic', 'racwht', 'racblk', 'racasn', 'racaian', 'racnhpi', 'racsor']).pweight.sum()
k_anon_2010.name = 'k_anon_2010'
k_anon_2010.sort_values()

geoid_2010       age_2020  hispanic  racwht  racblk  racasn  racaian  racnhpi  racsor
482013501032006  10        0         1       0       0       0        0        0          1
482199503003001  14        0         1       0       0       0        0        0          1
482199503003000  16        0         1       0       0       0        0        0          1
                 14        0         1       0       0       0        0        0          1
                                                                                         ..
481130166352000  15        0         0       1       0       0        0        0         71
482012227011007  12        0         0       1       0       0        0        0         75
                 11        0         0       1       0       0        0        0         82
                 14        0         0       1       0       0        0        0         88
Name: k_anon_2010, Length: 1972575, dtype: int64

In [41]:
df = pd.merge(how='left',
              left=df, left_on=['geoid_2010', 'age_2020', 'hispanic', 'racwht',
                                'racblk', 'racasn', 'racaian', 'racnhpi', 'racsor'],
              right=k_anon_2010, right_index=True,)
df.k_anon_2010 = df.k_anon_2010.fillna(0)

In [42]:
# the rows with k_anon = 1 are putative matches
# meaning the individual was unique age/race in 2010
n_unique_2010 = sum(df.k_anon_2010 == 1)
n_unique_2010

1414929

In [43]:
# some of these putative matches will be unclear, because of in-migration

k_anon_2020 = df.groupby(['geoid_2020', 'age_2020', 'hispanic', 'racwht', 'racblk', 'racasn', 'racaian', 'racnhpi', 'racsor']).pweight.sum()
k_anon_2020.name = 'k_anon_2020'
# k_anon_2020.sort_values()

In [44]:
df = pd.merge(how='left',
              left=df, left_on=['geoid_2020', 'age_2020', 'hispanic', 'racwht',
                                'racblk', 'racasn', 'racaian', 'racnhpi', 'racsor'],
              right=k_anon_2020, right_index=True,)
df.k_anon_2020 = df.k_anon_2020.fillna(0)

In [45]:
# df[df.k_anon_2010 == 1].sort_values('k_anon_2020')

In [46]:
# the rows with k_anon_2010 an d_2020 = 1 are putative that are unique on both sides
len(df[(df.k_anon_2010 == 1) & (df.k_anon_2020 == 1)])

968019

In [47]:
# these putative matches that have the same geoid in 2010 and 2020 are confirmed matches
len(df[(df.k_anon_2010 == 1) & (df.k_anon_2020 == 1)
       & (df.geoid_2010 == df.geoid_2020)])

268492

In [48]:
100 * (len(df[(df.k_anon_2010 == 1) & (df.k_anon_2020 == 1)
       & (df.geoid_2010 == df.geoid_2020)]) / 
n_unique_2010)

18.975651781820854

In [49]:
# if some bad actor used this to identify trans kids
# they would find this many putative links
len(df[(df.k_anon_2010 == 1) & (df.k_anon_2020 == 1)
       & (df.reported_sex_2010 != df.reported_sex_2020)
      ])

1885

In [50]:
# but only this many would be the same person
len(df[(df.k_anon_2010 == 1) & (df.k_anon_2020 == 1)
       & (df.geoid_2010 == df.geoid_2020)
       & (df.reported_sex_2010 != df.reported_sex_2020)
      ])

526

In [51]:
n_scenario_2 = sum(
    (df.k_anon_2010 == 1) & (df.k_anon_2020 == 1)
       & (df.geoid_2010 == df.geoid_2020)
       & (df.reported_sex_2010 != df.reported_sex_2020)
    )


In [52]:
n_scenario_2

526

# next a version with household swapping to protect against disclosure
# I hypothesize that it is going to be just 10% lower

In [53]:
p_swap = 0.05

In [54]:
locations_2010 = df.hh_id_2010.unique()

random_location = np.random.choice(all_locations, size=len(locations_2010),  # induces distribution on geoid that is proportional to number of households
                                   replace=True)

reported_locations_2010 = np.where(np.random.uniform(size=len(locations_2010)) < p_swap,
                                   random_location,
                                   locations_2010,
                                  )

s_reported_location_2010 = pd.Series(reported_locations_2010,
                                     index=locations_2010)

df['reported_hh_id_2010'] = df.hh_id_2010.map(s_reported_location_2010)

In [55]:
locations_2020 = df.hh_id_2020.unique()

random_location = np.random.choice(all_locations, size=len(locations_2020),
                                   replace=True)

reported_locations_2020 = np.where(np.random.uniform(size=len(locations_2020)) < p_swap,
                                   random_location,
                                   locations_2020,
                                  )

s_reported_location_2020 = pd.Series(reported_locations_2020,
                                     index=locations_2020)

df['reported_hh_id_2020'] = df.hh_id_2020.map(s_reported_location_2020)

In [56]:
# number of trans families identified by age, geoid from reconstruction without noise

df['reported_geoid_2010'] = df.reported_hh_id_2010.map(lambda x: int(x.split('-')[0]))
df['reported_geoid_2020'] = df.reported_hh_id_2020.map(lambda x: int(x.split('-')[0]))

In [57]:
# without reidentification to link on, there is still a risk of identifying a block with a trans kid
# by finding blocks where there was a single kids of a given age in 2010 and that age+10 in 2020 and
# different reported sex

k_anon_2010 = df.groupby(['reported_geoid_2010', 'age_2020', 'hispanic', 'racwht', 'racblk', 'racasn', 'racaian', 'racnhpi', 'racsor']).pweight.sum()
k_anon_2010.name = 'k_anon_2010'
k_anon_2010.sort_values()

reported_geoid_2010  age_2020  hispanic  racwht  racblk  racasn  racaian  racnhpi  racsor
480019501001000      11        1         1       0       0       0        0        0          1
482419503001005      13        0         0       1       0       0        0        0          1
                     12        0         1       1       0       0        0        0          1
                                                 0       0       0        0        0          1
                                                                                             ..
481130166352000      15        0         0       1       0       0        0        0         67
482012227011007      12        0         0       1       0       0        0        0         70
                     11        0         0       1       0       0        0        0         73
                     14        0         0       1       0       0        0        0         81
Name: k_anon_2010, Length: 2006647, dtype: int

In [58]:
# some of these putative matches will be unclear, because of in-migration

k_anon_2020 = df.groupby(['reported_geoid_2020', 'age_2020', 'hispanic', 'racwht', 'racblk', 'racasn', 'racaian', 'racnhpi', 'racsor']).pweight.sum()
k_anon_2020.name = 'k_anon_2020'
k_anon_2020.sort_values()

reported_geoid_2020  age_2020  hispanic  racwht  racblk  racasn  racaian  racnhpi  racsor
480019501001000      15        1         1       0       0       0        0        0          1
483550025001002      13        1         0       0       0       0        0        1          1
                     10        0         1       0       0       0        0        0          1
483550025001001      17        0         0       1       0       0        0        0          1
                                                                                             ..
481130100021010      11        1         1       0       0       0        0        0         49
480410021003029      15        0         1       0       0       0        0        0         50
481130100021010      13        1         1       0       0       0        0        0         57
                     16        1         1       0       0       0        0        0         60
Name: k_anon_2020, Length: 2266500, dtype: int

In [59]:
del df['k_anon_2010']

df = pd.merge(how='left',
              left=df, left_on=['geoid_2010', 'age_2020', 'hispanic', 'racwht',
                                'racblk', 'racasn', 'racaian', 'racnhpi', 'racsor'],
              right=k_anon_2010, right_index=True,)
df.k_anon_2010 = df.k_anon_2010.fillna(0)

In [60]:
del df['k_anon_2020']

df = pd.merge(how='left',
              left=df, left_on=['geoid_2020', 'age_2020', 'hispanic', 'racwht',
                                'racblk', 'racasn', 'racaian', 'racnhpi', 'racsor'],
              right=k_anon_2020, right_index=True,)
df.k_anon_2020 = df.k_anon_2020.fillna(0)

In [61]:
# the rows with k_anon_2010 an d_2020 = 1 are putative that are unique on both sides
len(df[(df.k_anon_2010 == 1) & (df.k_anon_2020 == 1)])

906895

In [62]:
# these putative matches that have the same geoid in 2010 and 2020 are confirmed matches
len(df[(df.k_anon_2010 == 1) & (df.k_anon_2020 == 1)
       & (df.geoid_2010 == df.geoid_2020) 
       & (df.geoid_2010 == df.reported_geoid_2010)
       & (df.geoid_2020 == df.reported_geoid_2020)
      ])

243338

In [63]:
len(df[(df.k_anon_2010 == 1) & (df.k_anon_2020 == 1)
       & (df.geoid_2010 == df.geoid_2020)
       & (df.reported_sex_2010 != df.reported_sex_2020)
      ])

479

In [64]:
len(df[(df.k_anon_2010 == 1) & (df.k_anon_2020 == 1)
       & (df.geoid_2010 == df.geoid_2020)
       & (df.geoid_2010 == df.reported_geoid_2010)
       & (df.geoid_2020 == df.reported_geoid_2020)
       & (df.reported_sex_2010 != df.reported_sex_2020)
      ])

461

In [65]:
n_scenario_3 = sum(
    (df.k_anon_2010 == 1) & (df.k_anon_2020 == 1)
       & (df.geoid_2010 == df.geoid_2020)
       & (df.geoid_2010 == df.reported_geoid_2010)
       & (df.geoid_2020 == df.reported_geoid_2020)
       & (df.reported_sex_2010 != df.reported_sex_2020)
    )
n_scenario_2, n_scenario_3

(526, 461)

In [66]:
100 * (1 - n_scenario_3/n_scenario_2)

12.357414448669202

In [67]:
1 - 61/n_scenario_2  # n_scenario_4 = 61, see next notebook

0.8840304182509505