In [1]:
import numpy as np, matplotlib.pyplot as plt, pandas as pd
pd.set_option('display.max_rows', 8)
!date

%load_ext autoreload
%autoreload 2

Mon May 16 14:34:05 PDT 2022


# Scenario 4: Swapping DAS for 2010 data, TDA DAS for 2020 data

In [2]:
np.random.seed(12345)

# Load synthetic data for TX and use it to simulate 2010 and 2020 populations, and also get k_anon_2020 value from existing demo product



In [3]:
import linked_census_disclosure.data as lcd_data

In [4]:
%%time

sf1 = lcd_data.read_sf1_remf('tx')
dhc = lcd_data.read_dhc_remf('tx')

CPU times: user 12.3 s, sys: 2.13 s, total: 14.5 s
Wall time: 19.4 s


In [5]:
sf1

Unnamed: 0,state,county,tract,block,row_num,age,sex,race,eth,n
0,48.0,201.0,431802.0,2009.0,0,0.0,0.0,0.0,0.0,8.0
1,48.0,201.0,431802.0,2009.0,1,0.0,0.0,0.0,1.0,3.0
2,48.0,201.0,431802.0,2009.0,2,0.0,0.0,6.0,0.0,2.0
3,48.0,201.0,431802.0,2009.0,3,0.0,1.0,0.0,0.0,12.0
...,...,...,...,...,...,...,...,...,...,...
5941,48.0,451.0,1600.0,4313.0,0,18.0,1.0,0.0,0.0,2.0
5942,48.0,337.0,950200.0,1051.0,0,18.0,0.0,0.0,0.0,2.0
5943,48.0,333.0,950100.0,2208.0,0,18.0,0.0,0.0,0.0,1.0
5944,48.0,279.0,950600.0,1182.0,0,18.0,0.0,0.0,0.0,1.0


In [6]:
# does this have the expected number of people? yes, now it does
f'{sf1.n.sum():,.0f}' # expect population of texas in 2010 to be 25,145,561

'25,145,561'

# Focus in on the 0-7 year olds

But in a complicated way, because we want to use TDA as our simulated 2020 link

In [7]:
sf1[(sf1.age >= 10) & (sf1.age < 18)]

Unnamed: 0,state,county,tract,block,row_num,age,sex,race,eth,n
42,48.0,201.0,431802.0,2009.0,42,10.0,0.0,0.0,1.0,1.0
43,48.0,201.0,431802.0,2009.0,43,10.0,0.0,1.0,0.0,2.0
44,48.0,201.0,431802.0,2009.0,44,10.0,1.0,0.0,0.0,3.0
45,48.0,201.0,431802.0,2009.0,45,10.0,1.0,0.0,1.0,4.0
...,...,...,...,...,...,...,...,...,...,...
5925,48.0,111.0,950100.0,1136.0,3,14.0,0.0,17.0,1.0,1.0
5934,48.0,387.0,950100.0,2057.0,0,17.0,0.0,1.0,1.0,1.0
5936,48.0,171.0,950500.0,1125.0,0,10.0,0.0,0.0,1.0,1.0
5937,48.0,171.0,950500.0,1125.0,1,17.0,1.0,0.0,1.0,1.0


## expand this portion of the SF1 into microdata

For example, turn the first row of the table above into 9 rows for non-Hispanic White males who are age 8 in 2020.

In [8]:
# it takes a while to do this, around 20 minutes

In [9]:
%%time

def my_expand(df):
    assert len(df) == 1
    return pd.Series(1, index=range(df.iloc[0].n.astype(int)), name='pweight')

df = sf1[(sf1.age >= 10) & (sf1.age < 18)].groupby(
    ['state', 'county', 'tract', 'block', 'age', 'sex', 'race', 'eth']).apply(my_expand).reset_index()

CPU times: user 14min 37s, sys: 10.7 s, total: 14min 48s
Wall time: 14min 47s


In [10]:
df_all = df.copy()

In [11]:
n_kids = len(df)  # number of children
f'{n_kids:,.0f}'

'3,009,117'

In [12]:
#### simulate 10 years of demographic change

df['age_2020'] = df.age
df['age_2010'] = df.age - 10
del df['age']

In [13]:
assert np.all(df.age_2010 >= 0), 'ensure that all ages are still non-negative'

In [14]:
# ignore births, because we are focused
# only on kids who can be linked between 2010 and 2020 census

# Add in migration, but this time, make the change to the 2010 geography

p_stay from ACS, see [2022_04_19a_das_dhc_attack_mig_data.ipynb](2022_04_19a_das_dhc_attack_mig_data.ipynb)

In [15]:
# simple model migration, based on probability
# of being in same house for 10+ years among household with 8-17 year olds
# in ACS

p_stay = 0.23


In [16]:
def add_geoid(df):
    """form geoid from 'state', 'county', 'tract', 'block'
    """

    df['geoid'] = (df.state.astype(str) + df.county.astype(str).str.zfill(3)
                   + df.tract.astype(str).str.zfill(6) + df.block.astype(str).str.zfill(4))
    
add_geoid(sf1)
add_geoid(dhc)
add_geoid(df)

In [17]:
all_locations = sf1.geoid
p_all_locations = sf1.n / sf1.n.sum()

In [18]:
locations_2020 = df.geoid.unique()

In [19]:
random_location = np.random.choice(all_locations, size=len(locations_2020),
                                   p=p_all_locations, replace=True)

locations_2010 = np.where(np.random.uniform(size=len(locations_2020)) < p_stay,
                         locations_2020,
                         random_location)

s_location_2010 = pd.Series(locations_2010,
                            index=locations_2020)

In [20]:
np.mean(locations_2010 == locations_2020)  # should be around 0.23

0.2301284846457563

In [21]:
df['geoid_2020'] = df.geoid
df['geoid_2010'] = df.geoid.map(s_location_2010)
del df['geoid']

df

Unnamed: 0,state,county,tract,block,sex,race,eth,level_8,pweight,age_2020,age_2010,geoid_2020,geoid_2010
0,48.0,1.0,950100.0,1000.0,0.0,5.0,1.0,0,1,10.0,0.0,48.01.0950100.01000.0,48.01.0950100.01000.0
1,48.0,1.0,950100.0,1000.0,1.0,5.0,1.0,0,1,11.0,1.0,48.01.0950100.01000.0,48.01.0950100.01000.0
2,48.0,1.0,950100.0,1000.0,0.0,5.0,1.0,0,1,13.0,3.0,48.01.0950100.01000.0,48.01.0950100.01000.0
3,48.0,1.0,950100.0,1000.0,0.0,1.0,0.0,0,1,15.0,5.0,48.01.0950100.01000.0,48.01.0950100.01000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3009113,48.0,507.0,950302.0,5025.0,0.0,0.0,1.0,0,1,13.0,3.0,48.0507.0950302.05025.0,48.0507.0950302.05025.0
3009114,48.0,507.0,950302.0,5025.0,1.0,0.0,1.0,0,1,17.0,7.0,48.0507.0950302.05025.0,48.0507.0950302.05025.0
3009115,48.0,507.0,950302.0,5026.0,0.0,0.0,1.0,0,1,15.0,5.0,48.0507.0950302.05026.0,48.0507.0950302.05026.0
3009116,48.0,507.0,950302.0,5026.0,0.0,0.0,1.0,0,1,17.0,7.0,48.0507.0950302.05026.0,48.0507.0950302.05026.0


In [22]:
np.mean(df.geoid_2010 == df.geoid_2020)  # should be around 23%

0.22958362868575732

# Make gender column, based on BRFSS 2019 SOGI results

In future work, could try to incorporate observation that there is substantial age dependence in these values.  But for now, keep it simple, and work with the crude prevalence rates.  It is already not-that-simple.

Since we want to use the demonstration DHC without re-running it, we first take the reported_sex_2020 column from the `sex` column in the SF1 ReMF (note: this conflates swapping and TDA)

In [23]:
df['reported_sex_2020'] = df.sex

In [24]:
# then initalize a gender for each simulant, calibrated to have unconditional probability from BRFSS
# and also to match the data generation procedure for the reported_sex_2020 column

p_trans_boy = 0.18 / 100
p_trans_girl = 0.22 / 100
p_trans_other = 0.12 / 100
p_cis = 98.08 / 100

# rescale to sum to 100%
p_gender = np.array([p_trans_boy, p_trans_girl, p_trans_other, p_cis])
p_gender /= p_gender.sum()
p_gender

array([0.00182556, 0.00223124, 0.00121704, 0.99472617])

In [25]:
# first initialize gender without distinguishing cis boy and cis girl
# since that matches BRFSS SOGI question
df['gender'] = np.random.choice(['trans_boy', 'trans_girl', 'trans_other', 'cis'], p=p_gender, size=len(df))

In [26]:
# now distinguish cis based on reconstructed sex_id
df.gender = np.where(df.gender == 'cis',
                     df.sex.map({0:'cis_boy', 1:'cis_girl'}),
                     df.gender)

In [27]:
df['trans'] = df.gender.isin(['trans_boy', 'trans_girl', 'trans_other'])

In [28]:
np.round(100 * df.gender.value_counts(normalize=True), 2)

cis_boy        50.99
cis_girl       48.49
trans_girl      0.22
trans_boy       0.18
trans_other     0.12
Name: gender, dtype: float64

In [29]:
np.round(100 * df.gender.value_counts(normalize=True).filter(like='trans').sum(), 2)

0.52

In [30]:
np.round(100 * df.trans.mean(), 2)

0.52

In [31]:
df[df.trans].groupby('gender').reported_sex_2020.value_counts(normalize=True).unstack()

reported_sex_2020,0.0,1.0
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
trans_boy,0.508961,0.491039
trans_girl,0.505057,0.494943
trans_other,0.510454,0.489546


In [32]:
def gender_to_sex_2010(gender, reported_sex_2020):
    # start with values reported in 2020
    sex = reported_sex_2020.copy()
    
    # update the trans_boy and trans_girl entries to be gender assigned at birth
    sex = np.where((gender == 'trans_boy'),
                   0,
                   sex
                  )
    sex = np.where((gender == 'trans_girl'),
                   1,
                   sex
                  )
    return sex


df['reported_sex_2010'] = gender_to_sex_2010(df.gender, df.reported_sex_2020)

In [33]:
df['trans'].mean()

0.005188232960034456

In [34]:
np.mean(df.reported_sex_2010 != df.reported_sex_2020)

0.001993608091676063

# Values for results section

In [35]:
# Our synthetic population matched the age, sex, race/ethnicity, and geography of Texas
# on census day April 1, 2010, with
# X children ages 0-7 in Y household on census day 2010

n_kids = len(df)  # number of children
f'{n_kids:,.0f}'

'3,009,117'

In [36]:
# number of kids that were in same census block in 2010 and 2020 census
n_stayed = (df.geoid_2010 == df.geoid_2020).sum()
f'{n_stayed:,.0f}'

'690,844'

In [37]:
# number of trans kids that were in same place in 2010 and 2020 census

n_trans_stayed = (df.trans & (df.geoid_2010 == df.geoid_2020)).sum()
f'{n_trans_stayed:,.0f}'

'3,601'

In [38]:
# number of trans kids identified if full census data with names and dob was released

n_trans = df[(df.reported_sex_2010 != df.reported_sex_2020)].pweight.sum()
f'{n_trans:,.0f}'

'5,999'

# number of trans families identified by age, geoid from reconstruction based on TDA-protected DHC


In [39]:
# without reidentification to link on, there is still a risk of identifying a block with a trans kid
# by finding blocks where there was a single kids of a given age in 2010 and that age+10 in 2020 and
# different reported sex

k_anon_2010 = df.groupby(['geoid_2010', 'age_2020', 'race', 'eth']).pweight.sum()
k_anon_2010.name = 'k_anon_2010'
k_anon_2010.sort_values()

geoid_2010               age_2020  race  eth
48.01.0950100.01000.0    10.0      5.0   1.0      1
48.0309.01900.01027.0    17.0      5.0   1.0      1
48.0309.01900.02006.0    10.0      0.0   0.0      1
                                   1.0   0.0      1
                                               ... 
48.0201.0210100.01042.0  17.0      1.0   0.0    110
48.0201.0320100.01006.0  12.0      0.0   0.0    111
48.0201.0210100.01042.0  17.0      0.0   0.0    119
48.0415.0950600.05011.0  17.0      3.0   0.0    144
Name: k_anon_2010, Length: 1525368, dtype: int64

In [40]:
df = pd.merge(how='left',
              left=df, left_on=['geoid_2010', 'age_2020', 'race', 'eth'],
              right=k_anon_2010, right_index=True,)
df.k_anon_2010 = df.k_anon_2010.fillna(0)

In [41]:
# the rows with k_anon = 1 are putative matches
# meaning the individual was unique age/race in 2010
df[df.k_anon_2010 == 1]

Unnamed: 0,state,county,tract,block,sex,race,eth,level_8,pweight,age_2020,age_2010,geoid_2020,geoid_2010,reported_sex_2020,gender,trans,reported_sex_2010,k_anon_2010
0,48.0,1.0,950100.0,1000.0,0.0,5.0,1.0,0,1,10.0,0.0,48.01.0950100.01000.0,48.01.0950100.01000.0,0.0,cis_boy,False,0.0,1
1,48.0,1.0,950100.0,1000.0,1.0,5.0,1.0,0,1,11.0,1.0,48.01.0950100.01000.0,48.01.0950100.01000.0,1.0,cis_girl,False,1.0,1
2,48.0,1.0,950100.0,1000.0,0.0,5.0,1.0,0,1,13.0,3.0,48.01.0950100.01000.0,48.01.0950100.01000.0,0.0,cis_boy,False,0.0,1
7,48.0,1.0,950100.0,1000.0,0.0,1.0,0.0,0,1,16.0,6.0,48.01.0950100.01000.0,48.01.0950100.01000.0,0.0,cis_boy,False,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3009113,48.0,507.0,950302.0,5025.0,0.0,0.0,1.0,0,1,13.0,3.0,48.0507.0950302.05025.0,48.0507.0950302.05025.0,0.0,cis_boy,False,0.0,1
3009114,48.0,507.0,950302.0,5025.0,1.0,0.0,1.0,0,1,17.0,7.0,48.0507.0950302.05025.0,48.0507.0950302.05025.0,1.0,cis_girl,False,1.0,1
3009115,48.0,507.0,950302.0,5026.0,0.0,0.0,1.0,0,1,15.0,5.0,48.0507.0950302.05026.0,48.0507.0950302.05026.0,0.0,cis_boy,False,0.0,1
3009116,48.0,507.0,950302.0,5026.0,0.0,0.0,1.0,0,1,17.0,7.0,48.0507.0950302.05026.0,48.0507.0950302.05026.0,0.0,cis_boy,False,0.0,1


In [42]:
# some of these putative matches will be unclear, because of TDA or migration

k_anon_2020 = dhc.set_index(['geoid', 'age', 'race', 'eth']).n
k_anon_2020.name = 'k_anon_2020'
k_anon_2020.sort_values()

geoid                    age   race  eth
48.0337.0950400.02086.0  18.0  7.0   0.0       1.0
48.0141.03100.04001.0    9.0   2.0   1.0       1.0
                         10.0  0.0   1.0       1.0
48.0439.0113929.04008.0  8.0   10.0  1.0       1.0
                                             ...  
48.0201.0210100.01042.0  18.0  0.0   0.0    2694.0
48.0113.010000.01208.0   18.0  1.0   0.0    3195.0
48.041.02015.01021.0     18.0  0.0   0.0    3887.0
48.0201.0210100.01042.0  18.0  1.0   0.0    4471.0
Name: k_anon_2020, Length: 4111430, dtype: float64

In [43]:
df = pd.merge(how='left',
              left=df, left_on=['geoid_2020', 'age_2020', 'race', 'eth'],
              right=k_anon_2020, right_index=True,)
df.k_anon_2020 = df.k_anon_2020.fillna(0)

In [44]:
df[df.k_anon_2010 == 1].sort_values('k_anon_2020')

Unnamed: 0,state,county,tract,block,sex,race,eth,level_8,pweight,age_2020,age_2010,geoid_2020,geoid_2010,reported_sex_2020,gender,trans,reported_sex_2010,k_anon_2010,k_anon_2020
0,48.0,1.0,950100.0,1000.0,0.0,5.0,1.0,0,1,10.0,0.0,48.01.0950100.01000.0,48.01.0950100.01000.0,0.0,cis_boy,False,0.0,1,0.0
2028501,48.0,251.0,130208.0,3017.0,1.0,5.0,1.0,0,1,17.0,7.0,48.0251.0130208.03017.0,48.0251.0130208.03017.0,1.0,cis_girl,False,1.0,1,0.0
2028504,48.0,251.0,130208.0,3018.0,0.0,8.0,1.0,0,1,14.0,4.0,48.0251.0130208.03018.0,48.0121.021727.02029.0,0.0,cis_boy,False,0.0,1,0.0
2028505,48.0,251.0,130208.0,3018.0,1.0,0.0,0.0,0,1,16.0,6.0,48.0251.0130208.03018.0,48.0121.021727.02029.0,1.0,cis_girl,False,1.0,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1713336,48.0,201.0,542302.0,1020.0,0.0,0.0,1.0,0,1,12.0,2.0,48.0201.0542302.01020.0,48.061.014100.03072.0,0.0,cis_boy,False,0.0,1,27.0
2230941,48.0,339.0,692001.0,1022.0,0.0,0.0,0.0,0,1,11.0,1.0,48.0339.0692001.01022.0,48.0201.0432400.02006.0,0.0,cis_boy,False,0.0,1,28.0
1936057,48.0,215.0,24112.0,2187.0,1.0,0.0,1.0,0,1,14.0,4.0,48.0215.024112.02187.0,48.085.031632.01042.0,1.0,cis_girl,False,1.0,1,28.0
1602473,48.0,201.0,452600.0,3027.0,0.0,1.0,0.0,0,1,13.0,3.0,48.0201.0452600.03027.0,48.0201.0452600.03027.0,0.0,cis_boy,False,0.0,1,31.0


In [45]:
# the rows with k_anon_2010 an d_2020 = 1 are putative that are unique on both sides
len(df[(df.k_anon_2010 == 1) & (df.k_anon_2020 == 1)])

60586

In [46]:
# these putative matches that have the same geoid in 2010 and 2020 are confirmed matches
len(df[(df.k_anon_2010 == 1) & (df.k_anon_2020 == 1)
       & (df.geoid_2010 == df.geoid_2020)])

16183

In [47]:
len(df[(df.k_anon_2010 == 1) & (df.k_anon_2020 == 1)
       & (df.geoid_2010 == df.geoid_2020)
       & (df.reported_sex_2010 != df.reported_sex_2020)
      ])

26

In [48]:
len(df[(df.k_anon_2010 == 1) & (df.k_anon_2020 == 1)
       & (df.geoid_2010 == df.geoid_2020)
       & (df.reported_sex_2010 != df.reported_sex_2020)
       & df.trans
      ])

26