In [1]:
import numpy as np, matplotlib.pyplot as plt, pandas as pd
pd.set_option('display.max_rows', 8)
!date

%load_ext autoreload
%autoreload 2

Wed May 10 13:45:21 PDT 2023


# Scenario 4: Swapping DAS for 2010 data, TDA DAS for 2020 data

In [2]:
np.random.seed(12345)

# Load synthetic data for TX and use it to simulate 2010 and 2020 populations, and also get k_anon_2020 value from existing demo product



In [3]:
import linked_census_disclosure.data as lcd_data

In [4]:
%%time

sf1 = lcd_data.read_sf1_remf('tx')
dhc = lcd_data.read_dhc_remf('tx')

del sf1['row_num']  # check if these columns are needed by deleting them and seeing if any numbers change
del dhc['row_num']

CPU times: user 16.7 s, sys: 2.82 s, total: 19.5 s
Wall time: 21.6 s


In [5]:
sf1

Unnamed: 0,state,county,tract,block,age,sex,race,eth,n
0,48,201.0,431802.0,2009.0,0.0,0.0,0.0,0.0,8.0
1,48,201.0,431802.0,2009.0,0.0,0.0,0.0,1.0,3.0
2,48,201.0,431802.0,2009.0,0.0,0.0,6.0,0.0,2.0
3,48,201.0,431802.0,2009.0,0.0,1.0,0.0,0.0,12.0
...,...,...,...,...,...,...,...,...,...
5941,48,451.0,1600.0,4313.0,18.0,1.0,0.0,0.0,2.0
5942,48,337.0,950200.0,1051.0,18.0,0.0,0.0,0.0,2.0
5943,48,333.0,950100.0,2208.0,18.0,0.0,0.0,0.0,1.0
5944,48,279.0,950600.0,1182.0,18.0,0.0,0.0,0.0,1.0


In [6]:
# does this have the expected number of people? yes, now it does
f'{sf1.n.sum():,.0f}' # expect population of texas in 2010 to be 25,145,561

'25,145,561'

# Focus in on the 0-7 year olds in 2010

But in a complicated way, because we want to use TDA as our simulated 2020 link

In [7]:
sf1[(sf1.age >= 10) & (sf1.age < 18)]

Unnamed: 0,state,county,tract,block,age,sex,race,eth,n
42,48,201.0,431802.0,2009.0,10.0,0.0,0.0,1.0,1.0
43,48,201.0,431802.0,2009.0,10.0,0.0,1.0,0.0,2.0
44,48,201.0,431802.0,2009.0,10.0,1.0,0.0,0.0,3.0
45,48,201.0,431802.0,2009.0,10.0,1.0,0.0,1.0,4.0
...,...,...,...,...,...,...,...,...,...
5925,48,111.0,950100.0,1136.0,14.0,0.0,17.0,1.0,1.0
5934,48,387.0,950100.0,2057.0,17.0,0.0,1.0,1.0,1.0
5936,48,171.0,950500.0,1125.0,10.0,0.0,0.0,1.0,1.0
5937,48,171.0,950500.0,1125.0,17.0,1.0,0.0,1.0,1.0


## expand this portion of the SF1 into microdata

For example, turn the first row of the table above into 9 rows for non-Hispanic White males who are age 8 in 2020.

In [8]:
# it takes a while to do this, around 15 minutes

In [9]:
%%time

def my_expand(df):
    assert len(df) == 1
    return pd.Series(1, index=range(df.iloc[0].n.astype(int)), name='pweight')

df = sf1[(sf1.age >= 10) & (sf1.age < 18)].groupby(
    ['state', 'county', 'tract', 'block', 'age', 'sex', 'race', 'eth']).apply(my_expand).reset_index()

CPU times: user 26min 42s, sys: 19.4 s, total: 27min 2s
Wall time: 26min 59s


In [10]:
df_all = df.copy()

In [11]:
n_kids = len(df)  # number of children
f'{n_kids:,.0f}'

'3,009,117'

In [12]:
#### simulate 10 years of demographic change

df['age_2020'] = df.age
df['age_2010'] = df.age - 10
del df['age']

In [13]:
assert np.all(df.age_2010 >= 0), 'ensure that all ages are still non-negative'

In [14]:
# ignore births, because we are focused
# only on kids who can be linked between 2010 and 2020 census

# Add in migration, but this time, make the change to the 2010 geography

p_stay from ACS, see [2022_04_19a_das_dhc_attack_mig_data.ipynb](2022_04_19a_das_dhc_attack_mig_data.ipynb)

In [15]:
# simple model migration, based on probability
# of being in same house for 10+ years among household with 8-17 year olds
# in ACS

p_stay = 0.23


In [16]:
def add_geoid(df):
    """form geoid from 'state', 'county', 'tract', 'block'
    """

    df['geoid'] = (df.state.astype(str) + df.county.astype(str).str.zfill(3)
                   + df.tract.astype(str).str.zfill(6) + df.block.astype(str).str.zfill(4))
    
add_geoid(sf1)
add_geoid(dhc)
add_geoid(df)

In [17]:
all_locations = sf1.geoid
p_all_locations = sf1.n / sf1.n.sum()

In [18]:
locations_2020 = df.geoid

In [19]:
random_location = np.random.choice(all_locations, size=len(df),
                                   p=p_all_locations, replace=True)

locations_2010 = np.where(np.random.uniform(size=len(df)) < p_stay,
                         locations_2020,
                         random_location)

In [20]:
np.mean(locations_2010 == locations_2020)  # should be around 0.23

0.23014824614662707

In [21]:
df['geoid_2020'] = locations_2020
df['geoid_2010'] = locations_2010
del df['geoid']

df

Unnamed: 0,state,county,tract,block,sex,race,eth,level_8,pweight,age_2020,age_2010,geoid_2020,geoid_2010
0,48,1.0,950100.0,1000.0,0.0,5.0,1.0,0,1,10.0,0.0,481.0950100.01000.0,48185.0180101.03003.0
1,48,1.0,950100.0,1000.0,1.0,5.0,1.0,0,1,11.0,1.0,481.0950100.01000.0,48201.0411800.03023.0
2,48,1.0,950100.0,1000.0,0.0,5.0,1.0,0,1,13.0,3.0,481.0950100.01000.0,481.0950100.01000.0
3,48,1.0,950100.0,1000.0,0.0,1.0,0.0,0,1,15.0,5.0,481.0950100.01000.0,481.0950100.01000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3009113,48,507.0,950302.0,5025.0,0.0,0.0,1.0,0,1,13.0,3.0,48507.0950302.05025.0,48135.02900.03007.0
3009114,48,507.0,950302.0,5025.0,1.0,0.0,1.0,0,1,17.0,7.0,48507.0950302.05025.0,48409.010302.02003.0
3009115,48,507.0,950302.0,5026.0,0.0,0.0,1.0,0,1,15.0,5.0,48507.0950302.05026.0,48507.0950302.05026.0
3009116,48,507.0,950302.0,5026.0,0.0,0.0,1.0,0,1,17.0,7.0,48507.0950302.05026.0,4829.0182101.01003.0


In [22]:
np.mean(df.geoid_2010 == df.geoid_2020)  # should be around 23%

0.23014824614662707

# Make gender column, based on BRFSS 2019 SOGI results

In future work, could try to incorporate observation that there is substantial age dependence in these values.  But for now, keep it simple, and work with the crude prevalence rates.  It is already not-that-simple.

Since we want to use the demonstration DHC without re-running it, we first take the reported_sex_2020 column from the `sex` column in the SF1 ReMF (note: this conflates swapping and TDA)

In [23]:
df['reported_sex_2020'] = df.sex

In [24]:
# then initalize a gender for each simulant, calibrated to have unconditional probability from BRFSS
# and also to match the data generation procedure for the reported_sex_2020 column

p_trans_boy = 0.18 / 100
p_trans_girl = 0.22 / 100
p_trans_other = 0.12 / 100
p_cis = 98.08 / 100

# rescale to sum to 100%
p_gender = np.array([p_trans_boy, p_trans_girl, p_trans_other, p_cis])
p_gender /= p_gender.sum()
p_gender

array([0.00182556, 0.00223124, 0.00121704, 0.99472617])

In [25]:
# first initialize gender without distinguishing cis boy and cis girl
# since that matches BRFSS SOGI question
df['gender'] = np.random.choice(['trans_boy', 'trans_girl', 'trans_other', 'cis'], p=p_gender, size=len(df))

In [26]:
# now distinguish cis based on reconstructed sex_id
df.gender = np.where(df.gender == 'cis',
                     df.sex.map({0:'cis_boy', 1:'cis_girl'}),
                     df.gender)

In [27]:
df['trans'] = df.gender.isin(['trans_boy', 'trans_girl', 'trans_other'])

In [28]:
np.round(100 * df.gender.value_counts(normalize=True), 2)

cis_boy        50.99
cis_girl       48.48
trans_girl      0.22
trans_boy       0.18
trans_other     0.12
Name: gender, dtype: float64

In [29]:
np.round(100 * df.gender.value_counts(normalize=True).filter(like='trans').sum(), 2)

0.52

In [30]:
np.round(100 * df.trans.mean(), 2)

0.52

In [31]:
df[df.trans].groupby('gender').reported_sex_2020.value_counts(normalize=True).unstack()

reported_sex_2020,0.0,1.0
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
trans_boy,0.497812,0.502188
trans_girl,0.500673,0.499327
trans_other,0.507739,0.492261


In [32]:
def gender_to_sex_2010(gender, reported_sex_2020):
    # start with values reported in 2020
    sex = reported_sex_2020.copy()
    
    # update the trans_boy and trans_girl entries to be gender assigned at birth
    sex = np.where((gender == 'trans_boy'),
                   1,
                   sex
                  )
    sex = np.where((gender == 'trans_girl'),
                   0,
                   sex
                  )
    return sex


df['reported_sex_2010'] = gender_to_sex_2010(df.gender, df.reported_sex_2020)

In [33]:
df['trans'].mean()

0.005247054202279273

In [34]:
np.mean(df.reported_sex_2010 != df.reported_sex_2020)

0.0020168707298519797

# Values for results section

In [35]:
# Our synthetic population matched the age, sex, race/ethnicity, and geography of Texas
# on census day April 1, 2010, with
# X children ages 0-7 in Y household on census day 2010

n_kids = len(df)  # number of children
f'{n_kids:,.0f}'

'3,009,117'

In [36]:
# number of kids that were in same census block in 2010 and 2020 census
n_stayed = (df.geoid_2010 == df.geoid_2020).sum()
f'{n_stayed:,.0f}'

'692,543'

In [37]:
# number of trans kids that were in same place in 2010 and 2020 census

n_trans_stayed = (df.trans & (df.geoid_2010 == df.geoid_2020)).sum()
f'{n_trans_stayed:,.0f}'

'3,629'

In [38]:
# number of trans kids identified if full census data with names and dob was released

n_trans = df[(df.reported_sex_2010 != df.reported_sex_2020)].pweight.sum()
f'{n_trans:,.0f}'

'6,069'

# number of trans families identified by age, geoid from reconstruction based on TDA-protected DHC


In [39]:
# without reidentification to link on, there is still a risk of identifying a block with a trans kid
# by finding blocks where there was a single kids of a given age in 2010 and that age+10 in 2020 and
# different reported sex
df['reported_male_2010'] = (df.reported_sex_2010 == 0)
df['geoid'] = df.geoid_2010
df['age'] = df.age_2010 + 10 # add ten to make merge easier
g = df.groupby(['geoid', 'age',
                'race', 'eth'])

df_a = pd.DataFrame({'n_simulants': g.pweight.sum()})
df_a['pct_male'] = 100*g.reported_male_2010.mean()
df_a['pct_trans'] = 100*g.trans.mean()
df_a

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,n_simulants,pct_male,pct_trans
geoid,age,race,eth,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
481.0950100.01000.0,11.0,0.0,1.0,1,0.0,0.0
481.0950100.01000.0,13.0,0.0,0.0,1,0.0,0.0
481.0950100.01000.0,13.0,5.0,1.0,2,50.0,0.0
481.0950100.01000.0,15.0,1.0,0.0,2,50.0,0.0
...,...,...,...,...,...,...
4899.010804.03032.0,13.0,1.0,0.0,1,100.0,0.0
4899.010804.03034.0,12.0,7.0,0.0,1,100.0,0.0
4899.010804.03034.0,13.0,0.0,0.0,2,0.0,0.0
4899.0980000.01211.0,13.0,0.0,1.0,1,100.0,0.0


In [40]:
df['reported_male_2020'] = (df.reported_sex_2020 == 0)
df['geoid'] = df.geoid_2020
df['age'] = df.age_2020
df['discordant_sex'] = (df.reported_sex_2010 != df.reported_sex_2020)

g = df.groupby(['geoid', 'age',
                'race', 'eth'])

df_b = pd.DataFrame({'n_simulants': g.pweight.sum()})
df_b['pct_male'] = 100*g.reported_male_2020.mean()
df_b['pct_trans'] = 100*g.trans.mean()
df_b['pct_discordant_sex'] = 100*g.discordant_sex.mean()
n_unique_2020 = sum(df_b.n_simulants == 1)
n_unique_2020

1239637

In [41]:
# dhc['male'] = (dhc.sex == 0)
dhc['male'] = (np.random.uniform(size=len(dhc)) < 0.5) # a) Take the actual DHC, ignore the sex attributes, generate sex randomly, and use that as the "DHC" in the simulation.
# dhc['male'] = True # b) Same as (a) but make everyone  male in DHC
# dhc['male'] = False # c) Same as (a) but make everyone female in DHC

g = dhc.groupby(['geoid', 'age', 'race', 'eth'])
df_c = pd.DataFrame({'n_simulants':g.n.sum()})
df_c['pct_male'] = 100 * g.male.mean()

In [42]:
df_abc = df_a[df_a.n_simulants==1].copy()
df_abc.columns = ['n_simulants_2010', 'pct_male_2010', 'pct_trans_2010']

df_abc['n_simulants_2020'] = df_c[df_c.n_simulants==1].n_simulants
df_abc['pct_male_2020'] = df_c[df_c.n_simulants==1].pct_male
df_abc = df_abc.dropna()

df_abc['pct_trans_2020'] = df_b.pct_trans
df_abc['pct_discordant_sex'] = df_b.pct_discordant_sex
df_abc

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,n_simulants_2010,pct_male_2010,pct_trans_2010,n_simulants_2020,pct_male_2020,pct_trans_2020,pct_discordant_sex
geoid,age,race,eth,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
481.0950100.01001.0,14.0,0.0,0.0,1,0.0,0.0,1.0,100.0,0.0,0.0
481.0950100.01019.0,16.0,0.0,0.0,1,0.0,0.0,1.0,100.0,,
481.0950100.01047.0,15.0,0.0,0.0,1,100.0,0.0,1.0,0.0,,
481.0950100.01050.0,11.0,0.0,0.0,1,0.0,0.0,1.0,100.0,,
...,...,...,...,...,...,...,...,...,...,...
4899.010804.02021.0,11.0,0.0,0.0,1,0.0,0.0,1.0,0.0,0.0,0.0
4899.010804.03013.0,12.0,1.0,0.0,1,0.0,0.0,1.0,100.0,,
4899.010804.03018.0,14.0,1.0,0.0,1,0.0,0.0,1.0,0.0,,
4899.010804.03020.0,13.0,3.0,0.0,1,100.0,0.0,1.0,0.0,0.0,0.0


In [43]:
df_abc[df_abc.pct_male_2010 != df_abc.pct_male_2020].pct_trans_2020.value_counts(dropna=False)

0.000000      23635
NaN           21509
50.000000        73
100.000000       68
              ...  
20.000000        10
16.666667         2
12.500000         1
11.111111         1
Name: pct_trans_2020, Length: 10, dtype: int64

In [44]:
df_abc[df_abc.pct_male_2010 != df_abc.pct_male_2020].pct_discordant_sex.value_counts(dropna=False)

0.000000      23760
NaN           21509
50.000000        37
100.000000       25
              ...  
25.000000         8
20.000000         4
16.666667         1
11.111111         1
Name: pct_discordant_sex, Length: 9, dtype: int64

In [45]:
n_scenario_4 = sum(
    df_abc[df_abc.pct_male_2010 != df_abc.pct_male_2020].pct_trans_2020 > 0
    )
N_scenario_4 = sum(df_abc.pct_male_2010 != df_abc.pct_male_2020)

In [46]:
n_scenario_4, N_scenario_4-n_scenario_4

(215, 45144)

In [47]:
n_scenario_4 / N_scenario_4

0.00473996340307326

In [48]:
# what is the rate of identifying trans kids if you just pick strata at random
t = df.groupby(['geoid', 'age',
                'race', 'eth']).discordant_sex.mean()
(t>0).mean()

0.0033665635091421433

In [49]:
n_scenario_3 = 605
100 * (n_scenario_3 - n_scenario_4) / n_scenario_3  # pct decrease from scenario 3 to 4

64.46280991735537

In [50]:
dhc

Unnamed: 0,state,county,tract,block,age,sex,race,eth,n,geoid,male
0,48,245.0,11302.0,1000.0,17.0,0.0,1.0,0.0,2.0,48245.011302.01000.0,False
1,48,245.0,11302.0,1000.0,18.0,0.0,0.0,0.0,649.0,48245.011302.01000.0,True
2,48,245.0,11302.0,1000.0,18.0,0.0,0.0,1.0,721.0,48245.011302.01000.0,True
3,48,245.0,11302.0,1000.0,18.0,0.0,1.0,0.0,1125.0,48245.011302.01000.0,False
...,...,...,...,...,...,...,...,...,...,...,...
4311,48,161.0,100.0,1246.0,12.0,1.0,0.0,0.0,1.0,48161.00100.01246.0,True
4312,48,303.0,10508.0,1006.0,18.0,1.0,0.0,0.0,1.0,48303.010508.01006.0,False
4313,48,201.0,454100.0,1019.0,3.0,1.0,1.0,0.0,1.0,48201.0454100.01019.0,True
4314,48,337.0,950400.0,2086.0,18.0,0.0,7.0,0.0,1.0,48337.0950400.02086.0,False
