# Preprocess census data

Load census data and create hooks for population balancing

In [12]:
import pandas as pd

In [13]:
# load census data

age_data = pd.read_csv('../data/raw/Age by Nativity.csv')
hhi_data = pd.read_csv('../data/raw/Household Income.csv')
race_data = pd.read_csv('../data/raw/Race and Ethnicity.csv')

In [14]:
race_data.head()

Unnamed: 0,ID Race,Race,ID Ethnicity,Ethnicity,ID Year,Year,Hispanic Population Moe,Geography,ID Geography,Slug Geography,Population,share
0,0,White Alone,0,Not Hispanic or Latino,2019,2019,1314.0,"Somerville, MA",16000US2562535,somerville-ma,55183,0.682063
1,0,White Alone,1,Hispanic or Latino,2019,2019,1152.0,"Somerville, MA",16000US2562535,somerville-ma,6113,0.075557
2,1,Black or African American Alone,0,Not Hispanic or Latino,2019,2019,720.0,"Somerville, MA",16000US2562535,somerville-ma,4534,0.05604
3,1,Black or African American Alone,1,Hispanic or Latino,2019,2019,325.0,"Somerville, MA",16000US2562535,somerville-ma,451,0.005574
4,2,American Indian & Alaska Native Alone,0,Not Hispanic or Latino,2019,2019,81.0,"Somerville, MA",16000US2562535,somerville-ma,112,0.001384


Map race names to match the survey. Names don't have to be identical, but identify the right categories:

white, black, asian, other (inc Native Hawaiian), two_or_more_races

Map ethnicity to hispanic / not hispanic

In [15]:
# Race 

race_data = race_data[race_data["Year"]==2019]

# change value names
race_map = {'White Alone': 'white',
            'Black or African American Alone': 'aa',
            'Asian Alone': 'asian',
            'American Indian & Alaska Native Alone': 'other',
            'Native Hawaiian & Other Pacific Islander Alone': 'other',
            'Some Other Race Alone': 'other',
            'Two or More Races': 'two_or_more'}

race_data['race_mapped'] = race_data['Race'].map(race_map)

# Ethnicity

# change value names
eth_map = {'Hispanic or Latino': 'hispanic',
           'Not Hispanic or Latino': 'non-hispanic'}

race_data['eth_mapped'] = race_data['Ethnicity'].map(eth_map)


# create weights df
race_eth_weights = race_data[['race_mapped', 'eth_mapped', 'share']]
race_eth_weights = race_eth_weights.groupby(['race_mapped', 'eth_mapped']).sum().reset_index()

For age data we need to group native born and non native born. We can just sum the populations and shares.

Then we need to decimate 5-17 so we can get just 17 (assume equal age dist in this group), group 55-64, eliminated Under 5, and rebalance to remaining groups.

In [16]:
# Age

# filter down to 2019 for age data
age_data = age_data[age_data['Year'] == 2019]

# group by age (aggregate out place of birth)
age_data = age_data.groupby('Age').sum()[['share']]

# divide the 5 - 17 share by the number of years
share_5_17 = age_data.loc['5 to 17 Years', 'share']
share_17  = share_5_17 / (17 - 5)

# group 55 - 64
rows_55_64 = ['55 to 59 Years', '60 & 61 Years', '62 to 64 Years']
share_55_64 = age_data.loc[rows_55_64, 'share'].sum()

# add rows for 17 and 55 - 64
age_data.loc['17 Years'] = share_17
age_data.loc['55 to 64 Years'] = share_55_64

# drop under 5, 5-17, all 55-64 subgroups
age_data.drop(rows_55_64, inplace=True)
age_data.drop(['5 to 17 Years', 'Under 5 Years'], inplace=True)

# rebalance to remaining groups
age_data['share'] = age_data['share'] / sum(age_data['share'])

age_weights = age_data.reset_index()

In [17]:
# Household income

hhi_data = hhi_data[hhi_data['Year']==2019]
hhi_data = hhi_data[hhi_data['Geography']=='Somerville, MA']

hhi_map = {'< $10,000': 'hhi_1',
           '$10,000-$14,999': 'hhi_2',
           '$15,000-$19,999': 'hhi_2',
           '$20,000-$24,999': 'hhi_2',
           '$25,000-$29,999': 'hhi_3', 
           '$30,000-$34,999': 'hhi_3',
           '$35,000-$39,999': 'hhi_3', 
           '$40,000-$44,999': 'hhi_3', 
           '$45,000-$49,999': 'hhi_3',
           '$50,000-$59,999': 'hhi_4', 
           '$60,000-$74,999': 'hhi_4', 
           '$75,000-$99,999': 'hhi_5',
           '$100,000-$124,999': 'hhi_6', 
           '$125,000-$149,999': 'hhi_6', 
           '$150,000-$199,999': 'hhi_7',
           '$200,000+': 'hhi_8'}

hhi_data['hhi_mapped'] = hhi_data['Household Income Bucket'].map(hhi_map)

hhi_weights = hhi_data.groupby('hhi_mapped').sum()[['share']].reset_index()

In [18]:
# we don't need to import data for gender. Somerville is listed as 50/50 split between male and female. 
# Nonbinary is not accounted for in the census so I'm using a national avg of 0.5%.
gender_weights = pd.DataFrame({'gender': ['gender_Male', 'gender_Female', 'gender_Nonbinary'], 'share': [.4975, .4975, .005]})

In [19]:
# transpose and concatenate

hhi_t = hhi_weights.set_index('hhi_mapped').T
gender_t = gender_weights.set_index('gender').T
age_t = age_weights.set_index('Age').T

age_t.columns = ['age_' + c for c in age_t.columns]

# we can do better than marginal values for race and ethnicity since we have the breakdown for Somerville.
race_eth_t = race_eth_weights.set_index(['race_mapped', 'eth_mapped']).T
race_eth_t.columns = ['race_ethnicity_({0}, {1})'.format(c[0], c[1]) for c in race_eth_t.columns]

target_population = (pd.concat([hhi_t, gender_t, race_eth_t, age_t], axis=1) * 75000).T.astype(int)
target_population = target_population.reset_index().rename(columns={'index': 'demo', 'share': 'count'})

In [20]:
# save
target_population.to_csv('../data/processed/02_target_populations.csv', index=False)

In [21]:
target_population

Unnamed: 0,demo,count
0,hhi_1,3059
1,hhi_2,7307
2,hhi_3,8132
3,hhi_4,9511
4,hhi_5,10430
5,hhi_6,15904
6,hhi_7,9820
7,hhi_8,10833
8,gender_Male,37312
9,gender_Female,37312
