# Balance survey data

Weight participants by sex, HHI, race, and ethnicity

In [40]:
import pandas as pd

In [41]:
# load census data

age_data = pd.read_csv('../data/raw/Age by Nativity.csv')
hhi_data = pd.read_csv('../data/raw/Household Income.csv')
race_data = pd.read_csv('../data/raw/Race and Ethnicity.csv')

In [42]:
race_data.head()

Unnamed: 0,ID Race,Race,ID Ethnicity,Ethnicity,ID Year,Year,Hispanic Population Moe,Geography,ID Geography,Slug Geography,Population,share
0,0,White Alone,0,Not Hispanic or Latino,2019,2019,1314.0,"Somerville, MA",16000US2562535,somerville-ma,55183,0.682063
1,0,White Alone,1,Hispanic or Latino,2019,2019,1152.0,"Somerville, MA",16000US2562535,somerville-ma,6113,0.075557
2,1,Black or African American Alone,0,Not Hispanic or Latino,2019,2019,720.0,"Somerville, MA",16000US2562535,somerville-ma,4534,0.05604
3,1,Black or African American Alone,1,Hispanic or Latino,2019,2019,325.0,"Somerville, MA",16000US2562535,somerville-ma,451,0.005574
4,2,American Indian & Alaska Native Alone,0,Not Hispanic or Latino,2019,2019,81.0,"Somerville, MA",16000US2562535,somerville-ma,112,0.001384


Map race names to match the survey. Names don't have to be identical, but identify the right categories:

white, black, asian, other (inc Native Hawaiian), two_or_more_races

Map ethnicity to hispanic / not hispanic

In [43]:
race_data = race_data[race_data["Year"]==2019]

In [44]:
race_map = {'White Alone': 'race_white',
            'Black or African American Alone': 'race_aa',
            'Asian Alone': 'race_asian',
            'American Indian & Alaska Native Alone': 'race_other',
            'Native Hawaiian & Other Pacific Islander Alone': 'race_other',
            'Some Other Race Alone': 'race_other',
            'Two or More Races': 'race_two_or_more'}

race_data['race_mapped'] = race_data['Race'].map(race_map)

eth_map = {'Hispanic or Latino': 'eth_hispanic',
           'Not Hispanic or Latino': 'eth_not_hispanic'}

race_data['eth_mapped'] = race_data['Ethnicity'].map(eth_map)

# create weights df
race_eth_weights = race_data[['race_mapped', 'eth_mapped', 'share']]

For age data we need to group native born and non native born. We can just sum the populations and shares.

Then we need to decimate 5-17 so we can get just 17 (assume equal age dist in this group), group 55-64, eliminated Under 5, and rebalance to remaining groups.

In [45]:
# filter down to 2019 for age data
age_data = age_data[age_data['Year'] == 2019]

# group by age (aggregate out place of birth)
age_data = age_data.groupby('Age').sum()[['share']]

# divide the 5 - 17 share by the number of years
share_5_17 = age_data.loc['5 to 17 Years', 'share']
share_17  = share_5_17 / (17 - 5)

# group 55 - 64
rows_55_64 = ['55 to 59 Years', '60 & 61 Years', '62 to 64 Years']
share_55_64 = age_data.loc[rows_55_64, 'share'].sum()

# add rows for 17 and 55 - 64
age_data.loc['17 Years'] = share_17
age_data.loc['55 to 64 Years'] = share_55_64

# drop under 5, 5-17, all 55-64 subgroups
age_data.drop(rows_55_64, inplace=True)
age_data.drop(['5 to 17 Years', 'Under 5 Years'], inplace=True)

# rebalance to remaining groups
age_data['share'] = age_data['share'] / sum(age_data['share'])

age_weights = age_data.reset_index()

In [46]:
hhi_data = hhi_data[hhi_data['Year']==2019]
hhi_data = hhi_data[hhi_data['Geography']=='Somerville, MA']

In [47]:
hhi_map = {'< $10,000': 1,
           '$10,000-$14,999': 2,
           '$15,000-$19,999': 2,
           '$20,000-$24,999': 2,
           '$25,000-$29,999': 3, 
           '$30,000-$34,999': 3,
           '$35,000-$39,999': 3, 
           '$40,000-$44,999': 3, 
           '$45,000-$49,999': 3,
           '$50,000-$59,999': 4, 
           '$60,000-$74,999': 4, 
           '$75,000-$99,999': 5,
           '$100,000-$124,999': 6, 
           '$125,000-$149,999': 6, 
           '$150,000-$199,999': 7,
           '$200,000+': 8}

hhi_data['hhi_mapped'] = hhi_data['Household Income Bucket'].map(hhi_map)

hhi_weights = hhi_data.groupby('hhi_mapped').sum()[['share']].reset_index()

In [48]:
# we don't need to import data for gender. Somerville is listed as 50/50 split between male and female. 
# Nonbinary is not accounted for in the census so I'm using a national avg of 10%.
gender_weights = pd.DataFrame({'gender': ['Male', 'Female', 'Nonbinary'], 'share': [.45, .45, .1]})

Create weights table with all combinations of age, race, ethnicity, and hhi

In [50]:
merged_weights.head()

Unnamed: 0,race_mapped,eth_mapped,share_a,Age,share_b,hhi_mapped,share,gender,share_d
0,race_white,eth_not_hispanic,0.682063,18 to 24 Years,0.160068,1,0.04079,Male,0.45
1,race_white,eth_not_hispanic,0.682063,18 to 24 Years,0.160068,1,0.04079,Female,0.45
2,race_white,eth_not_hispanic,0.682063,18 to 24 Years,0.160068,1,0.04079,Nonbinary,0.1
3,race_white,eth_not_hispanic,0.682063,18 to 24 Years,0.160068,2,0.097433,Male,0.45
4,race_white,eth_not_hispanic,0.682063,18 to 24 Years,0.160068,2,0.097433,Female,0.45


In [49]:
# cross join to get all combinations
merged_weights = race_eth_weights.merge(
    age_weights, how='cross', suffixes=['_a', '_b']).merge(
    hhi_weights, how='cross', suffixes=['', '_c']).merge(
    gender_weights, how='cross', suffixes=['', '_d'])

# calculate composite weight (simple product)
merged_weights['population_share'] = (merged_weights['share_a'] 
                                      * merged_weights['share_b'] 
                                      * merged_weights['share_c']
                                      * merged_weights['share_d'])

# cleanup
col_rename_map = {'race_mapped': 'race',
                  'eth_mapped': 'eth',
                  'hhi_mapped': 'hhi',
                  'Age': 'age',
                   }

merged_weights.rename(columns=col_rename_map, inplace=True)

merged_weights.set_index(['gender', 'race', 'eth', 'age', 'hhi'], inplace=True)


KeyError: 'share_c'

In [None]:
merged_weights.head()

In [10]:
# save
cols = ['population_share']
merged_weights[cols].to_csv('../data/processed/population_weights.csv', index=True)

In [11]:
merged_weights.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,share_x,share_y,share,population_share
race,eth,age,hhi,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
race_white,eth_not_hispanic,18 to 24 Years,1,0.682063,0.160068,0.04079,0.004453
race_white,eth_not_hispanic,18 to 24 Years,2,0.682063,0.160068,0.097433,0.010637
race_white,eth_not_hispanic,18 to 24 Years,3,0.682063,0.160068,0.108439,0.011839
race_white,eth_not_hispanic,18 to 24 Years,4,0.682063,0.160068,0.126822,0.013846
race_white,eth_not_hispanic,18 to 24 Years,5,0.682063,0.160068,0.139077,0.015184
