# Balance survey data

Weight participants by sex, HHI, race, and ethnicity

In [1]:
import pandas as pd

In [2]:
# load census data

age_data = pd.read_csv('../data/raw/Age by Nativity.csv')
hhi_data = pd.read_csv('../data/raw/Household Income.csv')
race_data = pd.read_csv('../data/raw/Race and Ethnicity.csv')

In [3]:
race_data.head()

Unnamed: 0,ID Race,Race,ID Ethnicity,Ethnicity,ID Year,Year,Hispanic Population Moe,Geography,ID Geography,Slug Geography,Population,share
0,0,White Alone,0,Not Hispanic or Latino,2019,2019,1314.0,"Somerville, MA",16000US2562535,somerville-ma,55183,0.682063
1,0,White Alone,1,Hispanic or Latino,2019,2019,1152.0,"Somerville, MA",16000US2562535,somerville-ma,6113,0.075557
2,1,Black or African American Alone,0,Not Hispanic or Latino,2019,2019,720.0,"Somerville, MA",16000US2562535,somerville-ma,4534,0.05604
3,1,Black or African American Alone,1,Hispanic or Latino,2019,2019,325.0,"Somerville, MA",16000US2562535,somerville-ma,451,0.005574
4,2,American Indian & Alaska Native Alone,0,Not Hispanic or Latino,2019,2019,81.0,"Somerville, MA",16000US2562535,somerville-ma,112,0.001384


Map race names to match the survey. Names don't have to be identical, but identify the right categories:

white, black, asian, other (inc Native Hawaiian), two_or_more_races

Map ethnicity to hispanic / not hispanic

In [4]:
race_data = race_data[race_data["Year"]==2019]

In [5]:
race_map = {'White Alone': 'race_white',
            'Black or African American Alone': 'race_aa',
            'Asian Alone': 'race_asian',
            'American Indian & Alaska Native Alone': 'race_other',
            'Native Hawaiian & Other Pacific Islander Alone': 'race_other',
            'Some Other Race Alone': 'race_other',
            'Two or More Races': 'race_two_or_more'}

race_data['race_mapped'] = race_data['Race'].map(race_map)

eth_map = {'Hispanic or Latino': 'eth_hispanic',
           'Not Hispanic or Latino': 'eth_not_hispanic'}

race_data['eth_mapped'] = race_data['Ethnicity'].map(eth_map)

# create weights df
race_eth_weights = race_data[['race_mapped', 'eth_mapped', 'share']]

For age data we need to group native born and non native born. We can just sum the populations and shares.

Then we need to decimate 5-17 so we can get just 17 (assume equal age dist in this group), group 55-64, eliminated Under 5, and rebalance to remaining groups.

In [6]:
# filter down to 2019 for age data
age_data = age_data[age_data['Year'] == 2019]

# group by age (aggregate out place of birth)
age_data = age_data.groupby('Age').sum()[['share']]

# divide the 5 - 17 share by the number of years
share_5_17 = age_data.loc['5 to 17 Years', 'share']
share_17  = share_5_17 / (17 - 5)

# group 55 - 64
rows_55_64 = ['55 to 59 Years', '60 & 61 Years', '62 to 64 Years']
share_55_64 = age_data.loc[rows_55_64, 'share'].sum()

# add rows for 17 and 55 - 64
age_data.loc['17 Years'] = share_17
age_data.loc['55 to 64 Years'] = share_55_64

# drop under 5, 5-17, all 55-64 subgroups
age_data.drop(rows_55_64, inplace=True)
age_data.drop(['5 to 17 Years', 'Under 5 Years'], inplace=True)

# rebalance to remaining groups
age_data['share'] = age_data['share'] / sum(age_data['share'])

age_weights = age_data.reset_index()

In [7]:
hhi_data = hhi_data[hhi_data['Year']==2019]
hhi_data = hhi_data[hhi_data['Geography']=='Somerville, MA']

In [8]:
hhi_map = {'< $10,000': 1,
           '$10,000-$14,999': 2,
           '$15,000-$19,999': 2,
           '$20,000-$24,999': 2,
           '$25,000-$29,999': 3, 
           '$30,000-$34,999': 3,
           '$35,000-$39,999': 3, 
           '$40,000-$44,999': 3, 
           '$45,000-$49,999': 3,
           '$50,000-$59,999': 4, 
           '$60,000-$74,999': 4, 
           '$75,000-$99,999': 5,
           '$100,000-$124,999': 6, 
           '$125,000-$149,999': 6, 
           '$150,000-$199,999': 7,
           '$200,000+': 8}

hhi_data['hhi_mapped'] = hhi_data['Household Income Bucket'].map(hhi_map)

hhi_weights = hhi_data.groupby('hhi_mapped').sum()[['share']].reset_index()

In [9]:
# we don't need to import data for gender. Somerville is listed as 50/50 split between male and female. 
# Nonbinary is not accounted for in the census so I'm using a national avg of 10%.
gender_weights = pd.DataFrame({'gender': ['Male', 'Female', 'Nonbinary'], 'share': [.45, .45, .1]})

Create weights table with all combinations of age, race, ethnicity, and hhi

In [18]:
merged_weights.columns

Index(['race_mapped', 'eth_mapped', 'share_a', 'Age', 'share_b', 'gender',
       'share'],
      dtype='object')

In [19]:
# cross join to get all combinations
merged_weights = race_eth_weights.merge(
    age_weights, how='cross', suffixes=['_a', '_b']).merge(
    # hhi_weights, how='cross', suffixes=['', '']).merge(
    gender_weights, how='cross', suffixes=['_c'])

# calculate composite weight (simple product)
merged_weights['population_share'] = (merged_weights['share_a'] 
                                      * merged_weights['share_b'] 
                                      # * merged_weights['share']
                                      * merged_weights['share'])

# cleanup
col_rename_map = {'race_mapped': 'race',
                  'eth_mapped': 'eth',
                  # 'hhi_mapped': 'hhi',
                  'Age': 'age',
                   }

merged_weights.rename(columns=col_rename_map, inplace=True)

merged_weights.set_index(['gender', 
                          'race', 
                          'eth', 
                          'age', 
                          #'hhi',
                         ], inplace=True)


In [20]:
# save
cols = ['population_share']
merged_weights[cols].to_csv('../data/processed/population_weights.csv', index=True)

In [21]:
len(merged_weights)

336

In [22]:
merged_weights[cols].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,population_share
gender,race,eth,age,Unnamed: 4_level_1
Male,race_white,eth_not_hispanic,18 to 24 Years,0.049129
Female,race_white,eth_not_hispanic,18 to 24 Years,0.049129
Nonbinary,race_white,eth_not_hispanic,18 to 24 Years,0.010918
Male,race_white,eth_not_hispanic,25 to 34 Years,0.114785
Female,race_white,eth_not_hispanic,25 to 34 Years,0.114785


### How do we deal with people who didn't answer some or all of the demographics questions?

We can just apply weights based on the questions they did answer. Practically, we need to make aggregated versions of the above where we group by all but N of the indices.

In [46]:
df = merged_weights.reset_index()

# all unanswered
share_no_info = df
share_no_info['gbcol'] = 1
share_no_info = share_no_info.groupby('gbcol').sum().reset_index()
share_no_info[['gender', 'age', 'eth', 'race']] = "No Answer"
share_no_info.drop('gbcol', axis=1, inplace=True)

# gender
share_no_gender = df.groupby(['race', 'eth', 'age']).sum().reset_index()
share_no_gender['gender'] = 'No Answer'

# race
share_no_race = df.groupby(['eth', 'age', 'gender']).sum().reset_index()
share_no_race['race'] = 'No Answer'

# ethnnicity
share_no_eth = df.groupby(['race', 'age', 'gender']).sum().reset_index()
share_no_eth['eth'] = 'No Answer'

# age
share_no_age = df.groupby(['race', 'eth', 'gender']).sum().reset_index()
share_no_age['age'] = 'No Answer'

# gender and race
share_no_gender_race = df.groupby(['eth', 'age']).sum().reset_index()
share_no_gender_race[['gender', 'race']] = 'No Answer'

# gender and ethnicity
share_no_gender_race = df.groupby(['gender', 'eth']).sum().reset_index()
share_no_gender_race[['gender', 'eth']] = 'No Answer'

In [47]:
share_no_gender_race

Unnamed: 0,eth,age,share_a,share_b,share,population_share,gbcol,gender,race
0,eth_hispanic,17 Years,0.370615,0.149562,7.0,0.00088,21,No Answer,No Answer
1,eth_hispanic,18 to 24 Years,0.370615,3.361429,7.0,0.019775,21,No Answer,No Answer
2,eth_hispanic,25 to 34 Years,0.370615,7.853555,7.0,0.046201,21,No Answer,No Answer
3,eth_hispanic,35 to 44 Years,0.370615,3.399099,7.0,0.019996,21,No Answer,No Answer
4,eth_hispanic,45 to 54 Years,0.370615,2.075962,7.0,0.012212,21,No Answer,No Answer
5,eth_hispanic,55 to 64 Years,0.370615,2.029823,7.0,0.011941,21,No Answer,No Answer
6,eth_hispanic,65 to 74 Years,0.370615,1.218303,7.0,0.007167,21,No Answer,No Answer
7,eth_hispanic,75 Years & Over,0.370615,0.912267,7.0,0.005367,21,No Answer,No Answer
8,eth_not_hispanic,17 Years,2.629385,0.149562,7.0,0.006242,21,No Answer,No Answer
9,eth_not_hispanic,18 to 24 Years,2.629385,3.361429,7.0,0.140293,21,No Answer,No Answer


In [27]:
a = ['race', 'eth', 'age',]
a.remove()
a

ValueError: list.remove(x): x not in list

In [61]:
share_no_gender

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,share_a,share_b,share,share_c,population_share,gender
race,eth,age,hhi,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
race_aa,eth_hispanic,17 Years,1,0.016723,0.021366,0.122371,1.0,0.000002,No Answer
race_aa,eth_hispanic,17 Years,2,0.016723,0.021366,0.292299,1.0,0.000004,No Answer
race_aa,eth_hispanic,17 Years,3,0.016723,0.021366,0.325316,1.0,0.000004,No Answer
race_aa,eth_hispanic,17 Years,4,0.016723,0.021366,0.380465,1.0,0.000005,No Answer
race_aa,eth_hispanic,17 Years,5,0.016723,0.021366,0.417231,1.0,0.000006,No Answer
...,...,...,...,...,...,...,...,...,...
race_white,eth_not_hispanic,75 Years & Over,4,2.046189,0.130324,0.380465,1.0,0.003758,No Answer
race_white,eth_not_hispanic,75 Years & Over,5,2.046189,0.130324,0.417231,1.0,0.004121,No Answer
race_white,eth_not_hispanic,75 Years & Over,6,2.046189,0.130324,0.636181,1.0,0.006283,No Answer
race_white,eth_not_hispanic,75 Years & Over,7,2.046189,0.130324,0.392811,1.0,0.003880,No Answer
