# Balance survey data

Load census data and get into a format where we can do population weighting by gender, HHI, race, and ethnicity.

We're using the generalized survey raking protocol, so we need an array of categories with population counts.

In [1]:
import pandas as pd

In [2]:
# load census data

age_data = pd.read_csv('../data/raw/Age by Nativity.csv')
hhi_data = pd.read_csv('../data/raw/Household Income.csv')
race_data = pd.read_csv('../data/raw/Race and Ethnicity.csv')

In [3]:
race_data.head()

Unnamed: 0,ID Race,Race,ID Ethnicity,Ethnicity,ID Year,Year,Hispanic Population Moe,Geography,ID Geography,Slug Geography,Population,share
0,0,White Alone,0,Not Hispanic or Latino,2019,2019,1314.0,"Somerville, MA",16000US2562535,somerville-ma,55183,0.682063
1,0,White Alone,1,Hispanic or Latino,2019,2019,1152.0,"Somerville, MA",16000US2562535,somerville-ma,6113,0.075557
2,1,Black or African American Alone,0,Not Hispanic or Latino,2019,2019,720.0,"Somerville, MA",16000US2562535,somerville-ma,4534,0.05604
3,1,Black or African American Alone,1,Hispanic or Latino,2019,2019,325.0,"Somerville, MA",16000US2562535,somerville-ma,451,0.005574
4,2,American Indian & Alaska Native Alone,0,Not Hispanic or Latino,2019,2019,81.0,"Somerville, MA",16000US2562535,somerville-ma,112,0.001384


Map race names to match the survey. Names don't have to be identical, but identify the right categories:

white, black, asian, other (inc Native Hawaiian), two_or_more_races

Map ethnicity to hispanic / not hispanic

In [5]:
# Race data

race_data = race_data[race_data["Year"]==2019]

In [48]:
race_map = {'White Alone': 'race_white',
            'Black or African American Alone': 'race_aa',
            'Asian Alone': 'race_asian',
            'American Indian & Alaska Native Alone': 'race_other',
            'Native Hawaiian & Other Pacific Islander Alone': 'race_other',
            'Some Other Race Alone': 'race_other',
            'Two or More Races': 'race_two_or_more'}

race_data['race_mapped'] = race_data['Race'].map(race_map)

eth_map = {'Hispanic or Latino': 'eth_hispanic',
           'Not Hispanic or Latino': 'eth_not_hispanic'}

race_data['eth_mapped'] = race_data['Ethnicity'].map(eth_map)

# create weights df
race_eth_weights = race_data[['race_mapped', 'eth_mapped', 'share']]

For age data we need to group native born and non native born. We can just sum the populations and shares.

Then we need to decimate 5-17 so we can get just 17 (assume equal age dist in this group), group 55-64, eliminated Under 5, and rebalance to remaining groups.

In [49]:
# filter down to 2019 for age data
age_data = age_data[age_data['Year'] == 2019]

# group by age (aggregate out place of birth)
age_data = age_data.groupby('Age').sum()[['share']]

# divide the 5 - 17 share by the number of years
share_5_17 = age_data.loc['5 to 17 Years', 'share']
share_17  = share_5_17 / (17 - 5)

# group 55 - 64
rows_55_64 = ['55 to 59 Years', '60 & 61 Years', '62 to 64 Years']
share_55_64 = age_data.loc[rows_55_64, 'share'].sum()

# add rows for 17 and 55 - 64
age_data.loc['17 Years'] = share_17
age_data.loc['55 to 64 Years'] = share_55_64

# drop under 5, 5-17, all 55-64 subgroups
age_data.drop(rows_55_64, inplace=True)
age_data.drop(['5 to 17 Years', 'Under 5 Years'], inplace=True)

# rebalance to remaining groups
age_data['share'] = age_data['share'] / sum(age_data['share'])

age_weights = age_data.reset_index()

In [50]:
age_data

Unnamed: 0_level_0,share
Age,Unnamed: 1_level_1
18 to 24 Years,0.160068
25 to 34 Years,0.373979
35 to 44 Years,0.161862
45 to 54 Years,0.098855
65 to 74 Years,0.058014
75 Years & Over,0.043441
17 Years,0.007122
55 to 64 Years,0.096658


In [51]:
hhi_data = hhi_data[hhi_data['Year']==2019]
hhi_data = hhi_data[hhi_data['Geography']=='Somerville, MA']

In [52]:
hhi_map = {'< $10,000': 'hhi_1',
           '$10,000-$14,999': 'hhi_2',
           '$15,000-$19,999': 'hhi_2',
           '$20,000-$24,999': 'hhi_2',
           '$25,000-$29,999': 'hhi_3', 
           '$30,000-$34,999': 'hhi_3',
           '$35,000-$39,999': 'hhi_3', 
           '$40,000-$44,999': 'hhi_3', 
           '$45,000-$49,999': 'hhi_3',
           '$50,000-$59,999': 'hhi_4', 
           '$60,000-$74,999': 'hhi_4', 
           '$75,000-$99,999': 'hhi_5',
           '$100,000-$124,999': 'hhi_6', 
           '$125,000-$149,999': 'hhi_6', 
           '$150,000-$199,999': 'hhi_7',
           '$200,000+': 'hhi_8'}

hhi_data['hhi_mapped'] = hhi_data['Household Income Bucket'].map(hhi_map)

hhi_weights = hhi_data.groupby('hhi_mapped').sum()[['share']].reset_index()

In [53]:
# we don't need to import data for gender. Somerville is listed as 50/50 split between male and female. 
# Nonbinary is not accounted for in the census so I'm using a national avg of 10%.
gender_weights = pd.DataFrame({'gender': ['gender_Male', 'gender_Female', 'gender_Nonbinary'], 'share': [.45, .45, .1]})

For the generalized raking methodology we need to export this as a vector of counts for each group of interest.


In [57]:
hhi_t = hhi_weights.set_index('hhi_mapped').T
gender_t = gender_weights.set_index('gender').T
age_t = age_weights.set_index('Age').T
# we can do better than marginal values for race and ethnicity since we have the breakdown for Somerville.
race_eth_t = race_eth_weights.set_index(['race_mapped', 'eth_mapped']).T

In [58]:
target_population = pd.concat([hhi_t, gender_t, race_eth_t, age_t], axis=1) * 75000

In [59]:
target_population.T.to_csv('../data/processed/target_populations.csv', index=True)

In [61]:
target_population.T

Unnamed: 0,share
hhi_1,3059.264679
hhi_2,7307.481251
hhi_3,8132.888239
hhi_4,9511.615145
hhi_5,10430.766417
hhi_6,15904.518017
hhi_7,9820.285348
hhi_8,10833.180904
gender_Male,33750.0
gender_Female,33750.0


Create weights table with all combinations of age, race, ethnicity, and hhi

In [13]:
# cross join to get all combinations
merged_weights = race_eth_weights.merge(
    age_weights, how='cross', suffixes=['_a', '_b']).merge(
    # hhi_weights, how='cross', suffixes=['', '']).merge(
    gender_weights, how='cross', suffixes=['_c'])

# calculate composite weight (simple product)
merged_weights['population_share'] = (merged_weights['share_a'] 
                                      * merged_weights['share_b'] 
                                      # * merged_weights['share']
                                      * merged_weights['share'])

# cleanup
col_rename_map = {'race_mapped': 'race',
                  'eth_mapped': 'eth',
                  # 'hhi_mapped': 'hhi',
                  'Age': 'age',
                   }

merged_weights.rename(columns=col_rename_map, inplace=True)

merged_weights.set_index(['gender', 
                          'race', 
                          'eth', 
                          'age', 
                          #'hhi',
                         ], inplace=True)

merged_weights = merged_weights[['population_share']]

In [14]:
merged_weights.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,population_share
gender,race,eth,age,Unnamed: 4_level_1
Male,race_white,eth_not_hispanic,18 to 24 Years,0.049129
Female,race_white,eth_not_hispanic,18 to 24 Years,0.049129
Nonbinary,race_white,eth_not_hispanic,18 to 24 Years,0.010918
Male,race_white,eth_not_hispanic,25 to 34 Years,0.114785
Female,race_white,eth_not_hispanic,25 to 34 Years,0.114785


### How do we deal with people who didn't answer some or all of the demographics questions?

We can just apply weights based on the questions they did answer. Practically, we need to make aggregated versions of the above where we group by all but N of the indices.

In [12]:
df = merged_weights.reset_index()

col_combos = [['age'], ['gender'], ['eth'], ['race'], 
              ['age', 'gender'], ['age', 'eth'], ['age', 'race'],
              ['gender', 'eth'], ['gender', 'race'],
              ['eth', 'race'], 
              ['age', 'gender', 'eth'],
              ['age', 'gender', 'race'],
              ['gender', 'eth', 'race'],
              ['gender', 'age', 'eth', 'race']]


def reweight_unanswered_cols(df, cols):
    
    all_cols = ['age', 'gender', 'eth', 'race']
    gb_cols = list(set(all_cols) - set(cols))
    
    if len(gb_cols) < 1:
        df['dummy_col'] = 1
        gb_cols = ['dummy_col']
        
    # reweight
    df_out = df.groupby(gb_cols).sum().reset_index()
    
    # label as anonymous
    df_out[cols] = "No Answer"
    
    if 'dummy_col' in df_out.columns:
        df_out = df_out.drop('dummy_col', axis=1)
 
    return df_out

# do all grouping and concat into one dataframe
anon_dfs = []
for cc in col_combos:
    anon_dfs.append(reweight_unanswered_cols(df, cc))

anon_df = pd.concat(anon_dfs)
anon_df.set_index(['gender', 'race', 'eth', 'age'], inplace=True)

NameError: name 'merged_weights' is not defined

In [109]:
# join df and anon_df
all_weights = pd.concat([merged_weights, anon_df], axis=0)

In [110]:
# save
all_weights.to_csv('../data/processed/population_weights.csv', index=True)

In [112]:
all_weights

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,population_share
gender,race,eth,age,Unnamed: 4_level_1
Male,race_white,eth_not_hispanic,18 to 24 Years,0.049129
Female,race_white,eth_not_hispanic,18 to 24 Years,0.049129
Nonbinary,race_white,eth_not_hispanic,18 to 24 Years,0.010918
Male,race_white,eth_not_hispanic,25 to 34 Years,0.114785
Female,race_white,eth_not_hispanic,25 to 34 Years,0.114785
...,...,...,...,...
No Answer,No Answer,No Answer,45 to 54 Years,0.098855
No Answer,No Answer,No Answer,55 to 64 Years,0.096658
No Answer,No Answer,No Answer,65 to 74 Years,0.058014
No Answer,No Answer,No Answer,75 Years & Over,0.043441


In [113]:
all_weights.sum()

population_share    15.0
dtype: float64