# Balance survey data
Assign weights to each survey respondent such that the population is balanced on gender, age, hhi, race, and ethnicity with respect to the census.

1. Load data
2. Impute missing demo values based on baseline frequencies
3. Create population hooks in the survey data
4. Create an axis with all possible demographic categories
5. Join survey data and population targets to that axis, fill with 0's
6. Find weights via matrix inversion
7. Export weights, demo hooks

In [121]:
import numpy as np
import pandas as pd


In [122]:
# load data
survey_data = pd.read_csv('../data/intermediate/01_filtered_renamed_survey_data.csv')
population_targets = pd.read_csv('../data/intermediate/02_target_populations.csv')


## Create population hooks in survey data

In [123]:
hooks = pd.DataFrame()
hooks['id'] = survey_data['id']

In [124]:
# Gender

def map_gender(v):
    """Define gender categories. There is an open entry option, which we're mapping to nonbinary."""
    if v not in ('Female', 'Male', np.nan):
        return 'Nonbinary'
    else:
        return v


# map gender
hooks['gender'] = survey_data['d01_gender']
hooks['gender'] = hooks['gender'].apply(lambda x: map_gender(x))


In [125]:
# Age

def map_age(v):
    """Define age buckets"""
    if v == 17:
        return "17 Years"
    elif v <= 24:
        return "18 to 24 Years"
    elif v <= 34:
        return "25 to 34 Years"
    elif v <= 44:
        return "35 to 44 Years"
    elif v <= 54:
        return "45 to 54 Years"
    elif v <= 64:
        return "55 to 64 Years"
    elif v <= 74:
        return "65 to 74 Years"
    else:
        return "75 Years & Over"
    


# map age since we need it for balancing
hooks['age'] = pd.to_numeric(survey_data['d02_age'])
hooks['age'] = hooks['age'].apply(lambda x: map_age(x))


In [126]:
# Household income. Map to buckets for population balancing.

hhi_map = {'Less than $10,000': '1',
           '$10,000 to $24,999': '2',
           '$25,000 to $49,999': '3',
           '$50,000 to 74,999': '4',
           '$75,000 to $99,999': '5',
           '$100,000 to $149,999': '6',
           '$150,000 to 200,000': '7',
           '$200,000 or more': '8'}

hooks['hhi'] = survey_data['d08_hhi'].map(hhi_map)


In [127]:
# Race and Ethnicity

# We need to extract Hispanic / Non-Hispanic ethnicity from the Race response.

# format string
race_response = survey_data['d04_race'].str.lower()


searchfor = ['hispanic', 'puerto rican']
hooks['ethnicity'] = pd.to_numeric(race_response.str.contains('|'.join(searchfor)) * 1).map({0: 'non-hispanic', 1: 'hispanic'})


# People can have multiple responses for race, and we want to keep track of "more than one race".
# So as an intermediate step we break out indicators for each race in (white, aa, asian, other)


# Break out white, black, asian
race_data = pd.DataFrame()
race_data['race_white'] = pd.to_numeric(race_response.str.contains('white') * 1)
race_data['race_aa'] = pd.to_numeric(race_response.str.contains('black') * 1)
race_data['race_asian'] = pd.to_numeric(race_response.str.contains('asian') * 1)

# Map all other responses to "other". Note that these were taken manually from a list of unique reponses.
searchfor = ['jewish', 'american indian', 'portuguese', 'cape verdean', 
             'middle eastern', 'east indian', 'biracial', 'arab', 'brazilian']
race_data['race_other'] = pd.to_numeric(race_response.str.contains('|'.join(searchfor)) * 1)

# Define hooks
def get_race_hooks(row):
    race_cols = ['race_aa', 'race_asian', 'race_white', 'race_other']
    if sum(row[race_cols]) > 1:
        return 'two_or_more'
    if row['race_aa'] > 0:
        return 'aa'
    if row['race_asian'] > 0:
        return 'asian'
    if row['race_other'] > 0:
        return 'other'
    if row['race_white'] > 0:
        return 'white'
    else:
        return np.nan
    
hooks['race'] = race_data.apply(lambda row: get_race_hooks(row), axis=1)

In [128]:
hooks.to_csv('../data/intermediate/03_survey_population_hooks.csv')

## Impute missing values by sampled frequency

In [129]:
def impute_by_sampled_frequency(df, col):
    """Impute nulls by sampling according to the frequencies present in the data.
    Modifies the df in place."""
    
    s = df[col].value_counts(normalize=True)
    missing = df[col].isnull()
    df.loc[missing, col] = np.random.choice(s.index, size=len(df[missing]),p=s.values)

    return 

In [130]:
# for any missing value, substitute according to the sample frequencies
impute_by_sampled_frequency(hooks, 'gender')
impute_by_sampled_frequency(hooks, 'hhi')
impute_by_sampled_frequency(hooks, 'race')
impute_by_sampled_frequency(hooks, 'ethnicity')
impute_by_sampled_frequency(hooks, 'age')

In [131]:
# get race x ethnicity combinations since we're balancing on the combination.

hooks['race_ethnicity'] = hooks.apply(lambda row: "({0}, {1})".format(row['race'], row['ethnicity']), axis=1)
hooks.drop(['race', 'ethnicity'], axis=1, inplace=True)

In [132]:
# HHI is coded as a number but it needs to be a string to join to survey data targets.
hooks['hhi'] = hooks['hhi'].astype(int).astype(str)

In [133]:
# Break out into indicators. The automated column naming should match the population targets.
hooks.set_index('id', inplace=True)
survey_demo_counts = pd.get_dummies(hooks)

## Population balancing

In [134]:
hooks

Unnamed: 0_level_0,gender,age,hhi,race_ethnicity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
7391,Female,65 to 74 Years,3,"(white, non-hispanic)"
7392,Female,55 to 64 Years,2,"(white, non-hispanic)"
7393,Male,65 to 74 Years,4,"(white, non-hispanic)"
7394,Male,55 to 64 Years,5,"(white, non-hispanic)"
7395,Female,55 to 64 Years,5,"(asian, non-hispanic)"
...,...,...,...,...
8882,Female,45 to 54 Years,6,"(white, non-hispanic)"
8883,Female,25 to 34 Years,8,"(white, non-hispanic)"
8884,Female,25 to 34 Years,8,"(white, non-hispanic)"
8885,Female,25 to 34 Years,8,"(two_or_more, non-hispanic)"


In [135]:
# from targets, select the right columns in order to do the calculation
population_targets = population_targets.set_index('demo')

population_targets = population_targets.loc[survey_demo_counts.columns]

In [136]:
# Practically, we may need to balance on a subset of demo counts to get a good fit.
cols = ['gender_Female', 
        'gender_Male', 
        'gender_Nonbinary', 
        'age_17 Years',
       'age_18 to 24 Years', 
        'age_25 to 34 Years', 
        'age_35 to 44 Years',
       'age_45 to 54 Years', 
        'age_55 to 64 Years', 
        'age_65 to 74 Years',
       'age_75 Years & Over', 
        'hhi_1', 
        'hhi_2', 
        'hhi_3',
        'hhi_4', 
        'hhi_5',
        'hhi_6', 
        'hhi_7',
        'hhi_8',
        #'race_ethnicity_(aa, hispanic)',
       #'race_ethnicity_(aa, not_hispanic)', 'race_ethnicity_(asian, hispanic)',
       #'race_ethnicity_(asian, not_hispanic)',
       #'race_ethnicity_(other, not_hispanic)',
       #'race_ethnicity_(two_or_more, hispanic)',
       #'race_ethnicity_(two_or_more, not_hispanic)',
       #'race_ethnicity_(white, hispanic)',
       #'race_ethnicity_(white, not_hispanic)',
       ]

We're going to get weights by solving the linear inverse problem. This is equivalent to a regularized OLS problem.

The problem looks like:
wX = T

Where w is the [1 by N] vector of weights, X is the [N by M] matrix of survey participant demo data, and T is the [1 by M] vector of population target numbers. 

w is then given by
w = T X^+
Here, X^+ is the regularized pseudo-inverse of X.

In [137]:

# find the pseudo-inverse of X
survey_counts_inverse = np.linalg.pinv(survey_demo_counts[cols])

# calculate the weight vector.
weights = np.dot(population_targets.loc[cols]['count'], survey_counts_inverse)

In [138]:
weights.min()

-32.775155403090324

In [139]:
# We have a small number of negative weights which we will set to zero
weights[weights <  0] = 0

In [140]:
# check how close we are
res = population_targets.loc[cols]
res['weighted_survey_pop'] = np.dot(weights, survey_demo_counts[cols])

res['pct_error'] = 100 * (1 - res['weighted_survey_pop']/res['count'])

In [141]:
res

Unnamed: 0,count,weighted_survey_pop,pct_error
gender_Female,37312,37575.312242,-0.705704
gender_Male,37312,37311.714286,0.000766
gender_Nonbinary,375,450.18942,-20.050512
age_17 Years,534,534.017857,-0.003344
age_18 to 24 Years,12005,12005.017857,-0.000149
age_25 to 34 Years,28048,28048.017857,-6.4e-05
age_35 to 44 Years,12139,12139.961513,-0.007921
age_45 to 54 Years,7414,7414.017857,-0.000241
age_55 to 64 Years,7249,7261.373403,-0.170691
age_65 to 74 Years,4351,4384.106814,-0.760901


We see that we're a bit off on very underrepresented groups: nonbinary participants and those over 75 years of age.

In [142]:
# assign weights
survey_data['weight'] = weights

In [143]:
# Export weights
survey_data[['id', 'weight']].to_csv('../data/intermediate/03_survey_weights.csv', index=False)

In [144]:
# survey_data.to_csv('../data/intermediate/weighted_survey_data.csv', index=False)

In [145]:
# QA check

survey_data.weight.isnull().any()

False