# Balance survey data
Assign weights to each survey respondent such that the population is balanced on gender, age, hhi, race, and ethnicity with respect to the census.

1. Load data
2. Impute missing demo values based on baseline frequencies
3. Create population hooks in the survey data
4. Create an axis with all possible demographic categories
5. Join survey data and population targets to that axis, fill with 0's
6. Find weights
7. Join weights with survey data and save

In [493]:
import numpy as np
import pandas as pd


In [496]:
# load data
survey_data = pd.read_csv('../data/processed/data_2019_preprocessed.csv')
population_targets = pd.read_csv('../data/processed/target_populations.csv')

# trim columns and rename
demo_cols = ['d01_gender',
             'd02_age',
             'race_hooks',
             'd04_ethnicity',
             'd08_hhi_buckets']


demo_data = survey_data[demo_cols]
demo_data.rename(columns={'d01_gender': 'gender',
                          'd02_age': 'age',
                          'race_hooks': 'race',
                          'd04_ethnicity': 'ethnicity',
                          'd08_hhi_buckets': 'hhi'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  demo_data.rename(columns={'d01_gender': 'gender',


In [497]:
# set 'no answer" to null so we can impute easily
demo_data.replace({"No Answer": np.nan}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  demo_data.replace({"No Answer": np.nan}, inplace=True)


In [498]:
def impute_by_sampled_frequency(df, col):
    """Impute nulls by sampling according to the frequencies present in the data.
    Modifies the df in place."""
    
    s = df[col].value_counts(normalize=True)
    missing = df[col].isnull()
    df.loc[missing, col] = np.random.choice(s.index, size=len(df[missing]),p=s.values)

    return 

In [362]:
# for any missing value, substitute according to the sample frequencies
impute_by_sampled_frequency(demo_data, 'gender')
impute_by_sampled_frequency(demo_data, 'hhi')
impute_by_sampled_frequency(demo_data, 'race')
impute_by_sampled_frequency(demo_data, 'ethnicity')
impute_by_sampled_frequency(demo_data, 'age')

In [363]:
# change ethnicity column from binary to string
demo_data['ethnicity'] = demo_data['ethnicity'].apply(lambda x: 'hispanic' if x==1.0 else 'not_hispanic')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  demo_data['ethnicity'] = demo_data['ethnicity'].apply(lambda x: 'hispanic' if x==1.0 else 'not_hispanic')


In [364]:
# get race x ethnicity combinations.

demo_data['race_ethnicity'] = demo_data.apply(lambda row: "({0}, {1})".format(row['race'], row['ethnicity']), axis=1)
demo_data.drop(['race', 'ethnicity'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  demo_data['race_ethnicity'] = demo_data.apply(lambda row: "({0}, {1})".format(row['race'], row['ethnicity']), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  demo_data.drop(['race', 'ethnicity'], axis=1, inplace=True)


In [365]:
demo_data['hhi'] = demo_data['hhi'].astype(int).astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  demo_data['hhi'] = demo_data['hhi'].astype(int).astype(str)


In [366]:
survey_demo_counts = pd.get_dummies(demo_data)

## Population balancing

In [367]:
# from targets, select the right columns in order to do the calculation
population_targets = population_targets.set_index('demo')


In [368]:
population_targets

Unnamed: 0_level_0,count
demo,Unnamed: 1_level_1
hhi_1,3059
hhi_2,7307
hhi_3,8132
hhi_4,9511
hhi_5,10430
hhi_6,15904
hhi_7,9820
hhi_8,10833
gender_Male,37312
gender_Female,37312


In [369]:
population_targets = population_targets.loc[survey_demo_counts.columns]

In [370]:


def raking_inverse(x):
  return np.exp(x)

def d_raking_inverse(x):
  return np.exp(x)

def graking(X, T, max_steps=500, tolerance=1e-6):
  # Based on algo in (Deville et al., 1992) explained in detail on page 37 in
  # https://orca.cf.ac.uk/109727/1/2018daviesgpphd.pdf

  # Initialize variables - Step 1
  n, m = X.shape
  L = np.zeros(m) # Lagrange multipliers (lambda)
  w = np.ones(n) # Our weights (will get progressively updated)
  H = np.eye(n)
  success = False

  for step in range(max_steps):
    L += np.dot(np.linalg.pinv(np.dot(np.dot(X.T, H), X)), (T - np.dot(X.T, w))) # Step 2.1
    w = raking_inverse(np.dot(X, L)) # Step 2.2
    H = np.diag(d_raking_inverse(np.dot(X, L))) # Step 2.3

    # Termination condition:
    loss = np.max(np.abs(np.dot(X.T, w) - T) / T)
    if loss < tolerance:
        success = True
        break

  if not success: raise Exception("Did not converge")
  return w

As noted in the function, we are using a Generalized Survey Raking methodology due to Deville (1992). The inputs are

T: a vector of targets, providing counts of each group of interest, e.g. [100 Male, 100 Femal, 10 Nonbinary, 12 Black, 60 White, ...]

X: A binary matrix of [respondents x target] providing group membership. 

Output is
w: A vector of weights for each respondent such that 
wX = T

In [372]:
len(population_targets)

28

In [390]:
survey_demo_counts.columns

Index(['gender_Female', 'gender_Male', 'gender_Nonbinary', 'age_17 Years',
       'age_18 to 24 Years', 'age_25 to 34 Years', 'age_35 to 44 Years',
       'age_45 to 54 Years', 'age_55 to 64 Years', 'age_65 to 74 Years',
       'age_75 Years & Over', 'hhi_1', 'hhi_2', 'hhi_3', 'hhi_4', 'hhi_5',
       'hhi_6', 'hhi_7', 'hhi_8', 'race_ethnicity_(aa, hispanic)',
       'race_ethnicity_(aa, not_hispanic)', 'race_ethnicity_(asian, hispanic)',
       'race_ethnicity_(asian, not_hispanic)',
       'race_ethnicity_(other, not_hispanic)',
       'race_ethnicity_(two_or_more, hispanic)',
       'race_ethnicity_(two_or_more, not_hispanic)',
       'race_ethnicity_(white, hispanic)',
       'race_ethnicity_(white, not_hispanic)'],
      dtype='object')

In [456]:
cols = ['gender_Female', 
        'gender_Male', 
        'gender_Nonbinary', 
        'age_17 Years',
       'age_18 to 24 Years', 
        'age_25 to 34 Years', 
        'age_35 to 44 Years',
       'age_45 to 54 Years', 
        'age_55 to 64 Years', 
        'age_65 to 74 Years',
       'age_75 Years & Over', 
        'hhi_1', 
        'hhi_2', 
        'hhi_3',
        'hhi_4', 
        'hhi_5',
        'hhi_6', 
        'hhi_7',
        'hhi_8',
        #'race_ethnicity_(aa, hispanic)',
       #'race_ethnicity_(aa, not_hispanic)', 'race_ethnicity_(asian, hispanic)',
       #'race_ethnicity_(asian, not_hispanic)',
       #'race_ethnicity_(other, not_hispanic)',
       #'race_ethnicity_(two_or_more, hispanic)',
       #'race_ethnicity_(two_or_more, not_hispanic)',
       #'race_ethnicity_(white, hispanic)',
       #'race_ethnicity_(white, not_hispanic)',
       ]

In [457]:
survey_counts_inverse = np.linalg.pinv(survey_demo_counts[cols])

In [458]:
weights = np.dot(population_targets.loc[cols]['count'], survey_counts_inverse)

In [483]:
# We have a small number of negative weights which we will set to zero
weights[weights <  0] = 0

In [485]:
res = population_targets.loc[cols]
res['weighted_survey_pop'] = np.dot(weights, survey_demo_counts[cols])

res

Unnamed: 0,count,weighted_survey_pop
gender_Female,37312,37613.237704
gender_Male,37312,37311.714286
gender_Nonbinary,375,436.949374
age_17 Years,534,534.017857
age_18 to 24 Years,12005,12005.017857
age_25 to 34 Years,28048,28048.017857
age_35 to 44 Years,12139,12141.03603
age_45 to 54 Years,7414,7414.017857
age_55 to 64 Years,7249,7254.128834
age_65 to 74 Years,4351,4382.916654


In [486]:
# assign weights
survey_data['weight'] = weights

In [491]:
survey_data.to_csv('../data/processed/weighted_survey_data.csv', index=False)