# Race by District (with Margin of Error)

This workbook demonstrates how to aggregate ACS data where some estimates may be less reliable, typically because they are for small subgroups.


In [1]:
import pandas as pd
import cenpy                  # https://pypi.org/project/cenpy/ 
import census_data_aggregator # https://pypi.org/project/census-data-aggregator/

In [2]:
acs = cenpy.products.APIConnection('ACSDT5Y2018')

# Refresh our memory on the variable codes for various columns in the race tables
pd.set_option('display.max_colwidth',None)
acs.varslike('B03002_*')[['label']].sort_index()

Unnamed: 0,label
B03002_001E,Estimate!!Total
B03002_002E,Estimate!!Total!!Not Hispanic or Latino
B03002_003E,Estimate!!Total!!Not Hispanic or Latino!!White alone
B03002_004E,Estimate!!Total!!Not Hispanic or Latino!!Black or African American alone
B03002_005E,Estimate!!Total!!Not Hispanic or Latino!!American Indian and Alaska Native alone
B03002_006E,Estimate!!Total!!Not Hispanic or Latino!!Asian alone
B03002_007E,Estimate!!Total!!Not Hispanic or Latino!!Native Hawaiian and Other Pacific Islander alone
B03002_008E,Estimate!!Total!!Not Hispanic or Latino!!Some other race alone
B03002_009E,Estimate!!Total!!Not Hispanic or Latino!!Two or more races
B03002_010E,Estimate!!Total!!Not Hispanic or Latino!!Two or more races!!Two races including Some other race


In [3]:
# make something to help us use friendlier names for the columns
# Use an ordered dict to ensure that things between estimate and MOE cols stay in sync
from collections import OrderedDict
race_cols = OrderedDict([
    ('B03002_001E', 'total'),
    ('B03002_003E', 'nh_white'),
    ('B03002_004E', 'nh_black'),
    ('B03002_005E', 'nh_amerind'),
    ('B03002_006E', 'nh_asian'),
    ('B03002_007E', 'nh_nhpi'),
    ('B03002_008E', 'nh_some_other'),
    ('B03002_009E', 'nh_twoplus'),
    ('B03002_012E', 'hispanic')
])
moe_cols = OrderedDict((k.replace('E','M'),v+"_moe") for k,v in race_cols.items())
query_cols = ['GEO_ID'] + list(race_cols.keys()) + list(moe_cols.keys())
county_race = acs.query(query_cols,'county')
for k in query_cols[1:]: # cenpy doesn't cast estimates to integer so we have to handle that.
    county_race[k] = county_race[k].astype(int)
county_race = county_race.rename(columns=race_cols).rename(columns=moe_cols)

# a Margin of Error value of -555555555 "indicates that the estimate is controlled. 
# A statistical test for sampling variability is not appropriate."
# The math doesn't work with that value, so replace those with 0
county_race = county_race.replace(-555555555,0) 
county_race = county_race.drop(['state', 'county'], axis='columns') # API gives us those but we don't need them

In [4]:
# Join our crosswalk to the ACS data
xref = pd.read_csv('county_district_xref.csv',index_col='geoid', usecols=['geoid','state', 'district'])
joined = xref.join(county_race.set_index('GEO_ID'))# xref


In [5]:
# a helper function so that we can sum more than one estimate/moe pair in a given data frame
def sum_with_moe(df, *column_pairs):
    """Given a data frame and a list of one or more tuples representing estimate/error pairs,
       return a dictionary where each key is one of the values from column pairs and the corresponding
       value is the approximate sum, or approximate error for the sum.
    """
    result = {}
    for est,err in column_pairs:
        tuples = [tuple(x) for x in df[[est,err]].to_numpy()]
        est_sum, err_sum = census_data_aggregator.approximate_sum(*tuples)
        result[est] = est_sum
        result[err] = err_sum
    return result

def compute_single_cv(est,moe):
    se = moe/1.645 # assumes normal distribution
    cv = se/est*100
    return cv

def compute_cvs(df, *column_pairs):
    """Given a data frame and a list of one or more tuples representing estimate/error pairs,
       return a new DataFrame where each column represents the CV for one of the pairs.
       Columns in the new DataFrame will be named by appending "_cv" to the first value
       in each column_pair.
    """
    cvs = []
    for est,moe in column_pairs:
        cv = df[[est,moe]].apply(lambda x: compute_single_cv(x[est],x[moe]),axis=1)
        cv.name = f"{est}_cv"
        cvs.append(cv)
    return pd.concat(cvs,axis=1)


In [6]:
# sum the counties
sums = []

for (state, district), df in joined.groupby(['state', 'district']):
    tuples = zip(race_cols.values(), moe_cols.values())  # we've renamed the columns to the values of those dicts
    d = sum_with_moe(df, *tuples)
    d['state'] = state
    d['district'] = district
    sums.append(d)

race_by_district_base = pd.DataFrame(sums)    

cols = list(race_by_district_base.columns) # for review purposes, it will be nice to have our grouping values at the front
cols.remove('state') # so take them out
cols.remove('district')
cols = ['state', 'district'] + cols # put them where we want them
race_by_district_base = race_by_district_base[cols]
pd.options.display.float_format = '{:.1f}'.format
race_by_district_base.head() # how does that look?

Unnamed: 0,state,district,total,total_moe,nh_white,nh_white_moe,nh_black,nh_black_moe,nh_amerind,nh_amerind_moe,nh_asian,nh_asian_moe,nh_nhpi,nh_nhpi_moe,nh_some_other,nh_some_other_moe,nh_twoplus,nh_twoplus_moe,hispanic,hispanic_moe
0,Alabama,Middle,1151252,0.0,684038,454.9,382206,1223.0,3308,396.8,18388,533.3,205,143.7,1551,479.4,20799,1337.3,40757,148.2
1,Alabama,Northern,2870454,0.0,1999982,585.5,628466,1964.2,12469,846.8,35008,1034.5,1169,223.0,4155,918.7,50506,2237.2,138699,193.9
2,Alabama,Southern,842974,0.0,512710,370.5,275065,1002.8,7466,537.9,10540,583.5,147,103.0,1797,566.0,11559,1097.3,23690,294.9
3,Alaska,Alaska,738516,564.0,450754,677.2,22817,730.1,103506,1448.5,45617,998.2,8544,395.9,1459,515.6,54633,1880.2,51186,257.9
4,Arizona,Arizona,6946685,0.0,3825886,1206.2,286614,2527.9,271946,1834.2,222477,2081.6,12523,561.2,9177,1290.8,154750,3919.1,2163312,0.0


## Now what?

Having aggregated margins of error enables two things: to test whether any given estimate is "reliable", and to test whether any two values are *significantly* different.  

For now, we'll defer checking for "significant difference," since I didn't feel like fishing around for pairs to compare. I'll just say that the LA Times DataDesk team has a python library which encapsulates the [statistical difference test](https://github.com/datadesk/census-error-analyzer#test-statistical-difference), so you might want to use that instead of re-implementing it. 


Testing reliability involves computing the Coefficient of Variation (CV). There are no hard and fast rules, but, as documented in this [Tufts GIS tutorial](http://sites.tufts.edu/gis/files/2013/11/Amercian-Community-Survey_Margin-of-error-tutorial.pdf), here are two rules of thumb about how to proceed with a given CV.

<table>
    <tr>
        <th>Source</th>
        <th>High reliability</th>
        <th>Medium "be careful"</th>
        <th>Low "use extreme caution"</th>
    </tr>
    <tr>
        <td>Census Bureau</td>
        <td>CV &lt;15%</td>        
        <td>CV 15-30%</td>
        <td>CV &gt;30%</td>        
    </tr>
    <tr>
        <td>ESRI</td>
        <td>CV &lt;12%</td>        
        <td>CV 12-40%</td>
        <td>CV &gt;40%</td>        
    </tr>
</table>
    
    

In [7]:
tuples = zip(race_cols.values(), moe_cols.values())  # again, get pairs of column names for estimate/moe
race_district_cvs = compute_cvs(race_by_district_base,*tuples)
race_district_cvs.insert(0,'state',race_by_district_base['state'])       # the indexes will be aligned, so we can just
race_district_cvs.insert(1,'district',race_by_district_base['district']) # insert the group labels
race_district_cvs.head() 

Unnamed: 0,state,district,total_cv,nh_white_cv,nh_black_cv,nh_amerind_cv,nh_asian_cv,nh_nhpi_cv,nh_some_other_cv,nh_twoplus_cv,hispanic_cv
0,Alabama,Middle,0.0,0.0,0.2,7.3,1.8,42.6,18.8,3.9,0.2
1,Alabama,Northern,0.0,0.0,0.2,4.1,1.8,11.6,13.4,2.7,0.1
2,Alabama,Southern,0.0,0.0,0.2,4.4,3.4,42.6,19.1,5.8,0.8
3,Alaska,Alaska,0.0,0.1,1.9,0.9,1.3,2.8,21.5,2.1,0.3
4,Arizona,Arizona,0.0,0.0,0.5,0.4,0.6,2.7,8.6,1.5,0.0


 ## What have we got
 
Typically, you'd probably just consult the CV matrix for specific values before you went too far using them, but for our purposes, let's iterate through and see where we should take care. You'll see that the most common cases of caution are for populations which tend to be small -- "Native Hawaiian/Pacific Islander" (except in Hawaii) and "Some other race" (which is most often used by Latinos, and so is often quite small among non-hispanic populations)

In [8]:
print("Reviewing reliability of aggregated race by district\n")
for idx, row in race_district_cvs.iterrows():
    warnings = []
    for col in race_district_cvs.columns[2:]: # iterate all the non-label columns
        if row[col] > 30:
            warnings.append(f"{col:>17} - {row[col]:.1f} - low reliability - use extreme caution")
        elif row[col] > 15:
            warnings.append(f"{col:>17} - {row[col]:.1f} - med reliability - use caution")
    if row['state'] == row['district']: # simplify for single-district states
        print(f"{row['state']}")
    else:
        print(f"{row['state']} {row['district']}")
    if len(warnings) == 0:
        print("  No warnings")
    else:
        for w in warnings:
            print(f"  {w}")
    print("")
    

Reviewing reliability of aggregated race by district

Alabama Middle
         nh_nhpi_cv - 42.6 - low reliability - use extreme caution
   nh_some_other_cv - 18.8 - med reliability - use caution

Alabama Northern

Alabama Southern
         nh_nhpi_cv - 42.6 - low reliability - use extreme caution
   nh_some_other_cv - 19.1 - med reliability - use caution

Alaska
   nh_some_other_cv - 21.5 - med reliability - use caution

Arizona

Arkansas Eastern
         nh_nhpi_cv - 28.5 - med reliability - use caution
   nh_some_other_cv - 22.4 - med reliability - use caution

Arkansas Western
   nh_some_other_cv - 18.3 - med reliability - use caution

California Central

California Eastern

California Northern

California Southern

Colorado

Connecticut
         nh_nhpi_cv - 18.2 - med reliability - use caution

Delaware
         nh_nhpi_cv - 18.7 - med reliability - use caution

District of Columbia
         nh_nhpi_cv - 16.3 - med reliability - use caution

Florida Middle

Florida Northern

Flori