In [1]:
# import censusdata as cd
# might not need this after all.

In [2]:
#% pip install CensusData
# import censusdata as cd # likely don't need this after all.
import pandas as pd
import numpy as np
import seaborn as sns
pd.set_option('display.max_columns', None)
#pd.set_option('display.max_columns', None)

- First, read in the ACS 2019 income data columns of interest. We used the metadata descriptions from this file: `ACSST5Y2019.S1901_metadata_2021-10-05T144610.csv` to choose the desired columns.

- To keep things less busy, we created a columns_of_interest list for each .csv that we read in and used that during the `pd.read_csv()` process.

In [3]:
# Specify columns of interest.
acs2019_income_columns_of_interest = ['GEO_ID', 
                                      'NAME', 
                                      'S1901_C01_001E', 
                                      'S1901_C01_012E', 
                                      'S1901_C01_013E', 
                                      'S1901_C02_001E', 
                                      'S1901_C02_012E', 
                                      'S1901_C02_013E'
                                     ]

# Read in the data. skiprows = 1 removes the verbose first row that was acting like a second header; it was forcing numeric columns to be objects.

acs2019_income = pd.read_csv(
    '../data/02_demo_data/ACS/ACS_2019_income_by_county/ACSST5Y2019.S1901_data_with_overlays_2021-10-05T144610.csv',
    usecols = acs2019_income_columns_of_interest,
    skiprows=[1])

acs2019_income.head(3)

Unnamed: 0,GEO_ID,NAME,S1901_C01_001E,S1901_C01_012E,S1901_C01_013E,S1901_C02_001E,S1901_C02_012E,S1901_C02_013E
0,0500000US01001,"Autauga County, Alabama",21397,58731,75326,15076,71103,87094
1,0500000US01003,"Baldwin County, Alabama",80930,58320,80986,53467,75850,97991
2,0500000US01005,"Barbour County, Alabama",9345,32525,47068,6187,41704,56374


In [4]:
acs2019_income.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3220 entries, 0 to 3219
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   GEO_ID          3220 non-null   object
 1   NAME            3220 non-null   object
 2   S1901_C01_001E  3220 non-null   int64 
 3   S1901_C01_012E  3220 non-null   int64 
 4   S1901_C01_013E  3220 non-null   int64 
 5   S1901_C02_001E  3220 non-null   int64 
 6   S1901_C02_012E  3220 non-null   int64 
 7   S1901_C02_013E  3220 non-null   int64 
dtypes: int64(6), object(2)
memory usage: 201.4+ KB


In [5]:
# Clean up the naming conventions.
acs2019_income.columns = ['geo_id', 
                          'name',
                          'total_hh',
                          'median_hh_income',
                          'mean_hh_income',
                          'total_families',
                          'median_family_income',
                          'mean_family_income',
                          ]

In [6]:
acs2019_income.head(2)

Unnamed: 0,geo_id,name,total_hh,median_hh_income,mean_hh_income,total_families,median_family_income,mean_family_income
0,0500000US01001,"Autauga County, Alabama",21397,58731,75326,15076,71103,87094
1,0500000US01003,"Baldwin County, Alabama",80930,58320,80986,53467,75850,97991


#### Next we need to parse off the FIPS ID, and reset the index to the FIPS ID.

In [7]:
# Parse out the state and county from `name`.
acs2019_income[['county', 'state']] = acs2019_income['name'].str.split(',', expand = True)

# Pick off the FIPS code from the end of the GEO_ID.
acs2019_income['fips'] = acs2019_income['geo_id'].str[-5:]

# Examine the dataset.
acs2019_income

Unnamed: 0,geo_id,name,total_hh,median_hh_income,mean_hh_income,total_families,median_family_income,mean_family_income,county,state,fips
0,0500000US01001,"Autauga County, Alabama",21397,58731,75326,15076,71103,87094,Autauga County,Alabama,01001
1,0500000US01003,"Baldwin County, Alabama",80930,58320,80986,53467,75850,97991,Baldwin County,Alabama,01003
2,0500000US01005,"Barbour County, Alabama",9345,32525,47068,6187,41704,56374,Barbour County,Alabama,01005
3,0500000US01007,"Bibb County, Alabama",6891,47542,60182,4789,57891,69316,Bibb County,Alabama,01007
4,0500000US01009,"Blount County, Alabama",20847,49358,65639,14874,62295,76547,Blount County,Alabama,01009
...,...,...,...,...,...,...,...,...,...,...,...
3215,0500000US72145,"Vega Baja Municipio, Puerto Rico",18721,19617,28805,13216,24963,32859,Vega Baja Municipio,Puerto Rico,72145
3216,0500000US72147,"Vieques Municipio, Puerto Rico",2258,14936,22742,1203,22429,28085,Vieques Municipio,Puerto Rico,72147
3217,0500000US72149,"Villalba Municipio, Puerto Rico",7908,19877,29612,5873,23231,32959,Villalba Municipio,Puerto Rico,72149
3218,0500000US72151,"Yabucoa Municipio, Puerto Rico",11541,16295,24078,7576,20785,28567,Yabucoa Municipio,Puerto Rico,72151


In [8]:
# Drop `geo_id`, `name`.
acs2019_income.drop(columns = ['geo_id', 'name'], inplace = True)
acs2019_income

Unnamed: 0,total_hh,median_hh_income,mean_hh_income,total_families,median_family_income,mean_family_income,county,state,fips
0,21397,58731,75326,15076,71103,87094,Autauga County,Alabama,01001
1,80930,58320,80986,53467,75850,97991,Baldwin County,Alabama,01003
2,9345,32525,47068,6187,41704,56374,Barbour County,Alabama,01005
3,6891,47542,60182,4789,57891,69316,Bibb County,Alabama,01007
4,20847,49358,65639,14874,62295,76547,Blount County,Alabama,01009
...,...,...,...,...,...,...,...,...,...
3215,18721,19617,28805,13216,24963,32859,Vega Baja Municipio,Puerto Rico,72145
3216,2258,14936,22742,1203,22429,28085,Vieques Municipio,Puerto Rico,72147
3217,7908,19877,29612,5873,23231,32959,Villalba Municipio,Puerto Rico,72149
3218,11541,16295,24078,7576,20785,28567,Yabucoa Municipio,Puerto Rico,72151


In [9]:
# Set the index to the FIPS code. 
acs2019_income.set_index('fips', inplace = True)

In [10]:
acs2019_income

Unnamed: 0_level_0,total_hh,median_hh_income,mean_hh_income,total_families,median_family_income,mean_family_income,county,state
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
01001,21397,58731,75326,15076,71103,87094,Autauga County,Alabama
01003,80930,58320,80986,53467,75850,97991,Baldwin County,Alabama
01005,9345,32525,47068,6187,41704,56374,Barbour County,Alabama
01007,6891,47542,60182,4789,57891,69316,Bibb County,Alabama
01009,20847,49358,65639,14874,62295,76547,Blount County,Alabama
...,...,...,...,...,...,...,...,...
72145,18721,19617,28805,13216,24963,32859,Vega Baja Municipio,Puerto Rico
72147,2258,14936,22742,1203,22429,28085,Vieques Municipio,Puerto Rico
72149,7908,19877,29612,5873,23231,32959,Villalba Municipio,Puerto Rico
72151,11541,16295,24078,7576,20785,28567,Yabucoa Municipio,Puerto Rico


In [11]:
# Drop Puerto Rico.
acs2019_income = acs2019_income[acs2019_income['state'].str.contains('Puerto') == False]
acs2019_income

Unnamed: 0_level_0,total_hh,median_hh_income,mean_hh_income,total_families,median_family_income,mean_family_income,county,state
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
01001,21397,58731,75326,15076,71103,87094,Autauga County,Alabama
01003,80930,58320,80986,53467,75850,97991,Baldwin County,Alabama
01005,9345,32525,47068,6187,41704,56374,Barbour County,Alabama
01007,6891,47542,60182,4789,57891,69316,Bibb County,Alabama
01009,20847,49358,65639,14874,62295,76547,Blount County,Alabama
...,...,...,...,...,...,...,...,...
56037,15523,74843,85346,10794,87906,97459,Sweetwater County,Wyoming
56039,9019,84678,132531,5190,110667,172024,Teton County,Wyoming
56041,7597,63403,74938,5313,77725,86265,Uinta County,Wyoming
56043,3365,54158,65496,2191,68265,77355,Washakie County,Wyoming


In [12]:
acs2019_income.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3142 entries, 01001 to 56045
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   total_hh              3142 non-null   int64 
 1   median_hh_income      3142 non-null   int64 
 2   mean_hh_income        3142 non-null   int64 
 3   total_families        3142 non-null   int64 
 4   median_family_income  3142 non-null   int64 
 5   mean_family_income    3142 non-null   int64 
 6   county                3142 non-null   object
 7   state                 3142 non-null   object
dtypes: int64(6), object(2)
memory usage: 220.9+ KB


### Do the same type of import and data cleanup on the ACS 2019 poverty data.

In [13]:
# Specify columns of interest.
acs2019_poverty_columns_of_interest = ['GEO_ID', 
                                      'NAME', 
                                      'S1701_C01_001E', 
                                       'S1701_C02_001E', 
                                       'S1701_C03_001E']

# Read in the data.

acs2019_poverty = pd.read_csv(
    '../data/02_demo_data/ACS/ACS_2019_poverty_by_county/ACSST5Y2019.S1701_data_with_overlays_2021-11-02T160432.csv',
    usecols = acs2019_poverty_columns_of_interest,
    skiprows = [1])

acs2019_poverty.head(3)

Unnamed: 0,GEO_ID,NAME,S1701_C01_001E,S1701_C02_001E,S1701_C03_001E
0,0500000US01001,"Autauga County, Alabama",54922,8340,15.2
1,0500000US01003,"Baldwin County, Alabama",209618,21704,10.4
2,0500000US01005,"Barbour County, Alabama",22417,6875,30.7


In [14]:
acs2019_poverty.columns = ['geo_id', 
                          'name',
                          'total_pop',
                          'pop_below_poverty',
                          'percent_below_poverty',
                          ]

In [15]:
acs2019_poverty.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3220 entries, 0 to 3219
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   geo_id                 3220 non-null   object 
 1   name                   3220 non-null   object 
 2   total_pop              3220 non-null   int64  
 3   pop_below_poverty      3220 non-null   int64  
 4   percent_below_poverty  3220 non-null   float64
dtypes: float64(1), int64(2), object(2)
memory usage: 125.9+ KB


In [16]:
# Parse out the state and county from `name`.
acs2019_poverty[['county', 'state']] = acs2019_poverty['name'].str.split(',', expand = True)

# Pick off the FIPS code from the end of the GEO_ID.
acs2019_poverty['fips'] = acs2019_poverty['geo_id'].str[-5:]

# Examine the dataset.
acs2019_poverty

Unnamed: 0,geo_id,name,total_pop,pop_below_poverty,percent_below_poverty,county,state,fips
0,0500000US01001,"Autauga County, Alabama",54922,8340,15.2,Autauga County,Alabama,01001
1,0500000US01003,"Baldwin County, Alabama",209618,21704,10.4,Baldwin County,Alabama,01003
2,0500000US01005,"Barbour County, Alabama",22417,6875,30.7,Barbour County,Alabama,01005
3,0500000US01007,"Bibb County, Alabama",20632,3740,18.1,Bibb County,Alabama,01007
4,0500000US01009,"Blount County, Alabama",57108,7739,13.6,Blount County,Alabama,01009
...,...,...,...,...,...,...,...,...
3215,0500000US72145,"Vega Baja Municipio, Puerto Rico",51983,23486,45.2,Vega Baja Municipio,Puerto Rico,72145
3216,0500000US72147,"Vieques Municipio, Puerto Rico",8603,3947,45.9,Vieques Municipio,Puerto Rico,72147
3217,0500000US72149,"Villalba Municipio, Puerto Rico",22263,10641,47.8,Villalba Municipio,Puerto Rico,72149
3218,0500000US72151,"Yabucoa Municipio, Puerto Rico",33446,17766,53.1,Yabucoa Municipio,Puerto Rico,72151


In [17]:
acs2019_poverty.drop(columns = ['geo_id', 'name'], inplace = True)
acs2019_poverty

Unnamed: 0,total_pop,pop_below_poverty,percent_below_poverty,county,state,fips
0,54922,8340,15.2,Autauga County,Alabama,01001
1,209618,21704,10.4,Baldwin County,Alabama,01003
2,22417,6875,30.7,Barbour County,Alabama,01005
3,20632,3740,18.1,Bibb County,Alabama,01007
4,57108,7739,13.6,Blount County,Alabama,01009
...,...,...,...,...,...,...
3215,51983,23486,45.2,Vega Baja Municipio,Puerto Rico,72145
3216,8603,3947,45.9,Vieques Municipio,Puerto Rico,72147
3217,22263,10641,47.8,Villalba Municipio,Puerto Rico,72149
3218,33446,17766,53.1,Yabucoa Municipio,Puerto Rico,72151


In [18]:
acs2019_poverty.set_index('fips', inplace = True)
acs2019_poverty

Unnamed: 0_level_0,total_pop,pop_below_poverty,percent_below_poverty,county,state
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
01001,54922,8340,15.2,Autauga County,Alabama
01003,209618,21704,10.4,Baldwin County,Alabama
01005,22417,6875,30.7,Barbour County,Alabama
01007,20632,3740,18.1,Bibb County,Alabama
01009,57108,7739,13.6,Blount County,Alabama
...,...,...,...,...,...
72145,51983,23486,45.2,Vega Baja Municipio,Puerto Rico
72147,8603,3947,45.9,Vieques Municipio,Puerto Rico
72149,22263,10641,47.8,Villalba Municipio,Puerto Rico
72151,33446,17766,53.1,Yabucoa Municipio,Puerto Rico


In [19]:
# Drop Puerto Rico.
acs2019_poverty = acs2019_poverty[acs2019_poverty['state'].str.contains('Puerto') == False]
acs2019_poverty

Unnamed: 0_level_0,total_pop,pop_below_poverty,percent_below_poverty,county,state
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
01001,54922,8340,15.2,Autauga County,Alabama
01003,209618,21704,10.4,Baldwin County,Alabama
01005,22417,6875,30.7,Barbour County,Alabama
01007,20632,3740,18.1,Bibb County,Alabama
01009,57108,7739,13.6,Blount County,Alabama
...,...,...,...,...,...
56037,42958,4921,11.5,Sweetwater County,Wyoming
56039,23188,1426,6.1,Teton County,Wyoming
56041,20259,2298,11.3,Uinta County,Wyoming
56043,7853,825,10.5,Washakie County,Wyoming


### Read in the data from OpenIntro.org. This 

In [20]:
county_complete_columns_of_interest = ['fips', 
                           'state', 
                           'name', 
                           'smoking_ban_2010',                                                        
                           'asian_2019', 
                           'avg_family_size_2019', 
                           'black_2019',
                           'hispanic_2019',
                           'household_has_broadband_2019', 
                           'household_has_computer_2019',
                           'household_has_smartphone_2019',
                           'households_2019',
                           'households_speak_limited_english_2019',
                           'housing_mobile_homes_2019',
                           'hs_grad_2019',                           
                           'median_household_income_2019',
                           'median_individual_income_2019',
                           'native_2019',
                           'other_single_race_2019',
                           'pac_isl_2019',
                           'persons_per_household_2019',
                           'pop_2019',
                           'unemployment_rate_2019',
                           'uninsured_2019',
                           'veterans_2019',
                           'white_2019',
                           'white_not_hispanic_2019']

county_complete = pd.read_csv(
    '../data/02_demo_data/openintro_dot_org/county_complete.csv',
    usecols = county_complete_columns_of_interest,
    converters = {'fips': lambda x: str(x)})


county_complete.info()

# Got the idea for the converters from:
# https://stackoverflow.com/questions/13250046/how-to-keep-leading-zeros-in-a-column-when-reading-csv-with-pandas

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3142 entries, 0 to 3141
Data columns (total 27 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   fips                                   3142 non-null   object 
 1   state                                  3142 non-null   object 
 2   name                                   3142 non-null   object 
 3   smoking_ban_2010                       3116 non-null   object 
 4   asian_2019                             3142 non-null   float64
 5   avg_family_size_2019                   3142 non-null   float64
 6   black_2019                             3142 non-null   float64
 7   hispanic_2019                          3142 non-null   float64
 8   household_has_broadband_2019           3142 non-null   float64
 9   household_has_computer_2019            3142 non-null   float64
 10  household_has_smartphone_2019          3142 non-null   float64
 11  hous

In [21]:
county_complete.rename(columns = {'name': 'county'}, inplace = True)

In [22]:
county_complete['fips'] = county_complete['fips'].str.zfill(5)

# Learned about zfill() here: https://stackoverflow.com/questions/42375068/python-add-a-leading-zero-to-column-with-str-and-int
# Also here: https://www.datasciencemadesimple.com/add-leading-preceding-zeros-python/

In [23]:
county_complete.set_index('fips', inplace = True)
county_complete

Unnamed: 0_level_0,state,county,smoking_ban_2010,asian_2019,avg_family_size_2019,black_2019,hispanic_2019,household_has_broadband_2019,household_has_computer_2019,household_has_smartphone_2019,households_2019,households_speak_limited_english_2019,housing_mobile_homes_2019,hs_grad_2019,median_household_income_2019,median_individual_income_2019,native_2019,other_single_race_2019,pac_isl_2019,persons_per_household_2019,pop_2019,unemployment_rate_2019,uninsured_2019,veterans_2019,white_2019,white_not_hispanic_2019
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
01001,Alabama,Autauga County,none,1.0,3.09,19.0,2.8,80.6,73.0,78.4,21397,0.7,26.7,88.5,58731,29725,0.3,0.7,0.0,2.56,55380,3.5,7.1,12.6,76.8,74.6
01003,Alabama,Baldwin County,none,0.9,3.24,9.3,4.6,81.8,76.3,81.7,80930,1.2,24.8,90.8,58320,29802,0.8,1.1,0.0,2.59,212830,4.0,8.9,11.8,86.2,83.1
01005,Alabama,Barbour County,partial,0.5,3.01,47.6,4.4,60.5,51.9,64.2,9345,1.6,39.1,73.2,32525,17963,0.3,3.6,0.0,2.41,25361,9.4,11.3,6.6,46.8,45.8
01007,Alabama,Bibb County,none,0.1,3.74,22.3,2.6,69.2,54.7,66.6,6891,0.6,25.6,79.1,47542,21958,0.1,0.0,0.0,2.99,22493,7.0,10.7,8.0,76.8,74.5
01009,Alabama,Blount County,none,0.4,3.33,1.6,9.3,73.0,63.5,70.1,20847,1.8,21.2,80.5,49358,26976,0.1,0.9,0.0,2.74,57681,3.1,10.8,7.7,95.5,86.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56037,Wyoming,Sweetwater County,none,0.8,3.32,1.2,15.9,84.0,81.9,83.4,15523,1.9,23.8,92.8,74843,36512,1.2,1.2,0.0,2.77,43521,5.7,11.3,8.6,93.4,79.6
56039,Wyoming,Teton County,partial,1.3,3.01,1.2,15.0,87.1,90.1,85.3,9019,5.7,39.6,95.2,84678,40914,0.3,7.1,0.1,2.47,23280,0.7,12.7,5.3,89.3,81.3
56041,Wyoming,Uinta County,none,0.2,3.23,0.1,9.1,89.5,80.3,84.8,7597,0.6,22.9,92.7,63403,29557,0.7,2.1,0.0,2.66,20479,5.5,11.2,7.4,93.4,87.5
56043,Wyoming,Washakie County,none,0.0,2.89,0.0,14.2,78.2,77.2,72.3,3365,1.2,23.5,89.8,54158,29327,0.9,5.5,0.0,2.34,8027,4.1,15.0,11.9,89.7,81.9


In [24]:
county_class_columns_of_interest = ['FIPStxt', 
                                    'State', 
                                    'County',
                                    'Metro2013',
                                    'RuralUrbanContinuumCode2013', 
                                    'Retirement_Destination_2015_Update', 
                                    'Metro_Adjacent2013']

county_class = pd.read_csv(
    '../data/02_demo_data/rural_atlas_all_counties/County Classifications.csv', 
    delimiter='\t', 
    encoding_errors = 'Ignore',
    converters = {'FIPStxt': lambda x: str(x)},
    usecols = county_class_columns_of_interest)


# Note: I added the encoding_errors = 'Ignore' as a test after getting this error:
# 'utf-8' codec can't decode byte 0xf1 in position 185518: invalid continuation byte
# This was on the second pass of importing the file; the first pass yielded no errors.


In [25]:
# Fix the county column names
county_class.columns = ['fips', 'state', 'county', 'Metro2013',
                                    'rural_urban_continuum_code_2013', 
                                    'retirement_destination_2015_update', 
                                    'metro_adjacent_2013'
                       ]

In [26]:
# Set the index to match the other columns
county_class.set_index('fips', inplace = True)

In [27]:
# Drop Puerto Rico.
county_class = county_class[county_class['state'].str.contains('PR') == False]
county_class

Unnamed: 0_level_0,state,county,Metro2013,rural_urban_continuum_code_2013,retirement_destination_2015_update,metro_adjacent_2013
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
01001,AL,Autauga,2.0,1.0,1.0,0.0
01003,AL,Baldwin,3.0,1.0,1.0,0.0
01005,AL,Barbour,6.0,0.0,0.0,1.0
01007,AL,Bibb,1.0,1.0,0.0,0.0
01009,AL,Blount,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...
56037,WY,Sweetwater,5.0,0.0,0.0,0.0
56039,WY,Teton,7.0,0.0,0.0,0.0
56041,WY,Uinta,7.0,0.0,0.0,0.0
56043,WY,Washakie,7.0,0.0,0.0,0.0


In [28]:
county_class.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3147 entries, 01001 to 56045
Data columns (total 6 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   state                               3147 non-null   object 
 1   county                              3147 non-null   object 
 2   Metro2013                           3143 non-null   float64
 3   rural_urban_continuum_code_2013     3143 non-null   float64
 4   retirement_destination_2015_update  3143 non-null   float64
 5   metro_adjacent_2013                 3143 non-null   float64
dtypes: float64(4), object(2)
memory usage: 172.1+ KB


In [29]:
%who DataFrame

acs2019_income	 acs2019_poverty	 county_class	 county_complete	 


In [30]:
dfs = [acs2019_income, acs2019_poverty, county_class, county_complete]
final_demo_df = pd.concat(dfs, axis = 1)
final_demo_df

Unnamed: 0_level_0,total_hh,median_hh_income,mean_hh_income,total_families,median_family_income,mean_family_income,county,state,total_pop,pop_below_poverty,percent_below_poverty,county,state,state,county,Metro2013,rural_urban_continuum_code_2013,retirement_destination_2015_update,metro_adjacent_2013,state,county,smoking_ban_2010,asian_2019,avg_family_size_2019,black_2019,hispanic_2019,household_has_broadband_2019,household_has_computer_2019,household_has_smartphone_2019,households_2019,households_speak_limited_english_2019,housing_mobile_homes_2019,hs_grad_2019,median_household_income_2019,median_individual_income_2019,native_2019,other_single_race_2019,pac_isl_2019,persons_per_household_2019,pop_2019,unemployment_rate_2019,uninsured_2019,veterans_2019,white_2019,white_not_hispanic_2019
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1
01001,21397.0,58731.0,75326.0,15076.0,71103.0,87094.0,Autauga County,Alabama,54922.0,8340.0,15.2,Autauga County,Alabama,AL,Autauga,2.0,1.0,1.0,0.0,Alabama,Autauga County,none,1.0,3.09,19.0,2.8,80.6,73.0,78.4,21397.0,0.7,26.7,88.5,58731.0,29725.0,0.3,0.7,0.0,2.56,55380.0,3.5,7.1,12.6,76.8,74.6
01003,80930.0,58320.0,80986.0,53467.0,75850.0,97991.0,Baldwin County,Alabama,209618.0,21704.0,10.4,Baldwin County,Alabama,AL,Baldwin,3.0,1.0,1.0,0.0,Alabama,Baldwin County,none,0.9,3.24,9.3,4.6,81.8,76.3,81.7,80930.0,1.2,24.8,90.8,58320.0,29802.0,0.8,1.1,0.0,2.59,212830.0,4.0,8.9,11.8,86.2,83.1
01005,9345.0,32525.0,47068.0,6187.0,41704.0,56374.0,Barbour County,Alabama,22417.0,6875.0,30.7,Barbour County,Alabama,AL,Barbour,6.0,0.0,0.0,1.0,Alabama,Barbour County,partial,0.5,3.01,47.6,4.4,60.5,51.9,64.2,9345.0,1.6,39.1,73.2,32525.0,17963.0,0.3,3.6,0.0,2.41,25361.0,9.4,11.3,6.6,46.8,45.8
01007,6891.0,47542.0,60182.0,4789.0,57891.0,69316.0,Bibb County,Alabama,20632.0,3740.0,18.1,Bibb County,Alabama,AL,Bibb,1.0,1.0,0.0,0.0,Alabama,Bibb County,none,0.1,3.74,22.3,2.6,69.2,54.7,66.6,6891.0,0.6,25.6,79.1,47542.0,21958.0,0.1,0.0,0.0,2.99,22493.0,7.0,10.7,8.0,76.8,74.5
01009,20847.0,49358.0,65639.0,14874.0,62295.0,76547.0,Blount County,Alabama,57108.0,7739.0,13.6,Blount County,Alabama,AL,Blount,1.0,1.0,0.0,0.0,Alabama,Blount County,none,0.4,3.33,1.6,9.3,73.0,63.5,70.1,20847.0,1.8,21.2,80.5,49358.0,26976.0,0.1,0.9,0.0,2.74,57681.0,3.1,10.8,7.7,95.5,86.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
02010,,,,,,,,,,,,,,AK,Aleutian Islands,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
02201,,,,,,,,,,,,,,AK,Prince of Wales-Outer Ketchikan,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
02232,,,,,,,,,,,,,,AK,Skagway-Hoonah-Angoon,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
02280,,,,,,,,,,,,,,AK,Wrangell-Petersburg,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


#### We discovered that five FIPS codes in the `county_class` dataframe were not in the other three. See below:

In [31]:
county_class[~county_class.index.isin(acs2019_poverty.index)]

Unnamed: 0_level_0,state,county,Metro2013,rural_urban_continuum_code_2013,retirement_destination_2015_update,metro_adjacent_2013
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010,AK,Aleutian Islands,,,,
2201,AK,Prince of Wales-Outer Ketchikan,,,,
2232,AK,Skagway-Hoonah-Angoon,,,,
2280,AK,Wrangell-Petersburg,,,,
51515,VA,Bedford,2.0,1.0,0.0,0.0


In [32]:
county_class[~county_class.index.isin(acs2019_income.index)]

Unnamed: 0_level_0,state,county,Metro2013,rural_urban_continuum_code_2013,retirement_destination_2015_update,metro_adjacent_2013
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010,AK,Aleutian Islands,,,,
2201,AK,Prince of Wales-Outer Ketchikan,,,,
2232,AK,Skagway-Hoonah-Angoon,,,,
2280,AK,Wrangell-Petersburg,,,,
51515,VA,Bedford,2.0,1.0,0.0,0.0


In [33]:
county_class[~county_class.index.isin(county_complete.index)]

Unnamed: 0_level_0,state,county,Metro2013,rural_urban_continuum_code_2013,retirement_destination_2015_update,metro_adjacent_2013
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010,AK,Aleutian Islands,,,,
2201,AK,Prince of Wales-Outer Ketchikan,,,,
2232,AK,Skagway-Hoonah-Angoon,,,,
2280,AK,Wrangell-Petersburg,,,,
51515,VA,Bedford,2.0,1.0,0.0,0.0


#### Here's what happened to them:
 - 02010 in Aleutian Islands, AK, became 02016. https://www.thearda.com/archive/files/descriptions/AppendixC_RCMS.pdf
 - 02201 in Prince of Wales-Outer Ketchikan: Part of this area (Outer Ketchikan) was annexed by Ketchikan Gateway Borough (FIPS code = 02130), part was included in the new Wrangell City and borough (FIPS code = 02275), and the remainder was renamed Prince of Wales-Hyder Census Area (FIPS code = 02198). https://www.cdc.gov/nchs/data/nvss/bridged_race/county_geography_changes.pdf
 - 02232 in Skagway-Hoonah-Angoon, AK was split to create Skagway Municipality (FIPS = 02230) and Hoonah-Angoon Census Area (FIPS code = 02105). https://www.cdc.gov/nchs/data/data_acces_files/County-Geography.pdf
 - 02280 in Wrangell-Petersburg, AK, was split to create part of Wrangell City and Borough (FIPS code = 02275) and all of Petersburg Census Area (FIPS code = 02195)  https://www.cdc.gov/nchs/data/data_acces_files/County-Geography.pdf
 - 51515 in Bedford, VA, was merged (Bedford city, VA (FIPS code=51515). In 2013, Bedford City, an independent city, merged
with Bedford county (FIPS code=51019) https://www.cdc.gov/nchs/data/data_acces_files/County-Geography.pdf

#### We'll just drop those five rows from the `final_demo_df`.

In [34]:
final_demo_df.drop(index=['02010', '02201', '02232', '02280', '51515'], inplace = True)
final_demo_df

Unnamed: 0_level_0,total_hh,median_hh_income,mean_hh_income,total_families,median_family_income,mean_family_income,county,state,total_pop,pop_below_poverty,percent_below_poverty,county,state,state,county,Metro2013,rural_urban_continuum_code_2013,retirement_destination_2015_update,metro_adjacent_2013,state,county,smoking_ban_2010,asian_2019,avg_family_size_2019,black_2019,hispanic_2019,household_has_broadband_2019,household_has_computer_2019,household_has_smartphone_2019,households_2019,households_speak_limited_english_2019,housing_mobile_homes_2019,hs_grad_2019,median_household_income_2019,median_individual_income_2019,native_2019,other_single_race_2019,pac_isl_2019,persons_per_household_2019,pop_2019,unemployment_rate_2019,uninsured_2019,veterans_2019,white_2019,white_not_hispanic_2019
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1
01001,21397.0,58731.0,75326.0,15076.0,71103.0,87094.0,Autauga County,Alabama,54922.0,8340.0,15.2,Autauga County,Alabama,AL,Autauga,2.0,1.0,1.0,0.0,Alabama,Autauga County,none,1.0,3.09,19.0,2.8,80.6,73.0,78.4,21397.0,0.7,26.7,88.5,58731.0,29725.0,0.3,0.7,0.0,2.56,55380.0,3.5,7.1,12.6,76.8,74.6
01003,80930.0,58320.0,80986.0,53467.0,75850.0,97991.0,Baldwin County,Alabama,209618.0,21704.0,10.4,Baldwin County,Alabama,AL,Baldwin,3.0,1.0,1.0,0.0,Alabama,Baldwin County,none,0.9,3.24,9.3,4.6,81.8,76.3,81.7,80930.0,1.2,24.8,90.8,58320.0,29802.0,0.8,1.1,0.0,2.59,212830.0,4.0,8.9,11.8,86.2,83.1
01005,9345.0,32525.0,47068.0,6187.0,41704.0,56374.0,Barbour County,Alabama,22417.0,6875.0,30.7,Barbour County,Alabama,AL,Barbour,6.0,0.0,0.0,1.0,Alabama,Barbour County,partial,0.5,3.01,47.6,4.4,60.5,51.9,64.2,9345.0,1.6,39.1,73.2,32525.0,17963.0,0.3,3.6,0.0,2.41,25361.0,9.4,11.3,6.6,46.8,45.8
01007,6891.0,47542.0,60182.0,4789.0,57891.0,69316.0,Bibb County,Alabama,20632.0,3740.0,18.1,Bibb County,Alabama,AL,Bibb,1.0,1.0,0.0,0.0,Alabama,Bibb County,none,0.1,3.74,22.3,2.6,69.2,54.7,66.6,6891.0,0.6,25.6,79.1,47542.0,21958.0,0.1,0.0,0.0,2.99,22493.0,7.0,10.7,8.0,76.8,74.5
01009,20847.0,49358.0,65639.0,14874.0,62295.0,76547.0,Blount County,Alabama,57108.0,7739.0,13.6,Blount County,Alabama,AL,Blount,1.0,1.0,0.0,0.0,Alabama,Blount County,none,0.4,3.33,1.6,9.3,73.0,63.5,70.1,20847.0,1.8,21.2,80.5,49358.0,26976.0,0.1,0.9,0.0,2.74,57681.0,3.1,10.8,7.7,95.5,86.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56037,15523.0,74843.0,85346.0,10794.0,87906.0,97459.0,Sweetwater County,Wyoming,42958.0,4921.0,11.5,Sweetwater County,Wyoming,WY,Sweetwater,5.0,0.0,0.0,0.0,Wyoming,Sweetwater County,none,0.8,3.32,1.2,15.9,84.0,81.9,83.4,15523.0,1.9,23.8,92.8,74843.0,36512.0,1.2,1.2,0.0,2.77,43521.0,5.7,11.3,8.6,93.4,79.6
56039,9019.0,84678.0,132531.0,5190.0,110667.0,172024.0,Teton County,Wyoming,23188.0,1426.0,6.1,Teton County,Wyoming,WY,Teton,7.0,0.0,0.0,0.0,Wyoming,Teton County,partial,1.3,3.01,1.2,15.0,87.1,90.1,85.3,9019.0,5.7,39.6,95.2,84678.0,40914.0,0.3,7.1,0.1,2.47,23280.0,0.7,12.7,5.3,89.3,81.3
56041,7597.0,63403.0,74938.0,5313.0,77725.0,86265.0,Uinta County,Wyoming,20259.0,2298.0,11.3,Uinta County,Wyoming,WY,Uinta,7.0,0.0,0.0,0.0,Wyoming,Uinta County,none,0.2,3.23,0.1,9.1,89.5,80.3,84.8,7597.0,0.6,22.9,92.7,63403.0,29557.0,0.7,2.1,0.0,2.66,20479.0,5.5,11.2,7.4,93.4,87.5
56043,3365.0,54158.0,65496.0,2191.0,68265.0,77355.0,Washakie County,Wyoming,7853.0,825.0,10.5,Washakie County,Wyoming,WY,Washakie,7.0,0.0,0.0,0.0,Wyoming,Washakie County,none,0.0,2.89,0.0,14.2,78.2,77.2,72.3,3365.0,1.2,23.5,89.8,54158.0,29327.0,0.9,5.5,0.0,2.34,8027.0,4.1,15.0,11.9,89.7,81.9


In [None]:
final_demo_df

In [35]:
final_demo_df.to_csv('../data/02_demo_data/cleaned/final_demo.csv')

In [38]:
test_df = pd.read_csv('../data/02_demo_data/cleaned/final_demo.csv', index_col='fips')
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3142 entries, 1001 to 56045
Data columns (total 45 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   total_hh                               3142 non-null   float64
 1   median_hh_income                       3142 non-null   float64
 2   mean_hh_income                         3142 non-null   float64
 3   total_families                         3142 non-null   float64
 4   median_family_income                   3142 non-null   float64
 5   mean_family_income                     3142 non-null   float64
 6   county                                 3142 non-null   object 
 7   state                                  3142 non-null   object 
 8   total_pop                              3142 non-null   float64
 9   pop_below_poverty                      3142 non-null   float64
 10  percent_below_poverty                  3142 non-null   float64
 11  