## PROJECT 4 TEAM 2
### Demographic Data Acquistion, Cleaning, Merging
#### Kade, Marcus, Erin

In [1]:
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

#### American Community Survey 2019 data: 5-year estimates.
##### Start with age and sex.
- We used the metadata descriptions from this file: `ACSST5Y2019.S0101_metadata_2021-10-06T131343.csv` to choose the desired columns.

- To keep things less busy, we created a columns_of_interest list for each .csv that we read in and used that during the `pd.read_csv()` process.

In [2]:
# Specify columns of interest.
acs2019_age_sex_columns_of_interest = ['GEO_ID', 
                                      'NAME', 
                                      'S0101_C01_001E', 
                                      'S0101_C01_032E',
                                      'S0101_C01_033E',
                                      'S0101_C02_022E',
                                      'S0101_C02_026E',
                                      'S0101_C02_030E',
                                      ]
                                      

# Read in the data. skiprows = 1 removes the verbose first row that was acting like a second header; it was forcing numeric columns to be objects.

acs2019_age_sex = pd.read_csv(
    '../data/02_demo_data/ACS/ACS_2019_age_and_sex_by_county/ACSST5Y2019.S0101_data_with_overlays_2021-10-06T131343.csv',
    usecols = acs2019_age_sex_columns_of_interest,
    skiprows=[1])
    

# Assigning new column names.
# Prior to renaming columns we ran the import without skipping the row just to ensure the column naming conventions matched up.
acs2019_age_sex.columns = ['geo_id', 
                          'name',
                          'total_pop_age_sex',
                          'median_age',
                          'sex_ratio_males',
                          'under_18_percent',
                          'over_18_percent',
                          'over_65_percent'
                         ]
acs2019_age_sex.head(3)

Unnamed: 0,geo_id,name,total_pop_age_sex,median_age,sex_ratio_males,under_18_percent,over_18_percent,over_65_percent
0,0500000US01001,"Autauga County, Alabama",55380,38.2,94.7,23.8,76.2,15.0
1,0500000US01003,"Baldwin County, Alabama",212830,43.0,94.7,21.7,78.3,20.0
2,0500000US01005,"Barbour County, Alabama",25361,40.4,112.4,20.9,79.1,18.6


##### Create a little function to clean up the imported dataframes.

In [3]:
def clean_acs(df):
    df[['county', 'state']] = df['name'].str.split(',', expand = True)
    df['fips'] = df['geo_id'].str[-5:]
    df.drop(columns = ['geo_id', 'name'], inplace = True)
    df.set_index('fips', inplace = True)
    df = df[df['state'].str.contains('Puerto') == False]
    return df

#### Note that we'll end up with duplicate `county` and `state` columns in each of these dataframes if we don't take care of it. We'll keep them from the first dataframe and drop the other ones after they serve their purpose for dropping Puerto Rico.

In [4]:
acs2019_age_sex_clean = clean_acs(acs2019_age_sex)
acs2019_age_sex_clean

Unnamed: 0_level_0,total_pop_age_sex,median_age,sex_ratio_males,under_18_percent,over_18_percent,over_65_percent,county,state
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
01001,55380,38.2,94.7,23.8,76.2,15.0,Autauga County,Alabama
01003,212830,43.0,94.7,21.7,78.3,20.0,Baldwin County,Alabama
01005,25361,40.4,112.4,20.9,79.1,18.6,Barbour County,Alabama
01007,22493,40.9,117.5,20.6,79.4,15.9,Bibb County,Alabama
01009,57681,40.7,97.6,23.2,76.8,17.9,Blount County,Alabama
...,...,...,...,...,...,...,...,...
56037,43521,35.3,106.8,26.5,73.5,11.4,Sweetwater County,Wyoming
56039,23280,39.3,112.5,19.1,80.9,14.0,Teton County,Wyoming
56041,20479,35.8,103.3,29.2,70.8,13.0,Uinta County,Wyoming
56043,8027,42.9,102.6,24.6,75.4,21.1,Washakie County,Wyoming


In [5]:
acs2019_age_sex.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3220 entries, 01001 to 72153
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   total_pop_age_sex  3220 non-null   int64  
 1   median_age         3220 non-null   float64
 2   sex_ratio_males    3220 non-null   float64
 3   under_18_percent   3220 non-null   float64
 4   over_18_percent    3220 non-null   float64
 5   over_65_percent    3220 non-null   float64
 6   county             3220 non-null   object 
 7   state              3220 non-null   object 
dtypes: float64(5), int64(1), object(2)
memory usage: 226.4+ KB


##### Next, read in the ACS 2019 income data columns of interest. We used `ACSST5Y2019.S1901_metadata_2021-10-05T144610.csv` to choose the desired columns.

In [6]:
# Specify columns of interest.
acs2019_income_columns_of_interest = ['GEO_ID', 
                                      'NAME', 
                                      'S1901_C01_001E', 
                                      'S1901_C01_012E', 
                                      'S1901_C01_013E', 
                                      'S1901_C02_001E', 
                                      'S1901_C02_012E', 
                                      'S1901_C02_013E'
                                     ]

acs2019_income = pd.read_csv(
    '../data/02_demo_data/ACS/ACS_2019_income_by_county/ACSST5Y2019.S1901_data_with_overlays_2021-10-05T144610.csv',
    usecols = acs2019_income_columns_of_interest,
    skiprows=[1])


# Clean up the naming conventions.
acs2019_income.columns = ['geo_id', 
                          'name',
                          'total_hh',
                          'median_hh_income',
                          'mean_hh_income',
                          'total_families',
                          'median_family_income',
                          'mean_family_income',
                          ]

acs2019_income.head(3)

Unnamed: 0,geo_id,name,total_hh,median_hh_income,mean_hh_income,total_families,median_family_income,mean_family_income
0,0500000US01001,"Autauga County, Alabama",21397,58731,75326,15076,71103,87094
1,0500000US01003,"Baldwin County, Alabama",80930,58320,80986,53467,75850,97991
2,0500000US01005,"Barbour County, Alabama",9345,32525,47068,6187,41704,56374


In [7]:
acs2019_income.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3220 entries, 0 to 3219
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   geo_id                3220 non-null   object
 1   name                  3220 non-null   object
 2   total_hh              3220 non-null   int64 
 3   median_hh_income      3220 non-null   int64 
 4   mean_hh_income        3220 non-null   int64 
 5   total_families        3220 non-null   int64 
 6   median_family_income  3220 non-null   int64 
 7   mean_family_income    3220 non-null   int64 
dtypes: int64(6), object(2)
memory usage: 201.4+ KB


In [8]:
# Call the dataframe cleaning function.
acs2019_income_clean = clean_acs(acs2019_income)
acs2019_income_clean.drop(columns = ['state', 'county'], inplace = True)
acs2019_income_clean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0_level_0,total_hh,median_hh_income,mean_hh_income,total_families,median_family_income,mean_family_income
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
01001,21397,58731,75326,15076,71103,87094
01003,80930,58320,80986,53467,75850,97991
01005,9345,32525,47068,6187,41704,56374
01007,6891,47542,60182,4789,57891,69316
01009,20847,49358,65639,14874,62295,76547
...,...,...,...,...,...,...
56037,15523,74843,85346,10794,87906,97459
56039,9019,84678,132531,5190,110667,172024
56041,7597,63403,74938,5313,77725,86265
56043,3365,54158,65496,2191,68265,77355


#### Next we need to parse off the FIPS ID, and reset the index to the FIPS ID.

In [9]:
acs2019_income.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3220 entries, 01001 to 72153
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   total_hh              3220 non-null   int64 
 1   median_hh_income      3220 non-null   int64 
 2   mean_hh_income        3220 non-null   int64 
 3   total_families        3220 non-null   int64 
 4   median_family_income  3220 non-null   int64 
 5   mean_family_income    3220 non-null   int64 
 6   county                3220 non-null   object
 7   state                 3220 non-null   object
dtypes: int64(6), object(2)
memory usage: 226.4+ KB


#### Then, read in the race and ethnicity by county data. We used `ACSDT5Y2019.B02001_metadata_2021-10-08T032031.csv` to identify columns of interest.

In [10]:
# Specify columns of interest.
acs2019_income_columns_of_interest = ['GEO_ID', 
                                      'NAME', 
                                      'B02001_001E', 
                                      'B02001_002E', 
                                      'B02001_003E', 
                                      'B02001_004E', 
                                      'B02001_005E', 
                                      'B02001_006E'
                                     ]

acs2019_race_ethnicity = pd.read_csv(
    '../data/02_demo_data/ACS/ACS_2019_race_and_ethnicity_by_county/ACSDT5Y2019.B02001_data_with_overlays_2021-10-08T032031.csv',
    usecols = acs2019_income_columns_of_interest,
    skiprows=[1])

# Clean up naming conventions.
acs2019_race_ethnicity.columns = ['geo_id', 
                          'name',
                          'tot_pop_race_ethnicity',
                          'tot_white',
                          'tot_black_af_am',
                          'tot_aminalnat', # American Indian and Alaska Native
                          'tot_asian',
                          'tot_hawopi' # Hawaiiwan and other Pacific Islander
                          ]
acs2019_race_ethnicity

Unnamed: 0,geo_id,name,tot_pop_race_ethnicity,tot_white,tot_black_af_am,tot_aminalnat,tot_asian,tot_hawopi
0,0500000US01001,"Autauga County, Alabama",55380,42527,10538,140,573,26
1,0500000US01003,"Baldwin County, Alabama",212830,183471,19718,1645,1969,9
2,0500000US01005,"Barbour County, Alabama",25361,11869,12066,82,134,1
3,0500000US01007,"Bibb County, Alabama",22493,17272,5014,30,27,0
4,0500000US01009,"Blount County, Alabama",57681,55062,928,46,212,25
...,...,...,...,...,...,...,...,...
3215,0500000US72145,"Vega Baja Municipio, Puerto Rico",52192,43427,2589,64,9,1
3216,0500000US72147,"Vieques Municipio, Puerto Rico",8642,4730,651,8,14,0
3217,0500000US72149,"Villalba Municipio, Puerto Rico",22403,12713,773,0,0,0
3218,0500000US72151,"Yabucoa Municipio, Puerto Rico",33499,6244,26418,0,11,0


In [11]:
acs2019_race_ethnicity_clean = clean_acs(acs2019_race_ethnicity)
acs2019_race_ethnicity_clean.drop(columns = ['state', 'county'], inplace = True)
acs2019_race_ethnicity_clean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0_level_0,tot_pop_race_ethnicity,tot_white,tot_black_af_am,tot_aminalnat,tot_asian,tot_hawopi
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
01001,55380,42527,10538,140,573,26
01003,212830,183471,19718,1645,1969,9
01005,25361,11869,12066,82,134,1
01007,22493,17272,5014,30,27,0
01009,57681,55062,928,46,212,25
...,...,...,...,...,...,...
56037,43521,40642,502,508,340,19
56039,23280,20785,290,77,291,29
56041,20479,19132,23,145,32,0
56043,8027,7204,3,76,0,0


### Do the same type of import and data cleanup on the ACS 2019 poverty data.

In [12]:
# Specify columns of interest.
acs2019_poverty_columns_of_interest = ['GEO_ID', 
                                      'NAME', 
                                      'S1701_C01_001E', 
                                       'S1701_C02_001E', 
                                       'S1701_C03_001E']

# Read in the data.

acs2019_poverty = pd.read_csv(
    '../data/02_demo_data/ACS/ACS_2019_poverty_by_county/ACSST5Y2019.S1701_data_with_overlays_2021-11-02T160432.csv',
    usecols = acs2019_poverty_columns_of_interest,
    skiprows = [1])

# Change the column naming conventions.
acs2019_poverty.columns = ['geo_id', 
                          'name',
                          'total_pop_poverty',
                          'pop_below_poverty',
                          'percent_below_poverty',
                          ]
acs2019_poverty.head(3)

Unnamed: 0,geo_id,name,total_pop_poverty,pop_below_poverty,percent_below_poverty
0,0500000US01001,"Autauga County, Alabama",54922,8340,15.2
1,0500000US01003,"Baldwin County, Alabama",209618,21704,10.4
2,0500000US01005,"Barbour County, Alabama",22417,6875,30.7


In [13]:
acs2019_poverty.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3220 entries, 0 to 3219
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   geo_id                 3220 non-null   object 
 1   name                   3220 non-null   object 
 2   total_pop_poverty      3220 non-null   int64  
 3   pop_below_poverty      3220 non-null   int64  
 4   percent_below_poverty  3220 non-null   float64
dtypes: float64(1), int64(2), object(2)
memory usage: 125.9+ KB


In [14]:
acs2019_poverty_clean = clean_acs(acs2019_poverty)
acs2019_poverty_clean.drop(columns = ['state', 'county'], inplace = True)
acs2019_poverty_clean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0_level_0,total_pop_poverty,pop_below_poverty,percent_below_poverty
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
01001,54922,8340,15.2
01003,209618,21704,10.4
01005,22417,6875,30.7
01007,20632,3740,18.1
01009,57108,7739,13.6
...,...,...,...
56037,42958,4921,11.5
56039,23188,1426,6.1
56041,20259,2298,11.3
56043,7853,825,10.5


#### Read in the data from OpenIntro.org. This is an organization that created a nearly full file of data variables that many people are interested in. It is likely that many of the columns will overlap with others that we've read in from ACS data but we're grabbing them anyway because occasionally they're in a better format than the ACS data. https://www.openintro.org/data/?data=county_complete

In [15]:
county_complete_columns_of_interest = ['fips', 
                           'state', 
                           'name', 
                           'smoking_ban_2010',                                                        
                           'asian_2019', 
                           'avg_family_size_2019', 
                           'black_2019',
                           'hispanic_2019',
                           'household_has_broadband_2019', 
                           'household_has_computer_2019',
                           'household_has_smartphone_2019',
                           'households_2019',
                           'households_speak_limited_english_2019',
                           'housing_mobile_homes_2019',
                           'hs_grad_2019',                           
                           'median_household_income_2019',
                           'median_individual_income_2019',
                           'native_2019',
                           'other_single_race_2019',
                           'pac_isl_2019',
                           'persons_per_household_2019',
                           'pop_2019',
                           'unemployment_rate_2019',
                           'uninsured_2019',
                           'veterans_2019',
                           'white_2019',
                           'white_not_hispanic_2019']

county_complete = pd.read_csv(
    '../data/02_demo_data/openintro_dot_org/county_complete.csv',
    usecols = county_complete_columns_of_interest,
    converters = {'fips': lambda x: str(x)})


county_complete.info()

# Got the idea for the converters from:
# https://stackoverflow.com/questions/13250046/how-to-keep-leading-zeros-in-a-column-when-reading-csv-with-pandas

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3142 entries, 0 to 3141
Data columns (total 27 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   fips                                   3142 non-null   object 
 1   state                                  3142 non-null   object 
 2   name                                   3142 non-null   object 
 3   smoking_ban_2010                       3116 non-null   object 
 4   asian_2019                             3142 non-null   float64
 5   avg_family_size_2019                   3142 non-null   float64
 6   black_2019                             3142 non-null   float64
 7   hispanic_2019                          3142 non-null   float64
 8   household_has_broadband_2019           3142 non-null   float64
 9   household_has_computer_2019            3142 non-null   float64
 10  household_has_smartphone_2019          3142 non-null   float64
 11  hous

In [16]:
county_complete.rename(columns = {'name': 'county'}, inplace = True)

county_complete['fips'] = county_complete['fips'].str.zfill(5)

# Learned about zfill() here: https://stackoverflow.com/questions/42375068/python-add-a-leading-zero-to-column-with-str-and-int
# Also here: https://www.datasciencemadesimple.com/add-leading-preceding-zeros-python/

county_complete.set_index('fips', inplace = True)
county_complete.drop(columns = ['state', 'county'], inplace = True)
county_complete

Unnamed: 0_level_0,smoking_ban_2010,asian_2019,avg_family_size_2019,black_2019,hispanic_2019,household_has_broadband_2019,household_has_computer_2019,household_has_smartphone_2019,households_2019,households_speak_limited_english_2019,housing_mobile_homes_2019,hs_grad_2019,median_household_income_2019,median_individual_income_2019,native_2019,other_single_race_2019,pac_isl_2019,persons_per_household_2019,pop_2019,unemployment_rate_2019,uninsured_2019,veterans_2019,white_2019,white_not_hispanic_2019
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
01001,none,1.0,3.09,19.0,2.8,80.6,73.0,78.4,21397,0.7,26.7,88.5,58731,29725,0.3,0.7,0.0,2.56,55380,3.5,7.1,12.6,76.8,74.6
01003,none,0.9,3.24,9.3,4.6,81.8,76.3,81.7,80930,1.2,24.8,90.8,58320,29802,0.8,1.1,0.0,2.59,212830,4.0,8.9,11.8,86.2,83.1
01005,partial,0.5,3.01,47.6,4.4,60.5,51.9,64.2,9345,1.6,39.1,73.2,32525,17963,0.3,3.6,0.0,2.41,25361,9.4,11.3,6.6,46.8,45.8
01007,none,0.1,3.74,22.3,2.6,69.2,54.7,66.6,6891,0.6,25.6,79.1,47542,21958,0.1,0.0,0.0,2.99,22493,7.0,10.7,8.0,76.8,74.5
01009,none,0.4,3.33,1.6,9.3,73.0,63.5,70.1,20847,1.8,21.2,80.5,49358,26976,0.1,0.9,0.0,2.74,57681,3.1,10.8,7.7,95.5,86.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56037,none,0.8,3.32,1.2,15.9,84.0,81.9,83.4,15523,1.9,23.8,92.8,74843,36512,1.2,1.2,0.0,2.77,43521,5.7,11.3,8.6,93.4,79.6
56039,partial,1.3,3.01,1.2,15.0,87.1,90.1,85.3,9019,5.7,39.6,95.2,84678,40914,0.3,7.1,0.1,2.47,23280,0.7,12.7,5.3,89.3,81.3
56041,none,0.2,3.23,0.1,9.1,89.5,80.3,84.8,7597,0.6,22.9,92.7,63403,29557,0.7,2.1,0.0,2.66,20479,5.5,11.2,7.4,93.4,87.5
56043,none,0.0,2.89,0.0,14.2,78.2,77.2,72.3,3365,1.2,23.5,89.8,54158,29327,0.9,5.5,0.0,2.34,8027,4.1,15.0,11.9,89.7,81.9


#### Read in data about county classifications from: https://www.ers.usda.gov/data-products/atlas-of-rural-and-small-town-america/download-the-data/


In [17]:
county_class_columns_of_interest = ['FIPStxt', 
                                    'State', 
                                    'County',
                                    'Metro2013',
                                    'RuralUrbanContinuumCode2013', 
                                    'Retirement_Destination_2015_Update', 
                                    'Metro_Adjacent2013']

county_class = pd.read_csv(
    '../data/02_demo_data/rural_atlas_all_counties/County Classifications.csv', 
    delimiter='\t', 
    encoding_errors = 'Ignore',
    converters = {'FIPStxt': lambda x: str(x)},
    usecols = county_class_columns_of_interest)


# Note: I added the encoding_errors = 'Ignore' as a test after getting this error:
# 'utf-8' codec can't decode byte 0xf1 in position 185518: invalid continuation byte
# This was on the second pass of importing the file; the first pass yielded no errors.


In [18]:
county_class

Unnamed: 0,FIPStxt,State,County,RuralUrbanContinuumCode2013,Metro2013,Retirement_Destination_2015_Update,Metro_Adjacent2013
0,01001,AL,Autauga,2.0,1.0,1.0,0.0
1,01003,AL,Baldwin,3.0,1.0,1.0,0.0
2,01005,AL,Barbour,6.0,0.0,0.0,1.0
3,01007,AL,Bibb,1.0,1.0,0.0,0.0
4,01009,AL,Blount,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...
3220,72145,PR,Vega Baja,1.0,1.0,,0.0
3221,72147,PR,Vieques,7.0,0.0,,0.0
3222,72149,PR,Villalba,2.0,1.0,,0.0
3223,72151,PR,Yabucoa,1.0,1.0,,0.0


In [19]:
county_class.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3225 entries, 0 to 3224
Data columns (total 7 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   FIPStxt                             3225 non-null   object 
 1   State                               3225 non-null   object 
 2   County                              3225 non-null   object 
 3   RuralUrbanContinuumCode2013         3221 non-null   float64
 4   Metro2013                           3221 non-null   float64
 5   Retirement_Destination_2015_Update  3143 non-null   float64
 6   Metro_Adjacent2013                  3221 non-null   float64
dtypes: float64(4), object(3)
memory usage: 176.5+ KB


In [20]:
# Fix the county column names
county_class.columns = ['fips', 
                        'state', 
                        'county', 
                        'rural_urban_continuum_code_2013',
                        'Metro2013',
                        'retirement_destination_2015_update', 
                        'metro_adjacent_2013'
                       ]

# Set the index to match the other columns
county_class.set_index('fips', inplace = True)

# Drop Puerto Rico.
county_class = county_class[county_class['state'].str.contains('PR') == False]

# Drop the state and county columns as they are no longer needed and are duplicates.

county_class.drop(columns = ['state', 'county'], inplace = True)

# Note that the state and county naming conventions don't match the other data set; this doesn't matter as we will merge on the FIPS index.

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [21]:
county_class.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3147 entries, 01001 to 56045
Data columns (total 4 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   rural_urban_continuum_code_2013     3143 non-null   float64
 1   Metro2013                           3143 non-null   float64
 2   retirement_destination_2015_update  3143 non-null   float64
 3   metro_adjacent_2013                 3143 non-null   float64
dtypes: float64(4)
memory usage: 122.9+ KB


In [22]:
%who DataFrame

acs2019_age_sex	 acs2019_age_sex_clean	 acs2019_income	 acs2019_income_clean	 acs2019_poverty	 acs2019_poverty_clean	 acs2019_race_ethnicity	 acs2019_race_ethnicity_clean	 county_class	 
county_complete	 


In [23]:
dfs = [
    acs2019_age_sex_clean,
    acs2019_income_clean,
    acs2019_race_ethnicity_clean,
    acs2019_poverty_clean,
    county_complete,
    county_class, 
]


In [24]:
final_demo_df = pd.concat(dfs, axis = 1)
final_demo_df

Unnamed: 0_level_0,total_pop_age_sex,median_age,sex_ratio_males,under_18_percent,over_18_percent,over_65_percent,county,state,total_hh,median_hh_income,mean_hh_income,total_families,median_family_income,mean_family_income,tot_pop_race_ethnicity,tot_white,tot_black_af_am,tot_aminalnat,tot_asian,tot_hawopi,total_pop_poverty,pop_below_poverty,percent_below_poverty,smoking_ban_2010,asian_2019,avg_family_size_2019,black_2019,hispanic_2019,household_has_broadband_2019,household_has_computer_2019,household_has_smartphone_2019,households_2019,households_speak_limited_english_2019,housing_mobile_homes_2019,hs_grad_2019,median_household_income_2019,median_individual_income_2019,native_2019,other_single_race_2019,pac_isl_2019,persons_per_household_2019,pop_2019,unemployment_rate_2019,uninsured_2019,veterans_2019,white_2019,white_not_hispanic_2019,rural_urban_continuum_code_2013,Metro2013,retirement_destination_2015_update,metro_adjacent_2013
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
01001,55380.0,38.2,94.7,23.8,76.2,15.0,Autauga County,Alabama,21397.0,58731.0,75326.0,15076.0,71103.0,87094.0,55380.0,42527.0,10538.0,140.0,573.0,26.0,54922.0,8340.0,15.2,none,1.0,3.09,19.0,2.8,80.6,73.0,78.4,21397.0,0.7,26.7,88.5,58731.0,29725.0,0.3,0.7,0.0,2.56,55380.0,3.5,7.1,12.6,76.8,74.6,2.0,1.0,1.0,0.0
01003,212830.0,43.0,94.7,21.7,78.3,20.0,Baldwin County,Alabama,80930.0,58320.0,80986.0,53467.0,75850.0,97991.0,212830.0,183471.0,19718.0,1645.0,1969.0,9.0,209618.0,21704.0,10.4,none,0.9,3.24,9.3,4.6,81.8,76.3,81.7,80930.0,1.2,24.8,90.8,58320.0,29802.0,0.8,1.1,0.0,2.59,212830.0,4.0,8.9,11.8,86.2,83.1,3.0,1.0,1.0,0.0
01005,25361.0,40.4,112.4,20.9,79.1,18.6,Barbour County,Alabama,9345.0,32525.0,47068.0,6187.0,41704.0,56374.0,25361.0,11869.0,12066.0,82.0,134.0,1.0,22417.0,6875.0,30.7,partial,0.5,3.01,47.6,4.4,60.5,51.9,64.2,9345.0,1.6,39.1,73.2,32525.0,17963.0,0.3,3.6,0.0,2.41,25361.0,9.4,11.3,6.6,46.8,45.8,6.0,0.0,0.0,1.0
01007,22493.0,40.9,117.5,20.6,79.4,15.9,Bibb County,Alabama,6891.0,47542.0,60182.0,4789.0,57891.0,69316.0,22493.0,17272.0,5014.0,30.0,27.0,0.0,20632.0,3740.0,18.1,none,0.1,3.74,22.3,2.6,69.2,54.7,66.6,6891.0,0.6,25.6,79.1,47542.0,21958.0,0.1,0.0,0.0,2.99,22493.0,7.0,10.7,8.0,76.8,74.5,1.0,1.0,0.0,0.0
01009,57681.0,40.7,97.6,23.2,76.8,17.9,Blount County,Alabama,20847.0,49358.0,65639.0,14874.0,62295.0,76547.0,57681.0,55062.0,928.0,46.0,212.0,25.0,57108.0,7739.0,13.6,none,0.4,3.33,1.6,9.3,73.0,63.5,70.1,20847.0,1.8,21.2,80.5,49358.0,26976.0,0.1,0.9,0.0,2.74,57681.0,3.1,10.8,7.7,95.5,86.9,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
02010,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
02201,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
02232,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
02280,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


#### We discovered that five FIPS codes in the `county_class` dataframe were not in the other three. See below:

In [25]:
county_class[~county_class.index.isin(acs2019_poverty.index)]

Unnamed: 0_level_0,rural_urban_continuum_code_2013,Metro2013,retirement_destination_2015_update,metro_adjacent_2013
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010,,,,
2201,,,,
2232,,,,
2280,,,,
51515,2.0,1.0,0.0,0.0


In [26]:
county_class[~county_class.index.isin(acs2019_income.index)]

Unnamed: 0_level_0,rural_urban_continuum_code_2013,Metro2013,retirement_destination_2015_update,metro_adjacent_2013
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010,,,,
2201,,,,
2232,,,,
2280,,,,
51515,2.0,1.0,0.0,0.0


In [27]:
county_class[~county_class.index.isin(county_complete.index)]

Unnamed: 0_level_0,rural_urban_continuum_code_2013,Metro2013,retirement_destination_2015_update,metro_adjacent_2013
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010,,,,
2201,,,,
2232,,,,
2280,,,,
51515,2.0,1.0,0.0,0.0


#### Here's what happened to them:
 - 02010 in Aleutian Islands, AK, became 02016. https://www.thearda.com/archive/files/descriptions/AppendixC_RCMS.pdf
 - 02201 in Prince of Wales-Outer Ketchikan: Part of this area (Outer Ketchikan) was annexed by Ketchikan Gateway Borough (FIPS code = 02130), part was included in the new Wrangell City and borough (FIPS code = 02275), and the remainder was renamed Prince of Wales-Hyder Census Area (FIPS code = 02198). https://www.cdc.gov/nchs/data/nvss/bridged_race/county_geography_changes.pdf
 - 02232 in Skagway-Hoonah-Angoon, AK was split to create Skagway Municipality (FIPS = 02230) and Hoonah-Angoon Census Area (FIPS code = 02105). https://www.cdc.gov/nchs/data/data_acces_files/County-Geography.pdf
 - 02280 in Wrangell-Petersburg, AK, was split to create part of Wrangell City and Borough (FIPS code = 02275) and all of Petersburg Census Area (FIPS code = 02195)  https://www.cdc.gov/nchs/data/data_acces_files/County-Geography.pdf
 - 51515 in Bedford, VA, was merged (Bedford city, VA (FIPS code=51515). In 2013, Bedford City, an independent city, merged
with Bedford county (FIPS code=51019) https://www.cdc.gov/nchs/data/data_acces_files/County-Geography.pdf

#### We'll just drop those five rows from the `final_demo_df`.

In [28]:
final_demo_df.drop(index=['02010', '02201', '02232', '02280', '51515'], inplace = True)

# Reset the index to pop FIPS back into the dataframe so that we can write it out to CSV and read it back in properly. 
final_demo_df.reset_index(inplace = True)

In [29]:
final_demo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3142 entries, 0 to 3141
Data columns (total 52 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   fips                                   3142 non-null   object 
 1   total_pop_age_sex                      3142 non-null   float64
 2   median_age                             3142 non-null   float64
 3   sex_ratio_males                        3142 non-null   float64
 4   under_18_percent                       3142 non-null   float64
 5   over_18_percent                        3142 non-null   float64
 6   over_65_percent                        3142 non-null   float64
 7   county                                 3142 non-null   object 
 8   state                                  3142 non-null   object 
 9   total_hh                               3142 non-null   float64
 10  median_hh_income                       3142 non-null   float64
 11  mean

In [30]:
final_demo_df

Unnamed: 0,fips,total_pop_age_sex,median_age,sex_ratio_males,under_18_percent,over_18_percent,over_65_percent,county,state,total_hh,median_hh_income,mean_hh_income,total_families,median_family_income,mean_family_income,tot_pop_race_ethnicity,tot_white,tot_black_af_am,tot_aminalnat,tot_asian,tot_hawopi,total_pop_poverty,pop_below_poverty,percent_below_poverty,smoking_ban_2010,asian_2019,avg_family_size_2019,black_2019,hispanic_2019,household_has_broadband_2019,household_has_computer_2019,household_has_smartphone_2019,households_2019,households_speak_limited_english_2019,housing_mobile_homes_2019,hs_grad_2019,median_household_income_2019,median_individual_income_2019,native_2019,other_single_race_2019,pac_isl_2019,persons_per_household_2019,pop_2019,unemployment_rate_2019,uninsured_2019,veterans_2019,white_2019,white_not_hispanic_2019,rural_urban_continuum_code_2013,Metro2013,retirement_destination_2015_update,metro_adjacent_2013
0,01001,55380.0,38.2,94.7,23.8,76.2,15.0,Autauga County,Alabama,21397.0,58731.0,75326.0,15076.0,71103.0,87094.0,55380.0,42527.0,10538.0,140.0,573.0,26.0,54922.0,8340.0,15.2,none,1.0,3.09,19.0,2.8,80.6,73.0,78.4,21397.0,0.7,26.7,88.5,58731.0,29725.0,0.3,0.7,0.0,2.56,55380.0,3.5,7.1,12.6,76.8,74.6,2.0,1.0,1.0,0.0
1,01003,212830.0,43.0,94.7,21.7,78.3,20.0,Baldwin County,Alabama,80930.0,58320.0,80986.0,53467.0,75850.0,97991.0,212830.0,183471.0,19718.0,1645.0,1969.0,9.0,209618.0,21704.0,10.4,none,0.9,3.24,9.3,4.6,81.8,76.3,81.7,80930.0,1.2,24.8,90.8,58320.0,29802.0,0.8,1.1,0.0,2.59,212830.0,4.0,8.9,11.8,86.2,83.1,3.0,1.0,1.0,0.0
2,01005,25361.0,40.4,112.4,20.9,79.1,18.6,Barbour County,Alabama,9345.0,32525.0,47068.0,6187.0,41704.0,56374.0,25361.0,11869.0,12066.0,82.0,134.0,1.0,22417.0,6875.0,30.7,partial,0.5,3.01,47.6,4.4,60.5,51.9,64.2,9345.0,1.6,39.1,73.2,32525.0,17963.0,0.3,3.6,0.0,2.41,25361.0,9.4,11.3,6.6,46.8,45.8,6.0,0.0,0.0,1.0
3,01007,22493.0,40.9,117.5,20.6,79.4,15.9,Bibb County,Alabama,6891.0,47542.0,60182.0,4789.0,57891.0,69316.0,22493.0,17272.0,5014.0,30.0,27.0,0.0,20632.0,3740.0,18.1,none,0.1,3.74,22.3,2.6,69.2,54.7,66.6,6891.0,0.6,25.6,79.1,47542.0,21958.0,0.1,0.0,0.0,2.99,22493.0,7.0,10.7,8.0,76.8,74.5,1.0,1.0,0.0,0.0
4,01009,57681.0,40.7,97.6,23.2,76.8,17.9,Blount County,Alabama,20847.0,49358.0,65639.0,14874.0,62295.0,76547.0,57681.0,55062.0,928.0,46.0,212.0,25.0,57108.0,7739.0,13.6,none,0.4,3.33,1.6,9.3,73.0,63.5,70.1,20847.0,1.8,21.2,80.5,49358.0,26976.0,0.1,0.9,0.0,2.74,57681.0,3.1,10.8,7.7,95.5,86.9,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3137,56037,43521.0,35.3,106.8,26.5,73.5,11.4,Sweetwater County,Wyoming,15523.0,74843.0,85346.0,10794.0,87906.0,97459.0,43521.0,40642.0,502.0,508.0,340.0,19.0,42958.0,4921.0,11.5,none,0.8,3.32,1.2,15.9,84.0,81.9,83.4,15523.0,1.9,23.8,92.8,74843.0,36512.0,1.2,1.2,0.0,2.77,43521.0,5.7,11.3,8.6,93.4,79.6,5.0,0.0,0.0,0.0
3138,56039,23280.0,39.3,112.5,19.1,80.9,14.0,Teton County,Wyoming,9019.0,84678.0,132531.0,5190.0,110667.0,172024.0,23280.0,20785.0,290.0,77.0,291.0,29.0,23188.0,1426.0,6.1,partial,1.3,3.01,1.2,15.0,87.1,90.1,85.3,9019.0,5.7,39.6,95.2,84678.0,40914.0,0.3,7.1,0.1,2.47,23280.0,0.7,12.7,5.3,89.3,81.3,7.0,0.0,0.0,0.0
3139,56041,20479.0,35.8,103.3,29.2,70.8,13.0,Uinta County,Wyoming,7597.0,63403.0,74938.0,5313.0,77725.0,86265.0,20479.0,19132.0,23.0,145.0,32.0,0.0,20259.0,2298.0,11.3,none,0.2,3.23,0.1,9.1,89.5,80.3,84.8,7597.0,0.6,22.9,92.7,63403.0,29557.0,0.7,2.1,0.0,2.66,20479.0,5.5,11.2,7.4,93.4,87.5,7.0,0.0,0.0,0.0
3140,56043,8027.0,42.9,102.6,24.6,75.4,21.1,Washakie County,Wyoming,3365.0,54158.0,65496.0,2191.0,68265.0,77355.0,8027.0,7204.0,3.0,76.0,0.0,0.0,7853.0,825.0,10.5,none,0.0,2.89,0.0,14.2,78.2,77.2,72.3,3365.0,1.2,23.5,89.8,54158.0,29327.0,0.9,5.5,0.0,2.34,8027.0,4.1,15.0,11.9,89.7,81.9,7.0,0.0,0.0,0.0


In [31]:
final_demo_df.to_csv(
    '../data/cleaned/final_demo.csv',
    index=False
)

In [32]:
test_df = pd.read_csv(
    '../data/cleaned/final_demo.csv', 
    dtype = {'fips' : object}
)
    
test_df

Unnamed: 0,fips,total_pop_age_sex,median_age,sex_ratio_males,under_18_percent,over_18_percent,over_65_percent,county,state,total_hh,median_hh_income,mean_hh_income,total_families,median_family_income,mean_family_income,tot_pop_race_ethnicity,tot_white,tot_black_af_am,tot_aminalnat,tot_asian,tot_hawopi,total_pop_poverty,pop_below_poverty,percent_below_poverty,smoking_ban_2010,asian_2019,avg_family_size_2019,black_2019,hispanic_2019,household_has_broadband_2019,household_has_computer_2019,household_has_smartphone_2019,households_2019,households_speak_limited_english_2019,housing_mobile_homes_2019,hs_grad_2019,median_household_income_2019,median_individual_income_2019,native_2019,other_single_race_2019,pac_isl_2019,persons_per_household_2019,pop_2019,unemployment_rate_2019,uninsured_2019,veterans_2019,white_2019,white_not_hispanic_2019,rural_urban_continuum_code_2013,Metro2013,retirement_destination_2015_update,metro_adjacent_2013
0,01001,55380.0,38.2,94.7,23.8,76.2,15.0,Autauga County,Alabama,21397.0,58731.0,75326.0,15076.0,71103.0,87094.0,55380.0,42527.0,10538.0,140.0,573.0,26.0,54922.0,8340.0,15.2,none,1.0,3.09,19.0,2.8,80.6,73.0,78.4,21397.0,0.7,26.7,88.5,58731.0,29725.0,0.3,0.7,0.0,2.56,55380.0,3.5,7.1,12.6,76.8,74.6,2.0,1.0,1.0,0.0
1,01003,212830.0,43.0,94.7,21.7,78.3,20.0,Baldwin County,Alabama,80930.0,58320.0,80986.0,53467.0,75850.0,97991.0,212830.0,183471.0,19718.0,1645.0,1969.0,9.0,209618.0,21704.0,10.4,none,0.9,3.24,9.3,4.6,81.8,76.3,81.7,80930.0,1.2,24.8,90.8,58320.0,29802.0,0.8,1.1,0.0,2.59,212830.0,4.0,8.9,11.8,86.2,83.1,3.0,1.0,1.0,0.0
2,01005,25361.0,40.4,112.4,20.9,79.1,18.6,Barbour County,Alabama,9345.0,32525.0,47068.0,6187.0,41704.0,56374.0,25361.0,11869.0,12066.0,82.0,134.0,1.0,22417.0,6875.0,30.7,partial,0.5,3.01,47.6,4.4,60.5,51.9,64.2,9345.0,1.6,39.1,73.2,32525.0,17963.0,0.3,3.6,0.0,2.41,25361.0,9.4,11.3,6.6,46.8,45.8,6.0,0.0,0.0,1.0
3,01007,22493.0,40.9,117.5,20.6,79.4,15.9,Bibb County,Alabama,6891.0,47542.0,60182.0,4789.0,57891.0,69316.0,22493.0,17272.0,5014.0,30.0,27.0,0.0,20632.0,3740.0,18.1,none,0.1,3.74,22.3,2.6,69.2,54.7,66.6,6891.0,0.6,25.6,79.1,47542.0,21958.0,0.1,0.0,0.0,2.99,22493.0,7.0,10.7,8.0,76.8,74.5,1.0,1.0,0.0,0.0
4,01009,57681.0,40.7,97.6,23.2,76.8,17.9,Blount County,Alabama,20847.0,49358.0,65639.0,14874.0,62295.0,76547.0,57681.0,55062.0,928.0,46.0,212.0,25.0,57108.0,7739.0,13.6,none,0.4,3.33,1.6,9.3,73.0,63.5,70.1,20847.0,1.8,21.2,80.5,49358.0,26976.0,0.1,0.9,0.0,2.74,57681.0,3.1,10.8,7.7,95.5,86.9,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3137,56037,43521.0,35.3,106.8,26.5,73.5,11.4,Sweetwater County,Wyoming,15523.0,74843.0,85346.0,10794.0,87906.0,97459.0,43521.0,40642.0,502.0,508.0,340.0,19.0,42958.0,4921.0,11.5,none,0.8,3.32,1.2,15.9,84.0,81.9,83.4,15523.0,1.9,23.8,92.8,74843.0,36512.0,1.2,1.2,0.0,2.77,43521.0,5.7,11.3,8.6,93.4,79.6,5.0,0.0,0.0,0.0
3138,56039,23280.0,39.3,112.5,19.1,80.9,14.0,Teton County,Wyoming,9019.0,84678.0,132531.0,5190.0,110667.0,172024.0,23280.0,20785.0,290.0,77.0,291.0,29.0,23188.0,1426.0,6.1,partial,1.3,3.01,1.2,15.0,87.1,90.1,85.3,9019.0,5.7,39.6,95.2,84678.0,40914.0,0.3,7.1,0.1,2.47,23280.0,0.7,12.7,5.3,89.3,81.3,7.0,0.0,0.0,0.0
3139,56041,20479.0,35.8,103.3,29.2,70.8,13.0,Uinta County,Wyoming,7597.0,63403.0,74938.0,5313.0,77725.0,86265.0,20479.0,19132.0,23.0,145.0,32.0,0.0,20259.0,2298.0,11.3,none,0.2,3.23,0.1,9.1,89.5,80.3,84.8,7597.0,0.6,22.9,92.7,63403.0,29557.0,0.7,2.1,0.0,2.66,20479.0,5.5,11.2,7.4,93.4,87.5,7.0,0.0,0.0,0.0
3140,56043,8027.0,42.9,102.6,24.6,75.4,21.1,Washakie County,Wyoming,3365.0,54158.0,65496.0,2191.0,68265.0,77355.0,8027.0,7204.0,3.0,76.0,0.0,0.0,7853.0,825.0,10.5,none,0.0,2.89,0.0,14.2,78.2,77.2,72.3,3365.0,1.2,23.5,89.8,54158.0,29327.0,0.9,5.5,0.0,2.34,8027.0,4.1,15.0,11.9,89.7,81.9,7.0,0.0,0.0,0.0
