### Pull County-Level ACS 1-year and 5-year data using Census library and API

### 


Liz

<b>Done: </b>

Number retired people - instead use number of people age 65 +


*Commute* Mean commuting time is calculated by dividing the aggregate travel time to work for all workers (in minutes) by the total number of workers, 16-years old and older, who commute (ACS 5-year variables B08013_001E from table B08013 and B08012_001E from table B08012, respectively). (https://fred.stlouisfed.org/series/B080ACS027053)


*College students* in a county via ACS used total undergraduate students


*Rental vs. Owner Occupied*
total vacant housing units - many are n/a


https://www.socialexplorer.com/data/ACS2015_5yr/metadata/?ds=ACS15_5yr&var=B25056001

crime rate - table 10 contains data for 2018 and seems to miss a few counties https://ucr.fbi.gov/crime-in-the-u.s/2018/crime-in-the-u.s.-2018/topic-pages/property-crime 

historical crime data https://www.fbi.gov/services/cjis/ucr/publications



In [None]:
#Note 1-year data is only available for counties with population of 65K or greater. 

#Kim
# All set - FEMA data - Environmental factors (flooding) - disaster locations?/type of disaster - fires - geolocation of wildfires

#Kim 
# Political affiliations
# weather data - creative commons - have min, max, avg temp
# rainfall/snow temperature - have precip - not specifically snow


#Used this notebook as inspiration: 
#https://github.com/BuzzFeedNews/2020-02-gentrification/blob/master/notebooks/01-download-census-data.ipynb. 

# County shape file https://data.ca.gov/dataset/ca-geographic-boundaries/resource/b0007416-a325-4777-9295-368ea6b710e6
# County code to name mapping website
# https://www.census.gov/geographies/reference-files/2017/demo/popest/2017-fips.html

In [None]:
#installations
!pip install census
!pip install us

Collecting census
  Downloading census-0.8.18-py3-none-any.whl (11 kB)
Installing collected packages: census
Successfully installed census-0.8.18
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.[0m
Collecting us
  Downloading us-2.0.2.tar.gz (14 kB)
Collecting jellyfish==0.6.1
  Downloading jellyfish-0.6.1.tar.gz (132 kB)
[K     |████████████████████████████████| 132 kB 19.1 MB/s 
[?25hBuilding wheels for collected packages: us, jellyfish
  Building wheel for us (setup.py) ... [?25ldone
[?25h  Created wheel for us: filename=us-2.0.2-py3-none-any.whl size=11942 sha256=82a25950c4356e8f69a7705a40b34fd5690be3f625397aa03afb10f6dece0ece
  Stored in directory: /root/.cache/pip/wheels/1a/93/5b/98d3861ec2c4a9d90b16324c6f8d7e4db03e6a830bc993adbb
  Building wheel for jellyfish (setup.py) ... [?25ldone
[?25h  Created wheel for jellyfish: filename=jellyfish-0.6.1-cp39-cp39-linux_x86_64.whl size=25451 sha256=3f9f79e6b20bff5b8c7f03afb670d47ea5

In [None]:
# Dependencies

import pandas as pd
import numpy as np
import requests
from census import Census
from us import states

### Pull Census 1-year data and select variables

In [None]:
c = Census('307a9c458f89f32c9cf817fb1ae4f426ccc84e2c')
#This is the key to use the API. It was obtained at https://api.census.gov/data/key_signup.html


In [None]:
#List of variables available at https://api.census.gov/data/2019/acs/acs1/variables.html 
categories = [
     'NAME', # county name
     'B01001_001E', # Total population
     'B15002_001E', # Total population 25 and over
     'B09020_001E', # Total population 65 + 
     'B17001_002E', # total poverty status - income in past 12 months below poverty level
     'B19013_001E', # Median income
     'B25111_001E', # Median Gross Rent
     'B25077_001E', # Median home value
     'B15011_001E', # Total population age 25+ years with a bachelor's degree or higher
     'B03002_003E', # Not Hispanic or Latino!!White alone
     'B03002_004E', # Not Hispanic or Latino!!Black or African American alone
     'B02001_004E', # American Indian and Alaska Native Alone
     'B03002_006E', # Not Hispanic or Latino!!Asian alone
     'B03002_007E', # Not Hispanic or Latino!!Native Hawaiian and Other Pacific Islander alone
     'B03002_008E', # Not Hispanic or Latino!!Some other race alone
     'B03002_009E', # Not Hispanic or Latino!!Two or more races
     'B03002_012E', # Hispanic or Latino
     'B08013_001E', # total commute time
     'B08012_001E', # Number of 16+ workers commuting
     'B25034_001E', # Total housing units
     'B25014_002E', #Total owner occupied
     'B25014_008E', # Total renal occupied
     'B25004_001E', # Total vacant units
     'B14001_008E', # College enrollment
     'B27011_008E', # estimate_total_in_labor_force_unemployed
]


In [None]:
# https://pypi.org/project/census/ list of geography levels available
#This function pulls county level data for acs 1-year

#This is useful for seeing how much data we have for a particular variable

acs_1yr = c.acs5.state_county(
        ['B27011_008E'], #aggregate commute time / total number of commuters 16 +
        Census.ALL,
        #state_code,
        Census.ALL,
        year = 2019
    )
pd.DataFrame(acs_1yr)

Unnamed: 0,B27011_008E,state,county
0,461.0,17,051
1,604.0,17,107
2,668.0,17,165
3,16759.0,17,097
4,214.0,17,127
...,...,...,...
3215,261.0,47,033
3216,164.0,47,095
3217,9059.0,47,093
3218,3943.0,53,005


In [None]:
all_df = pd.DataFrame()


for timeperiod in range(2014, 2020):
    acs_5yr = c.acs5.state_county(
        categories,
        Census.ALL,
        Census.ALL,
        year = timeperiod
    )
    df = pd.DataFrame(acs_5yr)

    rename_dict = {
     'NAME': 'name',
     'B01001_001E': "total_population",
     'B15002_001E': "total_population_25_over",
     'B09020_001E': "total_population_65_over",
     'B17001_002E': "total_poverty",
     'B19013_001E': "median_income",
     'B25111_001E': "median_rent",
     'B25077_001E': "median_home_value",
     'B15011_001E': "bachelors_degree",
     'B03002_003E': "white",
     'B03002_004E': "black",
     'B02001_004E': "american_indian",
     'B03002_006E': "asian_alone",
     'B03002_007E': "hawaiian",
     'B03002_008E': "some_other_race_alone",
     'B03002_009E': "two_more_races",
     'B03002_012E': "hispanic_or_latino",
     "C08134_001E": "mean_travel_work",
     'B08013_001E': "total_commute_time",
     'B08012_001E': "number_workers_commuting",
     'B25014_002E': 'total_owner_occupied',
     'B25014_008E': 'total_renter_occupied',
     'B25034_001E': 'total_housing_units',
     'B25004_001E': 'total_vacant_units',
     'B14001_008E': 'total_enrolled_undergraduate',
     'B27011_008E' : 'total_unemployed'}

   
    df = df.rename(columns=rename_dict)
    df['year'] = timeperiod

    # Concatenate
    all_df = pd.concat([all_df,df], ignore_index=True)

all_df_2= all_df.copy()

In [None]:
c.acs5.state_county(
        categories,
        Census.ALL,
        Census.ALL,
        year = timeperiod
    )

[{'NAME': 'Fayette County, Illinois',
  'B01001_001E': 21565.0,
  'B15002_001E': 15303.0,
  'B09020_001E': 4058.0,
  'B17001_002E': 3421.0,
  'B19013_001E': 46650.0,
  'B25111_001E': 612.0,
  'B25077_001E': 88300.0,
  'B15011_001E': 1727.0,
  'B03002_003E': 19868.0,
  'B03002_004E': 1007.0,
  'B02001_004E': 68.0,
  'B03002_006E': 116.0,
  'B03002_007E': 28.0,
  'B03002_008E': 0.0,
  'B03002_009E': 75.0,
  'B03002_012E': 403.0,
  'B08013_001E': 187345.0,
  'B08012_001E': 8450.0,
  'B25034_001E': 9315.0,
  'B25014_002E': 6228.0,
  'B25014_008E': 1509.0,
  'B25004_001E': 1578.0,
  'B14001_008E': 808.0,
  'B27011_008E': 461.0,
  'state': '17',
  'county': '051'},
 {'NAME': 'Logan County, Illinois',
  'B01001_001E': 29003.0,
  'B15002_001E': 20373.0,
  'B09020_001E': 5224.0,
  'B17001_002E': 2323.0,
  'B19013_001E': 57308.0,
  'B25111_001E': 689.0,
  'B25077_001E': 103200.0,
  'B15011_001E': 4059.0,
  'B03002_003E': 25049.0,
  'B03002_004E': 1984.0,
  'B02001_004E': 73.0,
  'B03002_006E': 2

In [None]:
#about a quarter of the counties are represented using 1-yr data
df_1yr = pd.DataFrame()


for timeperiod in range(2014, 2020):
    acs_1yr = c.acs1.state_county(
        categories,
        Census.ALL,
        Census.ALL,
        year = timeperiod
    )
    df = pd.DataFrame(acs_1yr)

    rename_dict = {
     'NAME': 'name',
     'B01001_001E': "total_population",
     'B15002_001E': "total_population_25_over",
     'B09020_001E': "total_population_65_over",
     'B17001_002E': "total_poverty",
     'B19013_001E': "median_income",
     'B25111_001E': "median_rent",
     'B25077_001E': "median_home_value",
     'B15011_001E': "bachelors_degree",
     'B03002_003E': "white",
     'B03002_004E': "black",
     'B02001_004E': "american_indian",
     'B03002_006E': "asian_alone",
     'B03002_007E': "hawaiian",
     'B03002_008E': "some_other_race_alone",
     'B03002_009E': "two_more_races",
     'B03002_012E': "hispanic_or_latino",
     "C08134_001E": "mean_travel_work",
     'B08013_001E': "total_commute_time",
     'B08012_001E': "number_workers_commuting",
     'B25014_002E': 'total_owner_occupied',
     'B25014_008E': 'total_renter_occupied',
     'B25034_001E': 'total_housing_units',
     'B25004_001E': 'total_vacant_units',
     'B14001_008E': 'total_enrolled_undergraduate',
     'B27011_008E' : 'total_unemployed'}

   
    df = df.rename(columns=rename_dict)
    df['year'] = timeperiod

    # Concatenate
    df_1yr = pd.concat([df_1yr,df], ignore_index=True)

# df_1yr 

In [None]:
df_1yr.set_index(['state','county','year'],inplace=True)

all_df.set_index(['state','county','year'],inplace=True)

all_df.loc[df_1yr.index, :] = df_1yr[:] #replace rows of all_df with rows from 1 year df when possible

all_df.reset_index(inplace=True)

all_df = all_df.fillna(all_df_2) #fill na from 1 year data with 5-year data

In [None]:
all_df.head() 

Unnamed: 0,state,county,year,name,total_population,total_population_25_over,total_population_65_over,total_poverty,median_income,median_rent,...,two_more_races,hispanic_or_latino,total_commute_time,number_workers_commuting,total_housing_units,total_owner_occupied,total_renter_occupied,total_vacant_units,total_enrolled_undergraduate,total_unemployed
0,31,117,2014,"McPherson County, Nebraska",426.0,324.0,89.0,49.0,57763.0,657.0,...,4.0,4.0,4705.0,164.0,244.0,129.0,59.0,56.0,3.0,6.0
1,31,33,2014,"Cheyenne County, Nebraska",10044.0,6948.0,1617.0,1340.0,54094.0,590.0,...,132.0,636.0,65545.0,5216.0,4897.0,2932.0,1450.0,515.0,422.0,578.0
2,31,47,2014,"Dawson County, Nebraska",24205.0,15372.0,3384.0,3304.0,48104.0,648.0,...,147.0,7892.0,184225.0,11490.0,10135.0,5970.0,2761.0,1404.0,680.0,1901.0
3,31,71,2014,"Garfield County, Nebraska",1954.0,1425.0,523.0,229.0,41776.0,334.0,...,11.0,17.0,8425.0,780.0,1171.0,660.0,210.0,301.0,11.0,167.0
4,31,125,2014,"Nance County, Nebraska",3667.0,2586.0,717.0,409.0,46220.0,532.0,...,5.0,85.0,35540.0,1641.0,1820.0,1187.0,361.0,272.0,82.0,227.0


### Normalize data


In [None]:
#Percent of people with bachelor's degree or higher
all_df['educational_attainment']=all_df['bachelors_degree']/all_df['total_population_25_over']

#Average commute time in minutes
all_df['av_commute_time'] = all_df['total_commute_time'] / all_df['number_workers_commuting']

#Calculate percent of total population
raw_data = ['total_poverty','white','black','american_indian','asian_alone','hawaiian',
'some_other_race_alone','hispanic_or_latino','total_population_65_over','total_enrolled_undergraduate','total_unemployed']

new_data = ['perc_poverty', 'perc_white','perc_black','perc_american_indian','perc_asian','perc_hawaiian',
'perc_other_race','perc_hispanic','perc_65_over','perc_enrolled_undergrad','perc_unemployed']

for x in range(len(raw_data)):
       all_df[new_data[x]] = all_df[raw_data[x]] / all_df['total_population']


# Calculate percent of homes are renter occupied, owner occupied, or vacant
raw_housing = ['total_owner_occupied', 'total_renter_occupied','total_vacant_units']
new_housing = ['perc_owner','perc_renter','perc_vacant']
all_df['housing_per_capita'] = all_df['total_housing_units']/all_df['total_population']

for x in range(len(raw_housing)):
       all_df[new_housing[x]] = all_df[raw_housing[x]] / all_df['total_housing_units']

# Drop unneccessary columns
all_df.drop([ 'total_population_25_over', 'total_poverty', 'bachelors_degree', 'white',
       'black', 'american_indian', 'asian_alone', 'hawaiian',
       'some_other_race_alone', 'two_more_races', 'hispanic_or_latino',
       'total_population_65_over','total_enrolled_undergraduate',
       'total_owner_occupied', 'total_renter_occupied','total_vacant_units',
       'total_commute_time', 'number_workers_commuting','total_housing_units', 'total_unemployed'], axis=1, inplace=True)


### Dealing with missing values

I considered replacing missing values with the mean value for a county in a column. However, this is a form of data leakage, so we should think about a better way to deal with missing values. Or at least doing it after we split the test and training data. 

In [None]:
all_df.sort_values(by='median_rent')

# some acs missing data is noted -666666666, so we are converting this to np.nan
all_df = all_df.replace(to_replace ={-666666666: np.nan})

# This is groupying by county, then filling missing values with the mean for each county
#Note: This is a form of data leakage - we may want to emply a similar strategy AFTER we split data
#all_df = all_df.groupby("name").transform(lambda x: x.fillna(x.mean()))


In [None]:
cols = ['median_income','median_rent', 'median_home_value', 'av_commute_time', 
'perc_poverty', 'perc_unemployed']


all_df[cols] = all_df.groupby(['state','county'])[cols].ffill()

# bfill the two columns with one remaining missing value. These must be for 2014
all_df[cols] = all_df.groupby(['state','county'])[cols].bfill()



In [None]:
#This is the one county with missing information for all 6 years

# interesting read about the county https://en.wikipedia.org/wiki/Kalawao_County,_Hawaii
print(all_df[all_df['median_home_value'].isna()]['name'].iloc[0])

all_df.isna().sum()

Kalawao County, Hawaii


state                      0
county                     0
year                       0
name                       0
total_population           0
median_income              0
median_rent                0
median_home_value          6
educational_attainment     0
av_commute_time            0
perc_poverty               0
perc_white                 0
perc_black                 0
perc_american_indian       0
perc_asian                 0
perc_hawaiian              0
perc_other_race            0
perc_hispanic              0
perc_65_over               0
perc_enrolled_undergrad    0
perc_unemployed            0
housing_per_capita         0
perc_owner                 0
perc_renter                0
perc_vacant                0
dtype: int64

In [None]:
# export data
all_df.to_csv('/work/cleaned-csvs/acs.csv')

### Check assumptions

A large percent (find out exactly how much) of the white and black data was missing for the 1-year data pull. To deal with this, we replaced the missing 1-year data with the 5-year data for that same county. To see if this was a reasonable assumption, we compared 1-year and 5-year white counts to see how different they were for the 5-year data vs. the 1-year data. As you can see below, for data that was available for both time frames, the farthest difference was about 1 percent of the population. In other words, by replacing the missing count of white for a county with the available 5-year data, we had a margin of error of about 1%. 

In [None]:
all_df_2.set_index(['state','county','year'],inplace = True)

In [None]:
test = pd.merge(all_df_2,df_1yr,how ='inner', left_index=True,right_index=True)

test = test[['total_population_x','total_population_y','white_x','white_y']]

test['dif_white']= (test['white_x'] - test['white_y'])/test['total_population_x']

test.sort_values(by=['dif_white'],ascending = False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_population_x,total_population_y,white_x,white_y,dif_white
state,county,year,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
12,005,2019,182161.0,174705.0,138654.0,132599.0,0.033240
06,007,2019,225817.0,219186.0,162537.0,155136.0,0.032774
36,045,2016,117966.0,114006.0,97364.0,93531.0,0.032492
13,179,2015,64427.0,62467.0,27348.0,25460.0,0.029304
34,035,2019,329838.0,328934.0,185677.0,176225.0,0.028656
...,...,...,...,...,...,...,...
47,187,2019,225389.0,238412.0,190642.0,,
47,059,2019,68834.0,69069.0,64048.0,,
47,141,2019,77447.0,80245.0,68481.0,,
47,155,2019,97068.0,98250.0,87347.0,,


In [None]:
all_df_2 = all_df_2.reset_index()

### Create a mapper from county/state name to county/state fips code

In [None]:

#mapper = all_df_2 [all_df_2['year']==2019]

mapper = all_df_2[['state','county','name']]
mapper = mapper.groupby(['state','county','name']).sum()

mapper.reset_index(inplace = True)
#mapper['state_num'] = pd.to_numeric(mapper['state'])
#mapper['county_num'] = pd.to_numeric(mapper['county'])

mapper['a'] = mapper['name'].str.split(', ')
mapper['county_name'] = mapper['a'].str[0].str.lower()

mapper['state_name']=mapper['a'].str[1].str.lower()
mapper['county_name'] = mapper['county_name'].str[:-7]
mapper = mapper[['state','county','county_name','state_name']]

mapper.to_csv('/work/cleaned-csvs/us_counties.csv',index = False)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=f6c76417-5fde-42f3-8920-755838dec3fa' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>