Creation of county-level demographic data using 
1. American Community Survey data via python package CensusData (install: pip install CensusData; documentation: https://jtleider.github.io/censusdata/)
2. domestic and international flight passenger data from the US Bureau of Transportation Statistics (https://www.bts.gov/airport-rankings-2018)
3. shelter-in-place/stay-at-home order data from the New York Times (https://www.nytimes.com/interactive/2020/us/coronavirus-stay-at-home-order.html), the San Francisco Chronicle (https://www.sfchronicle.com/bayarea/article/Bay-Area-to-shelter-in-place-What-you-need-15135087.php), and the Alameda County government (https://www.acgov.org/documents/Final-Order-to-Shelter-In-Place.pdf)

Rather than using census data (the last census was in 2010), used 1 year American Community Survey estimates.  Though these are less accurate, they are the most recent estimates available from the US Census.  Also, although the 1-year ACS data is only available for populations >=65,000, this is a good limit to impose to keep the number of areas examined in check.

In [288]:
import pandas as pd
import numpy as np
import censusdata
import re
from uszipcode import SearchEngine

In [289]:
# https://jtleider.github.io/censusdata/example2.html
pd.set_option('display.expand_frame_repr', False) # the frame will be huge, don't expand
pd.set_option('display.precision', 4)

### Census/Population Data

In [290]:
# searched 2018 1-year ACS data profiles for specific fields to download using search terms like 'income', 'poverty', 'insurance', 'transportation', etc.
# DP = data profile table; explanation of table types at https://www.census.gov/programs-surveys/acs/guidance/which-data-tool/table-ids-explained.html

censusdata.search('acs1', 2018, 'label', 'transportation', 'profile')
# use fields('vars') 'DP02_0015E', 'DP03_0119PE', 'DP03_0051E', 'DP03_0021PE', 'DP05_0001E', 'DP05_0029E', 'DP03_0095PE', 'DP04_0143PE', ...

[('DP03_0021E',
  'SELECTED ECONOMIC CHARACTERISTICS',
  'Estimate!!COMMUTING TO WORK!!Workers 16 years and over!!Public transportation (excluding taxicab)'),
 ('DP03_0021PE',
  'SELECTED ECONOMIC CHARACTERISTICS',
  'Percent Estimate!!COMMUTING TO WORK!!Workers 16 years and over!!Public transportation (excluding taxicab)'),
 ('DP03_0031E',
  'SELECTED ECONOMIC CHARACTERISTICS',
  'Estimate!!OCCUPATION!!Civilian employed population 16 years and over!!Production, transportation, and material moving occupations'),
 ('DP03_0031PE',
  'SELECTED ECONOMIC CHARACTERISTICS',
  'Percent Estimate!!OCCUPATION!!Civilian employed population 16 years and over!!Production, transportation, and material moving occupations'),
 ('DP03_0038E',
  'SELECTED ECONOMIC CHARACTERISTICS',
  'Estimate!!INDUSTRY!!Civilian employed population 16 years and over!!Transportation and warehousing, and utilities'),
 ('DP03_0038PE',
  'SELECTED ECONOMIC CHARACTERISTICS',
  'Percent Estimate!!INDUSTRY!!Civilian employed po

In [291]:
fields = ['DP02_0015E', 'DP03_0033E', 'DP03_0041E', 'DP03_0042E', 'DP03_0043E', 
          'DP03_0035E', 'DP03_0037E', 'DP03_0046E', 'DP03_0119PE', 'DP03_0088E', #'DP03_0051E', 
          'DP03_0021PE', 'DP05_0001E', 'DP05_0029E', 'DP03_0095PE', 'DP04_0143PE']
counties = censusdata.download('acs1', 2018, censusdata.censusgeo([('county', '*')]), 
                               fields, tabletype = 'profile')

In [292]:
counties.describe()

Unnamed: 0,DP02_0015E,DP03_0033E,DP03_0041E,DP03_0042E,DP03_0043E,DP03_0035E,DP03_0037E,DP03_0046E,DP03_0119PE,DP03_0088E,DP03_0021PE,DP05_0001E,DP05_0029E,DP03_0095PE,DP04_0143PE
count,827.0,838.0,838.0,838.0,838.0,838.0,838.0,838.0,838.0,838.0,838.0,838.0,838.0,838.0,838.0
mean,2.5905,-2384700.0,-2366300.0,-2349200.0,-2370500.0,-2371400.0,-2368800.0,162390.0,9.3678,31799.432,-15513000.0,333710.0,51702.0,329340.0,-889150000.0
std,0.2503,48824000.0,48825000.0,48826000.0,48825000.0,48825000.0,48825000.0,291520.0,5.2102,8072.5366,123660000.0,587270.0,82760.0,582460.0,5424900.0
min,1.9,-1000000000.0,-1000000000.0,-1000000000.0,-1000000000.0,-1000000000.0,-1000000000.0,15513.0,1.3,10080.0,-1000000000.0,62607.0,6359.0,57066.0,-1000000000.0
25%,2.41,509.25,3575.2,10258.0,4073.8,4537.8,5050.8,43710.0,5.9,26713.75,0.3,95300.0,16251.0,93264.0,-888890000.0
50%,2.55,940.0,7000.0,17369.0,7155.5,8109.0,8977.0,74613.0,8.5,30535.5,0.7,159040.0,26132.0,156720.0,-888890000.0
75%,2.71,1691.5,18371.0,39902.0,16001.0,15998.0,18470.0,164980.0,11.5,35472.75,1.8,330890.0,52966.0,327890.0,-888890000.0
max,4.11,66139.0,642350.0,1031600.0,583850.0,455550.0,504940.0,5001400.0,49.2,74911.0,60.7,10106000.0,1376000.0,10035000.0,-888890000.0


In [293]:
# apparently censusdata doesn't retrieve labels with the download, so creating a dictionary with that info now
field_info = dict()
for field in fields:
    field_info[field] = censusdata.variable_info.censustable('acs1', 2018, table=re.sub("_.+$", "", field)).get(field)

for key in field_info.keys():
    print(key, "\t", field_info.get(key).get('label'))

DP02_0015E 	 Estimate!!HOUSEHOLDS BY TYPE!!Total households!!Average household size
DP03_0033E 	 Estimate!!INDUSTRY!!Civilian employed population 16 years and over!!Agriculture, forestry, fishing and hunting, and mining
DP03_0041E 	 Estimate!!INDUSTRY!!Civilian employed population 16 years and over!!Professional, scientific, and management, and administrative and waste management services
DP03_0042E 	 Estimate!!INDUSTRY!!Civilian employed population 16 years and over!!Educational services, and health care and social assistance
DP03_0043E 	 Estimate!!INDUSTRY!!Civilian employed population 16 years and over!!Arts, entertainment, and recreation, and accommodation and food services
DP03_0035E 	 Estimate!!INDUSTRY!!Civilian employed population 16 years and over!!Manufacturing
DP03_0037E 	 Estimate!!INDUSTRY!!Civilian employed population 16 years and over!!Retail trade
DP03_0046E 	 Estimate!!CLASS OF WORKER!!Civilian employed population 16 years and over
DP03_0119PE 	 Percent Estimate!!PERCE

In [294]:
# creating shorter labels...
labels = ['household_size', 'empl_agriculture', 'empl_professional', 'empl_social', 'empl_services', 'empl_manufacturing', 'empl_retail', 'employed',
          'prc_fam_poverty', 'avg_income', 'prc_public_transp', 'population', 'pop_65_plus', 'health_ins', 'avg_rent_prc_income']

In [295]:
counties.columns = labels

In [296]:
counties.describe()

Unnamed: 0,household_size,empl_agriculture,empl_professional,empl_social,empl_services,empl_manufacturing,empl_retail,employed,prc_fam_poverty,avg_income,prc_public_transp,population,pop_65_plus,health_ins,avg_rent_prc_income
count,827.0,838.0,838.0,838.0,838.0,838.0,838.0,838.0,838.0,838.0,838.0,838.0,838.0,838.0,838.0
mean,2.5905,-2384700.0,-2366300.0,-2349200.0,-2370500.0,-2371400.0,-2368800.0,162390.0,9.3678,31799.432,-15513000.0,333710.0,51702.0,329340.0,-889150000.0
std,0.2503,48824000.0,48825000.0,48826000.0,48825000.0,48825000.0,48825000.0,291520.0,5.2102,8072.5366,123660000.0,587270.0,82760.0,582460.0,5424900.0
min,1.9,-1000000000.0,-1000000000.0,-1000000000.0,-1000000000.0,-1000000000.0,-1000000000.0,15513.0,1.3,10080.0,-1000000000.0,62607.0,6359.0,57066.0,-1000000000.0
25%,2.41,509.25,3575.2,10258.0,4073.8,4537.8,5050.8,43710.0,5.9,26713.75,0.3,95300.0,16251.0,93264.0,-888890000.0
50%,2.55,940.0,7000.0,17369.0,7155.5,8109.0,8977.0,74613.0,8.5,30535.5,0.7,159040.0,26132.0,156720.0,-888890000.0
75%,2.71,1691.5,18371.0,39902.0,16001.0,15998.0,18470.0,164980.0,11.5,35472.75,1.8,330890.0,52966.0,327890.0,-888890000.0
max,4.11,66139.0,642350.0,1031600.0,583850.0,455550.0,504940.0,5001400.0,49.2,74911.0,60.7,10106000.0,1376000.0,10035000.0,-888890000.0


In [297]:
# drop avg_rent_prc_income since it doesn't seem very informative 
counties = counties.drop(columns=['avg_rent_prc_income'])

In [298]:
counties.describe()

Unnamed: 0,household_size,empl_agriculture,empl_professional,empl_social,empl_services,empl_manufacturing,empl_retail,employed,prc_fam_poverty,avg_income,prc_public_transp,population,pop_65_plus,health_ins
count,827.0,838.0,838.0,838.0,838.0,838.0,838.0,838.0,838.0,838.0,838.0,838.0,838.0,838.0
mean,2.5905,-2384700.0,-2366300.0,-2349200.0,-2370500.0,-2371400.0,-2368800.0,162390.0,9.3678,31799.432,-15513000.0,333710.0,51702.0,329340.0
std,0.2503,48824000.0,48825000.0,48826000.0,48825000.0,48825000.0,48825000.0,291520.0,5.2102,8072.5366,123660000.0,587270.0,82760.0,582460.0
min,1.9,-1000000000.0,-1000000000.0,-1000000000.0,-1000000000.0,-1000000000.0,-1000000000.0,15513.0,1.3,10080.0,-1000000000.0,62607.0,6359.0,57066.0
25%,2.41,509.25,3575.2,10258.0,4073.8,4537.8,5050.8,43710.0,5.9,26713.75,0.3,95300.0,16251.0,93264.0
50%,2.55,940.0,7000.0,17369.0,7155.5,8109.0,8977.0,74613.0,8.5,30535.5,0.7,159040.0,26132.0,156720.0
75%,2.71,1691.5,18371.0,39902.0,16001.0,15998.0,18470.0,164980.0,11.5,35472.75,1.8,330890.0,52966.0,327890.0
max,4.11,66139.0,642350.0,1031600.0,583850.0,455550.0,504940.0,5001400.0,49.2,74911.0,60.7,10106000.0,1376000.0,10035000.0


In [299]:
counties.head()

Unnamed: 0,household_size,empl_agriculture,empl_professional,empl_social,empl_services,empl_manufacturing,empl_retail,employed,prc_fam_poverty,avg_income,prc_public_transp,population,pop_65_plus,health_ins
"Morgan County, Alabama: Summary level: 050, state:01> county:103",2.56,580,6009,10431,4473,11938,5507,53742,9.9,27742,0.4,119089,20464,117677
"Kings County, California: Summary level: 050, state:06> county:031",3.15,7797,3901,11372,4707,3752,4899,52644,15.6,22628,0.5,151366,15413,136372
"Monterey County, California: Summary level: 050, state:06> county:053",3.31,30494,19232,37518,20736,12420,17109,190707,10.5,30674,1.3,435594,59491,419413
"Nevada County, California: Summary level: 050, state:06> county:057",2.37,596,7285,9199,5133,1562,4717,44505,5.1,37645,0.1,99696,27746,98472
"Shasta County, California: Summary level: 050, state:06> county:089",2.59,743,6543,17734,7999,3077,8953,69649,9.5,28144,0.8,180040,37027,178552


In [300]:
# make county, state columns from index to use for joining the airport data
county = counties.index
state = [re.sub(": Summary.+$", "", x.name) for x in county] # move "County, State" to state list
counties['county'] = [re.sub(", .+$", "", x) for x in state] # move just "County" to county list
counties['state'] = [re.sub("^.*, ", "", x) for x in state] # remove "County, " to have just "State" in state list

In [301]:
counties.head()

Unnamed: 0,household_size,empl_agriculture,empl_professional,empl_social,empl_services,empl_manufacturing,empl_retail,employed,prc_fam_poverty,avg_income,prc_public_transp,population,pop_65_plus,health_ins,county,state
"Morgan County, Alabama: Summary level: 050, state:01> county:103",2.56,580,6009,10431,4473,11938,5507,53742,9.9,27742,0.4,119089,20464,117677,Morgan County,Alabama
"Kings County, California: Summary level: 050, state:06> county:031",3.15,7797,3901,11372,4707,3752,4899,52644,15.6,22628,0.5,151366,15413,136372,Kings County,California
"Monterey County, California: Summary level: 050, state:06> county:053",3.31,30494,19232,37518,20736,12420,17109,190707,10.5,30674,1.3,435594,59491,419413,Monterey County,California
"Nevada County, California: Summary level: 050, state:06> county:057",2.37,596,7285,9199,5133,1562,4717,44505,5.1,37645,0.1,99696,27746,98472,Nevada County,California
"Shasta County, California: Summary level: 050, state:06> county:089",2.59,743,6543,17734,7999,3077,8953,69649,9.5,28144,0.8,180040,37027,178552,Shasta County,California


In [302]:
counties[counties['county'] == "New York County"]

Unnamed: 0,household_size,empl_agriculture,empl_professional,empl_social,empl_services,empl_manufacturing,empl_retail,employed,prc_fam_poverty,avg_income,prc_public_transp,population,pop_65_plus,health_ins,county,state
"New York County, New York: Summary level: 050, state:36> county:061",2.08,1113,195150,209196,93400,22918,58324,901880,12.1,74911,59.9,1628701,268834,1617657,New York County,New York


In [303]:
counties[counties['county'] == "Alameda County"]

Unnamed: 0,household_size,empl_agriculture,empl_professional,empl_social,empl_services,empl_manufacturing,empl_retail,employed,prc_fam_poverty,avg_income,prc_public_transp,population,pop_65_plus,health_ins,county,state
"Alameda County, California: Summary level: 050, state:06> county:001",2.84,4405,180270,192425,72367,86012,76344,882648,5.1,48595,15.7,1666753,230510,1657847,Alameda County,California


In [304]:
counties.shape

(838, 16)

In [305]:
fips = re.compile("^.+, state:(\d+).*county:(\d+).*$")

In [306]:
found = fips.match(str(counties.index[0]))

In [307]:
found.group(1)+found.group(2)

'01103'

In [308]:
# add column of FIPS codes for joining with area (sq. mi.) data
codes = list()
for x in counties.index:
    found = fips.match(str(x))
    codes.append(found.group(1)+found.group(2))

In [309]:
counties['FIPS'] = codes

In [310]:
counties.head()

Unnamed: 0,household_size,empl_agriculture,empl_professional,empl_social,empl_services,empl_manufacturing,empl_retail,employed,prc_fam_poverty,avg_income,prc_public_transp,population,pop_65_plus,health_ins,county,state,FIPS
"Morgan County, Alabama: Summary level: 050, state:01> county:103",2.56,580,6009,10431,4473,11938,5507,53742,9.9,27742,0.4,119089,20464,117677,Morgan County,Alabama,1103
"Kings County, California: Summary level: 050, state:06> county:031",3.15,7797,3901,11372,4707,3752,4899,52644,15.6,22628,0.5,151366,15413,136372,Kings County,California,6031
"Monterey County, California: Summary level: 050, state:06> county:053",3.31,30494,19232,37518,20736,12420,17109,190707,10.5,30674,1.3,435594,59491,419413,Monterey County,California,6053
"Nevada County, California: Summary level: 050, state:06> county:057",2.37,596,7285,9199,5133,1562,4717,44505,5.1,37645,0.1,99696,27746,98472,Nevada County,California,6057
"Shasta County, California: Summary level: 050, state:06> county:089",2.59,743,6543,17734,7999,3077,8953,69649,9.5,28144,0.8,180040,37027,178552,Shasta County,California,6089


To get land area information, ended up having to download data from archived US Census site since I couldn't figure out how to easily get this info through censusdata module.  The land area is from the 2010 census, but much less likely to have changed in the past 10 years than the population/demographics data.

https://web.archive.org/web/20150807220054/http://quickfacts.census.gov/qfd/download_data.html

data itself is available at: https://web.archive.org/web/20150821182814/http://quickfacts.census.gov/qfd/download/DataSet.txt

In [311]:
# get land area for the given counties

quickFacts = pd.read_csv("stateCounty_quickFacts.txt", dtype={'fips':str, 'LND110210':float})

In [312]:
quickFacts # only needs fips and LND110210

Unnamed: 0,fips,PST045214,PST045213,PST040210,PST120214,PST120213,POP010210,AGE135213,AGE295213,AGE775213,...,SBO415207,SBO015207,MAN450207,WTN220207,RTN130207,RTN131207,AFN120207,BPS030214,LND110210,POP060210
0,00000,318857056,316497531,308758105,3.3,2.5,308745538,6.3,23.3,14.1,...,8.3,28.8,5319456312,4174286516,3917663456,12990,613795732,1046363,3.5319e+06,87.4
1,01000,4849377,4833996,4780127,1.4,1.1,4779736,6.1,23.0,14.9,...,1.2,28.1,112858843,52252752,57344851,12364,6426342,13369,5.0645e+04,94.4
2,01001,55395,55136,54571,1.5,1.0,54571,6.1,25.4,13.5,...,0.7,31.7,0,0,598175,12003,88157,131,5.9444e+02,91.8
3,01003,200111,195443,182265,9.8,7.2,182265,5.7,22.4,18.1,...,1.3,27.3,1410273,0,2966489,17166,436955,1384,1.5898e+03,114.6
4,01005,26887,26978,27457,-2.1,-1.7,27457,5.8,21.1,15.9,...,0.0,27.0,0,0,188337,6334,0,8,8.8488e+02,31.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3190,56037,45010,45205,43806,2.7,3.2,43806,7.6,27.2,9.0,...,3.8,27.2,0,437493,898189,22843,150439,227,1.0427e+04,4.2
3191,56039,22930,22375,21294,7.7,5.1,21294,5.9,19.2,11.8,...,3.3,25.3,0,0,515644,25688,327363,145,3.9954e+03,5.3
3192,56041,20904,21031,21118,-1.0,-0.4,21118,7.6,29.5,10.5,...,2.2,15.9,0,159375,413983,20626,35497,40,2.0813e+03,10.1
3193,56043,8322,8450,8533,-2.5,-1.0,8533,5.8,24.4,19.2,...,0.0,26.9,0,12128,98308,12596,10175,4,2.2386e+03,3.8


In [313]:
quickFacts = quickFacts[["fips", "LND110210"]]

In [314]:
quickFacts.head()

Unnamed: 0,fips,LND110210
0,0,3531900.0
1,1000,50645.0
2,1001,594.44
3,1003,1589.8
4,1005,884.88


In [315]:
counties.iloc[0]['FIPS']

'01103'

In [316]:
area = list()
for x in range(counties.shape[0]):
    results = quickFacts.loc[quickFacts["fips"] == counties.iloc[x]['FIPS'], "LND110210"]
    try:
        value = float(results)
        area.append(value)
    except:
        area.append(None)

In [317]:
counties["area"] = area

In [318]:
counties.describe()

Unnamed: 0,household_size,empl_agriculture,empl_professional,empl_social,empl_services,empl_manufacturing,empl_retail,employed,prc_fam_poverty,avg_income,prc_public_transp,population,pop_65_plus,health_ins,area
count,827.0,838.0,838.0,838.0,838.0,838.0,838.0,838.0,838.0,838.0,838.0,838.0,838.0,838.0,827.0
mean,2.5905,-2384700.0,-2366300.0,-2349200.0,-2370500.0,-2371400.0,-2368800.0,162390.0,9.3678,31799.432,-15513000.0,333710.0,51702.0,329340.0,1066.8997
std,0.2503,48824000.0,48825000.0,48826000.0,48825000.0,48825000.0,48825000.0,291520.0,5.2102,8072.5366,123660000.0,587270.0,82760.0,582460.0,1816.4262
min,1.9,-1000000000.0,-1000000000.0,-1000000000.0,-1000000000.0,-1000000000.0,-1000000000.0,15513.0,1.3,10080.0,-1000000000.0,62607.0,6359.0,57066.0,15.03
25%,2.41,509.25,3575.2,10258.0,4073.8,4537.8,5050.8,43710.0,5.9,26713.75,0.3,95300.0,16251.0,93264.0,434.6
50%,2.55,940.0,7000.0,17369.0,7155.5,8109.0,8977.0,74613.0,8.5,30535.5,0.7,159040.0,26132.0,156720.0,629.0
75%,2.71,1691.5,18371.0,39902.0,16001.0,15998.0,18470.0,164980.0,11.5,35472.75,1.8,330890.0,52966.0,327890.0,909.955
max,4.11,66139.0,642350.0,1031600.0,583850.0,455550.0,504940.0,5001400.0,49.2,74911.0,60.7,10106000.0,1376000.0,10035000.0,24607.9


In [319]:
counties.loc[counties.area == None,]

Unnamed: 0,household_size,empl_agriculture,empl_professional,empl_social,empl_services,empl_manufacturing,empl_retail,employed,prc_fam_poverty,avg_income,prc_public_transp,population,pop_65_plus,health_ins,county,state,FIPS,area


In [320]:
counties[(counties.county == "New York County") | (counties.county == "Alameda County")]
# note that while Alameda County, CA and New York County, NY have very similar population sizes, Alameda County has
# vastly more land area

Unnamed: 0,household_size,empl_agriculture,empl_professional,empl_social,empl_services,empl_manufacturing,empl_retail,employed,prc_fam_poverty,avg_income,prc_public_transp,population,pop_65_plus,health_ins,county,state,FIPS,area
"Alameda County, California: Summary level: 050, state:06> county:001",2.84,4405,180270,192425,72367,86012,76344,882648,5.1,48595,15.7,1666753,230510,1657847,Alameda County,California,6001,739.02
"New York County, New York: Summary level: 050, state:36> county:061",2.08,1113,195150,209196,93400,22918,58324,901880,12.1,74911,59.9,1628701,268834,1617657,New York County,New York,36061,22.83


In [321]:
# convert full name of state in "state" column to its appropriate 2-letter abbreviation
# state abbreviation dictionary from https://gist.github.com/rogerallen/1583593
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}

In [322]:
state = list()

for x in range(counties.shape[0]):
    state.append(us_state_abbrev[counties.iloc[x]["state"]])
counties["state"] = state

In [323]:
counties[(counties.county == "New York County") | (counties.county == "Alameda County")]

Unnamed: 0,household_size,empl_agriculture,empl_professional,empl_social,empl_services,empl_manufacturing,empl_retail,employed,prc_fam_poverty,avg_income,prc_public_transp,population,pop_65_plus,health_ins,county,state,FIPS,area
"Alameda County, California: Summary level: 050, state:06> county:001",2.84,4405,180270,192425,72367,86012,76344,882648,5.1,48595,15.7,1666753,230510,1657847,Alameda County,CA,6001,739.02
"New York County, New York: Summary level: 050, state:36> county:061",2.08,1113,195150,209196,93400,22918,58324,901880,12.1,74911,59.9,1628701,268834,1617657,New York County,NY,36061,22.83


### Airport Passenger Volume Data
Add airport data for the 200-something airports from data ranking at https://www.bts.gov/airport-rankings-2018

In [324]:
airports = pd.read_csv("passengers.csv")

In [325]:
airports.describe()

Unnamed: 0,"2018 Scheduled Enplanements Domestic Passengers (000,000)",2018 Scheduled Emplanements International Passengers (000)
count,200.0,48.0
mean,3.8382,2373.6792
std,6.8996,3483.3706
min,0.145,101.9
25%,0.3492,197.25
50%,0.798,563.65
75%,3.4653,3257.125
max,45.711,16459.9


In [326]:
airports.head()

Unnamed: 0,Airport,IATA,"2018 Scheduled Enplanements Domestic Passengers (000,000)",2018 Scheduled Emplanements International Passengers (000)
0,Atlanta,ATL,45.711,6133.5
1,Chicago O'Hare,ORD,33.144,6707.4
2,Los Angeles,LAX,30.008,12570.6
3,Denver,DEN,29.883,1458.9
4,Dallas/Fort Worth,DFW,28.641,4138.7


Getting the lat/long for each airport from data file from OpenFlights: https://github.com/jpatokal/openflights and described at https://openflights.org/data.html

In [327]:
air_locs = pd.read_csv("airports.dat", header=None)

In [328]:
air_locs.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,Goroka Airport,Goroka,Papua New Guinea,GKA,AYGA,-6.0817,145.392,5282,10,U,Pacific/Port_Moresby,airport,OurAirports
1,2,Madang Airport,Madang,Papua New Guinea,MAG,AYMD,-5.2071,145.789,20,10,U,Pacific/Port_Moresby,airport,OurAirports
2,3,Mount Hagen Kagamuga Airport,Mount Hagen,Papua New Guinea,HGU,AYMH,-5.8268,144.296,5388,10,U,Pacific/Port_Moresby,airport,OurAirports
3,4,Nadzab Airport,Nadzab,Papua New Guinea,LAE,AYNZ,-6.5698,146.726,239,10,U,Pacific/Port_Moresby,airport,OurAirports
4,5,Port Moresby Jacksons International Airport,Port Moresby,Papua New Guinea,POM,AYPY,-9.4434,147.22,146,10,U,Pacific/Port_Moresby,airport,OurAirports


In [329]:
air_locs.columns = ['openflights_id', 'name', 'city', 'country', 'IATA', 'ICAO', 'lat', 
                    'lng', 'alt', 'timezone', 'DST', 'Tz', 'type', 'source']

In [330]:
air_locs = air_locs[air_locs['country'] == 'United States']
air_locs.head()

Unnamed: 0,openflights_id,name,city,country,IATA,ICAO,lat,lng,alt,timezone,DST,Tz,type,source
3212,3411,Barter Island LRRS Airport,Barter Island,United States,BTI,PABA,70.134,-143.582,2,-9,A,America/Anchorage,airport,OurAirports
3213,3412,Wainwright Air Station,Fort Wainwright,United States,\N,PAWT,70.6134,-159.86,35,-9,A,America/Anchorage,airport,OurAirports
3214,3413,Cape Lisburne LRRS Airport,Cape Lisburne,United States,LUR,PALU,68.8751,-166.11,16,-9,A,America/Anchorage,airport,OurAirports
3215,3414,Point Lay LRRS Airport,Point Lay,United States,PIZ,PPIZ,69.7329,-163.005,22,-9,A,America/Anchorage,airport,OurAirports
3216,3415,Hilo International Airport,Hilo,United States,ITO,PHTO,19.7214,-155.048,38,-10,N,Pacific/Honolulu,airport,OurAirports


In [331]:
lat = list()
long = list()

for x in range(airports.shape[0]):
    result_lat = air_locs.loc[air_locs["IATA"] == airports.iloc[x]['IATA'], "lat"]
    result_lng = air_locs.loc[air_locs["IATA"] == airports.iloc[x]['IATA'], "lng"]
    try:
        value_lat = float(result_lat)
        lat.append(value_lat)
    except:
        lat.append(None)
        
    try:
        value_lng = float(result_lng)
        long.append(value_lng)
    except:
        long.append(None)

In [332]:
airports["lat"] = lat
airports["long"] = long

In [333]:
airports.head()

Unnamed: 0,Airport,IATA,"2018 Scheduled Enplanements Domestic Passengers (000,000)",2018 Scheduled Emplanements International Passengers (000),lat,long
0,Atlanta,ATL,45.711,6133.5,33.6367,-84.4281
1,Chicago O'Hare,ORD,33.144,6707.4,41.9786,-87.9048
2,Los Angeles,LAX,30.008,12570.6,33.9425,-118.408
3,Denver,DEN,29.883,1458.9,39.8617,-104.673
4,Dallas/Fort Worth,DFW,28.641,4138.7,32.8968,-97.038


In [334]:
airports.tail()

Unnamed: 0,Airport,IATA,"2018 Scheduled Enplanements Domestic Passengers (000,000)",2018 Scheduled Emplanements International Passengers (000),lat,long
195,Jacksonville/Camp Lejeune,OAJ,0.151,,34.8292,-77.6121
196,Latrobe,LBE,0.15,,40.2759,-79.4048
197,Minot,MOT,0.148,,48.2594,-101.28
198,Lincoln,LNK,0.146,,40.851,-96.7592
199,Concord,USA,0.145,,35.3878,-80.7091


In [335]:
# convert NaN to 0 in passenger columns

airports[["2018 Scheduled Enplanements Domestic Passengers (000,000)",
          "2018 Scheduled Emplanements International Passengers (000)"]] = airports[["2018 Scheduled Enplanements Domestic Passengers (000,000)",
          "2018 Scheduled Emplanements International Passengers (000)"]].fillna(0)


Convert the lat/long for each airport into the appropriate county, state using package uszipcode

I want calculate the number of domestic and international airline passengers that arrive within 20 miles of each county.

In [336]:
search = SearchEngine()
passengers = dict()

for x in range((airports.shape[0])):
    # first use lat, long for each airport, and search for its zipcode, and use to get all 
    # zipcodes within 20 mile radius of the airport
    result = search.by_coordinates(airports.loc[x, "lat"], airports.loc[x, "long"], 
                                   radius=20, returns=700)
    
    # get set of county, state tuples within a 20 mile radius of this airport from zipcode info
    unique = set()
    for y in range(len(result)):
        co = result[y].to_dict()["county"]
        st = result[y].to_dict()["state"]
        unique.add((co, st))
    
    # for each unique county found, either add to passengers dict with domestic and
    # international passenger count, or if already in passengers dict add the
    # dom and intl passenger counts to those already in the dictionary
    for y in unique:
        if y in passengers.keys():
            temp = passengers[y]
            temp["dom"] = temp["dom"] + (airports.loc[x, "2018 Scheduled Enplanements Domestic Passengers (000,000)"] * 1000000)
            temp["intl"] = temp["intl"] + (airports.loc[x, "2018 Scheduled Emplanements International Passengers (000)"] * 1000)
            passengers[y] = temp
        else:
            temp = dict()
            temp["dom"] = airports.loc[x, "2018 Scheduled Enplanements Domestic Passengers (000,000)"] * 1000000
            temp["intl"] = airports.loc[x, "2018 Scheduled Emplanements International Passengers (000)"] * 1000
            passengers[y] = temp

In [337]:
# convert passengers dictionary into a data frame
co = list()
st = list()
dom = list()
intl = list()

for x in passengers.keys():
    co.append(x[0])
    st.append(x[1])
    dom.append(int(passengers[x]["dom"])) # convert to integer
    intl.append(int(passengers[x]["intl"]))

ps = {"county":co, "state":st, "dom passengers":dom, "intl passengers":intl}
passengers = pd.DataFrame(ps)

In [338]:
passengers.head()

Unnamed: 0,county,state,dom passengers,intl passengers
0,Rockdale County,GA,45711000,6133500
1,Henry County,GA,45711000,6133500
2,DeKalb County,GA,45711000,6133500
3,Fulton County,GA,45711000,6133500
4,Clayton County,GA,45711000,6133500


Add the airport data to the census/county data

In [358]:
domestic = list()
international = list()

for x in range(counties.shape[0]):
    result_dom = passengers.loc[(passengers["county"] == counties.iloc[x]["county"]) & (passengers["state"] == counties.iloc[x]["state"]), "dom passengers"]
    result_int = passengers.loc[(passengers["county"] == counties.iloc[x]["county"]) & (passengers["state"] == counties.iloc[x]["state"]), "intl passengers"]
    
    if (len(result_dom) > 1 or len(result_int) > 1):
        print("unusual at x = ", x) # should only be one value for counties with any passengers
    domestic.append(result_dom.sum()) # use sum to convert empty series to 0
    international.append(result_int.sum()) 

In [361]:
counties["domestic_passengers"] = domestic
counties["intl_passengers"] = international

In [362]:
counties.head()

Unnamed: 0,household_size,empl_agriculture,empl_professional,empl_social,empl_services,empl_manufacturing,empl_retail,employed,prc_fam_poverty,avg_income,prc_public_transp,population,pop_65_plus,health_ins,county,state,FIPS,area,domestic_passengers,intl_passengers
"Morgan County, Alabama: Summary level: 050, state:01> county:103",2.56,580,6009,10431,4473,11938,5507,53742,9.9,27742,0.4,119089,20464,117677,Morgan County,AL,1103,579.34,580000,0
"Kings County, California: Summary level: 050, state:06> county:031",3.15,7797,3901,11372,4707,3752,4899,52644,15.6,22628,0.5,151366,15413,136372,Kings County,CA,6031,1389.42,0,0
"Monterey County, California: Summary level: 050, state:06> county:053",3.31,30494,19232,37518,20736,12420,17109,190707,10.5,30674,1.3,435594,59491,419413,Monterey County,CA,6053,3280.6,186000,0
"Nevada County, California: Summary level: 050, state:06> county:057",2.37,596,7285,9199,5133,1562,4717,44505,5.1,37645,0.1,99696,27746,98472,Nevada County,CA,6057,957.77,0,0
"Shasta County, California: Summary level: 050, state:06> county:089",2.59,743,6543,17734,7999,3077,8953,69649,9.5,28144,0.8,180040,37027,178552,Shasta County,CA,6089,3775.4,0,0


### Shelter-in-Place Order Data

add data about when states/counties began shelter-in-place/stay-at-hom orders.

Data was manually entered into CSV files from:
+ https://www.nytimes.com/interactive/2020/us/coronavirus-stay-at-home-order.html
+ https://www.sfchronicle.com/bayarea/article/Bay-Area-to-shelter-in-place-What-you-need-15135087.php
+ https://www.acgov.org/documents/Final-Order-to-Shelter-In-Place.pdf




In [363]:
# start with county/cities first, since more specific and earlier 
# (if there were subsequent, superceding state-level orders, like in California)

county = pd.read_csv("county_orders.csv")
state = pd.read_csv("state_orders.csv")

In [364]:
county.tail()

Unnamed: 0,state,county,city,date order started (MM/DD/YY)
15,CA,Marin County,,03/17/20
16,CA,Sonoma County,,03/17/20
17,CA,Solano County,,03/17/20
18,CA,Napa County,,03/17/20
19,CA,Contra Costa County,,03/17/20


In [122]:
state.head()

Unnamed: 0,state,date order started (MM/DD/YY)
0,AL,04/04/20
1,AK,03/28/30
2,AZ,03/31/20
3,CA,03/19/20
4,CO,03/26/20


In [123]:
st = counties.iloc[0]["state"]
co = counties.iloc[0]["county"]
co

'Morgan County'

In [124]:
counties.loc[counties["county"] == "Alameda County"]

Unnamed: 0,household_size,empl_agriculture,empl_professional,empl_social,empl_services,empl_manufacturing,empl_retail,employed,prc_fam_poverty,avg_income,prc_public_transp,population,pop_65_plus,health_ins,county,state,FIPS,area,domestic_passengers,intl_passengers
"Alameda County, California: Summary level: 050, state:06> county:001",2.84,4405,180270,192425,72367,86012,76344,882648,5.1,48595,15.7,1666753,230510,1657847,Alameda County,California,6001,739.02,6194000.0,467100.0


In [126]:
# figure out how to access everything...
st = counties.loc[counties["county"] == "Alameda County", "state"].values[0]
co = counties.loc[counties["county"] == "Alameda County", "county"].values[0]
print(str(co), "\t", str(st))

Alameda County 	 California


In [127]:
result = county.loc[(county["state"] == us_state_abbrev[st]) & (county["county"] == co), "date order started (MM/DD/YY)"].values
result

array(['03/17/20'], dtype=object)

In [128]:
orders = list()

for x in range(counties.shape[0]):
    st = counties.iloc[x]["state"]
    co = counties.iloc[x]["county"]
    
    # check if in the counties set first
    date = county.loc[(county["state"] == us_state_abbrev[st]) & (county["county"] == co), "date order started (MM/DD/YY)"].values
    if len(date) > 0:
        orders.append(date[0])
    else:
        date = state.loc[state["state"] == us_state_abbrev[st], "date order started (MM/DD/YY)"].values
        if len(date) > 0:
            orders.append(date[0])
        else:
            orders.append(None)

In [129]:
counties["order started"] = orders

In [130]:
counties.head()

Unnamed: 0,household_size,empl_agriculture,empl_professional,empl_social,empl_services,empl_manufacturing,empl_retail,employed,prc_fam_poverty,avg_income,...,population,pop_65_plus,health_ins,county,state,FIPS,area,domestic_passengers,intl_passengers,order started
"Morgan County, Alabama: Summary level: 050, state:01> county:103",2.56,580,6009,10431,4473,11938,5507,53742,9.9,27742,...,119089,20464,117677,Morgan County,Alabama,1103,579.34,0.0,0.0,04/04/20
"Kings County, California: Summary level: 050, state:06> county:031",3.15,7797,3901,11372,4707,3752,4899,52644,15.6,22628,...,151366,15413,136372,Kings County,California,6031,1389.42,0.0,0.0,03/19/20
"Monterey County, California: Summary level: 050, state:06> county:053",3.31,30494,19232,37518,20736,12420,17109,190707,10.5,30674,...,435594,59491,419413,Monterey County,California,6053,3280.6,186000.0,0.0,03/19/20
"Nevada County, California: Summary level: 050, state:06> county:057",2.37,596,7285,9199,5133,1562,4717,44505,5.1,37645,...,99696,27746,98472,Nevada County,California,6057,957.77,0.0,0.0,03/19/20
"Shasta County, California: Summary level: 050, state:06> county:089",2.59,743,6543,17734,7999,3077,8953,69649,9.5,28144,...,180040,37027,178552,Shasta County,California,6089,3775.4,0.0,0.0,03/19/20


In [131]:
counties.to_csv("counties.csv")