# Download census tract data from API

  - Available data: https://api.census.gov/data.html
  - Variables (aka, fields) you can query for: https://api.census.gov/data/2017/acs/acs5/profile/variables.html
  - Browse variables for a place here: https://www.census.gov/acs/www/data/data-tables-and-tools/data-profiles/2014/
  - Sample query: https://api.census.gov/data/2017/acs/acs5/profile?get=DP05_0001E&for=tract:400100&in=state:06+county:001
  - County FIPS codes: https://www.nrcs.usda.gov/wps/portal/nrcs/detail/?cid=nrcs143_013697
  
#### Variables name format

More info: https://www.census.gov/data/developers/data-sets/acs-5year/data-notes.html

variable name format: [TableID]_[RowNumber][VariableType]

Example: Variable DP02_0002PE, "Family households (families)", represents the percent estimate for table DP02 row number 2.

DP (Data Profile): Table type containing broad social, economic, housing, and demographic information in a total of four profiles.

  - DP02: Social Characteristics — includes Education, Marital Status, Relationships, Fertility, Grandparents... 
  - DP03: Economic Characteristics — includes Income, Employment, Occupation, Commuting to Work... 
  - DP04: Housing Characteristics — includes Occupancy and Structure, Housing Value and Costs, Utilities... 
  - DP05: Demographic Characteristics — includes Sex and Age, Race, Hispanic Origin, Housing Units... 

Variable suffixes:

  - E = estimate
  - M = margin of error
  - PE = percent estimate (of total)
  - PM = margin of error for corresponding PE
  - A = annotation

In [1]:
import geopandas as gpd
import getcensus as gc
import os
import pandas as pd
from shapely import geometry
from keys import census_api_key

In [2]:
# which census dataset
dataset = 'acs/acs5'

# which vintage year
year = 2017

# which census variables to retrieve for each tract
variables = {'DP05_0001E':'total_pop',    #total pop
             'DP05_0018E':'median_age',    #median age
             'DP05_0071PE':'pct_hispanic',   #pct pop hispanic or latino
             'DP05_0077PE':'pct_white',   #pct pop non-hispanic white alone
             'DP05_0078PE':'pct_black',   #pct pop non-hispanic black
             'DP05_0080E':'pct_asian',   #pct pop non-hispanic asian
             'DP05_0002PE':'pct_male',   #pct pop male
             'DP04_0007PE':'pct_single_family_home',   #pct single family detached homes
             'DP04_0089E':'med_home_value',    #median value of owner occupied units (dollars)
             'DP04_0037E':'med_rooms_per_home',    #median number of rooms in house
             'DP04_0026PE':'pct_built_before_1940',   #pct structure built 1939 or earlier
             'DP04_0047PE':'pct_renting',   #pct renter-occupied housing units
             'DP04_0005E':'rental_vacancy_rate',    #rental vacancy rate
             'DP04_0049E':'avg_renter_household_size',    #average household size of renter-occupied housing units
             'DP04_0134E':'med_gross_rent',    #median gross rent (dollars)
             'DP03_0062E':'med_household_income',    #median household income
             'DP03_0025E':'mean_commute_time',    #mean travel time to work
             'DP03_0019PE':'pct_commute_drive_alone',   #pct commute drove alone
             'DP03_0128PE':'pct_below_poverty',   #pct people with income below povery level
             'DP02_0057PE':'pct_college_grad_student',   #pct who are students currently enrolled in college or grad school
             'DP02_0079PE':'pct_same_residence_year_ago',   #pct residence 1 year ago was same house
             'DP02_0067PE':'pct_bachelors_degree',   #pct bachelor's degree or higher
             'DP02_0111PE':'pct_english_only',   #pct with english only language spoken at home
             'DP02_0092PE':'pct_foreign_born'}   #pct of population foreign born

# data directories
tracts_path = 'tl_2018_25_tract'
output_path = 'census_tracts_data.geojson'

In [3]:
# download and display census descriptions of each variable
variable_descriptions = gc.get_census_variable_descriptions(dataset=dataset, 
                                                            year=year, 
                                                            variables=variables)
for v, d in variable_descriptions.items():
    print('{}\t{}'.format(variables[v], d['label']))

total_pop	Estimate!!SEX AND AGE!!Total population
median_age	Estimate!!SEX AND AGE!!Total population!!Median age (years)
pct_hispanic	Percent Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Hispanic or Latino (of any race)
pct_white	Percent Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Not Hispanic or Latino!!White alone
pct_black	Percent Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Not Hispanic or Latino!!Black or African American alone
pct_asian	Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Not Hispanic or Latino!!Asian alone
pct_male	Percent Estimate!!SEX AND AGE!!Total population!!Male
pct_single_family_home	Percent Estimate!!UNITS IN STRUCTURE!!Total housing units!!1-unit detached
med_home_value	Estimate!!VALUE!!Owner-occupied units!!Median (dollars)
med_rooms_per_home	Estimate!!ROOMS!!Total housing units!!Median rooms
pct_built_before_1940	Percent Estimate!!YEAR STRUCTURE BUILT!!Total housing units!!Built 1939 or earlier
pct_renting	Perc

## Get vars from ACS DP

In [4]:
# load the tracts in our study area
gdf = gpd.read_file(tracts_path).sort_values(by='GEOID')
len(gdf)

1478

In [5]:
%%time
df = gc.get_census_tracts_data(tract_fips=gdf['GEOID'], api_key=census_api_key, dataset=dataset,
                               year=year, variables=variables, clean=True)

Downloading 24 census vars in 25001 for 57 tracts.
Downloading 24 census vars in 25003 for 39 tracts.
Downloading 24 census vars in 25005 for 126 tracts.
Downloading 24 census vars in 25007 for 5 tracts.
Downloading 24 census vars in 25009 for 163 tracts.
Downloading 24 census vars in 25011 for 18 tracts.
Downloading 24 census vars in 25013 for 103 tracts.
Downloading 24 census vars in 25015 for 36 tracts.
Downloading 24 census vars in 25017 for 318 tracts.
Downloading 24 census vars in 25019 for 6 tracts.
Downloading 24 census vars in 25021 for 130 tracts.
Downloading 24 census vars in 25023 for 101 tracts.
Downloading 24 census vars in 25025 for 204 tracts.
Downloading 24 census vars in 25027 for 172 tracts.
Wall time: 25.4 s


In [6]:
# merge the tracts with the acs variables, rename columns, then make sure everything we merged is the same length
merged = pd.merge(left=gdf.set_index('GEOID'), right=df, how='inner', left_index=True, right_index=True)
merged = merged.rename(columns=variables)
assert len(gdf) == len(df) == len(merged)

In [7]:
merged.head()

Unnamed: 0,STATEFP,COUNTYFP,TRACTCE,NAME,NAMELSAD,MTFCC,FUNCSTAT,ALAND,AWATER,INTPTLAT,...,mean_commute_time,pct_commute_drive_alone,pct_below_poverty,pct_college_grad_student,pct_same_residence_year_ago,pct_bachelors_degree,pct_english_only,pct_foreign_born,state,county
25001010100,25,1,10100,101.0,Census Tract 101,G5020,S,25046218,12765873,42.0598291,...,13.9,39.7,10.7,47.4,91.8,48.8,88.5,9.2,25,1
25001010206,25,1,10206,102.06,Census Tract 102.06,G5020,S,51240917,18830100,41.9226356,...,22.6,68.0,11.3,27.5,85.4,52.6,95.3,7.8,25,1
25001010208,25,1,10208,102.08,Census Tract 102.08,G5020,S,54268861,11461462,42.0135566,...,16.8,69.5,11.2,9.5,99.6,45.9,93.6,9.6,25,1
25001010304,25,1,10304,103.04,Census Tract 103.04,G5020,S,18347659,7830612,41.825108,...,23.5,79.5,4.8,30.2,93.4,51.2,93.7,7.0,25,1
25001010306,25,1,10306,103.06,Census Tract 103.06,G5020,S,17828556,1730602,41.8593758,...,17.8,72.8,8.2,10.2,88.1,45.1,96.9,5.0,25,1


## Save to disk

In [8]:
upcast_dispatch = {geometry.Point: geometry.MultiPoint, 
                   geometry.LineString: geometry.MultiLineString, 
                   geometry.Polygon: geometry.MultiPolygon}

def maybe_cast_to_multigeometry(geom):
    caster = upcast_dispatch.get(type(geom), lambda x: x[0])
    return caster([geom])

merged['geometry'] = merged['geometry'].apply(maybe_cast_to_multigeometry)

In [9]:
%%time
merged.reset_index().to_file(output_path, driver='GeoJSON')
print(output_path)

census_tracts_data.geojson
Wall time: 4.88 s
