# Geographic gazetteer

> Catalogue of geographic units of the USA.

Census Bureau:
- [FIPS codes](https://www.census.gov/geographies/reference-files/2019/demo/popest/2019-fips.html)
- [Gazeteer reference files](https://www.census.gov/geographies/reference-files/time-series/geo/gazetteer-files.html)
- [Cartographic boundary files](https://www.census.gov/geographies/mapping-files/time-series/geo/cartographic-boundary.html)

In [None]:
#default_exp geography
#export
import pandas as pd
import geopandas as gpd

from rurec.reseng.config import Paths
from rurec import util

PATH = Paths(
    source='data/geo/source',
    state='data/geo/state.json',
    county='data/geo/county.json',
    tract='data/geo/tract.json'
)

# Source files

Source data files are downloaded from web and cached locally.

[Cartographic Boundary Files](https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.html)

In [None]:
#export
def get_source(src):
    """Return path to file specified by `src` key, downloading if missing."""
    if src == 'geocodes':
        url = 'https://www2.census.gov/programs-surveys/popest/geographies/2019/all-geocodes-v2019.xlsx'
        local = PATH.source/'all-geocodes-v2019.xlsx'
    elif src == 'state-boundary':
        url = 'https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_state_20m.zip'
        local = PATH.source/'cb_2018_us_state_20m.zip'
    elif src == 'county-boundary':
        url = 'https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_county_20m.zip'
        local = PATH.source/'cb_2018_us_county_20m.zip'
    elif src == 'tract-boundary':
        url = 'https://www2.census.gov/geo/tiger/GENZ2019/shp/cb_2019_us_tract_500k.zip'
        local = PATH.source/'cb_2019_us_tract_500k.zip'
    else:
        raise Exception(f'Unknown source: {src}')
        
    if not local.exists():
        print(f'File "{local}" not found, attempting download.')
        util.download_file(url, local.parent, local.name)
    return local

# Utility functions

In [None]:
#export
def get_mapping(key, val):
    """Return dict of `key` -> `val` pairs from dataframe detected from `key` prefix."""
    if key.startswith('STATE_'):
        df = pd.DataFrame(get_state_df()).add_prefix('STATE_')
    else:
        raise Exception(f'Unknown field: {key}')
    return dict(df[[key, val]].itertuples(False))

# State

In [None]:
#export
def get_state_df():
    if PATH.state.exists():
        return gpd.read_file(PATH.state)
    
    p = get_source('state-boundary')
    df = gpd.read_file(f'zip://{p}')
    df = df.rename(columns={'STATEFP': 'CODE', 'STUSPS': 'ABBR'})
    df = df[['CODE', 'ABBR', 'NAME', 'ALAND', 'AWATER', 'geometry']]
    df.to_file(PATH.state, driver='GeoJSON')
    return df

In [None]:
get_state_df().head(3)

In [None]:
import matplotlib.pyplot as plt

df = get_state_df()
df = df[~df['ABBR'].isin(['AK', 'HI', 'PR'])]
fig, ax = plt.subplots(figsize=(24, 8))
ax.set_aspect('equal')
df.boundary.plot(ax=ax)
for abbr, shape in df[['ABBR', 'geometry']].itertuples(False):
    ax.annotate(abbr, shape.centroid.coords[0], horizontalalignment='center')

# County

In [None]:
#export
def get_county_df():
    if PATH.county.exists():
        return gpd.read_file(PATH.county)
    
    p = get_source('county-boundary')
    df = gpd.read_file(f'zip://{p}')
    df = df.rename(columns={'GEOID': 'CODE'})
    df = df[['CODE', 'NAME', 'ALAND', 'AWATER', 'geometry']]
    df.to_file(PATH.county, driver='GeoJSON')
    return df

In [None]:
get_county_df().sample(5)

In [None]:
import ipywidgets as widgets
import matplotlib.pyplot as plt

df = get_county_df()
df['STATE'] = df['CODE'].str[:2].map(get_mapping('STATE_CODE', 'STATE_NAME'))
st = widgets.Dropdown(description='State',
                      options=sorted(df['STATE'].unique()),
                      value='Wisconsin')
out = widgets.Output()
def upd(*args):
    with out:
        out.clear_output(True)
        fig, ax = plt.subplots(figsize=(12, 12))
        ax.set_aspect('equal')
        d = df[df['STATE'] == st.value]
        d.boundary.plot(ax=ax)
        for name, shape in d[['NAME', 'geometry']].itertuples(False):
            ax.annotate(name, shape.centroid.coords[0], horizontalalignment='center')
        plt.close()
        display(fig)
upd()
st.observe(upd, 'value')
widgets.VBox([st, out])

# Census Tract

Code is 11 digits: 2 state, 5 county, 6 tract.

In [None]:
#export
def get_tract_df():
    if PATH.tract.exists():
        return gpd.read_file(PATH.tract)
    
    p = get_source('tract-boundary')
    df = gpd.read_file(f'zip://{p}')
    df = df.rename(columns={'GEOID': 'CODE'})
    df = df[['CODE', 'NAME', 'ALAND', 'AWATER', 'geometry']]
    df.to_file(PATH.tract, driver='GeoJSON')
    return df

In [None]:
import ipywidgets as widgets
import matplotlib.pyplot as plt

df = get_tract_df()
df['COUNTY_CODE'] = df['CODE'].str[:5]
dfc = get_county_df()
st = widgets.Dropdown(description='State',
                      options=sorted(get_mapping('STATE_NAME', 'STATE_CODE').items()),
                      value='55')
ct = widgets.Dropdown(description='County')
def upd_ct_list(*args):
    ct.options = [('', '00000')] + sorted(dfc.loc[dfc['CODE'].str[:2] == st.value, ['NAME', 'CODE']].values.tolist())
upd_ct_list()
st.observe(upd_ct_list, 'value')
out = widgets.Output()
def upd(*args):
    if ct.value == '00000':
        out.clear_output()
        return
    with out:
        out.clear_output(True)
        fig, ax = plt.subplots(figsize=(12, 12))
        ax.set_aspect('equal')
        df[df['COUNTY_CODE'] == ct.value].boundary.plot(ax=ax)
        plt.close()
        display(fig)
ct.observe(upd, 'value')
ct.value = '55025'

widgets.VBox([widgets.HBox([st, ct]), out])

# CBSA

# Postal Zip Code

# Zip Code Tabulation Area (ZCTA)

# Rurality classifications

1. Non-urban counties (OMB)
2. Inverse of Census urbanity (Census Bureau)
3. Outside urban activity (ERS)
    - Urban Influence Codes
    - Rural-Urban Continuum
    - Rural-Urban Commuting Areas
4. Non-urban census tracts
    - HRSA/FORHP
    - Inverse of spatial overlap with urban areas
5. Zip codes
    - data challenges
    - FAR and Remote

## HRSA/FORHP

HRSA's Federal Office of Rural Health Policy (FORHP) accepts all non-metro counties as rural 
and uses an additional method of determining rural status called the Rural-Urban Commuting 
Area (RUCA) codes. Like the MSAs, these are based on Census data which is used to assign a 
code to each Census Tract. Tracts inside Metropolitan counties with the codes 4-10 are 
considered rural. While use of the RUCA codes has allowed identification of rural census 
tracts in Metropolitan counties, among the more than 60,000 tracts in the U.S. there are 
some that are extremely large and where use of RUCA codes alone fails to account for distance 
to services and sparse population. In response to these concerns, FORHP has designated 
132 large area census tracts with RUCA codes 2 or 3 as rural. These tracts are at least 
400 square miles in area with a population density of no more than 35 people. The FORHP 
definition includes about 1866 of the population and 8566 of the area of the USA. RUCA codes 
represent the current version of the Goldsmith Modification.


There are two major definitions which the Federal government uses to identify the rural 
status of an area: the Census Bureau's 'Urban Area' and the OMB's 'Core-Based Statistical 
Area'.

## Urban Area
The first is from the U.S. Census Bureau which identifies two types of 
urban areas, Urbanized Areas (UAs) of 50,000 or more people and Urban Clusters (UCs) of at 
Ieast2,S00 and less than 50,000 people. Since the U.S. Census Bureau does not explicitly 
classify areas as rural, rural is defined as â€œencompassing all population, housing, and 
territory not included within an urban area (those areas not identified as UC or UA)". 
In the 2010 Census, 19.366 of the population was rural while over 9596 of the land area 
is still classified as rural. 

## CBSA
The second is from the Office of Management and Budget [OMB] 
which designates counties as Metropolitan, Micropolitan, or Neither. An OMB Metropolitan 
area contains a core urban area of 50,000 or more population, and a Micropolitan area 
contains an urban core of at least 10,000 (but less than 50,000) population. All counties 
that are not part of a Metropolitan Statistical Area [MSA) are considered rural. 

There are measurement challenges with both the U.S. Census Bureau and OMB definitions. 
Some policy experts note that the U.S. Census Bureau definition classifies quite a bit of  
suburban area as rural. The OMB definition includes rural areas in Metropolitan bounties.
Consequently, one could argue that the Census Bureau standard includes an overcount of rural 
population whereas the OMB standard represents an undercount of the rural population.

## Frontier and Remote