# Geographic gazetteer

> Catalogue of geographic units of the USA.

In [None]:
#default_exp geography
#export
import pandas as pd
import geopandas as gpd

from rurec.reseng.config import Paths
from rurec import util

PATH = Paths(
    source='data/geo/source',
    state='data/geo/state.json',
    county='data/geo/county.json',
    tract='data/geo/tract.json',
    cbsa='data/geo/cbsa.json'
)

# Source files

Source data files are downloaded from web and cached locally.

Census Bureau:
- [Cartographic Boundary Files](https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.html)
- [Cartographic boundary files 2019+](https://www.census.gov/geographies/mapping-files/time-series/geo/cartographic-boundary.html) - national tracts file
- [LSAD codes](https://www.census.gov/library/reference/code-lists/legal-status-codes.html) - Legal/Statistical Area Description Codes and Definitions
- [FIPS codes](https://www.census.gov/geographies/reference-files/2019/demo/popest/2019-fips.html)
- [Gazeteer reference files](https://www.census.gov/geographies/reference-files/time-series/geo/gazetteer-files.html)

In [None]:
#export
def get_source(src):
    """Return path to file specified by `src` key, downloading if missing."""
    if src == 'geocodes':
        url = 'https://www2.census.gov/programs-surveys/popest/geographies/2019/all-geocodes-v2019.xlsx'
        local = PATH.source/'all-geocodes-v2019.xlsx'
    elif src == 'state-boundary':
        url = 'https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_state_20m.zip'
        local = PATH.source/'cb_2018_us_state_20m.zip'
    elif src == 'county-boundary':
        url = 'https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_county_20m.zip'
        local = PATH.source/'cb_2018_us_county_20m.zip'
    elif src == 'tract-boundary':
        url = 'https://www2.census.gov/geo/tiger/GENZ2019/shp/cb_2019_us_tract_500k.zip'
        local = PATH.source/'cb_2019_us_tract_500k.zip'
    elif src == 'cbsa-boundary':
        url = 'https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_cbsa_20m.zip'
        local = PATH.source/'cb_2018_us_cbsa_20m.zip'
    else:
        raise Exception(f'Unknown source: {src}')
        
    if not local.exists():
        print(f'File "{local}" not found, attempting download.')
        util.download_file(url, local.parent, local.name)
    return local

# Utility functions

In [None]:
#export
def get_mapping(key, val):
    """Return dict of `key` -> `val` pairs from dataframe detected from `key` prefix."""
    if key.startswith('STATE_'):
        df = pd.DataFrame(get_state_df()).add_prefix('STATE_')
    else:
        raise Exception(f'Unknown field: {key}')
    return dict(df[[key, val]].itertuples(False))

# State

In [None]:
#export
def get_state_df():
    if PATH.state.exists():
        return gpd.read_file(PATH.state)
    
    p = get_source('state-boundary')
    df = gpd.read_file(f'zip://{p}')
    df = df.rename(columns={'STATEFP': 'CODE', 'STUSPS': 'ABBR'})
    df = df[['CODE', 'ABBR', 'NAME', 'ALAND', 'AWATER', 'geometry']]
    df.to_file(PATH.state, driver='GeoJSON')
    return df

In [None]:
get_state_df().head(3)

In [None]:
def _show_state_map():
    import matplotlib.pyplot as plt

    df = get_state_df()
    df = df[~df['ABBR'].isin(['AK', 'HI', 'PR'])]
    fig, ax = plt.subplots(figsize=(24, 8))
    ax.set_aspect('equal')
    df.boundary.plot(ax=ax)
    for abbr, shape in df[['ABBR', 'geometry']].itertuples(False):
        ax.annotate(abbr, shape.centroid.coords[0], horizontalalignment='center')

_show_state_map()

# County

In [None]:
#export
def get_county_df():
    if PATH.county.exists():
        return gpd.read_file(PATH.county)
    
    p = get_source('county-boundary')
    df = gpd.read_file(f'zip://{p}')
    df = df.rename(columns={'GEOID': 'CODE'})
    df = df[['CODE', 'NAME', 'ALAND', 'AWATER', 'geometry']]
    df.to_file(PATH.county, driver='GeoJSON')
    return df

In [None]:
get_county_df().sample(5)

In [None]:
def _show_county_map():
    import ipywidgets as widgets
    import matplotlib.pyplot as plt

    df = get_county_df()
    df['STATE'] = df['CODE'].str[:2].map(get_mapping('STATE_CODE', 'STATE_NAME'))
    st = widgets.Dropdown(description='State',
                          options=sorted(df['STATE'].unique()),
                          value='Wisconsin')
    out = widgets.Output()
    def upd(*args):
        with out:
            out.clear_output(True)
            fig, ax = plt.subplots(figsize=(12, 12))
            ax.set_aspect('equal')
            d = df[df['STATE'] == st.value]
            d.boundary.plot(ax=ax)
            for name, shape in d[['NAME', 'geometry']].itertuples(False):
                ax.annotate(name, shape.centroid.coords[0], horizontalalignment='center')
            plt.close()
            display(fig)
    upd()
    st.observe(upd, 'value')
    display(widgets.VBox([st, out]))
    
_show_county_map()

# Census Tract

Code is 11 digits: 2 state, 5 county, 6 tract.

In [None]:
#export
def get_tract_df():
    if PATH.tract.exists():
        return gpd.read_file(PATH.tract)
    
    p = get_source('tract-boundary')
    df = gpd.read_file(f'zip://{p}')
    df = df.rename(columns={'GEOID': 'CODE'})
    df = df[['CODE', 'NAME', 'ALAND', 'AWATER', 'geometry']]
    df.to_file(PATH.tract, driver='GeoJSON')
    return df

In [None]:
def _show_tract_map():
    import ipywidgets as widgets
    import matplotlib.pyplot as plt

    df = get_tract_df()
    df['COUNTY_CODE'] = df['CODE'].str[:5]
    dfc = get_county_df()
    st = widgets.Dropdown(description='State',
                          options=sorted(get_mapping('STATE_NAME', 'STATE_CODE').items()),
                          value='55')
    ct = widgets.Dropdown(description='County')
    def upd_ct_list(*args):
        ct.options = [('', '00000')] + sorted(dfc.loc[dfc['CODE'].str[:2] == st.value, ['NAME', 'CODE']].values.tolist())
    upd_ct_list()
    st.observe(upd_ct_list, 'value')
    out = widgets.Output()
    def upd(*args):
        if ct.value == '00000':
            out.clear_output()
            return
        with out:
            out.clear_output(True)
            fig, ax = plt.subplots(figsize=(12, 12))
            ax.set_aspect('equal')
            df[df['COUNTY_CODE'] == ct.value].boundary.plot(ax=ax)
            plt.close()
            display(fig)
    ct.observe(upd, 'value')
    ct.value = '55025'

    display(widgets.VBox([widgets.HBox([st, ct]), out]))

_show_tract_map()

# Core Based Statistical Area (CBSA)

From CB [glossary](https://www.census.gov/programs-surveys/geography/about/glossary.html#par_textimage_7):
> Core Based Statistical Areas (CBSAs) consist of the county or counties or equivalent entities associated with at least one core (urbanized area or urban cluster) of at least 10,000 population, plus adjacent counties having a high degree of social and economic integration with the core as measured through commuting ties with the counties associated with the core. The general concept of a CBSA is that of a core area containing a substantial population nucleus, together with adjacent communities having a high degree of economic and social integration with that core. The term "core based statistical area" became effective in 2003 and refers collectively to metropolitan statistical areas and micropolitan statistical areas. The U.S. Office of Management and Budget (OMB) defines CBSAs to provide a nationally consistent set of geographic entities for the United States and Puerto Rico for use in tabulating and presenting statistical data.

An OMB Metropolitan area contains a core urban area of 50,000 or more population, and a Micropolitan area contains an urban core of at least 10,000 (but less than 50,000) population.

[CB page with data and definitions](https://www.census.gov/programs-surveys/metro-micro.html)

In [None]:
#export
def get_cbsa_df():
    if PATH.cbsa.exists():
        return gpd.read_file(PATH.cbsa)
    
    p = get_source('cbsa-boundary')
    df = gpd.read_file(f'zip://{p}')
    df = df.rename(columns={'GEOID': 'CODE', 'LSAD': 'TYPE'})
    df['TYPE'] = df['TYPE'].map({'M1': 'Metro', 'M2': 'Micro'})
    df = df[['CODE', 'NAME', 'TYPE', 'ALAND', 'AWATER', 'geometry']]
    df.to_file(PATH.cbsa, driver='GeoJSON')
    return df

In [None]:
get_cbsa_df().sample(3)

In [None]:
def _show_cbsa_map():
    import matplotlib.pyplot as plt

    fig, ax = plt.subplots(figsize=(12, 12))
    ax.set_aspect('equal')

    df = get_state_df()
    contig = ~df['ABBR'].isin(['AK', 'HI', 'PR'])
    df[contig].boundary.plot(ax=ax, color='gray', zorder=1)

    df = get_cbsa_df()
    contig = ~df['NAME'].apply(lambda name: any(st in name for st in ['AK', 'HI', 'PR']))
    df[contig].plot(ax=ax, column='TYPE', legend=True, zorder=2, alpha=0.9)
    
_show_cbsa_map()

# Postal Zip Code

# Zip Code Tabulation Area (ZCTA)

# Area phone code