# Geographic gazetteer

> Catalogue of geographic units of the USA.

In [None]:
#default_exp geography
#export
import pandas as pd
import geopandas as gpd

from rurec.reseng.config import Paths
from rurec import util

PATH = Paths(
    source='data/geo/source',
    state='data/geo/state.pq',
    county='data/geo/county.json',
    tract='data/geo/tract.pq',
    cbsa='data/geo/cbsa.json'
)

In [None]:
#export
# in geopandas 0.8, parquet support is still experimental
# https://geopandas.org/docs/user_guide/io.html#apache-parquet-and-feather-file-formats
import warnings
warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')

# Source files

Source data files are downloaded from web and cached locally.

Census Bureau:
- [Cartographic Boundary Files](https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.html)
- [Cartographic boundary files 2019+](https://www.census.gov/geographies/mapping-files/time-series/geo/cartographic-boundary.html) - national tracts file
- [LSAD codes](https://www.census.gov/library/reference/code-lists/legal-status-codes.html) - Legal/Statistical Area Description Codes and Definitions
- [FIPS codes](https://www.census.gov/geographies/reference-files/2019/demo/popest/2019-fips.html)
- [Gazeteer reference files](https://www.census.gov/geographies/reference-files/time-series/geo/gazetteer-files.html)
- [Relationship files](https://www.census.gov/geographies/reference-files/time-series/geo/relationship-files.2020.html) - These text files describe geographic relationships. There are two types of relationship files; those that show the relationship between the same type of geography over time (comparability) and those that show the relationship between two types of geography for the same time period.

In [None]:
#export
def get_source(src):
    """Return path to file specified by `src` key, downloading if missing."""
    if src == 'geocodes':
        url = 'https://www2.census.gov/programs-surveys/popest/geographies/2019/all-geocodes-v2019.xlsx'
        local = PATH.source/'all-geocodes-v2019.xlsx'
    elif src == 'state-boundary':
        url = 'https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_state_20m.zip'
        local = PATH.source/'cb_2018_us_state_20m.zip'
    elif src == 'county-boundary':
        url = 'https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_county_20m.zip'
        local = PATH.source/'cb_2018_us_county_20m.zip'
    elif src.startswith('tract-boundary-'):
        # tract-boundary-YYYY-SS, YYYY = decennial census year, SS = state FIPS code
        y = int(src[15:19])
        s = src[-2:]
        if y == 1990:
            url = f'https://www2.census.gov/geo/tiger/PREVGENZ/tr/tr90shp/tr{s}_d90_shp.zip'
        elif y == 2000:
            url = f'https://www2.census.gov/geo/tiger/PREVGENZ/tr/tr00shp/tr{s}_d00_shp.zip'
        elif y == 2010:
            url = f'https://www2.census.gov/geo/tiger/GENZ2010/gz_2010_{s}_140_00_500k.zip'
        elif y == 2020:
            url = f'https://www2.census.gov/geo/tiger/GENZ2020/shp/cb_2020_{s}_tract_500k.zip'
        else:
            raise Exception(f'No tract revisions in {y}.')
        local = PATH.source/f'tract/{y}/{s}.zip'
    elif src == 'cbsa-boundary':
        url = 'https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_cbsa_20m.zip'
        local = PATH.source/'cb_2018_us_cbsa_20m.zip'
    else:
        raise Exception(f'Unknown source: {src}')
        
    if not local.exists():
        print(f'File "{local}" not found, attempting download.')
        util.download_file(url, local.parent, local.name)
    return local

# State

In [None]:
#export
def get_state_df(geometry=True):
    path = PATH.state
    if path.exists():
        if geometry:
            return gpd.read_parquet(path)
        else:
            return pd.read_parquet(path, 'pyarrow', ['CODE', 'ABBR', 'NAME', 'ALAND', 'AWATER'])

    p = get_source('state-boundary')
    df = gpd.read_file(f'zip://{p}')
    df = df.rename(columns={'STATEFP': 'CODE', 'STUSPS': 'ABBR'})
    df = df[['CODE', 'ABBR', 'NAME', 'ALAND', 'AWATER', 'geometry']]
    df.to_parquet(path)
    if not geometry:
        df = pd.DataFrame(df).drop(columns='geometry')
    return df

In [None]:
def _show_state_map():
    import matplotlib.pyplot as plt

    df = get_state_df()
    df = df[~df['ABBR'].isin(['AK', 'HI', 'PR'])]
    fig, ax = plt.subplots(figsize=(24, 8))
    ax.set_aspect('equal')
    df.boundary.plot(ax=ax)
    for abbr, shape in df[['ABBR', 'geometry']].itertuples(False):
        ax.annotate(abbr, shape.centroid.coords[0], horizontalalignment='center')

_show_state_map()

# County

In [None]:
#export
def get_county_df():
    if PATH.county.exists():
        return gpd.read_file(PATH.county)
    
    p = get_source('county-boundary')
    df = gpd.read_file(f'zip://{p}')
    df = df.rename(columns={'GEOID': 'CODE'})
    df = df[['CODE', 'NAME', 'ALAND', 'AWATER', 'geometry']]
    df.to_file(PATH.county, driver='GeoJSON')
    return df

In [None]:
get_county_df().sample(5)

In [None]:
def _show_county_map():
    import ipywidgets as widgets
    import matplotlib.pyplot as plt

    df = get_county_df()
    df['STATE'] = df['CODE'].str[:2].map(get_mapping('STATE_CODE', 'STATE_NAME'))
    st = widgets.Dropdown(description='State',
                          options=sorted(df['STATE'].unique()),
                          value='Wisconsin')
    out = widgets.Output()
    def upd(*args):
        with out:
            out.clear_output(True)
            fig, ax = plt.subplots(figsize=(12, 12))
            ax.set_aspect('equal')
            d = df[df['STATE'] == st.value]
            d.boundary.plot(ax=ax)
            for name, shape in d[['NAME', 'geometry']].itertuples(False):
                ax.annotate(name, shape.centroid.coords[0], horizontalalignment='center')
            plt.close()
            display(fig)
    upd()
    st.observe(upd, 'value')
    display(widgets.VBox([st, out]))
    
_show_county_map()

# Census Tract

Code is 11 digits: 2 state, 5 county, 4+2 tract.

The first four digits of the tract code are "permanent." 
When tracks get large (+8000 residents), tracts are split and 2 digit tag is used (same with the split of splits):

|1990|2000|2010| 
|----|----|----|
|1000|1000.01|1000.03|
|1000|1000.01|1000.04|
|1000|1000.02|1000.05| 
|1000|1000.02|1000.06|
        
The naming convention for merges (population falls below 1,200) and boundary revisions are less clear-cut.

When changes (splits, merges, redefinitions) occur, the relationship of new tracts to old tracts is [crosswalked](https://www.census.gov/geographies/reference-files/time-series/geo/relationship-files.2010.html).

There is a master file, as well as two files that provided the identifiers of tracts that were "substantially changed" between decennials. The two files of significantly changed census tracts consist only of a list of census tracts that exhibited a change of 2.5-percent or greater. Tract relationships may be one-to-one, many-to-one, one-to-many, or many-to-many.

[Reference](https://www2.census.gov/geo/pdfs/education/CensusTracts.pdf)

In [None]:
y = 1990
state = '08'
p = get_source(f'tract-boundary-{y}-{state}')
df = gpd.read_file(f'zip://{p}')
if y == 1990:
    df = df.rename(columns={'ST': 'STATE_CODE', 'CO': 'COUNTY_CODE'})
    df['TRACT_CODE'] = df['TRACTBASE'] + df['TRACTSUF'].fillna('00')
    df['CODE'] = df['STATE_CODE'] + df['COUNTY_CODE'] + df['TRACT_CODE']
    df['NAME'] = df['TRACTBASE']
    df.loc[df['TRACTSUF'].notna(), 'NAME'] = df['NAME'] + '.' + df['TRACTSUF']

In [None]:
#export
def get_tract_df(year, state, geometry=True):
    assert 1990 <= year < 2030, f'No tract data for {year}.'
    y = 1990 + (year - 1990) // 10 * 10
    path = PATH.tract/f'YEAR={y}/STATE={state}'
    if path.exists():
        if geometry:
            return gpd.read_parquet(path)
        else:
            return pd.read_parquet(path, 'pyarrow', ['CODE', 'STATE_CODE', 'COUNTY_CODE', 'TRACT_CODE', 'NAME', 'AREA'])
    
    p = get_source(f'tract-boundary-{year}-{state}')
    df = gpd.read_file(f'zip://{p}')
    df = df.rename(columns={'GEOID': 'CODE'})
    df = df[['CODE', 'NAME', 'ALAND', 'AWATER', 'geometry']]
    df.to_file(PATH.tract, driver='GeoJSON')
    return df

In [None]:
def _show_tract_map():
    import ipywidgets as widgets
    import matplotlib.pyplot as plt

    df = get_tract_df()
    df['COUNTY_CODE'] = df['CODE'].str[:5]
    dfc = get_county_df()
    st = widgets.Dropdown(description='State',
                          options=sorted(get_mapping('STATE_NAME', 'STATE_CODE').items()),
                          value='55')
    ct = widgets.Dropdown(description='County')
    def upd_ct_list(*args):
        ct.options = [('', '00000')] + sorted(dfc.loc[dfc['CODE'].str[:2] == st.value, ['NAME', 'CODE']].values.tolist())
    upd_ct_list()
    st.observe(upd_ct_list, 'value')
    out = widgets.Output()
    def upd(*args):
        if ct.value == '00000':
            out.clear_output()
            return
        with out:
            out.clear_output(True)
            fig, ax = plt.subplots(figsize=(12, 12))
            ax.set_aspect('equal')
            df[df['COUNTY_CODE'] == ct.value].boundary.plot(ax=ax)
            plt.close()
            display(fig)
    ct.observe(upd, 'value')
    ct.value = '55025'

    display(widgets.VBox([widgets.HBox([st, ct]), out]))

_show_tract_map()

## Changes between 1990 and 2000

In [None]:
meta = pd.read_html('https://www.census.gov/programs-surveys/geography/technical-documentation/records-layout/2000-tract-relationship-record-layout.html')

In [None]:
# Substantially changed in 2000
df = pd.read_fwf('https://www2.census.gov/geo/relfiles/tract/al/al01sg2k.txt',
                 widths=meta[3]['Field Length'].tolist(),
                 dtype='str', header=None)
df.columns = meta[3]['Field Description']
subch = df
subch.head(1)

In [None]:
# cross-product of 1990 and 2000 tracts by population
df = pd.read_fwf('https://www2.census.gov/geo/relfiles/tract/al/al01pop.txt',
                 widths=meta[0]['Field Length'].tolist(),
                 dtype='str', header=None)
df.columns = meta[0]['Field Description']
df['tract_2000'] = df['2000 state FIPS code'] + df['2000 county FIPS code'] + df['2000 census tract base'] + df['2000 census tract suffix']
df['tract_1990'] = df['1990 state FIPS code'] + df['1990 county FIPS code'] + df['1990 census tract base'] + df['1990 census tract suffix']
df = df.merge(subch, 'left', ('2000 state FIPS code', '2000 county FIPS code', '2000 census tract base', '2000 census tract suffix'), indicator='sub_ch_2000')
df['sub_ch_2000'].cat.rename_categories({'left_only': False, 'both': True}, inplace=True)
df.head(1)

### Map the difference

1:500,000 tract files for Alabama from "Cartographic boundary files".

In [None]:
url = {}
url[1990] = 'https://www2.census.gov/geo/tiger/PREVGENZ/tr/tr90shp/tr01_d90_shp.zip'
url[2000] = 'https://www2.census.gov/geo/tiger/PREVGENZ/tr/tr00shp/tr01_d00_shp.zip'
url[2010] = 'https://www2.census.gov/geo/tiger/GENZ2010/gz_2010_01_140_00_500k.zip'
url[2013] = 'https://www2.census.gov/geo/tiger/GENZ2013/cb_2013_01_tract_500k.zip'
for y in range(2014, 2021):
    url[y] = f'https://www2.census.gov/geo/tiger/GENZ{y}/shp/cb_{y}_01_tract_500k.zip'

tr = {}
for y, u in url.items():
    tr[y] = gpd.read_file(u)

In [None]:
county = '001'
t0 = tr[1990].query('CO == @county')
t1 = tr[2000].query('COUNTY == @county')

fig, ax = plt.subplots(figsize=(12, 12))
ax.set_aspect('equal')
d = 3
t0.boundary.plot(ax=ax, color='red', linestyle=(0, (d, d)))
t1.boundary.plot(ax=ax, color='blue', linestyle=(d, (d, d)))

### Census geographic data precision
Different years of the same revision have slightly different polygons. Maybe these files are approximately aggregated from more detailed sources (TIGER?)

In [None]:
tig = {
    2000: gpd.read_file('https://www2.census.gov/geo/tiger/TIGER2010/TRACT/2000/tl_2010_01_tract00.zip'),
    2010: gpd.read_file('https://www2.census.gov/geo/tiger/TIGER2010/TRACT/2010/tl_2010_01_tract10.zip'),
    2019: gpd.read_file('https://www2.census.gov/geo/tiger/TIGER2019/TRACT/tl_2019_01_tract.zip'),
    2020: gpd.read_file('https://www2.census.gov/geo/tiger/TIGER2020/TRACT/tl_2020_01_tract.zip')
}

In [None]:
x0 = tig[2010].set_index('GEOID10').loc[['01005950300'], 'geometry']
x1 = tig[2020].set_index('GEOID').loc[['01005950300'], 'geometry']

In [None]:
fig, ax = plt.subplots(figsize=(12, 12))
ax.set_aspect('equal')
x0.boundary.plot(ax=ax, color='red', linestyle=(0, (10, 10)))
x1.boundary.plot(ax=ax, color='blue', linestyle=(10, (10, 10)))

In [None]:
tig[2019].head()

In [None]:
tr[2019].sort_values('GEOID').head()

In [None]:
tig.sort_values('GEOID').head()

In [None]:
(len(t1.iloc[0, -1].exterior.coords[:]),
len(t2.iloc[0, -1].exterior.coords[:]),
len(ti.iloc[0, -1].exterior.coords[:]))

In [None]:
import matplotlib.pyplot as plt
t1 = tr[2013].sort_values('GEOID').head(4)
t2 = tr[2019].sort_values('GEOID').head(4)
ti = tig[2019].sort_values('GEOID').head(4)
fig, ax = plt.subplots(figsize=(12, 12))
ax.set_aspect('equal')
t1.boundary.plot(ax=ax, color='red', linestyle=(0, (10, 20)))
t2.boundary.plot(ax=ax, color='blue', linestyle=(10, (10, 20)))
ti.boundary.plot(ax=ax, color='green', linestyle=(20, (10, 20)))

# Core Based Statistical Area (CBSA)

From CB [glossary](https://www.census.gov/programs-surveys/geography/about/glossary.html#par_textimage_7):
> Core Based Statistical Areas (CBSAs) consist of the county or counties or equivalent entities associated with at least one core (urbanized area or urban cluster) of at least 10,000 population, plus adjacent counties having a high degree of social and economic integration with the core as measured through commuting ties with the counties associated with the core. The general concept of a CBSA is that of a core area containing a substantial population nucleus, together with adjacent communities having a high degree of economic and social integration with that core. The term "core based statistical area" became effective in 2003 and refers collectively to metropolitan statistical areas and micropolitan statistical areas. The U.S. Office of Management and Budget (OMB) defines CBSAs to provide a nationally consistent set of geographic entities for the United States and Puerto Rico for use in tabulating and presenting statistical data.

An OMB Metropolitan area contains a core urban area of 50,000 or more population, and a Micropolitan area contains an urban core of at least 10,000 (but less than 50,000) population.

[CB page with data and definitions](https://www.census.gov/programs-surveys/metro-micro.html)

In [None]:
#export
def get_cbsa_df():
    if PATH.cbsa.exists():
        return gpd.read_file(PATH.cbsa)
    
    p = get_source('cbsa-boundary')
    df = gpd.read_file(f'zip://{p}')
    df = df.rename(columns={'GEOID': 'CODE', 'LSAD': 'TYPE'})
    df['TYPE'] = df['TYPE'].map({'M1': 'Metro', 'M2': 'Micro'})
    df = df[['CODE', 'NAME', 'TYPE', 'ALAND', 'AWATER', 'geometry']]
    df.to_file(PATH.cbsa, driver='GeoJSON')
    return df

In [None]:
get_cbsa_df().sample(3)

In [None]:
def _show_cbsa_map():
    import matplotlib.pyplot as plt

    fig, ax = plt.subplots(figsize=(12, 12))
    ax.set_aspect('equal')

    df = get_state_df()
    contig = ~df['ABBR'].isin(['AK', 'HI', 'PR'])
    df[contig].boundary.plot(ax=ax, color='gray', zorder=1)

    df = get_cbsa_df()
    contig = ~df['NAME'].apply(lambda name: any(st in name for st in ['AK', 'HI', 'PR']))
    df[contig].plot(ax=ax, column='TYPE', legend=True, zorder=2, alpha=0.9)
    
_show_cbsa_map()

# Postal Zip Code

# Zip Code Tabulation Area (ZCTA)

# Area phone code