# ERS Rurality codes

This notebooks prepares [ERS rurality codes](https://www.ers.usda.gov/topics/rural-economy-population/rural-classifications/) for use in FSRDC.

- Download Excel spreadsheets with code data and documentation.
- Normalize column names.
- Stack all years into a single dataframe and save it as CSV file.
- Combine pieces of documentation and column renaming schemes into a text file. Text is tab-separated and can be loaded in tabular format.

Final output files:
```
ruc.csv
ruc_doc.txt
ui.csv
ui_doc.txt
ruca.csv
ruca_doc.txt
```

In [None]:
from urllib.parse import urlparse, unquote
from pathlib import Path

import requests
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

In [None]:
# Set project root directory and make sure it is correct
root_dir = Path('..').resolve()
assert {'main', 'tmp', 'data'} < {x.name for x in root_dir.iterdir() if x.is_dir()}

# other paths
ruc_dir = root_dir / 'data/ers_codes/ruc'
ui_dir = root_dir / 'data/ers_codes/ui'
ruca_dir = root_dir / 'data/ers_codes/ruca'

# Download and combine data

In [None]:
def download_file(url, dir=None, fname=None):
    """Download file from given `url` and put it into `dir`.
    Current working directory is used as default. Missing directories are created.
    File name from `url` is used as default.
    Return absolute pathlib.Path of the downloaded file."""
    
    if dir is None:
        dir = '.'
    dpath = Path(dir).resolve()
    dpath.mkdir(parents=True, exist_ok=True)

    if fname is None:
        fname = Path(urlparse(url).path).name
    fpath = dpath / fname

    with requests.get(url) as r:
        r.raise_for_status()
        with open(fpath, 'wb') as f:
            f.write(r.content)
    
    return fpath 

## Rural-Urban Continuum Codes

[Homepage](https://www.ers.usda.gov/data-products/rural-urban-continuum-codes/)

1993 file is contained within 1983-1993 file. 2003 file repeats 1993 column.

In [None]:
ruc_dir.mkdir(parents=True, exist_ok=True)
ruc_dfs = []
ruc_doc_dfs = []


# 1974
url = 'https://www.ers.usda.gov/webdocs/DataFiles/53251/ruralurbancodes1974.xls?v=9631.3'
fname = download_file(url, ruc_dir / 'orig')
df = pd.read_excel(fname, dtype='str', nrows=3141)
cols_map = {'FIPS Code': 'FIPS', 'State': 'STATE', 'County Name': 'COUNTY', '1974 Rural-urban Continuum Code': 'RUC_CODE'}
df = df.rename(columns=cols_map)
df['RUC_YEAR'] = '1974'
ruc_dfs.append(df)

df = pd.concat([
    pd.DataFrame(['RUC 1974 documentation', '-' * 80, f'Data source: {url}']),
    pd.DataFrame([[''], ['Column names'], ['Renamed', 'Original']]),
    pd.DataFrame([[v, k] for k, v in cols_map.items()]),
    pd.DataFrame(['']),
    pd.read_excel(fname, dtype='str', skiprows=3143, header=None).dropna(1, 'all')])
ruc_doc_dfs.append(df)


# 1983
url = 'https://www.ers.usda.gov/webdocs/DataFiles/53251/cd8393.xls?v=9631.3'
fname = download_file(url, ruc_dir / 'orig')

df = pd.read_excel(fname, dtype='str')
cols_map = {'FIPS': 'FIPS', 'State': 'STATE', 'County Name': 'COUNTY', '1983 Rural-urban Continuum Code': 'RUC_CODE'}
df = df.rename(columns=cols_map)
df = df[cols_map.values()]
df['RUC_YEAR'] = '1983'
ruc_dfs.append(df)

df = pd.concat([
    pd.DataFrame(['RUC 1983 documentation', '-' * 80, f'Data source: {url}']),
    pd.DataFrame([[''], ['Column names'], ['Renamed', 'Original']]),
    pd.DataFrame([[v, k] for k, v in cols_map.items()])])
ruc_doc_dfs.extend([pd.DataFrame(['', '']), df])


# 1993
df = pd.read_excel(fname, dtype='str')
cols_map = {'FIPS': 'FIPS', 'State': 'STATE', 'County Name': 'COUNTY', '1993 Rural-urban Continuum Code': 'RUC_CODE'}
df = df.rename(columns=cols_map)
df = df[cols_map.values()]
df['RUC_YEAR'] = '1993'
ruc_dfs.append(df)

df = pd.concat([
    pd.DataFrame(['RUC 1993 documentation', '-' * 80, f'Data source: {url}']),
    pd.DataFrame([[''], ['Column names'], ['Renamed', 'Original']]),
    pd.DataFrame([[v, k] for k, v in cols_map.items()])])
ruc_doc_dfs.extend([pd.DataFrame(['', '']), df])


# 2003
url = 'https://www.ers.usda.gov/webdocs/DataFiles/53251/ruralurbancodes2003.xls?v=9631.3'
fname = download_file(url, ruc_dir / 'orig')
df = pd.read_excel(fname, dtype='str')
cols_map = {'FIPS Code': 'FIPS', 'State': 'STATE', 'County Name': 'COUNTY',
            '2003 Rural-urban Continuum Code': 'RUC_CODE', '2000 Population ': 'POPULATION',
            'Percent of workers in nonmetro counties commuting to central counties of adjacent metro areas': 'PERCENT_NONMETRO_COMMUTERS',
            'Description for 2003 codes': 'RUC_CODE_DESCRIPTION'}
df = df.rename(columns=cols_map)
df = df[cols_map.values()]
df['RUC_YEAR'] = '2003'
df['POPULATION_YEAR'] = '2000'
ruc_dfs.append(df)

df = pd.concat([
    pd.DataFrame(['RUC 2003 documentation', '-' * 80, f'Data source: {url}']),
    pd.DataFrame([[''], ['Column names'], ['Renamed', 'Original']]),
    pd.DataFrame([[v, k] for k, v in cols_map.items()])])
ruc_doc_dfs.extend([pd.DataFrame(['', '']), df])


# Puerto Rico 2003
url = 'https://www.ers.usda.gov/webdocs/DataFiles/53251/pr2003.xls?v=9631.3'
fname = download_file(url, ruc_dir / 'orig')
df = pd.read_excel(fname, dtype='str')

cols_map = {'FIPS Code': 'FIPS', 'State': 'STATE', 'Municipio Name': 'COUNTY', 'Population 2003 ': 'POPULATION',
            'Rural-urban Continuum Code, 2003': 'RUC_CODE', 'Description of the 2003 Code': 'RUC_CODE_DESCRIPTION'}
df = df.rename(columns=cols_map)
df = df[cols_map.values()]
df['RUC_YEAR'] = '2003'
df['POPULATION_YEAR'] = '2003'
ruc_dfs.append(df)

df = pd.concat([
    pd.DataFrame(['RUC Puerto Rico 2003 documentation', '-' * 80, f'Data source: {url}']),
    pd.DataFrame([['Column names'], ['Renamed', 'Original']]),
    pd.DataFrame([[v, k] for k, v in cols_map.items()])])
ruc_doc_dfs.extend([pd.DataFrame(['', '']), df])


# 2013
url = 'https://www.ers.usda.gov/webdocs/DataFiles/53251/ruralurbancodes2013.xls?v=9631.3'
fname = download_file(url, ruc_dir / 'orig')
df = pd.read_excel(fname, 'Rural-urban Continuum Code 2013', dtype='str')
cols_map = {'FIPS': 'FIPS', 'State': 'STATE', 'County_Name': 'COUNTY', 'Population_2010': 'POPULATION',
            'RUCC_2013': 'RUC_CODE', 'Description': 'RUC_CODE_DESCRIPTION'}
df = df.rename(columns=cols_map)
df = df[cols_map.values()]
df['RUC_YEAR'] = '2013'
df['POPULATION_YEAR'] = '2010'
ruc_dfs.append(df)

df = pd.concat([pd.DataFrame(['RUC 2013 documentation', '-' * 80, f'Data source: {url}']),
                pd.DataFrame([[''], ['Column names'], ['Renamed', 'Original']]),
                pd.DataFrame([[v, k] for k, v in cols_map.items()]),
                pd.DataFrame(['']),
                pd.read_excel(fname, 'Documentation', header=None, dtype='str')])
ruc_doc_dfs.extend([pd.DataFrame(['', '']), df])

# Combine and save to disk
df = pd.concat(ruc_dfs)
df = df[['FIPS', 'STATE', 'COUNTY', 'RUC_YEAR', 'RUC_CODE', 'RUC_CODE_DESCRIPTION', 
         'POPULATION_YEAR', 'POPULATION', 'PERCENT_NONMETRO_COMMUTERS']]
for col in df:
    df[col] = df[col].str.strip()
df = df.sort_values(['FIPS', 'RUC_YEAR'])
df.to_csv(ruc_dir / 'ruc.csv', index=False)

df = pd.concat(ruc_doc_dfs)
for col in df:
    df[col] = df[col].str.strip()
df.to_csv(ruc_dir / 'ruc_doc.txt', '\t', header=False, index=False)     

## Urban Influence Codes

[Homepage](https://www.ers.usda.gov/data-products/urban-influence-codes/)

In [None]:
ui_dir.mkdir(parents=True, exist_ok=True)
ui_dfs = []
ui_doc_dfs = []

# 1993
url = 'https://www.ers.usda.gov/webdocs/DataFiles/53797/UrbanInfluenceCodes.xls?v=1904.3'
fpath = download_file(url, ui_dir / 'orig')

df = pd.read_excel(fpath, 'Urban Influence Codes', dtype='str')
cols_map = {'FIPS Code': 'FIPS', 'State': 'STATE', 'County name': 'COUNTY',
            '2000 Population': 'POPULATION', '2000 Persons per square mile': 'POPULATION_DENSITY',
            '1993 Urban Influence Code': 'UI_CODE', '1993 Urban Influence Code description': 'UI_CODE_DESCRIPTION'}
df = df.rename(columns=cols_map)
df = df[cols_map.values()]
df['UI_YEAR'] = '1993'
df['POPULATION_YEAR'] = '2000'
ui_dfs.append(df)

df = pd.concat([pd.DataFrame(['UI 1993 documentation', '-' * 80, f'Data source: {url}']),
                pd.DataFrame([[''], ['Column names'], ['Renamed', 'Original']]),
                pd.DataFrame([[v, k] for k, v in cols_map.items()]),
                pd.DataFrame(['']),
                pd.read_excel(fpath, 'Information', header=None, dtype='str', skiprows=18)])

ui_doc_dfs.append(df)

# 2003
df = pd.read_excel(fpath, 'Urban Influence Codes', dtype='str')
cols_map = {'FIPS Code': 'FIPS', 'State': 'STATE', 'County name': 'COUNTY',
            '2003 Urban Influence Code': 'UI_CODE', '2003 Urban Influence Code description': 'UI_CODE_DESCRIPTION',
            '2000 Population': 'POPULATION', '2000 Persons per square mile': 'POPULATION_DENSITY'}
df = df.rename(columns=cols_map)
df = df[cols_map.values()]
df['UI_YEAR'] = '2003'
df['POPULATION_YEAR'] = '2000'
ui_dfs.append(df)

df = pd.concat([pd.DataFrame(['UI 2003 documentation', '-' * 80, f'Data source: {url}']),
                pd.DataFrame([[''], ['Column names'], ['Renamed', 'Original']]),
                pd.DataFrame([[v, k] for k, v in cols_map.items()]),
                pd.DataFrame(['']),
                pd.read_excel(fpath, 'Information', header=None, dtype='str', skiprows=3, nrows=14)])
ui_doc_dfs.extend([pd.DataFrame(['', '']), df])

# Puerto Rico 2003
url = 'https://www.ers.usda.gov/webdocs/DataFiles/53797/pr2003UrbInf.xls?v=1904.3'
fpath = download_file(url, ui_dir / 'orig')

df = pd.read_excel(fpath, dtype='str')
cols_map = {'FIPS Code': 'FIPS', 'State': 'STATE', 'Municipio Name': 'COUNTY', 'Population 2003 ': 'POPULATION',
            'Urban Influence  Code, 2003': 'UI_CODE', 'Description of the 2003 Code': 'UI_CODE_DESCRIPTION'}
df = df.rename(columns=cols_map)
df = df[cols_map.values()]
df['UI_YEAR'] = '2003'
df['POPULATION_YEAR'] = '2003'
ui_dfs.append(df)

df = pd.concat([pd.DataFrame(['UI Puerto Rico 2003 documentation', '-' * 80, f'Data source: {url}']),
                pd.DataFrame([[''], ['Column names'], ['Renamed', 'Original']]),
                pd.DataFrame([[v, k] for k, v in cols_map.items()])])
ui_doc_dfs.extend([pd.DataFrame(['', '']), df])

# 2013
url = 'https://www.ers.usda.gov/webdocs/DataFiles/53797/UrbanInfluenceCodes2013.xls?v=1904.3'
fpath = download_file(url, ui_dir / 'orig')

df = pd.read_excel(fpath, 'Urban Influence Codes 2013', dtype='str')
cols_map = {'FIPS': 'FIPS', 'State': 'STATE', 'County_Name': 'COUNTY', 'Population_2010': 'POPULATION',
            'UIC_2013': 'UI_CODE', 'Description': 'UI_CODE_DESCRIPTION'}
df = df.rename(columns=cols_map)
df = df[cols_map.values()]
df['UI_YEAR'] = '2013'
df['POPULATION_YEAR'] = '2010'
ui_dfs.append(df)

df = pd.concat([pd.DataFrame(['UI 2013 documentation', '-' * 80, f'Data source: {url}']),
                pd.DataFrame([[''], ['Column names'], ['Renamed', 'Original']]),
                pd.DataFrame([[v, k] for k, v in cols_map.items()]),
                pd.DataFrame(['']),
                pd.read_excel(fpath, 'Documentation', header=None, dtype='str')])
ui_doc_dfs.extend([pd.DataFrame(['', '']), df])

# Combine and save to disk
df = pd.concat(ui_dfs)
df = df[['FIPS', 'STATE', 'COUNTY', 'UI_YEAR', 'UI_CODE', 'UI_CODE_DESCRIPTION', 
         'POPULATION_YEAR', 'POPULATION', 'POPULATION_DENSITY']]
for col in df:
    df[col] = df[col].str.strip()
df = df.sort_values(['FIPS', 'UI_YEAR'])
df.to_csv(ui_dir / 'ui.csv', index=False)

df = pd.concat(ui_doc_dfs)
for col in df:
    df[col] = df[col].str.strip()
df.to_csv(ui_dir / 'ui_doc.txt', '\t', header=False, index=False)        

## Rural-Urban Commuting Area Codes

[Homepage](https://www.ers.usda.gov/data-products/rural-urban-commuting-area-codes/)

In [None]:
ruca_dir.mkdir(parents=True, exist_ok=True)
ruca_dfs = []
ruca_doc_dfs = []

# 1990
url = 'https://www.ers.usda.gov/webdocs/DataFiles/53241/ruca1990.xls?v=9882.5'
fname = download_file(url, ruca_dir / 'orig')

df = pd.read_excel(fname, 'Data', dtype='str')
cols_map = {'FIPS state-county-tract code': 'FIPS',
            'Rural-urban commuting area code': 'RUCA_CODE',
            'Census tract population, 1990': 'POPULATION',
            'Census tract land area, square miles, 1990': 'AREA',
            'County metropolitan status, 1993 (1=metro,0=nonmetro)': 'METRO'}
df = df.rename(columns=cols_map)
df = df[cols_map.values()]
df['FIPS'] = df['FIPS'].str.replace('.', '')
df['YEAR'] = '1990'
ruca_dfs.append(df)

df = pd.concat([
    pd.DataFrame(['RUCA 1990 documentation', '-' * 80, f'Data source: {url}']),
    pd.DataFrame([[''], ['Column names'], ['Renamed', 'Original']]),
    pd.DataFrame([[v, k] for k, v in cols_map.items()]),
    pd.DataFrame(['']),
    pd.read_excel(fname, 'RUCA code description', header=None, dtype='str'),
    pd.DataFrame(['', 'Data sources']),
    pd.read_excel(fname, 'Data sources', header=None, dtype='str')])
ruca_doc_dfs.append(df)


# 2000
url = 'https://www.ers.usda.gov/webdocs/DataFiles/53241/ruca00.xls?v=9882.5'
fname = download_file(url, ruca_dir / 'orig')

df = pd.read_excel(fname, 'Data', dtype='str')

cols_map = {
    'Select State': 'STATE',
    'Select County ': 'COUNTY',
    'State County Tract Code': 'FIPS',
    'RUCA Secondary Code 2000': 'RUCA_CODE',
    'Tract Population 2000': 'POPULATION'
}
df = df.rename(columns=cols_map)
df = df[cols_map.values()]
df['YEAR'] = '2000'
ruca_dfs.append(df)

df = pd.concat([
    pd.DataFrame(['RUCA 2000 documentation', '-' * 80, f'Data source: {url}']),
    pd.DataFrame([[''], ['Column names'], ['Renamed', 'Original']]),
    pd.DataFrame([[v, k] for k, v in cols_map.items()]),
    pd.DataFrame(['']),
    pd.read_excel(fname, 'RUCA code description', header=None, dtype='str'),
    pd.DataFrame(['', 'Data sources']),
    pd.read_excel(fname, 'Data sources', header=None, dtype='str')])
ruca_doc_dfs.extend([pd.DataFrame(['', '']), df])


# 2010
url = 'https://www.ers.usda.gov/webdocs/DataFiles/53241/ruca2010revised.xlsx?v=9882.5'
fname = download_file(url, ruca_dir / 'orig')

df = pd.read_excel(fname, 'Data', dtype='str', skiprows=1)
cols_map = {
    'Select State': 'STATE',
    'Select County': 'COUNTY',
    'State-County-Tract FIPS Code (lookup by address at http://www.ffiec.gov/Geocode/)': 'FIPS',
    'Secondary RUCA Code, 2010 (see errata)': 'RUCA_CODE',
    'Tract Population, 2010': 'POPULATION',
    'Land Area (square miles), 2010': 'AREA'
}
df = df.rename(columns=cols_map)
df = df[cols_map.values()]
df['YEAR'] = '2010'
ruca_dfs.append(df)

df = pd.concat([
    pd.DataFrame(['RUCA 2010 documentation', '-' * 80, f'Data source: {url}']),
    pd.DataFrame([[''], ['Column names'], ['Renamed', 'Original']]),
    pd.DataFrame([[v, k] for k, v in cols_map.items()]),
    pd.DataFrame(['']),
    pd.read_excel(fname, 'RUCA code description', header=None, dtype='str'),
    pd.DataFrame(['', 'Data sources']),
    pd.read_excel(fname, 'Data sources', header=None, dtype='str')])
ruca_doc_dfs.extend([pd.DataFrame(['', '']), df])


# Combine and save to disk
df = pd.concat(ruca_dfs)
df = df[['FIPS', 'STATE', 'COUNTY', 'YEAR', 'RUCA_CODE', 'POPULATION', 'AREA', 'METRO']]
for col in df:
    df[col] = df[col].str.strip()
df = df.sort_values(['FIPS', 'YEAR'])
df.to_csv(ruca_dir / 'ruca.csv', index=False)

df = pd.concat(ruca_doc_dfs)
for col in df:
    df[col] = df[col].str.strip()
df.to_csv(ruca_dir / 'ruca_doc.txt', '\t', header=False, index=False)

# Descriptive summary

Simple visualization of the data.

## Rural-Urban Continuum Codes

Documentation notes that not all years are directly comparable.

In 1974, 1983 and 1993 there was code "0", join it with "1" for simplicity.

In [None]:
df = pd.read_csv(ruc_dir / 'ruc.csv')
df['RUC_CODE'] = df['RUC_CODE'].replace(0, 1)
df['POPULATION'] = df.groupby('FIPS')['POPULATION'].fillna(method='bfill')

stats = df.groupby(['RUC_YEAR', 'RUC_CODE'])['POPULATION'].agg(['size', 'sum'])
stats['sum'] /= 1000
stats.columns = ['Counties', 'Population']
stats = stats.astype(int).unstack()

# 2003 and 2013 codes are identical
codes = (df.loc[df['RUC_YEAR'] == 2013, ['RUC_CODE', 'RUC_CODE_DESCRIPTION']]
         .dropna().drop_duplicates()
         .set_index('RUC_CODE').sort_index())

In [None]:
pd.options.display.max_colwidth = 200
codes

### Number of counties

In [None]:
data = stats['Counties']
data

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))
idx = [str(x) for x in data.index]
bottom = pd.Series([0] * len(data), index=data.index)
for code in data:
    ax.bar(idx, data[code], bottom=bottom, label=code)
    bottom += data[code]

dummy = mpl.patches.Patch(fill=False, edgecolor='none')
handles, labels = ax.get_legend_handles_labels()
handles = [dummy] + handles[:3] + [dummy] + handles[3:6] + [dummy] + handles[6:]
labels = ['metro'] + labels[:3] + ['nonmetro'] + labels[3:6] + [''] + labels[6:]
ax.legend(handles, labels, ncol=3, loc='upper center', bbox_to_anchor=(0.5, -0.1))

ax.set_title('Number of counties by Rural-Urban Continuum code')

plt.show()

### Population

No population in dataset before 2010, so 2010 is used for earlier years.

In [None]:
data = stats['Population']
data

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))
idx = [str(x) for x in data.index]
bottom = pd.Series([0] * len(data), index=data.index)
for code in data:
    ax.bar(idx, data[code], bottom=bottom, label=code)
    bottom += data[code]

dummy = mpl.patches.Patch(fill=False, edgecolor='none')
handles, labels = ax.get_legend_handles_labels()
handles = [dummy] + handles[:3] + [dummy] + handles[3:6] + [dummy] + handles[6:]
labels = ['metro'] + labels[:3] + ['nonmetro'] + labels[3:6] + [''] + labels[6:]
ax.legend(handles, labels, ncol=3, loc='upper center', bbox_to_anchor=(0.5, -0.1))

ax.set_title('Population in thousands by Rural-Urban Continuum code')

plt.show()