In [None]:
import pandas as pd
import geopandas as gpd

import warnings
warnings.filterwarnings("ignore")

In [None]:
tract_df = pd.read_csv('/InfoGroup/data/rurality/reference/geographical/points-in-polygons/data/all_tracts.csv',
                 dtype=object)
tract_gdf = gpd.GeoDataFrame(tract_df)

In [None]:
tract_df = pd.DataFrame(tract_gdf[['GEOID','UA_GEOID10','UATYP10','rural_tract']],dtype=object)
tract_df.rename(columns={'UA_GEOID10':'UA Code','UATYP10':'UA Type', \
                         'rural_tract':'rural_outside_UA'},inplace=True)

In [None]:
yr = 2017
df = pd.read_csv(f'/InfoGroup/data/rurality/step1_{yr}.csv',dtype=object)

In [None]:
merged = df.merge(tract_df,how='inner',left_on='Full Census Tract',right_on='GEOID',indicator=True)

-----------

validation

-----------

In [None]:
merged['_merge'].value_counts()

In [None]:
len(merged[merged['UA Code'].isnull()])

In [None]:
merged['rural_outside_UA'].value_counts()

In [None]:
merged['rural_outside_UA'].value_counts(normalize=True) * 100

--------------

end of validation

--------------

In [None]:
merged.drop(columns=['_merge','GEOID'],inplace=True)

In [None]:
merged.to_csv(f'/InfoGroup/data/rurality/step2_{yr}.csv',index=None)

#### Add Urban Influence, Rural-Urban Continuum, and Rural-Urban Commuting Area codes to the InfoGroup record

In [None]:
First choose the appropriate year, then match as below:
    
1. Match 'FIPS' in /ers/ui/ui.csv to 'FIPS Code' (county level) in InfoGroup. 
'UI_YEAR' in ui.csv has the values [1974,1983,1993,2001,2013].

2. Match 'FIPS' in /ers/ruc/ruc.csv to 'FIPS Code' in InfoGroup.
'RUC_YEAR' in ruc.csv has the values [1993,2003,2013].

3. Match 'FIPS' in /ers/ruca/ruca.csv to 'Full Census Tract' in InfoGroup.
'YEAR' in ruca.csv has the values [1990,2000,2010].

For example:

In [None]:
def get_ruc_df():
    """Return pandas.DataFrame of Rural-Urban Continuum codes for all years."""
    res = '/ers/ruc/ruc.csv'
    df = pd.read_csv(res, dtype='str')
    for c in ['RUC_YEAR', 'POPULATION_YEAR', 'POPULATION', 'PERCENT_NONMETRO_COMMUTERS']:
        df[c] = pd.to_numeric(df[c])
    cats = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
    df['RUC_CODE'] = pd.Categorical(df['RUC_CODE'], cats, True)
    return df

def get_ui_df():
    """Return pandas.DataFrame of Urban Influence codes for all years."""
    res = '/ers/ui/ui.csv'
    df = pd.read_csv(res, dtype='str')
    for c in ['UI_YEAR', 'POPULATION_YEAR', 'POPULATION', 'POPULATION_DENSITY']:
        df[c] = pd.to_numeric(df[c])
    cats = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']
    df['UI_CODE'] = pd.Categorical(df['UI_CODE'], cats, True)
    return df

def get_ruca_df():
    """Return pandas.DataFrame of Rural-Urban Commuting Area codes for all years."""
    res = '/ers/ruca/ruca.csv'
    df = pd.read_csv(res, dtype='str')
    for c in ['YEAR', 'POPULATION', 'AREA']:
        # ValueError: Unable to parse string "6 23.063" at position 269
        # todo: input files probably had this error, add manual fix to `download_and_convert_ruca()`
        df[c] = pd.to_numeric(df[c], 'coerce')
    cats = ['1', '1.1',
            '2', '2.1', '2.2',
            '3',
            '4', '4.1', '4.2',
            '5', '5.1', '5.2',
            '6', '6.1',
            '7', '7.1', '7.2', '7.3', '7.4',
            '8', '8.1', '8.2', '8.3', '8.4',
            '9', '9.1', '9.2',
            '10', '10.1', '10.2', '10.3', '10.4', '10.5', '10.6',
            '99']
    df['RUCA_CODE'] = df['RUCA_CODE'].str.replace('.0', '', regex=False)
    df['RUCA_CODE'] = pd.Categorical(df['RUCA_CODE'], cats, True)
    return df

In [None]:
import pandas as pd

df = pd.read_csv('/InfoGroup/data/rurality/step2_2017.csv',dtype=object)

In [None]:
ui_df = get_ui_df()
ui_df = ui_df[['UI_YEAR','UI_CODE','FIPS']]
ui_df.dropna(inplace=True)
ui_df = ui_df[ui_df['UI_YEAR'] == 2013]
merged = df.merge(ui_df,how='inner',left_on='FIPS Code',right_on='FIPS',indicator=True)
df = merged.drop(columns=['UI_YEAR','FIPS','_merge'])

In [None]:
ruc_df = get_ruc_df()
ruc_df = ruc_df[['RUC_YEAR','RUC_CODE','FIPS']]
ruc_df.dropna(inplace=True)
ruc_df = ruc_df[ruc_df['RUC_YEAR'] == 2013]
merged = df.merge(ruc_df,how='inner',left_on='FIPS Code',right_on='FIPS',indicator=True)
df = merged.drop(columns=['RUC_YEAR','FIPS','_merge'])

In [None]:
ruca_df = get_ruca_df()
ruca_df = ruca_df[['YEAR','RUCA_CODE','FIPS']]
ruca_df.dropna(inplace=True)
ruca_df = ruca_df[ruca_df['YEAR'] == 2010]
merged = df.merge(ruca_df,how='inner',left_on='Full Census Tract',right_on='FIPS',indicator=True)
df = merged.drop(columns=['YEAR','FIPS','_merge'])

In [None]:
df.columns

In [None]:
df.to_csv('/InfoGroup/data/rurality/step2_2017.csv',index=None)

In [None]:
# Explore UI_CODE, RUC_CODE, and RUCA_CODE

In [None]:
import pandas as pd
dfx = pd.read_csv('/InfoGroup/data/rurality/step2_2017.csv',dtype=object)

In [None]:
print('Missing in UI:', str(len(dfx[dfx['UI_CODE'].isnull()])))
print('Missing in RUC:', str(len(dfx[dfx['RUC_CODE'].isnull()])))
print('Missing in RUCA:', str(len(dfx[dfx['RUCA_CODE'].isnull()])))

In [None]:
dfx['UI_CODE'].value_counts(normalize=True) * 100

In [None]:
dfx['RUC_CODE'].value_counts(normalize=True) * 100

In [None]:
dfx['RUCA_CODE'].value_counts(normalize=True) * 100