In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

import warnings
warnings.filterwarnings("ignore")

## Main functions

In [None]:
def create_df(yr):
    """ Create a dataframe from the csv file saved in step 1. """
    return pd.read_csv(f'/InfoGroup/data/rurality/step2_{yr}.csv',dtype=object)

def rurality(df):
    df['rural_by_UA'] = df['UA Code'].apply(rur_UA) 
    showtime('\trurality (rural_by_UA)')
    print(df['rural_by_UA'].value_counts(),file=logfile)
    print(df['rural_by_UA'].value_counts(normalize=True) * 100,file=logfile)
    
    df['rural_HRSA'] = df.apply(rur_hrsa,axis=1)
    showtime('\trurality (rural_HRSA)')
    print(df['rural_HRSA'].value_counts(),file=logfile)
    print(df['rural_HRSA'].value_counts(normalize=True) * 100,file=logfile)

    # Merge with FAR data
    # FAR codes apply only to the continental states.
    df_far = df.merge(df_zip,how='left',left_on='ZipCode',right_on='ZIP',indicator=True)
    showtime('\trurality (FAR)')
    df_far.drop(columns=['ZIP','_merge'],inplace=True)

    # Rurality by census tract centroid
    merged = df_far.merge(tracts,how='left',left_on='Full Census Tract',right_on='GEOID',indicator=True)
    merged.drop(columns=['GEOID','_merge'],inplace=True)
    showtime('\trurality (census tracts)')
    print('Rural=1,Urban=0:',file=logfile)
    print(merged['rural_tract'].value_counts(),file=logfile)
    print('Missing:',file=logfile)
    print(merged['rural_tract'].isnull().sum(),file=logfile)

    return merged


## Little functions

In [None]:
def rur_hrsa(row):
    if int(row['CBSA Level']) == 0 or row['Full Census Tract'] in rural_tracts:
        return 1
    return 0

def rur_UA(code):
    """Apply the UA rurality flag."""
    if code in ua_keys.keys():
        return ua_keys[code]
    else:
        return np.nan

def farlevel(row):
    if sum([row['far1'],row['far2'],row['far3'],row['far4']]) == 0:
        return 0
    elif row['far1'] == 1 and sum([row['far2'],row['far3'],row['far4']]) == 0:
        return 1
    elif row['far2'] == 1 and sum([row['far3'],row['far4']]) == 0:
        return 2
    elif row['far3'] == 1 and row['far4'] == 0:
        return 3
    elif row['far4'] == 1:
        return 4
    
def log_far_levels(dfx):
    f = len(dfx)
    f0 = len(dfx[dfx['FAR Level']==0])
    print('Not Far and remote in InfoGroup file:',str(((f0/f) * 100))+'%',file=logfile)
    print('Distribution of FAR types for enterprises with non-zero FAR Level:',file=logfile)
    df = dfx[dfx['FAR Level']>0]
    f = len(df)
    f1 = len(df[df['FAR Level'] == 1])
    print('\tFAR Level 1:',str(((f1/f) * 100))+'%',file=logfile)
    f2 = len(df[df['FAR Level'] == 2])
    print('\tFAR Level 2:',str(((f2/f) * 100))+'%',file=logfile)
    f3 = len(df[df['FAR Level'] == 3])
    print('\tFAR Level 3:',str(((f3/f) * 100))+'%',file=logfile)
    f4 = len(df[df['FAR Level'] == 4])
    print('\tFAR Level 4:',str(((f4/f) * 100))+'%',file=logfile)

def all_nines(li):
    """Determine whether the list of CBSA codes in a UA are all missing values."""
    length = len(li)
    n = 0
    for item in li:
        if item == '99999':
            n += 1
    if n == length:
        return True
    return False

def showtime(num):
    now = datetime.now()
    dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
    print(str(num),'  ',dt_string)	

## Reference datasets and derived data structures

In [None]:
# Census relationship files: Urban Areas to CBSAs
relationship = '/home/tflory/notebooks/InfoGroup/rurality/Relationship_Files/Urban_Area_to_Metro_Micro_Area_utf-8.csv'
rel = pd.read_csv(relationship,dtype=object)

# Make a list of lists of all the UA/CBSA pairs in the relationship file.
pairs = rel[['UA','CBSA']].values.tolist()
# Create dict from list of lists. pairs[0] is UA, pairs[1] is CBSA.
# The result is a dict with keys of UAs and values of lists of CBSAs that are
# associated with each UA in the relationship file
uas = {}
for pair in pairs:
    try:
        uas[pair[0]].append(pair[1])
    except KeyError:
        uas[pair[0]] = [pair[1]]
        
ua_keys = dict.fromkeys(uas.keys())
for k,v in uas.items():
    if len(v) > 1:
        if all_nines(v):
            ua_keys[k] = 'rural-multi'
        elif '99999' in v:
            ua_keys[k] = 'partly rural'
        elif k == '99999':
            ua_keys[k] = 'unknown'
        else:
            ua_keys[k]= 'urban-multi'
    else:
        if v[0] == '99999':
            ua_keys[k] = 'rural-single'
        else:
            ua_keys[k] = 'urban-single'

# FORHP list of 2300+ rural census tracts
rural_tracts = []
with open('/InfoGroup/data/rurality/tract_data.txt','r') as fin:
    for line in fin:
        if line[0] != chr(32):
            continue
        else:
            line = line.strip()
            try:
                if line[0].isnumeric(): 
                    rural_tracts.append(line)
            except IndexError:
                pass

# Census Bureau: all census tracts, 2010 slightly updated in 2017
tracts_file = \
      '/home/tflory/notebooks/InfoGroup/rurality/points-in-polygons/data/all_tracts.csv'
tracts = pd.read_csv(tracts_file,dtype=object,usecols=['GEOID','rural_tract'])

# ERS: Frontier and Remote census data tracts
far_file = '/InfoGroup/data/rurality/FARcodesZIPdata2010WithAKandHI.csv'
df_far = pd.read_csv(far_file,dtype=object)
df_far['ZIP'] = df_far['ZIP'].apply(lambda x: x.zfill(5) if len(x) < 5 == 0 else x)
df_zip = df_far[['ZIP','far1','far2','far3','far4']].copy()
df_zip[['far1','far2','far3','far4']] = df_zip[['far1','far2','far3','far4']].astype(int)
df_zip['FAR Level'] = df_zip.apply(farlevel,axis=1)
df_zip = df_zip.drop_duplicates()


## Main

In [None]:
# open log file
logfile = open('/InfoGroup/data/rurality/logs/step3.log','w')

In [None]:
for yr in range(1997,2018):
    showtime('start')
    df = create_df(yr)
    showtime('create_df')
    new_df = rurality(df)
    showtime('rurality')
    new_df.to_csv(f'/InfoGroup/data/rurality/InfoGroup_{yr}_step3.csv',index=None)
    showtime('finished')

In [None]:
logfile.close()