In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

import warnings
warnings.filterwarnings("ignore")

## Functions

In [None]:
def rurality(df):
    all_rural_tracts = compile_rural_tracts()
    showtime('\trurality (compile_rural_tracts)')
    df['rural_HRSA'] = df['Full Census Tract'].apply(lambda x: 1 if x in all_rural_tracts else 0)
    showtime('\trurality (rural_HRSA)')
    print(df['rural_HRSA'].value_counts(),file=logfile)
    print(df['rural_HRSA'].value_counts(normalize=True) * 100,file=logfile)

    # Merge with FAR data
    # FAR codes apply only to the continental states.
    merged = df.merge(df_zip,how='left',left_on='ZipCode',right_on='ZIP',indicator=True)
    showtime('\trurality (FAR)')
    merged.drop(columns=['ZIP','far1','far2','far3','far4','_merge'],inplace=True)
    return merged

def rural_in_CBSA():
    """ Get 'Full Census Tract' of every IG record with census tract not in a CBSA """
    list_list = df[['CBSA Level','Full Census Tract']].values.tolist()
    rurtracts = [x[1] for x in list_list if x[0] not in ['1','2']]
    return rurtracts

def compile_rural_tracts():
    """ Construct a set of census tracts by adding those from non-metro
        InfoGroup establishments to those defined by the FORHP as rural
        tracts within metro counties. """
    rurtracts = rural_in_CBSA()
    print('not in CBSA:',str(len(rurtracts)),file=logfile)    
    print('hrsa_rural_tracts:',str(len(hrsa_rural_tracts)),file=logfile)                  
    rurtracts.extend(hrsa_rural_tracts) 
    rurtracts = set(rurtracts)
    print('all tracts deduped:',str(len(set(rurtracts))),file=logfile)
    return set(rurtracts)

def farlevel(row):
    if sum([row['far1'],row['far2'],row['far3'],row['far4']]) == 0:
        return '0'
    elif row['far1'] == 1 and sum([row['far2'],row['far3'],row['far4']]) == 0:
        return '1'
    elif row['far2'] == 1 and sum([row['far3'],row['far4']]) == 0:
        return '2'
    elif row['far3'] == 1 and row['far4'] == 0:
        return '3'
    elif row['far4'] == 1:
        return '4'
    else:
        return np.nan
    
def showtime(num):
    now = datetime.now()
    dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
    print(str(num),'  ',dt_string)	

## Reference datasets and derived data structures

In [None]:
# FORHP list of 2300+ rural census tracts
hrsa_rural_tracts = []
# This is a pre-processed text version of a former PDF file.
with open('/InfoGroup/data/rurality/tract_data.txt','r') as fin:
    for line in fin:
        if line[0] != chr(32):
            continue
        else:
            line = line.strip()
            try:
                if line[0].isnumeric(): 
                    hrsa_rural_tracts.append(line)
            except IndexError:
                pass

# ERS: Frontier and Remote census data tracts
far_file = '/InfoGroup/data/rurality/FARcodesZIPdata2010WithAKandHI.csv'
df_far = pd.read_csv(far_file,dtype=object)
df_far['ZIP'] = df_far['ZIP'].apply(lambda x: x.zfill(5) if len(x) < 5 == 0 else x)
df_zip = df_far[['ZIP','far1','far2','far3','far4']].copy()
df_zip[['far1','far2','far3','far4']] = df_zip[['far1','far2','far3','far4']].astype(int)
df_zip['FAR Level'] = df_zip.apply(farlevel,axis=1)
df_zip = df_zip.drop_duplicates()

## Main

In [None]:
# open log file
logfile = open('/InfoGroup/data/rurality/logs/step3.log','w')

In [None]:
for yr in range(2017,2018):
    showtime('start')
    df = pd.read_csv(f'/InfoGroup/data/rurality/step2_{yr}.csv',dtype=object)
    showtime('create dataframe')
    df = rurality(df)
    showtime('rurality function')
    df.to_csv(f'/InfoGroup/data/rurality/InfoGroup_{yr}_step3.csv',index=None)
    showtime('finished')

In [None]:
logfile.close()