In [None]:
import pandas as pd
import numpy as np
import re
from datetime import datetime

import warnings
warnings.filterwarnings("ignore")

## Main functions

In [None]:
def create_df(yr):
    """ Create a dataframe from the csv file saved in step 1. """
    return pd.read_csv(f'/InfoGroup/data/rurality/step1_{yr}.csv',dtype=object)
    
def new_vars(df):
    """ Create new variables 'UA Code', 'UA Type' and 'Full Census Tract' """
    print(f'rural/urban by OMB standard:',file=logfile)
    df['UA Code'] = df['CBSA Code'].apply(find_ua)
    
    showtime('\tnew_vars UA Code')
    df['UA Type'] = df['UA Code'].apply(get_ua_type)
    showtime('\tnew_vars UA Type')
    print(df['UA Type'].value_counts(),file=logfile)
    print(df['UA Type'].value_counts(normalize=True) * 100,file=logfile)
    
    df['Census Tract'].fillna('999999',inplace=True)
    df['FIPS Code'].fillna('99999',inplace=True)
    df['Full Census Tract'] = df.apply(full_tract,axis=1)
    showtime('\tnew_vars Full Census Tract')
    return df


## Little functions

In [None]:
def find_ua(code):
    """Get the UA code that matches a particular CBSA code in the census relationship file."""
    for pair in pairs:
        if pair[1] == code:
            return pair[0]

def ua_type(place):
    """Create the UA Type value by skimming the last two words off the 
       relationship file's UANAME value.
    """
    if place.find('Not in a 2010 urban area') != -1:
        return np.nan
    else:
        list = re.findall("(\S+)",place)
        return ' '.join(list[len(list)-2:])

def get_ua_type(code):
    """Return the UA Type value for a UA Code."""
    try:
        return ua_type_dict[code]
    except KeyError:
        return np.nan

def full_tract(row):
    if row['FIPS Code'] == 99999 or row['Census Tract'] == 999999:
        return np.nan
    else:
        return str(row['FIPS Code']) + str(row['Census Tract'])
        
def showtime(num):
    now = datetime.now()
    dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
    print(str(num),'  ',dt_string)	

## Reference datasets and derived data structures

In [None]:
# Census relationship files: CBSAs to counties
cbsa_df = pd.read_csv(f'/home/tflory/notebooks/InfoGroup/rurality/Relationship_Files/cbsa-county-relationships-2017.csv',usecols=['STCOU','CBSA','LSAD'])
cbsa_df.fillna('-9',inplace=True)
cbsa_df.rename(columns={'STCOU':'FIPS Code'},inplace=True)
cbsa_df['FIPS Code'] = cbsa_df['FIPS Code'].astype(int)

# Census relationship files: Urban Areas to CBSAs
relationship = '/home/tflory/notebooks/InfoGroup/rurality/Relationship_Files/Urban_Area_to_Metro_Micro_Area_utf-8.csv'
rel = pd.read_csv(relationship,dtype=object)
rel['UA Type'] = rel['UANAME'].apply(ua_type)

# Make a list of lists of all the UA/CBSA pairs in the relationship file.
pairs = rel[['UA','CBSA']].values.tolist()
# Create a dict with the UA as key and the UA Type as value.
ua_type_dict = {}
for v in rel[['UA','UA Type']].values.tolist():
    ua_type_dict[v[0]] = v[1]

## Main

In [None]:
# open log file
logfile = open('/InfoGroup/data/rurality/logs/step2.log','w')

In [None]:
for yr in range(1997,2018):
    showtime(f'\nstart {yr}:')
    df = create_df(yr)
    showtime('create_df')
    newvars_df = new_vars(df)
    showtime('new_vars')
    newvars_df.to_csv(f'/InfoGroup/data/rurality/step2_{yr}.csv',index=None)
    showtime('finished')

In [None]:
logfile.close()