In [None]:
import pandas as pd
import numpy as np
from zipfile import ZipFile
import os
from datetime import datetime

import warnings
warnings.filterwarnings("ignore")

In [None]:
state_fips = {
'AL':'01',
'AK':'02',
'AS':'60',
'AZ':'04',
'AR':'05',
'CA':'06',
'CO':'08',
'CT':'09',
'DE':'10',
'DC':'11',
'FL':'12',
'FM':'64',
'GA':'13',
'GU':'66',
'HI':'15',
'ID':'16',
'IL':'17',
'IN':'18',
'IA':'19',
'KS':'20',
'KY':'21',
'LA':'22',
'ME':'23',
'MH':'68',
'MD':'24',
'MA':'25',
'MI':'26',
'MN':'27',
'MS':'28',
'MO':'29',
'MT':'30',
'NE':'31',
'NV':'32',
'NH':'33',
'NJ':'34',
'NM':'35',
'NY':'36',
'NC':'37',
'ND':'38',
'MP':'69',
'OH':'39',
'OK':'40',
'OR':'41',
'PW':'70',
'PA':'42',
'PR':'72',
'RI':'44',
'SC':'45',
'SD':'46',
'TN':'47',
'TX':'48',
'UM':'74',
'UT':'49',
'VT':'50',
'VA':'51',
'VI':'78',
'WA':'53',
'WV':'54',
'WI':'55',
'WY':'56'
}

## Functions

In [None]:
def extract_and_correct(yr):
    print(f'\n{yr}:')
    print(f'\n{yr}:',file=logfile)
    dir = '/InfoGroup/data/original/'
    xdir = '/tmp/xtrcts/'
    fname = f'{yr}_Business_Academic_QCQ_utf-8'
    # Extract the annual file from the zip archive
    with ZipFile(f'{dir}{fname}.zip','r') as myzip:
        myzip.extract(f'{fname}.csv',f'{xdir}')
        df = pd.read_csv(f'{xdir}{fname}.csv',low_memory=False,dtype=object)
    
    # Delete the temp file
    os.remove(f'{xdir}{fname}.csv')  
    
    # Add 'State Code' column, the 2-digit FIPS code
    df['State Code'] = df['State'].apply(lambda s: state_fips[s])
    # Correct and overwrite the state FIPS code.
    df['FIPS Code'] = df['State Code'] + df['County Code']
    
    # Add Full Census Tract column, the 11-digit census tract identifying
    # a tract uniquely nationwide. The 'Census Tract' variable in InfoGroup
    # is the 6-digit code that identifies a tract only within a county.
    df['Full Census Tract'] = df['FIPS Code'] + df['Census Tract']
    # zero-fill the ZipCode value
    df['ZipCode'] = df['ZipCode'].apply(lambda x: x.zfill(5) if len(x) < 5 == 0 else x)
    return df

def CBSA_partition(df):    
    urban = df[~df['CBSA Level'].isnull()]
    rural = df[(~df['CBSA Code'].isnull()) & (df['CBSA Level'].isnull())]
    unknown = df[df['CBSA Code'] .isnull()]

    nrows = len(df)
    sum_of_parts = len(urban) + len(rural) + len(unknown)
    if sum_of_parts != nrows:
        print('Error in dividing enterprises into categories:',file=logfile)
        print(f'\t{nrows} != {sum_of_parts}',file=logfile)
        
    corrected = extract_corrections(unknown)
    corrected.drop(columns=['CBSA','LSAD'],inplace=True)
    corrected.rename(columns={"FIPS Code_l": "FIPS Code"},inplace=True)

    print(corrected['CBSA Level'].value_counts(),file=logfile)
    print(len(corrected[corrected['CBSA Level'].isnull()]),file=logfile)
    return (urban, rural, corrected)

def extract_corrections(unknowns):
    """Extracts CBSA Code and appropriate CBSA Level for a list of InfoGroup FIPS Codes"""
    unknowns['FIPS Code'] = unknowns['FIPS Code'].astype(str)
    unk = unknowns.merge(cbsa_df,on='FIPS Code',how='inner') 
    unk['CBSA Level'] = np.nan
    
    for i in unk.index:
        if unk.at[i,'LSAD'].find("Metropolitan") > -1:
            unk.at[i,'CBSA Level'] = 2
        elif unk.at[i,'LSAD'].find("Micropolitan") > -1:
            unk.at[i,'CBSA Level'] = 1    
    return unk
    
def showtime(num):
    now = datetime.now()
    dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
    print(str(num),'  ',dt_string)	

## Reference datasets and derived data structures

In [None]:
# Census relationship file: cross-references CBSA codes and state/county FIPS codes.
# Variable 'STCOU' is the 5-digit state/county FIPS code. The CBSA Level is inferred from the
# text in the 'LSAD' variable.
cbsa_df = pd.read_csv(f'/InfoGroup/data/rurality/reference/relationships/cbsa-county-relationships-2017.csv',
                      usecols=['STCOU','CBSA','LSAD'],dtype=object)   
cbsa_df.rename(columns={'STCOU':'FIPS Code'},inplace=True)
cbsa_df['FIPS Code'] = cbsa_df['FIPS Code'].astype(str)

## Main

In [None]:
# Open a log file.
logfile = open('/InfoGroup/data/rurality/logs/step1.log','w')

In [None]:
#for yr in range(1997,2018):
for yr in range(2017,2018):
    showtime('start')
    df = extract_and_correct(yr)
    showtime('extract_and_correct')
    (urban,rural,corrected) = CBSA_partition(df)
    showtime('CBSA_partition')
    final_df = pd.concat([urban,rural,corrected],ignore_index=True)
    showtime('inline concat')
    final_df.to_csv(f'/InfoGroup/data/rurality/step1_{yr}.csv',index=None)
    showtime('finished')

In [None]:
logfile.close()