In [None]:
import pandas as pd
from zipfile import ZipFile
import os
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

## Hardcoded data structures

In [None]:
cols = ['Company', 'Address Line 1', 'City', 'State', 'ZipCode', 'Zip4',
       'County Code', 'Area Code', 'IDCode', 'Location Employee Size Code',
       'Location Sales Volume Code', 'Primary SIC Code', 'SIC6_Descriptions',
       'Primary NAICS Code', 'NAICS8 Descriptions', 'SIC Code',
       'SIC6_Descriptions (SIC)', 'SIC Code 1', 'SIC6_Descriptions (SIC1)',
       'SIC Code 2', 'SIC6_Descriptions(SIC2)', 'SIC Code 3',
       'SIC6_Descriptions(SIC3)', 'SIC Code 4', 'SIC6_Descriptions(SIC4)',
       'Archive Version Year', 'Yellow Page Code',
       'Employee Size (5) - Location', 'Sales Volume (9) - Location',
       'Business Status Code', 'Industry Specific First Byte',
       'Year Established', 'Office Size Code', 'Company Holding Status', 'ABI',
       'Subsidiary Number', 'Parent Number', 'Parent Actual Employee Size',
       'Parent Actual Sales Volume', 'Parent Employee Size Code',
       'Parent Sales Volume Code', 'Site Number', 'Address Type Indicator',
       'Population Code', 'Census Tract', 'Census Block', 'Latitude',
       'Longitude', 'Match Code', 'CBSA Code', 'CBSA Level', 'CSA Code',
       'FIPS Code']

In [None]:
# Assign the FIPS state code to each record. Prior examination shows some data entry 
# errors in the FIPS Code variable so we do this by brute force.
state_fips = {
'AL':'01',
'AK':'02',
'AS':'60',
'AZ':'04',
'AR':'05',
'CA':'06',
'CO':'08',
'CT':'09',
'DE':'10',
'DC':'11',
'FL':'12',
'FM':'64',
'GA':'13',
'GU':'66',
'HI':'15',
'ID':'16',
'IL':'17',
'IN':'18',
'IA':'19',
'KS':'20',
'KY':'21',
'LA':'22',
'ME':'23',
'MH':'68',
'MD':'24',
'MA':'25',
'MI':'26',
'MN':'27',
'MS':'28',
'MO':'29',
'MT':'30',
'NE':'31',
'NV':'32',
'NH':'33',
'NJ':'34',
'NM':'35',
'NY':'36',
'NC':'37',
'ND':'38',
'MP':'69',
'OH':'39',
'OK':'40',
'OR':'41',
'PW':'70',
'PA':'42',
'PR':'72',
'RI':'44',
'SC':'45',
'SD':'46',
'TN':'47',
'TX':'48',
'UM':'74',
'UT':'49',
'VT':'50',
'VA':'51',
'VI':'78',
'WA':'53',
'WV':'54',
'WI':'55',
'WY':'56'
}

## Main functions

In [None]:
def extract_and_correct(yr):
    print(f'\n{yr}:')
    print(f'\n{yr}:',file=logfile)
    dir = '/InfoGroup/data/original/'
    xdir = '/tmp/xtrcts/'
    fname = f'{yr}_Business_Academic_QCQ_utf-8'
    # Extract the annual file from the zip archive
    with ZipFile(f'{dir}{fname}.zip','r') as myzip:
        myzip.extract(f'{fname}.csv',f'{xdir}')
        df = pd.read_csv(f'{xdir}{fname}.csv',low_memory=False,usecols=cols,dtype=object)
    
    # Delete the temp file
    os.remove(f'{xdir}{fname}.csv')  
    
    # Prepare CBSA Code and CBSA Level for correction.
    df['CBSA Level'].fillna('0',inplace=True)
    df['CBSA Code'].fillna('00000',inplace=True)
    df[['CBSA Code','CBSA Level']] = df[['CBSA Code','CBSA Level']].astype(int,copy=False)

    # Correct and overwrite the state FIPS code.
    df['FIPS Code'] = df.apply(new_fips, axis=1)
    
    # zero-fill the ZipCode value
    df['ZipCode'] = df['ZipCode'].apply(lambda x: x.zfill(5) if len(x) < 5 == 0 else x)
    return df

def CBSA_partition(df):    
    urban = df[(df['CBSA Level'] > 0)]
    rural = df[(df['CBSA Code'] > 0) & (df['CBSA Level']==0)]
    unknown = df[(df['CBSA Code'] < 0) | ((df['CBSA Code'] == 0) & (df['CBSA Level']==0))]
    unknown['FIPS Code'] = unknown['FIPS Code'].astype(int)
    
    nrows = len(df)
    sum_of_parts = len(urban) + len(rural) + len(unknown)
    if sum_of_parts != nrows:
        print(f'Error in dividing enterprises into categories:',file=logfile)
        print(f'\t{nrows} != {sum_of_parts}',file=logfile)
        
    corrected = extract_corrections(unknown)
    corrected.drop(columns=['CBSA','FIPS Code_r','LSAD'],inplace=True)
    corrected.rename(columns={"FIPS Code_l": "FIPS Code"},inplace=True)

    print(corrected['CBSA Level'].value_counts(),file=logfile)
    return (urban, rural, corrected)


## Little functions

In [None]:
def state_code(state):
    """ Establish an accurate state FIPS code"""
    return state_fips[state]

def new_fips(row):
    if str(row['FIPS Code']) == 'nan' or str(row['County Code']) == 'nan':
        return '00000'
    else:
        return state_fips[row['State']] + str(row['County Code'])

def extract_corrections(unknowns):
    """Extracts CBSA Code and appropriate CBSA Level for a list of InfoGroup FIPS Codes"""
    # Files cross-references CBSA codes and county FIPS codes.
    # Variable 'STCOU' is the 5-digit county FIPS code. The CBSA Level is inferred from the
    # text in the 'LSAD' variable.
    unk = unknowns.join(cbsa_df,on='FIPS Code',how='left',lsuffix='_l',rsuffix='_r')
    unk['CBSA Level'] = 0
    for i in unk.index:
        if str(unk.at[i,'LSAD']).find("Metropolitan") > -1:
            unk.at[i,'CBSA Level'] = 2
        elif str(unk.at[i,'LSAD']).find("Micropolitan") > -1:
            unk.at[i,'CBSA Level'] = 1    
    return unk
    
def showtime(num):
    now = datetime.now()
    dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
    print(str(num),'  ',dt_string)	

## Reference datasets and derived data structures

In [None]:
# Census relationship files: CBSAs to counties
cbsa_df = pd.read_csv(f'/home/tflory/notebooks/InfoGroup/rurality/Relationship_Files/cbsa-county-relationships-2017.csv',
                      usecols=['STCOU','CBSA','LSAD']).fillna(-9)
cbsa_df.rename(columns={'STCOU':'FIPS Code'},inplace=True)
cbsa_df['FIPS Code'] = cbsa_df['FIPS Code'].astype(int)

## Main

In [None]:
# Open a log file.
logfile = open('/InfoGroup/data/rurality/logs/step1.log','w')

In [None]:
for yr in range(1997,2018):
    showtime('start')
    df = extract_and_correct(yr)
    showtime('extract_and_correct')
    (urban,rural,corrected) = CBSA_partition(df)
    showtime('CBSA_partition')
    final_df = pd.concat([urban,rural,corrected],ignore_index=True)
    showtime('inline concat')
    final_df.to_csv(f'/InfoGroup/data/rurality/step1_{yr}.csv',index=None)
    showtime('finished')

In [None]:
logfile.close()