# Validate geo variables in InfoGroup

> Test for consistency across different spatial variables and correct errors when possible.

- Create 11-digit census tract identifier
- Fill missing CBSA codes from FIPS reference

## Possible future work

- Correct 2-digit state part of the FIPS code.
- Correct missing CBSA code and CBSA level, mainly in 2009.
- Validations:
  - codes are valid (i.e. can be found in lookup tables) for fields such as SIC, NAICS, FIPS, CBSA_CODE etc.
  - geo variable consistency: CBSA_LEVEL vs CBSA_CODE, lon-lat, nesting of areas

In [None]:
# This section is not supposed to work and was moved over from step1 for reference.
# Needs revision and integration in overall workflow.

def extract_and_correct(yr):
    print(f'\n{yr}:')
    print(f'\n{yr}:',file=logfile)
    dir = '/InfoGroup/data/original/'
    xdir = '/tmp/xtrcts/'
    fname = f'{yr}_Business_Academic_QCQ_utf-8'
    # Extract the annual file from the zip archive
    with ZipFile(f'{dir}{fname}.zip','r') as myzip:
        myzip.extract(f'{fname}.csv',f'{xdir}')
        df = pd.read_csv(f'{xdir}{fname}.csv',low_memory=False,dtype=object)
    
    # Delete the temp file
    os.remove(f'{xdir}{fname}.csv')  
    
    # Add 'State Code' column, the 2-digit FIPS code
    df['State Code'] = df['State'].apply(lambda s: state_fips[s])
    # Correct and overwrite the state FIPS code.
    df['FIPS Code'] = df['State Code'] + df['County Code']
    
    # Add Full Census Tract column, the 11-digit census tract identifying
    # a tract uniquely nationwide. The 'Census Tract' variable in InfoGroup
    # is the 6-digit code that identifies a tract only within a county.
    df['Full Census Tract'] = df['FIPS Code'] + df['Census Tract']
    # zero-fill the ZipCode value
    df['ZipCode'] = df['ZipCode'].apply(lambda x: x.zfill(5) if len(x) < 5 == 0 else x)
    return df

def CBSA_partition(df):    
    urban = df[~df['CBSA Level'].isnull()]
    rural = df[(~df['CBSA Code'].isnull()) & (df['CBSA Level'].isnull())]
    unknown = df[df['CBSA Code'] .isnull()]

    nrows = len(df)
    sum_of_parts = len(urban) + len(rural) + len(unknown)
    if sum_of_parts != nrows:
        print('Error in dividing enterprises into categories:',file=logfile)
        print(f'\t{nrows} != {sum_of_parts}',file=logfile)
        
    corrected = extract_corrections(unknown)
    corrected.drop(columns=['CBSA','LSAD'],inplace=True)
    corrected.rename(columns={"FIPS Code_l": "FIPS Code"},inplace=True)

    print(corrected['CBSA Level'].value_counts(),file=logfile)
    print(len(corrected[corrected['CBSA Level'].isnull()]),file=logfile)
    return (urban, rural, corrected)

def extract_corrections(unknowns):
    """Extracts CBSA Code and appropriate CBSA Level for a list of InfoGroup FIPS Codes"""
    unknowns['FIPS Code'] = unknowns['FIPS Code'].astype(str)
    unk = unknowns.merge(cbsa_df,on='FIPS Code',how='inner') 
    unk['CBSA Level'] = np.nan
    
    for i in unk.index:
        if unk.at[i,'LSAD'].find("Metropolitan") > -1:
            unk.at[i,'CBSA Level'] = 2
        elif unk.at[i,'LSAD'].find("Micropolitan") > -1:
            unk.at[i,'CBSA Level'] = 1    
    return unk
    
def showtime(num):
    now = datetime.now()
    dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
    print(str(num),'  ',dt_string)	

## Reference datasets and derived data structures

# Census relationship file: cross-references CBSA codes and state/county FIPS codes.
# Variable 'STCOU' is the 5-digit state/county FIPS code. The CBSA Level is inferred from the
# text in the 'LSAD' variable.
cbsa_df = pd.read_csv(f'/InfoGroup/data/rurality/reference/relationships/cbsa-county-relationships-2017.csv',
                      usecols=['STCOU','CBSA','LSAD'],dtype=object)   
cbsa_df.rename(columns={'STCOU':'FIPS Code'},inplace=True)
cbsa_df['FIPS Code'] = cbsa_df['FIPS Code'].astype(str)

## Main

# Open a log file.
logfile = open('/InfoGroup/data/rurality/logs/step1.log','w')

#for yr in range(1997,2018):
for yr in range(2017,2018):
    showtime('start')
    df = extract_and_correct(yr)
    showtime('extract_and_correct')
    (urban,rural,corrected) = CBSA_partition(df)
    showtime('CBSA_partition')
    final_df = pd.concat([urban,rural,corrected],ignore_index=True)
    showtime('inline concat')
    final_df.to_csv(f'/InfoGroup/data/rurality/step1_{yr}.csv',index=None)
    showtime('finished')

logfile.close()

# Validation of geographic variables

Geographic variables

- ADDRESS: historical address
- CITY: historical address city
- STATE: historical address state
- ZIP: historical address zip code
- ZIP4: historical address zip code zip + 4
- COUNTY_CODE: county code based upon location address/zip4 (postal)
- AREA_CODE: area code of business
- ADDRESS_TYPE: indicates if type of address. "F": "Firm", "G": "General delivery", "H": "High-rise", "M": "Military", "P": "Post office box", "R": "Rural route or hwy contract", "S": "Street", "N": "Unknown", "": "No match to Zip4".
- CENSUS_TRACT: identifies a small geographic area for the purpose of collecting and compiling population and housing data.  census tracts are unique only within census county, and census counties are unique only within census state.  
- CENSUS_BLOCK: bgs are subdivisions of census tracts and unique only within a specific census tract.  census tracts/block groups are assigned to address records via a geocoding process.
- LATITUDE: parcel level assigned via point geo coding.  half of a pair of coordinates (the other being longitude)  provided in a formatted value, with decimals or a negative sign. not available in puerto rico & virgin island.
- LONGITUDE: parcel level assigned via point geo coding.  note: longitudes are negatives values in the western hemisphere.  provided in its formatted value, with decimals or a negative sign. not available in puerto rico & virigin island
- MATCH_CODE: parcel level match code of the business location. "0": "Site level", "2": "Zip+2 centroid", "4": "Zip+4 centroid", "P": "Parcel", "X": "Zip centroid".
- CBSA_CODE: core bases statistical area (expanded msa code)
- CBSA_LEVEL: indicates if an area is a micropolitan or metropolitan area. "1": "Micropolitan", "2": "Metropolitan"
- CSA_CODE: adjoining cbsa's.  combination of metro and micro areas
- FIPS_CODE: first 2 bytes = state code, last 3 bytes = county code (location)

In [None]:
geo_cols = ['ADDRESS', 'CITY', 'STATE', 'ZIP', 'ZIP4', 'COUNTY_CODE', 'AREA_CODE', 'ADDRESS_TYPE', 
            'CENSUS_TRACT', 'CENSUS_BLOCK', 'LATITUDE', 'LONGITUDE', 'MATCH_CODE', 
            'CBSA_CODE', 'CBSA_LEVEL', 'CSA_CODE', 'FIPS_CODE']
total_count = {}
isna_count = {}
other_count = {}

for year in range(1997, 2018):
    df = get_df(year, cols=geo_cols)

    total_count[year] = len(df)

    isna_count[year] = {}
    for col in geo_cols:
        isna_count[year][col] = df[col].isna().sum()

    other_count[year] = {}
    other_count[year]['ADDRESS_TYPE'] = (df['ADDRESS_TYPE'] == 'N').sum()
    other_count[year]['CENSUS_TRACT'] = (df['CENSUS_TRACT'] == '000000').sum() # is 000000 a valid tract id?
    other_count[year]['CENSUS_BLOCK'] = (df['CENSUS_BLOCK'] == '0').sum() # is 0 a valid block id?
    other_count[year]['CBSA_CODE'] = (df['CBSA_CODE'] == '00000').sum()
    other_count[year]['CSA_CODE'] = (df['CSA_CODE'] == '000').sum()

## STATE, COUNTY_CODE and FIPS_CODE

- STATE is never missing
- Tiny fraction (0.0001%) have missing COUNTY_CODE or FIPS_CODE
- Until 2012, about 2% have inconsistent codes, and only a few after that
  - Can correct using either STATE or state part of FIPS_CODE as truth

In [None]:
state_fips_map = {
'AL':'01',
'AK':'02',
'AS':'60',
'AZ':'04',
'AR':'05',
'CA':'06',
'CO':'08',
'CT':'09',
'DE':'10',
'DC':'11',
'FL':'12',
'FM':'64',
'GA':'13',
'GU':'66',
'HI':'15',
'ID':'16',
'IL':'17',
'IN':'18',
'IA':'19',
'KS':'20',
'KY':'21',
'LA':'22',
'ME':'23',
'MH':'68',
'MD':'24',
'MA':'25',
'MI':'26',
'MN':'27',
'MS':'28',
'MO':'29',
'MT':'30',
'NE':'31',
'NV':'32',
'NH':'33',
'NJ':'34',
'NM':'35',
'NY':'36',
'NC':'37',
'ND':'38',
'MP':'69',
'OH':'39',
'OK':'40',
'OR':'41',
'PW':'70',
'PA':'42',
'PR':'72',
'RI':'44',
'SC':'45',
'SD':'46',
'TN':'47',
'TX':'48',
'UM':'74',
'UT':'49',
'VT':'50',
'VA':'51',
'VI':'78',
'WA':'53',
'WV':'54',
'WI':'55',
'WY':'56'
}

In [None]:
logging.basicConfig(filename=resources.paths.root/'tmp/geo_valid.log', level=logging.INFO, format='%(message)s', force=True)
# logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(message)s', force=True)
crosstabs = {}
for year in range(1997, 2018):
    df = get_df(year, cols=['STATE', 'COUNTY_CODE', 'FIPS_CODE'])

    df['_STATE_CODE'] = df['STATE'].map(state_fips_map, 'ignore')
    state_isna = df['STATE'].isna()
    state_isna.name = 'STATE is NA'
    county_isna = df['COUNTY_CODE'].isna()
    county_isna.name = 'COUNTY_CODE is NA'
    fips_isna = df.FIPS_CODE.isna()
    fips_isna.name = 'FIPS_CODE is NA'
    county_eq_fips = (df['COUNTY_CODE'] == df['FIPS_CODE'].str[2:])
    county_eq_fips.name = 'COUNTY_CODE consistent with FIPS_CODE'
    state_eq_fips = (df['_STATE_CODE'] == df['FIPS_CODE'].str[:2])
    state_eq_fips.name = 'STATE consistent with FIPS_CODE'

    crosstab = pd.crosstab([state_isna, county_isna, fips_isna], [county_eq_fips, state_eq_fips])
    crosstabs[year] = crosstab.stack([0, 1])
    logging.info(f'''
    ---- {year} ----
    STATE, COUNTY_CODE and FIPS_CODE consistency

    {crosstab}
    ''')

In [None]:
ct = pd.concat(crosstabs).unstack(0).fillna(0).astype(int).T
ct = ct.loc[:, ~(ct == 0).all()].droplevel(0, 1)
ct

In [None]:
ctf = pd.DataFrame()
ctf['Valid'] = ct.loc[:, (False, False, True, True)]
ctf['Missing COUNTY_CODE or FIPS_CODE'] = ct[True].sum(1) + ct[(False, True,)].iloc[:, 0]
ctf['Inconsistent codes'] = ct.sum(1) - ctf.sum(1)
pd.testing.assert_series_equal(ct.sum(1), ctf.sum(1))
ctf.loc['Total', :] = ctf.sum()
ctf['Total'] = ctf.sum(1)
ctf = ctf.astype(int)
ctf.style.format('{:,}')

In [None]:
ctfr = ctf.copy()
for c in ctfr:
    ctfr[c] /= ctfr['Total']
ctfr.style.format('{:.3%}')

## CBSA_CODE and CBSA_LEVEL