In [2]:
import pandas as pd

#load FFIEC and Platform data files
FFIEC_GEO_FILE = "../data/census2018.csv"
PLATFORM_GEO_FILE = "../data/platform_geo_file_5_31_2019.txt"

platform_geo = pd.read_csv(PLATFORM_GEO_FILE, sep="|", dtype=object)
ffiec_df = pd.read_csv(FFIEC_GEO_FILE, header=None, dtype=object)

In [3]:
#set up FFIEC data
#FFIEC Census data includes MD in the MSA field
col_names = {0:"hmda_year", 1:"msa", 2:"state", 3:"county", 4:"tract", 14:"total_persons", 
             20:"min_pop_pct", 915:"owner_occupied", 899:"1_4_units",580:"tract_mfi", 12:"tract_to_msa_mfi_pct",
            952:"median_age", 13:"ffiec_mfi"}

#get MSA, state, county, tract codes
ffiec_census_data = ffiec_df.iloc[:, [0,1,2,3,4,13,14,20,915,899,580,12,952]].copy()
ffiec_census_data.rename(columns=col_names, inplace=True)
ffiec_census_data.head()

Unnamed: 0,hmda_year,msa,state,county,tract,ffiec_mfi,total_persons,min_pop_pct,owner_occupied,1_4_units,tract_mfi,tract_to_msa_mfi_pct,median_age
0,2018,33860,1,1,20100,62900,1948,12.58,507,724,72727,122.93,36
1,2018,33860,1,1,20200,62900,2156,59.55,433,785,48750,82.4,39
2,2018,33860,1,1,20300,62900,2968,25.47,828,1327,55766,94.26,39
3,2018,33860,1,1,20400,62900,4423,17.21,1345,1806,69114,116.82,46
4,2018,33860,1,1,20500,62900,10763,31.54,2255,3237,75574,127.74,14


In [4]:
#create MSA to tract map from FFIEC file

print(len(ffiec_census_data), "total records")
print(len(ffiec_census_data.msa.unique()), "distinct MSA/MD records")
print(len(ffiec_census_data.state.unique()), "distinct State records")
print(len(ffiec_census_data.county.unique()), "distinct County records")
print(len(ffiec_census_data.tract.unique()), "distinct Tract records")
ffiec_census_data.head()

75883 total records
411 distinct MSA/MD records
56 distinct State records
329 distinct County records
23970 distinct Tract records


Unnamed: 0,hmda_year,msa,state,county,tract,ffiec_mfi,total_persons,min_pop_pct,owner_occupied,1_4_units,tract_mfi,tract_to_msa_mfi_pct,median_age
0,2018,33860,1,1,20100,62900,1948,12.58,507,724,72727,122.93,36
1,2018,33860,1,1,20200,62900,2156,59.55,433,785,48750,82.4,39
2,2018,33860,1,1,20300,62900,2968,25.47,828,1327,55766,94.26,39
3,2018,33860,1,1,20400,62900,4423,17.21,1345,1806,69114,116.82,46
4,2018,33860,1,1,20500,62900,10763,31.54,2255,3237,75574,127.74,14


In [5]:
#create MSA to tract map from Platform
print(len(platform_geo), "total records")
print(len(platform_geo["MSA/MD"].unique()), "distinct MSA/MD records")
print(len(platform_geo["State"].unique()), "distinct State records")
print(len(platform_geo["County"].unique()), "distinct county records")
print(len(platform_geo["Census Tract"].unique()), "distinct census tracts")

platform_geo.head()

75883 total records
411 distinct MSA/MD records
56 distinct State records
329 distinct county records
23970 distinct census tracts


Unnamed: 0,Collection Year,MSA/MD,State,County,Census Tract,FFIEC Median Family Income,Population,Minority Population %,Number of Owner Occupied Units,Number of 1 to 4 Family Units,Tract MFI,Tract to MSA Income %,Median Age,Small County,MSA/MD Name
0,2018,33860,1,1,20100,62900,1948,12.58,507,724,72727,122.93,36,T,"MONTGOMERY,AL"
1,2018,33860,1,1,20200,62900,2156,59.55,433,785,48750,82.4,39,T,"MONTGOMERY,AL"
2,2018,33860,1,1,20300,62900,2968,25.47,828,1327,55766,94.26,39,T,"MONTGOMERY,AL"
3,2018,33860,1,1,20400,62900,4423,17.21,1345,1806,69114,116.82,46,T,"MONTGOMERY,AL"
4,2018,33860,1,1,20500,62900,10763,31.54,2255,3237,75574,127.74,14,T,"MONTGOMERY,AL"


In [6]:
#compare all geographic sets between dataframes
#MSA/MD
platform_msas = set(platform_geo["MSA/MD"])
ffiec_msas = set(ffiec_census_data.msa)
print(platform_msas == ffiec_msas, "MSA/MD set comparison")

#State
platform_states = set(platform_geo["State"])
ffiec_states = set(ffiec_census_data.state)
print(platform_states == ffiec_states, "State set comparison")

#County
platform_counties = set(platform_geo["County"])
ffiec_counties = set(ffiec_census_data.county)
print(platform_counties == ffiec_counties, "County set comparison")

#tract
platform_tracts = set(platform_geo["Census Tract"])
ffiec_tracts = set(ffiec_census_data.tract)
print(platform_tracts == ffiec_tracts, "Tract set comparison")

True MSA/MD set comparison
True State set comparison
True County set comparison
True Tract set comparison


In [7]:
#MSA to state maps
for msa in set(platform_geo["MSA/MD"]):
    platform_states = set(platform_geo.State[platform_geo["MSA/MD"]==msa])
    ffiec_states = set(ffiec_census_data.state[ffiec_census_data.msa==msa])
    if platform_states != ffiec_states:
        print("MSA {msa} has mismatched state sets".format(msa))

#MSA to county maps
for msa in set(platform_geo["MSA/MD"]):
    platform_counties = set(platform_geo.County[platform_geo["MSA/MD"]==msa])
    ffiec_counties = set(ffiec_census_data.county[ffiec_census_data.msa==msa])
    if platform_counties != ffiec_counties:
        print("MSA {msa} has mismatched counties sets".format(msa))
        
#MSA to tract maps
for msa in set(platform_geo["MSA/MD"]):
    platform_tracts = set(platform_geo["Census Tract"][platform_geo["MSA/MD"]==msa])
    ffiec_tracts = set(ffiec_census_data.tract[ffiec_census_data.msa==msa])
    if platform_tracts != ffiec_tracts:
        print("MSA {msa} has mismatched tracts sets".format(msa))
        
#state to county maps
for state in set(platform_geo.State):
    platform_counties = set(platform_geo.County[platform_geo.State==state])
    ffiec_counties = set(ffiec_census_data.county[ffiec_census_data.state==state])
    if platform_counties != ffiec_counties:
        print("State {state} has mismatched county sets".format(state))
        
#state to tract maps
for state in set(platform_geo.State):
    platform_tracts = set(platform_geo["Census Tract"][platform_geo.State==state])
    ffiec_tracts = set(ffiec_census_data.tract[ffiec_census_data.state==state])
    if platform_tracts != ffiec_tracts:
        print("State {state} has mismatched tract sets".format(state))
        
#county to tract maps
for county in set(platform_geo.County):
    platform_tracts = set(platform_geo["Census Tract"][platform_geo.County==county])
    ffiec_tracts = set(ffiec_census_data.tract[ffiec_census_data.county==county])
    if platform_tracts != ffiec_tracts:
        print("County {county} has mismatched tract sets".format(county))
        
print("done")

done
