In [1]:
import os
import sys
import re
import csv

import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import shapely.wkt
import usaddress
from number_parser import parse_ordinal

In [2]:
os.chdir('..')
abs_path = os.getcwd()

zcta_folder = os.path.join(abs_path, 'data/tl_2023_us_zcta520_clean')
zcta_file = 'tl_2023_us_zcta520_clean.shp'
zcta_path = os.path.join(zcta_folder, zcta_file)

OMOP_county_full_path = os.path.join(abs_path, 'output', 'OMOP_county_full.csv')
OMOP_failed_county_full_path = os.path.join(abs_path, 'output', 'OMOP_failed_county.csv')

zcta_crosswalk_file = "tab20_zcta520_county20_natl.txt"
zcta_crosswalk_path = os.path.join(abs_path, "data", zcta_crosswalk_file)

In [3]:
def create_dir(save_dir):
    """
    Creates directory if it does not exist
         
    Parameters
    ----------
        save_dir (str): path of desired output directory
    """
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

# ZCTA County Match

**The goal is to match county names for each address validated by two methods**
1) Matching county names
* ZCTA 2023: spatial join polygon to the address lat/lon
* ZCTA 2020 Crosswalk: match GEOID to address zipcode

2) Keep addresses only where the county names match

In [7]:
omop_county_full = pd.read_csv(OMOP_county_full_path)
omop_county_failed = pd.read_csv(OMOP_failed_county_full_path)
zcta_crosswalk = pd.read_csv(zcta_crosswalk_path, sep="|")

  omop_county_full = pd.read_csv(OMOP_county_full_path)


In [8]:
# drop unnecessary columns
zcta_crosswalk = zcta_crosswalk.drop(columns=["OID_ZCTA5_20", "AREALAND_ZCTA5_20", "AREAWATER_ZCTA5_20", "MTFCC_ZCTA5_20", "CLASSFP_ZCTA5_20", "FUNCSTAT_ZCTA5_20", 
                                              "AREALAND_COUNTY_20", "AREAWATER_COUNTY_20", "MTFCC_COUNTY_20", "CLASSFP_COUNTY_20", "FUNCSTAT_COUNTY_20", "AREALAND_PART", "AREAWATER_PART"])
zcta_crosswalk

Unnamed: 0,GEOID_ZCTA5_20,NAMELSAD_ZCTA5_20,OID_COUNTY_20,GEOID_COUNTY_20,NAMELSAD_COUNTY_20
0,,,27590114112812,1003,Baldwin County
1,,,2759099719300,1007,Bibb County
2,,,27590103020886,1015,Calhoun County
3,,,27590336389978,1021,Chilton County
4,,,2759075862059,1025,Clarke County
...,...,...,...,...,...
47858,99923.0,ZCTA5 99923,275903025880880,2198,Prince of Wales-Hyder Census Area
47859,99925.0,ZCTA5 99925,275903025880880,2198,Prince of Wales-Hyder Census Area
47860,99926.0,ZCTA5 99926,275903025880880,2198,Prince of Wales-Hyder Census Area
47861,99927.0,ZCTA5 99927,275903025880880,2198,Prince of Wales-Hyder Census Area


In [9]:
zcta_shapes = gpd.read_file(zcta_path)
zcta_shapes = zcta_shapes.to_crs("EPSG:4326")
zcta_shapes

Unnamed: 0,ZCTA5CE20,GEOID20,GEOIDFQ20,INTPTLAT20,INTPTLON20,geometry
0,47236,47236,860Z200US47236,+39.1517426,-085.7252769,"POLYGON ((-85.73410 39.15597, -85.72794 39.156..."
1,47870,47870,860Z200US47870,+39.3701518,-087.4735141,"POLYGON ((-87.47415 39.37016, -87.47410 39.370..."
2,47851,47851,860Z200US47851,+39.5735839,-087.2459559,"POLYGON ((-87.24769 39.57450, -87.24711 39.574..."
3,47337,47337,860Z200US47337,+39.8027537,-085.4372850,"POLYGON ((-85.44356 39.80328, -85.44345 39.803..."
4,47435,47435,860Z200US47435,+39.2657557,-086.2951577,"POLYGON ((-86.29592 39.26547, -86.29592 39.266..."
...,...,...,...,...,...,...
33786,37932,37932,860Z200US37932,+35.9172993,-084.1987873,"POLYGON ((-84.27347 35.93928, -84.27287 35.940..."
33787,37341,37341,860Z200US37341,+35.2199309,-085.0730025,"POLYGON ((-85.15090 35.11231, -85.15088 35.112..."
33788,37849,37849,860Z200US37849,+36.0540502,-084.0484876,"POLYGON ((-84.14857 36.04234, -84.14240 36.046..."
33789,37754,37754,860Z200US37754,+36.1390993,-084.0298007,"POLYGON ((-84.10549 36.11168, -84.10543 36.111..."


In [None]:
# save ZCTA shapefile after dropping columns
# ONLY RUN ONCE
zcta_shapes = zcta_shapes.drop(columns=["CLASSFP20", "MTFCC20", "FUNCSTAT20", "ALAND20", "AWATER20"])

# shp_file = dict[fname].set_geometry('source_centroid')
save_dir = os.path.join(abs_path, 'data')        
save_path = os.path.join(save_dir, "tl_2023_us_zcta520_clean")
create_dir(save_path)

zcta_shapes.to_file(save_path, driver='ESRI Shapefile')

In [10]:
# remove decimals from float and convert to stringh
zcta_crosswalk['ZCTA5CE20'] = zcta_crosswalk.GEOID_ZCTA5_20.apply(lambda x: str(x).rstrip(".0") if ".0" in str(x) else str(x))

# pad leading zero to zipcode if not 5-digit
zcta_crosswalk['ZCTA5CE20'] = zcta_crosswalk.ZCTA5CE20.apply(lambda x: np.NaN if x == "nan" else x.rjust(5, '0'))

# zcta_crosswalk['GEOID20'] = zcta_crosswalk.GEOID_COUNTY_20.astype(str)
# zcta_crosswalk['GEOID20'] = zcta_crosswalk.zip.apply(lambda x: str(x).zfill(5))
# convert to string without trailing zeros and decimal
zcta_crosswalk

Unnamed: 0,GEOID_ZCTA5_20,NAMELSAD_ZCTA5_20,OID_COUNTY_20,GEOID_COUNTY_20,NAMELSAD_COUNTY_20,ZCTA5CE20
0,,,27590114112812,1003,Baldwin County,
1,,,2759099719300,1007,Bibb County,
2,,,27590103020886,1015,Calhoun County,
3,,,27590336389978,1021,Chilton County,
4,,,2759075862059,1025,Clarke County,
...,...,...,...,...,...,...
47858,99923.0,ZCTA5 99923,275903025880880,2198,Prince of Wales-Hyder Census Area,99923
47859,99925.0,ZCTA5 99925,275903025880880,2198,Prince of Wales-Hyder Census Area,99925
47860,99926.0,ZCTA5 99926,275903025880880,2198,Prince of Wales-Hyder Census Area,99926
47861,99927.0,ZCTA5 99927,275903025880880,2198,Prince of Wales-Hyder Census Area,99927


In [12]:
zcta_crosswalk.loc[zcta_crosswalk.NAMELSAD_COUNTY_20.str.contains("Essex")]

Unnamed: 0,GEOID_ZCTA5_20,NAMELSAD_ZCTA5_20,OID_COUNTY_20,GEOID_COUNTY_20,NAMELSAD_COUNTY_20,ZCTA5CE20
365,,,27590206454990,25009,Essex County,
495,,,27590718981485,36031,Essex County,
1417,1810.0,ZCTA5 01810,27590206454990,25009,Essex County,00181
1420,1826.0,ZCTA5 01826,27590206454990,25009,Essex County,01826
1423,1830.0,ZCTA5 01830,27590206454990,25009,Essex County,00183
...,...,...,...,...,...,...
9105,22476.0,ZCTA5 22476,27590239331220,51057,Essex County,22476
9114,22504.0,ZCTA5 22504,27590239331220,51057,Essex County,22504
9118,22509.0,ZCTA5 22509,27590239331220,51057,Essex County,22509
9140,22560.0,ZCTA5 22560,27590239331220,51057,Essex County,02256


In [11]:
# merge ZCTA TIGER/Line file to the ZCTA crosswalk to match county names to ZCTA5CE20
zcta_county_name = zcta_shapes.merge(zcta_crosswalk, how="left", on="ZCTA5CE20")
zcta_county_name['zip'] = zcta_county_name.ZCTA5CE20
zcta_county_name

Unnamed: 0,ZCTA5CE20,GEOID20,GEOIDFQ20,INTPTLAT20,INTPTLON20,geometry,GEOID_ZCTA5_20,NAMELSAD_ZCTA5_20,OID_COUNTY_20,GEOID_COUNTY_20,NAMELSAD_COUNTY_20,zip
0,47236,47236,860Z200US47236,+39.1517426,-085.7252769,"POLYGON ((-85.73410 39.15597, -85.72794 39.156...",47236.0,ZCTA5 47236,2.759010e+13,18005.0,Bartholomew County,47236
1,47870,47870,860Z200US47870,+39.3701518,-087.4735141,"POLYGON ((-87.47415 39.37016, -87.47410 39.370...",,,,,,47870
2,47851,47851,860Z200US47851,+39.5735839,-087.2459559,"POLYGON ((-87.24769 39.57450, -87.24711 39.574...",47851.0,ZCTA5 47851,2.759035e+13,18167.0,Vigo County,47851
3,47337,47337,860Z200US47337,+39.8027537,-085.4372850,"POLYGON ((-85.44356 39.80328, -85.44345 39.803...",47337.0,ZCTA5 47337,2.759011e+13,18065.0,Henry County,47337
4,47435,47435,860Z200US47435,+39.2657557,-086.2951577,"POLYGON ((-86.29592 39.26547, -86.29592 39.266...",47435.0,ZCTA5 47435,2.759010e+13,18013.0,Brown County,47435
...,...,...,...,...,...,...,...,...,...,...,...,...
46906,37754,37754,860Z200US37754,+36.1390993,-084.0298007,"POLYGON ((-84.10549 36.11168, -84.10543 36.111...",37754.0,ZCTA5 37754,2.759026e+13,47001.0,Anderson County,37754
46907,37754,37754,860Z200US37754,+36.1390993,-084.0298007,"POLYGON ((-84.10549 36.11168, -84.10543 36.111...",37754.0,ZCTA5 37754,2.759023e+13,47093.0,Knox County,37754
46908,37754,37754,860Z200US37754,+36.1390993,-084.0298007,"POLYGON ((-84.10549 36.11168, -84.10543 36.111...",37754.0,ZCTA5 37754,2.759054e+13,47173.0,Union County,37754
46909,37806,37806,860Z200US37806,+36.0846931,-083.7279865,"MULTIPOLYGON (((-83.78542 36.08103, -83.78461 ...",37806.0,ZCTA5 37806,2.759022e+13,47057.0,Grainger County,37806


In [15]:
# add point geometry column that matches expected gpd format
omop_county_full['geometry'] = omop_county_full.apply(lambda x: "POINT (" + str(x.longitude) + " " + str(x.latitude) + ")", axis=1)

# convert to geopandas DataFrame and set `geometry` centroid 
omop_county_gdf = gpd.GeoDataFrame(omop_county_full, geometry=omop_county_full['geometry'].apply(shapely.wkt.loads))
omop_county_gdf.crs = "EPSG:4326"
omop_county_gdf

Unnamed: 0,Location_id,address_1,address_2,city,state,zip,county,location_source_value,latitude,longitude,address_type,state_abbr,flag,geometry
0,1,523 E Broadway,,South Boston,Massachusetts,02127,Suffolk,"523 E BROADWAY, SOUTH BOSTON, MA 02127",42.335472,-71.043522,Street Address,MA,SUCCESSFUL ADDRESS,POINT (-71.04352 42.33547)
1,2,454 Essex St,,Lawrence,Massachusetts,01840,Essex,"454 ESSEX ST, LAWRENCE, MA 01840",42.706213,-71.164940,Street Address,MA,SUCCESSFUL ADDRESS,POINT (-71.16494 42.70621)
2,3,569 Broadway,,Newark,New Jersey,07104,Essex,"569 BROADWAY, NEWARK, NJ 07104",40.769935,-74.162821,Street Address,NJ,SUCCESSFUL ADDRESS,POINT (-74.16282 40.76993)
3,4,3210 Southwestern Blvd,,Orchard Park,New York,14127,Erie,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",42.798535,-78.747816,Street Address,NY,SUCCESSFUL ADDRESS,POINT (-78.74782 42.79853)
4,5,431 Campground Rd,,Livermore Fls,Maine,04254,Androscoggin,"431 CAMPGROUND RD, LIVERMORE FLS, ME 04254",44.428610,-70.113777,Street Address,ME,SUCCESSFUL ADDRESS,POINT (-70.11378 44.42861)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78795,104098,1300 Peachtree Industrial Boulevard,,Suwanee,Georgia,30024,Gwinnett,"1300 PEACHTREE INDUSTRIAL BOULEVARD, SUWANEE, ...",34.041727,-84.095174,Street Address,GA,SUCCESSFUL ADDRESS,POINT (-84.09517 34.04173)
78796,104099,2660 Satellite Boulevard Northwest,,Duluth,Georgia,30096,Gwinnett,"2660 SATELLITE BOULEVARD NORTHWEST, DULUTH, GA...",33.966797,-84.101318,Street Address,GA,SUCCESSFUL ADDRESS,POINT (-84.10132 33.96680)
78797,104100,3685 Braselton Highway,,Dacula,Georgia,30019,Gwinnett,"3685 BRASELTON HIGHWAY, DACULA, GA 30019",34.068832,-83.902215,Street Address,GA,SUCCESSFUL ADDRESS,POINT (-83.90222 34.06883)
78798,104101,1055 Dove Run Road,,Lexington,Kentucky,40502,Fayette,"1055 DOVE RUN ROAD, LEXINGTON, KY 40502",37.996508,-84.494106,Street Address,KY,SUCCESSFUL ADDRESS,POINT (-84.49411 37.99651)


In [16]:
# spatial join omop county file to ZCTA TIGER/Line
omop_county_zcta = omop_county_gdf.sjoin(zcta_county_name, how='left')

# add "County" to the county names
omop_county_zcta["county"] =  omop_county_zcta.county.apply(lambda x: str(x) + " County")
omop_county_zcta

Unnamed: 0,Location_id,address_1,address_2,city,state,zip_left,county,location_source_value,latitude,longitude,...,GEOID20,GEOIDFQ20,INTPTLAT20,INTPTLON20,GEOID_ZCTA5_20,NAMELSAD_ZCTA5_20,OID_COUNTY_20,GEOID_COUNTY_20,NAMELSAD_COUNTY_20,zip_right
0,1,523 E Broadway,,South Boston,Massachusetts,02127,Suffolk County,"523 E BROADWAY, SOUTH BOSTON, MA 02127",42.335472,-71.043522,...,02127,860Z200US02127,+42.3353007,-071.0382163,2127.0,ZCTA5 02127,2.759021e+13,25025.0,Suffolk County,02127
1,2,454 Essex St,,Lawrence,Massachusetts,01840,Essex County,"454 ESSEX ST, LAWRENCE, MA 01840",42.706213,-71.164940,...,01840,860Z200US01840,+42.7067633,-071.1604026,,,,,,01840
2,3,569 Broadway,,Newark,New Jersey,07104,Essex County,"569 BROADWAY, NEWARK, NJ 07104",40.769935,-74.162821,...,07104,860Z200US07104,+40.7677132,-074.1683498,71040.0,ZCTA5 71040,2.759069e+12,22027.0,Claiborne Parish,07104
2,3,569 Broadway,,Newark,New Jersey,07104,Essex County,"569 BROADWAY, NEWARK, NJ 07104",40.769935,-74.162821,...,07104,860Z200US07104,+40.7677132,-074.1683498,7104.0,ZCTA5 07104,2.759043e+13,34013.0,Essex County,07104
3,4,3210 Southwestern Blvd,,Orchard Park,New York,14127,Erie County,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",42.798535,-78.747816,...,14127,860Z200US14127,+42.7528037,-078.7396939,14127.0,ZCTA5 14127,2.759012e+13,36029.0,Erie County,14127
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78796,104099,2660 Satellite Boulevard Northwest,,Duluth,Georgia,30096,Gwinnett County,"2660 SATELLITE BOULEVARD NORTHWEST, DULUTH, GA...",33.966797,-84.101318,...,30096,860Z200US30096,+33.9743018,-084.1453842,30096.0,ZCTA5 30096,2.759044e+13,13135.0,Gwinnett County,30096
78797,104100,3685 Braselton Highway,,Dacula,Georgia,30019,Gwinnett County,"3685 BRASELTON HIGHWAY, DACULA, GA 30019",34.068832,-83.902215,...,30019,860Z200US30019,+33.9756343,-083.8837695,30019.0,ZCTA5 30019,2.759044e+13,13135.0,Gwinnett County,30019
78797,104100,3685 Braselton Highway,,Dacula,Georgia,30019,Gwinnett County,"3685 BRASELTON HIGHWAY, DACULA, GA 30019",34.068832,-83.902215,...,30019,860Z200US30019,+33.9756343,-083.8837695,30019.0,ZCTA5 30019,2.759042e+13,13297.0,Walton County,30019
78798,104101,1055 Dove Run Road,,Lexington,Kentucky,40502,Fayette County,"1055 DOVE RUN ROAD, LEXINGTON, KY 40502",37.996508,-84.494106,...,40502,860Z200US40502,+38.0109529,-084.4831908,40502.0,ZCTA5 40502,2.759010e+13,21067.0,Fayette County,40502


In [24]:
omop_county_zcta.loc[omop_county_zcta.county.str.contains("Essex")]

Unnamed: 0,Location_id,address_1,address_2,city,state,zip_left,county,location_source_value,latitude,longitude,address_type,state_abbr,flag,geometry,ZCTA5CE20,GEOID20,NAMELSAD_COUNTY_20,county_match
1,2,454 Essex St,,Lawrence,Massachusetts,01840,Essex County,"454 ESSEX ST, LAWRENCE, MA 01840",42.706213,-71.164940,Street Address,MA,SUCCESSFUL ADDRESS,POINT (-71.16494 42.70621),01840,01840,,0
2,3,569 Broadway,,Newark,New Jersey,07104,Essex County,"569 BROADWAY, NEWARK, NJ 07104",40.769935,-74.162821,Street Address,NJ,SUCCESSFUL ADDRESS,POINT (-74.16282 40.76993),07104,07104,Claiborne Parish,0
2,3,569 Broadway,,Newark,New Jersey,07104,Essex County,"569 BROADWAY, NEWARK, NJ 07104",40.769935,-74.162821,Street Address,NJ,SUCCESSFUL ADDRESS,POINT (-74.16282 40.76993),07104,07104,Essex County,1
271,436,239 Ampere Pkwy,,Bloomfield,New Jersey,07003,Essex County,"239 AMPERE PKWY, BLOOMFIELD, NJ 07003",40.774346,-74.190551,Street Address,NJ,SUCCESSFUL ADDRESS,POINT (-74.19055 40.77435),07003,07003,Essex County,1
271,436,239 Ampere Pkwy,,Bloomfield,New Jersey,07003,Essex County,"239 AMPERE PKWY, BLOOMFIELD, NJ 07003",40.774346,-74.190551,Street Address,NJ,SUCCESSFUL ADDRESS,POINT (-74.19055 40.77435),07003,07003,St. Charles Parish,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55752,71731,196 Ray Brook Rd,,Ray Brook,New York,12977,Essex County,"196 RAY BROOK RD, RAY BROOK, NY 12977",44.295981,-74.091477,Street Address,NY,SUCCESSFUL ADDRESS,POINT (-74.09148 44.29598),12977,12977,Essex County,1
55760,71743,75 Burhart Lane,,Mineville,New York,12956,Essex County,"75 BURHART LANE, MINEVILLE, NY 12956",44.107500,-73.533249,Street Address,NY,SUCCESSFUL ADDRESS,POINT (-73.53325 44.10750),12956,12956,Essex County,1
64117,82654,5 Calkins Pl,,Ticonderoga,New York,12883,Essex County,"5 CALKINS PL, TICONDEROGA, NY 12883",43.842119,-73.427513,Street Address,NY,SUCCESSFUL ADDRESS,POINT (-73.42751 43.84212),12883,12883,Essex County,1
64117,82654,5 Calkins Pl,,Ticonderoga,New York,12883,Essex County,"5 CALKINS PL, TICONDEROGA, NY 12883",43.842119,-73.427513,Street Address,NY,SUCCESSFUL ADDRESS,POINT (-73.42751 43.84212),12883,12883,Washington County,0


In [17]:
omop_county_zcta["county_match"] = omop_county_zcta.apply(lambda x: 1 if x.county == x.NAMELSAD_COUNTY_20 else 0, axis=1)
omop_county_zcta

Unnamed: 0,Location_id,address_1,address_2,city,state,zip_left,county,location_source_value,latitude,longitude,...,GEOIDFQ20,INTPTLAT20,INTPTLON20,GEOID_ZCTA5_20,NAMELSAD_ZCTA5_20,OID_COUNTY_20,GEOID_COUNTY_20,NAMELSAD_COUNTY_20,zip_right,county_match
0,1,523 E Broadway,,South Boston,Massachusetts,02127,Suffolk County,"523 E BROADWAY, SOUTH BOSTON, MA 02127",42.335472,-71.043522,...,860Z200US02127,+42.3353007,-071.0382163,2127.0,ZCTA5 02127,2.759021e+13,25025.0,Suffolk County,02127,1
1,2,454 Essex St,,Lawrence,Massachusetts,01840,Essex County,"454 ESSEX ST, LAWRENCE, MA 01840",42.706213,-71.164940,...,860Z200US01840,+42.7067633,-071.1604026,,,,,,01840,0
2,3,569 Broadway,,Newark,New Jersey,07104,Essex County,"569 BROADWAY, NEWARK, NJ 07104",40.769935,-74.162821,...,860Z200US07104,+40.7677132,-074.1683498,71040.0,ZCTA5 71040,2.759069e+12,22027.0,Claiborne Parish,07104,0
2,3,569 Broadway,,Newark,New Jersey,07104,Essex County,"569 BROADWAY, NEWARK, NJ 07104",40.769935,-74.162821,...,860Z200US07104,+40.7677132,-074.1683498,7104.0,ZCTA5 07104,2.759043e+13,34013.0,Essex County,07104,1
3,4,3210 Southwestern Blvd,,Orchard Park,New York,14127,Erie County,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",42.798535,-78.747816,...,860Z200US14127,+42.7528037,-078.7396939,14127.0,ZCTA5 14127,2.759012e+13,36029.0,Erie County,14127,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78796,104099,2660 Satellite Boulevard Northwest,,Duluth,Georgia,30096,Gwinnett County,"2660 SATELLITE BOULEVARD NORTHWEST, DULUTH, GA...",33.966797,-84.101318,...,860Z200US30096,+33.9743018,-084.1453842,30096.0,ZCTA5 30096,2.759044e+13,13135.0,Gwinnett County,30096,1
78797,104100,3685 Braselton Highway,,Dacula,Georgia,30019,Gwinnett County,"3685 BRASELTON HIGHWAY, DACULA, GA 30019",34.068832,-83.902215,...,860Z200US30019,+33.9756343,-083.8837695,30019.0,ZCTA5 30019,2.759044e+13,13135.0,Gwinnett County,30019,1
78797,104100,3685 Braselton Highway,,Dacula,Georgia,30019,Gwinnett County,"3685 BRASELTON HIGHWAY, DACULA, GA 30019",34.068832,-83.902215,...,860Z200US30019,+33.9756343,-083.8837695,30019.0,ZCTA5 30019,2.759042e+13,13297.0,Walton County,30019,0
78798,104101,1055 Dove Run Road,,Lexington,Kentucky,40502,Fayette County,"1055 DOVE RUN ROAD, LEXINGTON, KY 40502",37.996508,-84.494106,...,860Z200US40502,+38.0109529,-084.4831908,40502.0,ZCTA5 40502,2.759010e+13,21067.0,Fayette County,40502,1


In [18]:
omop_county_zcta.columns

Index(['Location_id', 'address_1', 'address_2', 'city', 'state', 'zip_left',
       'county', 'location_source_value', 'latitude', 'longitude',
       'address_type', 'state_abbr', 'flag', 'geometry', 'index_right',
       'ZCTA5CE20', 'GEOID20', 'GEOIDFQ20', 'INTPTLAT20', 'INTPTLON20',
       'GEOID_ZCTA5_20', 'NAMELSAD_ZCTA5_20', 'OID_COUNTY_20',
       'GEOID_COUNTY_20', 'NAMELSAD_COUNTY_20', 'zip_right', 'county_match'],
      dtype='object')

In [19]:
omop_county_zcta = omop_county_zcta.drop(columns = ["GEOIDFQ20", "INTPTLAT20", "INTPTLON20", "NAMELSAD_ZCTA5_20", "OID_COUNTY_20", "zip_right" ,"GEOID_ZCTA5_20", "GEOID_COUNTY_20", "index_right"])
omop_county_zcta = omop_county_zcta.rename({"zip_left": "zip"})
omop_county_zcta

Unnamed: 0,Location_id,address_1,address_2,city,state,zip_left,county,location_source_value,latitude,longitude,address_type,state_abbr,flag,geometry,ZCTA5CE20,GEOID20,NAMELSAD_COUNTY_20,county_match
0,1,523 E Broadway,,South Boston,Massachusetts,02127,Suffolk County,"523 E BROADWAY, SOUTH BOSTON, MA 02127",42.335472,-71.043522,Street Address,MA,SUCCESSFUL ADDRESS,POINT (-71.04352 42.33547),02127,02127,Suffolk County,1
1,2,454 Essex St,,Lawrence,Massachusetts,01840,Essex County,"454 ESSEX ST, LAWRENCE, MA 01840",42.706213,-71.164940,Street Address,MA,SUCCESSFUL ADDRESS,POINT (-71.16494 42.70621),01840,01840,,0
2,3,569 Broadway,,Newark,New Jersey,07104,Essex County,"569 BROADWAY, NEWARK, NJ 07104",40.769935,-74.162821,Street Address,NJ,SUCCESSFUL ADDRESS,POINT (-74.16282 40.76993),07104,07104,Claiborne Parish,0
2,3,569 Broadway,,Newark,New Jersey,07104,Essex County,"569 BROADWAY, NEWARK, NJ 07104",40.769935,-74.162821,Street Address,NJ,SUCCESSFUL ADDRESS,POINT (-74.16282 40.76993),07104,07104,Essex County,1
3,4,3210 Southwestern Blvd,,Orchard Park,New York,14127,Erie County,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",42.798535,-78.747816,Street Address,NY,SUCCESSFUL ADDRESS,POINT (-78.74782 42.79853),14127,14127,Erie County,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78796,104099,2660 Satellite Boulevard Northwest,,Duluth,Georgia,30096,Gwinnett County,"2660 SATELLITE BOULEVARD NORTHWEST, DULUTH, GA...",33.966797,-84.101318,Street Address,GA,SUCCESSFUL ADDRESS,POINT (-84.10132 33.96680),30096,30096,Gwinnett County,1
78797,104100,3685 Braselton Highway,,Dacula,Georgia,30019,Gwinnett County,"3685 BRASELTON HIGHWAY, DACULA, GA 30019",34.068832,-83.902215,Street Address,GA,SUCCESSFUL ADDRESS,POINT (-83.90222 34.06883),30019,30019,Gwinnett County,1
78797,104100,3685 Braselton Highway,,Dacula,Georgia,30019,Gwinnett County,"3685 BRASELTON HIGHWAY, DACULA, GA 30019",34.068832,-83.902215,Street Address,GA,SUCCESSFUL ADDRESS,POINT (-83.90222 34.06883),30019,30019,Walton County,0
78798,104101,1055 Dove Run Road,,Lexington,Kentucky,40502,Fayette County,"1055 DOVE RUN ROAD, LEXINGTON, KY 40502",37.996508,-84.494106,Street Address,KY,SUCCESSFUL ADDRESS,POINT (-84.49411 37.99651),40502,40502,Fayette County,1


In [20]:
omop_county_zcta.county_match.value_counts()

county_match
1    69640
0    35287
Name: count, dtype: int64

In [21]:
omop_county_zcta_path = os.path.join(abs_path, 'output', 'omop_county_zcta.csv')
omop_county_zcta.to_csv(omop_county_zcta_path, index=False)

In [37]:
omop_county_zcta_path = os.path.join(abs_path, 'output', 'omop_county_zcta.csv')
omop_county_zcta = pd.read_csv(omop_county_zcta_path)

  omop_county_zcta = pd.read_csv(omop_county_zcta_path)


In [8]:
omop_county_zcta.state.nunique()

57

In [22]:
omop_county_zcta_match = omop_county_zcta.loc[omop_county_zcta.county_match == 1]
# omop_county_zcta_match = omop_county_zcta_match.drop(columns = ["GEOIDFQ20", "INTPTLAT20", "INTPTLON20", "NAMELSAD_ZCTA5_20", "OID_COUNTY_20", "zip_right" ,"Unnamed: 0", "GEOID_ZCTA5_20", "GEOID_COUNTY_20"])
omop_county_zcta_match = omop_county_zcta_match.rename({"zip_left": "zip"})
print(omop_county_zcta_match.columns)
omop_county_zcta_match

Index(['Location_id', 'address_1', 'address_2', 'city', 'state', 'zip_left',
       'county', 'location_source_value', 'latitude', 'longitude',
       'address_type', 'state_abbr', 'flag', 'geometry', 'ZCTA5CE20',
       'GEOID20', 'NAMELSAD_COUNTY_20', 'county_match'],
      dtype='object')


Unnamed: 0,Location_id,address_1,address_2,city,state,zip_left,county,location_source_value,latitude,longitude,address_type,state_abbr,flag,geometry,ZCTA5CE20,GEOID20,NAMELSAD_COUNTY_20,county_match
0,1,523 E Broadway,,South Boston,Massachusetts,02127,Suffolk County,"523 E BROADWAY, SOUTH BOSTON, MA 02127",42.335472,-71.043522,Street Address,MA,SUCCESSFUL ADDRESS,POINT (-71.04352 42.33547),02127,02127,Suffolk County,1
2,3,569 Broadway,,Newark,New Jersey,07104,Essex County,"569 BROADWAY, NEWARK, NJ 07104",40.769935,-74.162821,Street Address,NJ,SUCCESSFUL ADDRESS,POINT (-74.16282 40.76993),07104,07104,Essex County,1
3,4,3210 Southwestern Blvd,,Orchard Park,New York,14127,Erie County,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",42.798535,-78.747816,Street Address,NY,SUCCESSFUL ADDRESS,POINT (-78.74782 42.79853),14127,14127,Erie County,1
4,5,431 Campground Rd,,Livermore Fls,Maine,04254,Androscoggin County,"431 CAMPGROUND RD, LIVERMORE FLS, ME 04254",44.428610,-70.113777,Street Address,ME,SUCCESSFUL ADDRESS,POINT (-70.11378 44.42861),04254,04254,Androscoggin County,1
5,6,105 Harris Ave,,Portland,Maine,04103,Cumberland County,"105 HARRIS AVE, PORTLAND, ME 04103",43.699355,-70.303214,Street Address,ME,SUCCESSFUL ADDRESS,POINT (-70.30321 43.69936),04103,04103,Cumberland County,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78795,104098,1300 Peachtree Industrial Boulevard,,Suwanee,Georgia,30024,Gwinnett County,"1300 PEACHTREE INDUSTRIAL BOULEVARD, SUWANEE, ...",34.041727,-84.095174,Street Address,GA,SUCCESSFUL ADDRESS,POINT (-84.09517 34.04173),30024,30024,Gwinnett County,1
78796,104099,2660 Satellite Boulevard Northwest,,Duluth,Georgia,30096,Gwinnett County,"2660 SATELLITE BOULEVARD NORTHWEST, DULUTH, GA...",33.966797,-84.101318,Street Address,GA,SUCCESSFUL ADDRESS,POINT (-84.10132 33.96680),30096,30096,Gwinnett County,1
78797,104100,3685 Braselton Highway,,Dacula,Georgia,30019,Gwinnett County,"3685 BRASELTON HIGHWAY, DACULA, GA 30019",34.068832,-83.902215,Street Address,GA,SUCCESSFUL ADDRESS,POINT (-83.90222 34.06883),30019,30019,Gwinnett County,1
78798,104101,1055 Dove Run Road,,Lexington,Kentucky,40502,Fayette County,"1055 DOVE RUN ROAD, LEXINGTON, KY 40502",37.996508,-84.494106,Street Address,KY,SUCCESSFUL ADDRESS,POINT (-84.49411 37.99651),40502,40502,Fayette County,1


In [25]:
# capitalize only first letter
omop_county_zcta_match['address_1'] = omop_county_zcta_match.address_1.apply(lambda x: str(x).strip().title())
omop_county_zcta_match['address_2'] = omop_county_zcta_match.address_2.apply(lambda x: str(x).strip().title() if not np.NaN else x)
omop_county_zcta_match['city'] = omop_county_zcta_match.city.apply(lambda x: str(x).strip().title())
# omop_county_zcta_match['location_source_value'] = omop_county_zcta_match.location_source_value.apply(lambda x: str(x).strip().title())
omop_county_zcta_match

Unnamed: 0,Location_id,address_1,address_2,city,state,zip_left,county,location_source_value,latitude,longitude,address_type,state_abbr,flag,geometry,ZCTA5CE20,GEOID20,NAMELSAD_COUNTY_20,county_match
0,1,523 E Broadway,,South Boston,Massachusetts,02127,Suffolk County,"523 E BROADWAY, SOUTH BOSTON, MA 02127",42.335472,-71.043522,Street Address,MA,SUCCESSFUL ADDRESS,POINT (-71.04352 42.33547),02127,02127,Suffolk County,1
2,3,569 Broadway,,Newark,New Jersey,07104,Essex County,"569 BROADWAY, NEWARK, NJ 07104",40.769935,-74.162821,Street Address,NJ,SUCCESSFUL ADDRESS,POINT (-74.16282 40.76993),07104,07104,Essex County,1
3,4,3210 Southwestern Blvd,,Orchard Park,New York,14127,Erie County,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",42.798535,-78.747816,Street Address,NY,SUCCESSFUL ADDRESS,POINT (-78.74782 42.79853),14127,14127,Erie County,1
4,5,431 Campground Rd,,Livermore Fls,Maine,04254,Androscoggin County,"431 CAMPGROUND RD, LIVERMORE FLS, ME 04254",44.428610,-70.113777,Street Address,ME,SUCCESSFUL ADDRESS,POINT (-70.11378 44.42861),04254,04254,Androscoggin County,1
5,6,105 Harris Ave,,Portland,Maine,04103,Cumberland County,"105 HARRIS AVE, PORTLAND, ME 04103",43.699355,-70.303214,Street Address,ME,SUCCESSFUL ADDRESS,POINT (-70.30321 43.69936),04103,04103,Cumberland County,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78795,104098,1300 Peachtree Industrial Boulevard,,Suwanee,Georgia,30024,Gwinnett County,"1300 PEACHTREE INDUSTRIAL BOULEVARD, SUWANEE, ...",34.041727,-84.095174,Street Address,GA,SUCCESSFUL ADDRESS,POINT (-84.09517 34.04173),30024,30024,Gwinnett County,1
78796,104099,2660 Satellite Boulevard Northwest,,Duluth,Georgia,30096,Gwinnett County,"2660 SATELLITE BOULEVARD NORTHWEST, DULUTH, GA...",33.966797,-84.101318,Street Address,GA,SUCCESSFUL ADDRESS,POINT (-84.10132 33.96680),30096,30096,Gwinnett County,1
78797,104100,3685 Braselton Highway,,Dacula,Georgia,30019,Gwinnett County,"3685 BRASELTON HIGHWAY, DACULA, GA 30019",34.068832,-83.902215,Street Address,GA,SUCCESSFUL ADDRESS,POINT (-83.90222 34.06883),30019,30019,Gwinnett County,1
78798,104101,1055 Dove Run Road,,Lexington,Kentucky,40502,Fayette County,"1055 DOVE RUN ROAD, LEXINGTON, KY 40502",37.996508,-84.494106,Street Address,KY,SUCCESSFUL ADDRESS,POINT (-84.49411 37.99651),40502,40502,Fayette County,1


In [41]:
OMOP_county_zcta_path = os.path.join(abs_path, 'output', 'OMOP_county_zcta.csv')
omop_county_zcta.to_csv(OMOP_county_zcta_path, index=False)

In [26]:
OMOP_county_zcta_match_path = os.path.join(abs_path, 'output', 'OMOP_county_zcta_match.csv')
omop_county_zcta_match.to_csv(OMOP_county_zcta_match_path, index=False)

## Merge on ZCTA Crosswalk

In [27]:
# merge on zip using ZCTA crosswalk
omop_county_zcta_zip = omop_county_gdf.merge(zcta_county_name, how='left', on='zip')
omop_county_zcta_zip["county"] =  omop_county_zcta_zip.county.apply(lambda x: str(x) + " County")
omop_county_zcta_zip["county_match"] = omop_county_zcta_zip.apply(lambda x: 1 if x.county == x.NAMELSAD_COUNTY_20 else 0, axis=1)

# omop_county_zcta_zip = omop_county_zcta_zip.loc[omop_county_zcta_zip.county_match == 1]
omop_county_zcta_zip

Unnamed: 0,Location_id,address_1,address_2,city,state,zip,county,location_source_value,latitude,longitude,...,GEOIDFQ20,INTPTLAT20,INTPTLON20,geometry_y,GEOID_ZCTA5_20,NAMELSAD_ZCTA5_20,OID_COUNTY_20,GEOID_COUNTY_20,NAMELSAD_COUNTY_20,county_match
0,1,523 E Broadway,,South Boston,Massachusetts,02127,Suffolk County,"523 E BROADWAY, SOUTH BOSTON, MA 02127",42.335472,-71.043522,...,860Z200US02127,+42.3353007,-071.0382163,"POLYGON ((-71.06355 42.33079, -71.06291 42.331...",2127.0,ZCTA5 02127,2.759021e+13,25025.0,Suffolk County,1
1,2,454 Essex St,,Lawrence,Massachusetts,01840,Essex County,"454 ESSEX ST, LAWRENCE, MA 01840",42.706213,-71.164940,...,860Z200US01840,+42.7067633,-071.1604026,"POLYGON ((-71.17248 42.70787, -71.17261 42.707...",,,,,,0
2,3,569 Broadway,,Newark,New Jersey,07104,Essex County,"569 BROADWAY, NEWARK, NJ 07104",40.769935,-74.162821,...,860Z200US07104,+40.7677132,-074.1683498,"POLYGON ((-74.18521 40.75686, -74.18516 40.757...",7104.0,ZCTA5 07104,2.759043e+13,34013.0,Essex County,1
3,3,569 Broadway,,Newark,New Jersey,07104,Essex County,"569 BROADWAY, NEWARK, NJ 07104",40.769935,-74.162821,...,860Z200US07104,+40.7677132,-074.1683498,"POLYGON ((-74.18521 40.75686, -74.18516 40.757...",71040.0,ZCTA5 71040,2.759069e+12,22027.0,Claiborne Parish,0
4,4,3210 Southwestern Blvd,,Orchard Park,New York,14127,Erie County,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",42.798535,-78.747816,...,860Z200US14127,+42.7528037,-078.7396939,"POLYGON ((-78.80954 42.77168, -78.80828 42.772...",14127.0,ZCTA5 14127,2.759012e+13,36029.0,Erie County,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98932,104098,1300 Peachtree Industrial Boulevard,,Suwanee,Georgia,30024,Gwinnett County,"1300 PEACHTREE INDUSTRIAL BOULEVARD, SUWANEE, ...",34.041727,-84.095174,...,,,,,,,,,,0
98933,104099,2660 Satellite Boulevard Northwest,,Duluth,Georgia,30096,Gwinnett County,"2660 SATELLITE BOULEVARD NORTHWEST, DULUTH, GA...",33.966797,-84.101318,...,,,,,,,,,,0
98934,104100,3685 Braselton Highway,,Dacula,Georgia,30019,Gwinnett County,"3685 BRASELTON HIGHWAY, DACULA, GA 30019",34.068832,-83.902215,...,,,,,,,,,,0
98935,104101,1055 Dove Run Road,,Lexington,Kentucky,40502,Fayette County,"1055 DOVE RUN ROAD, LEXINGTON, KY 40502",37.996508,-84.494106,...,,,,,,,,,,0


In [28]:
# omop_county_zcta_zip_match = omop_county_zcta_zip.loc[omop_county_zcta_zip.county_match == 1]
omop_county_zcta_zip = omop_county_zcta_zip.drop(columns = ["GEOIDFQ20", "INTPTLAT20", "INTPTLON20", "NAMELSAD_ZCTA5_20", "OID_COUNTY_20", 
                                                            "GEOID_ZCTA5_20", "GEOID_COUNTY_20", "geometry_y", "GEOID20"])
omop_county_zcta_zip = omop_county_zcta_zip.rename({"geometry_x": "geometry"})
omop_county_zcta_zip

Unnamed: 0,Location_id,address_1,address_2,city,state,zip,county,location_source_value,latitude,longitude,address_type,state_abbr,flag,geometry_x,ZCTA5CE20,NAMELSAD_COUNTY_20,county_match
0,1,523 E Broadway,,South Boston,Massachusetts,02127,Suffolk County,"523 E BROADWAY, SOUTH BOSTON, MA 02127",42.335472,-71.043522,Street Address,MA,SUCCESSFUL ADDRESS,POINT (-71.04352 42.33547),02127,Suffolk County,1
1,2,454 Essex St,,Lawrence,Massachusetts,01840,Essex County,"454 ESSEX ST, LAWRENCE, MA 01840",42.706213,-71.164940,Street Address,MA,SUCCESSFUL ADDRESS,POINT (-71.16494 42.70621),01840,,0
2,3,569 Broadway,,Newark,New Jersey,07104,Essex County,"569 BROADWAY, NEWARK, NJ 07104",40.769935,-74.162821,Street Address,NJ,SUCCESSFUL ADDRESS,POINT (-74.16282 40.76993),07104,Essex County,1
3,3,569 Broadway,,Newark,New Jersey,07104,Essex County,"569 BROADWAY, NEWARK, NJ 07104",40.769935,-74.162821,Street Address,NJ,SUCCESSFUL ADDRESS,POINT (-74.16282 40.76993),07104,Claiborne Parish,0
4,4,3210 Southwestern Blvd,,Orchard Park,New York,14127,Erie County,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",42.798535,-78.747816,Street Address,NY,SUCCESSFUL ADDRESS,POINT (-78.74782 42.79853),14127,Erie County,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98932,104098,1300 Peachtree Industrial Boulevard,,Suwanee,Georgia,30024,Gwinnett County,"1300 PEACHTREE INDUSTRIAL BOULEVARD, SUWANEE, ...",34.041727,-84.095174,Street Address,GA,SUCCESSFUL ADDRESS,POINT (-84.09517 34.04173),,,0
98933,104099,2660 Satellite Boulevard Northwest,,Duluth,Georgia,30096,Gwinnett County,"2660 SATELLITE BOULEVARD NORTHWEST, DULUTH, GA...",33.966797,-84.101318,Street Address,GA,SUCCESSFUL ADDRESS,POINT (-84.10132 33.96680),,,0
98934,104100,3685 Braselton Highway,,Dacula,Georgia,30019,Gwinnett County,"3685 BRASELTON HIGHWAY, DACULA, GA 30019",34.068832,-83.902215,Street Address,GA,SUCCESSFUL ADDRESS,POINT (-83.90222 34.06883),,,0
98935,104101,1055 Dove Run Road,,Lexington,Kentucky,40502,Fayette County,"1055 DOVE RUN ROAD, LEXINGTON, KY 40502",37.996508,-84.494106,Street Address,KY,SUCCESSFUL ADDRESS,POINT (-84.49411 37.99651),,,0


In [29]:
omop_county_zcta_zip_path = os.path.join(abs_path, 'output', 'omop_county_zcta_zip.csv')
omop_county_zcta_zip.to_csv(omop_county_zcta_zip_path, index=False)

In [30]:
omop_county_zcta_zip_match = omop_county_zcta_zip.loc[omop_county_zcta_zip.county_match == 1]
# omop_county_zcta_zip_match = omop_county_zcta_zip_match.drop(columns = ["GEOIDFQ20", "INTPTLAT20", "INTPTLON20", "NAMELSAD_ZCTA5_20", "OID_COUNTY_20", 
#                                                                         "Unnamed: 0", "GEOID_ZCTA5_20", "GEOID_COUNTY_20", "geometry_y", "GEOID20"])
omop_county_zcta_zip_match = omop_county_zcta_zip_match.rename(columns={"geometry_x": "geometry"})
omop_county_zcta_zip_match

Unnamed: 0,Location_id,address_1,address_2,city,state,zip,county,location_source_value,latitude,longitude,address_type,state_abbr,flag,geometry,ZCTA5CE20,NAMELSAD_COUNTY_20,county_match
0,1,523 E Broadway,,South Boston,Massachusetts,02127,Suffolk County,"523 E BROADWAY, SOUTH BOSTON, MA 02127",42.335472,-71.043522,Street Address,MA,SUCCESSFUL ADDRESS,POINT (-71.04352 42.33547),02127,Suffolk County,1
2,3,569 Broadway,,Newark,New Jersey,07104,Essex County,"569 BROADWAY, NEWARK, NJ 07104",40.769935,-74.162821,Street Address,NJ,SUCCESSFUL ADDRESS,POINT (-74.16282 40.76993),07104,Essex County,1
4,4,3210 Southwestern Blvd,,Orchard Park,New York,14127,Erie County,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",42.798535,-78.747816,Street Address,NY,SUCCESSFUL ADDRESS,POINT (-78.74782 42.79853),14127,Erie County,1
5,5,431 Campground Rd,,Livermore Fls,Maine,04254,Androscoggin County,"431 CAMPGROUND RD, LIVERMORE FLS, ME 04254",44.428610,-70.113777,Street Address,ME,SUCCESSFUL ADDRESS,POINT (-70.11378 44.42861),04254,Androscoggin County,1
6,6,105 Harris Ave,,Portland,Maine,04103,Cumberland County,"105 HARRIS AVE, PORTLAND, ME 04103",43.699355,-70.303214,Street Address,ME,SUCCESSFUL ADDRESS,POINT (-70.30321 43.69936),04103,Cumberland County,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85668,85594,5000 N Bowes Rd,,Tucson,Arizona,85749,Pima County,"5000 N BOWES RD, TUCSON, AZ 85749",32.298797,-110.788349,Street Address,AZ,SUCCESSFUL ADDRESS,POINT (-110.78835 32.29880),85749,Pima County,1
85669,85596,1303 Paseo Del Canon East,,Taos,New Mexico,87571,Taos County,"1303 PASEO DEL CANON EAST, TAOS, NM 87571",36.375292,-105.568573,Street Address,NM,SUCCESSFUL ADDRESS,POINT (-105.56857 36.37529),87571,Taos County,1
85670,85599,12606 E Main St,,Mayer,Arizona,86333,Yavapai County,"12606 E MAIN ST, MAYER, AZ 86333",34.346322,-112.157189,Street Address,AZ,SUCCESSFUL ADDRESS,POINT (-112.15719 34.34632),86333,Yavapai County,1
85671,85600,4300 Blake Rd Sw,,Albuquerque,New Mexico,87121,Bernalillo County,"4300 BLAKE RD SW, ALBUQUERQUE, NM 87121",35.037163,-106.723264,Street Address,NM,SUCCESSFUL ADDRESS,POINT (-106.72326 35.03716),87121,Bernalillo County,1


In [31]:
OMOP_county_zcta_zip_match_path = os.path.join(abs_path, 'output', 'OMOP_county_zcta_zip_match.csv')
omop_county_zcta_zip_match.reset_index().to_csv(OMOP_county_zcta_zip_match_path, index=False)

### Merge for "Failed" Addresses

In [12]:
# add point geometry column that matches expected gpd format
omop_county_failed['geometry'] = omop_county_failed.apply(lambda x: "POINT (" + str(x.longitude) + " " + str(x.latitude) + ")", axis=1)

# convert to geopandas DataFrame and set `geometry` centroid 
omop_county_failed_gdf = gpd.GeoDataFrame(omop_county_failed, geometry=omop_county_failed['geometry'].apply(shapely.wkt.loads))
omop_county_failed_gdf.crs = "EPSG:4326"
omop_county_failed_gdf

Unnamed: 0,Location_id,address_1,address_2,city,state,zip,county,location_source_value,latitude,longitude,address_type,state_abbr,flag,geometry
0,156,Po Box 102,,Deerfield St,New Jersey,08313,Cumberland,"PO BOX 102, DEERFIELD ST, NJ 08313",39.523911,-75.236086,PO Box,NJ,FAILED DUE TO PO BOX ADDRESS,POINT (-75.23609 39.52391)
1,157,Po Box 6149,,China Village,Maine,04926,Kennebec,"PO BOX 6149, CHINA VILLAGE, ME 04926",44.481721,-69.516751,PO Box,ME,FAILED DUE TO PO BOX ADDRESS,POINT (-69.51675 44.48172)
2,158,Po Box 555,,Northeast Hbr,Maine,04662,Hancock,"PO BOX 555, NORTHEAST HBR, ME 04662",44.294140,-68.290211,PO Box,ME,FAILED DUE TO PO BOX ADDRESS,POINT (-68.29021 44.29414)
3,159,Po Box 514,,Bangor,Maine,04402,Penobscot,"PO BOX 514, BANGOR, ME 04402",44.801671,-68.772141,PO Box,ME,FAILED DUE TO PO BOX ADDRESS,POINT (-68.77214 44.80167)
4,160,Po Box 826,,Presque Isle,Maine,04769,Aroostook,"PO BOX 826, PRESQUE ISLE, ME 04769",46.681235,-68.010188,PO Box,ME,FAILED DUE TO PO BOX ADDRESS,POINT (-68.01019 46.68124)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15708,104057,141 Main Street,,South Bound Brook,New Jersey,00888,Somerset,"141 MAIN STREET, SOUTH BOUND BROOK, NJ 08880",40.552276,-74.523419,Street Address,NJ,FAILED DUE TO NON 5-DIGIT ZIPCODE,POINT (-74.52342 40.55228)
15709,104082,441 United States Highway 130,,Hightstown,New Jersey,00852,Mercer,"441 UNITED STATES HIGHWAY 130, HIGHTSTOWN, NJ ...",40.271074,-74.539260,Street Address,NJ,FAILED DUE TO NON 5-DIGIT ZIPCODE,POINT (-74.53926 40.27107)
15710,104083,16 Ethel Road,,Edison,New Jersey,08817,Middlesex,"16 ETHEL ROAD, EDISON, NJ 08817",40.540541,-74.398598,Street Address,NJ,FAILED DUE TO NON 5-DIGIT ZIPCODE,POINT (-74.39860 40.54054)
15711,104085,606 Dowd Avenue,,Elizabeth,New Jersey,07201,Union,"606 DOWD AVENUE, ELIZABETH, NJ 07201",40.666597,-74.188170,Street Address,NJ,FAILED DUE TO NON 5-DIGIT ZIPCODE,POINT (-74.18817 40.66660)


In [13]:
# merge on zip using ZCTA crosswalk
omop_county_failed_zcta_zip = omop_county_failed_gdf.merge(zcta_county_name, how='left', on='zip')
omop_county_failed_zcta_zip["county"] =  omop_county_failed_zcta_zip.county.apply(lambda x: str(x) + " County")
omop_county_failed_zcta_zip["county_match"] = omop_county_failed_zcta_zip.apply(lambda x: 1 if x.county == x.NAMELSAD_COUNTY_20 else 0, axis=1)
omop_county_failed_zcta_zip

Unnamed: 0,Location_id,address_1,address_2,city,state,zip,county,location_source_value,latitude,longitude,...,GEOIDFQ20,INTPTLAT20,INTPTLON20,geometry_y,GEOID_ZCTA5_20,NAMELSAD_ZCTA5_20,OID_COUNTY_20,GEOID_COUNTY_20,NAMELSAD_COUNTY_20,county_match
0,156,Po Box 102,,Deerfield St,New Jersey,08313,Cumberland County,"PO BOX 102, DEERFIELD ST, NJ 08313",39.523911,-75.236086,...,,,,,,,,,,0
1,157,Po Box 6149,,China Village,Maine,04926,Kennebec County,"PO BOX 6149, CHINA VILLAGE, ME 04926",44.481721,-69.516751,...,860Z200US04926,+44.4803190,-069.5152083,"POLYGON ((-69.51823 44.47833, -69.51809 44.478...",4926.0,ZCTA5 04926,2.759040e+13,23011.0,Kennebec County,1
2,158,Po Box 555,,Northeast Hbr,Maine,04662,Hancock County,"PO BOX 555, NORTHEAST HBR, ME 04662",44.294140,-68.290211,...,860Z200US04662,+44.3069258,-068.2872356,"POLYGON ((-68.30695 44.29795, -68.30711 44.299...",4662.0,ZCTA5 04662,2.759010e+13,23009.0,Hancock County,1
3,159,Po Box 514,,Bangor,Maine,04402,Penobscot County,"PO BOX 514, BANGOR, ME 04402",44.801671,-68.772141,...,,,,,,,,,,0
4,160,Po Box 826,,Presque Isle,Maine,04769,Aroostook County,"PO BOX 826, PRESQUE ISLE, ME 04769",46.681235,-68.010188,...,860Z200US04769,+46.6889522,-067.9919140,"POLYGON ((-68.05653 46.77568, -68.05374 46.775...",4769.0,ZCTA5 04769,2.759012e+13,23003.0,Aroostook County,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21493,104083,16 Ethel Road,,Edison,New Jersey,08817,Middlesex County,"16 ETHEL ROAD, EDISON, NJ 08817",40.540541,-74.398598,...,860Z200US08817,+40.5146550,-074.3931035,"POLYGON ((-74.43292 40.51060, -74.43277 40.510...",8817.0,ZCTA5 08817,2.759042e+13,34023.0,Middlesex County,1
21494,104085,606 Dowd Avenue,,Elizabeth,New Jersey,07201,Union County,"606 DOWD AVENUE, ELIZABETH, NJ 07201",40.666597,-74.188170,...,860Z200US07201,+40.6712716,-074.1773578,"POLYGON ((-74.21726 40.66644, -74.21614 40.666...",7201.0,ZCTA5 07201,2.759031e+13,34039.0,Union County,1
21495,104085,606 Dowd Avenue,,Elizabeth,New Jersey,07201,Union County,"606 DOWD AVENUE, ELIZABETH, NJ 07201",40.666597,-74.188170,...,860Z200US07201,+40.6712716,-074.1773578,"POLYGON ((-74.21726 40.66644, -74.21614 40.666...",72010.0,ZCTA5 72010,2.759041e+13,5145.0,White County,0
21496,104086,855 South United States Highway,17-92,Longwood,Florida,03275,Seminole County,"855 SOUTH UNITED STATES HIGHWAY 17-92, LONGWOO...",28.693001,-81.327137,...,860Z200US03275,+43.1700194,-071.4207075,"POLYGON ((-71.50286 43.17673, -71.50274 43.176...",3275.0,ZCTA5 03275,2.759030e+13,33013.0,Merrimack County,0


In [14]:
# omop_county_zcta_zip_match = omop_county_zcta_zip.loc[omop_county_zcta_zip.county_match == 1]
omop_county_failed_zcta_zip = omop_county_failed_zcta_zip.drop(columns = ["GEOIDFQ20", "INTPTLAT20", "INTPTLON20", "NAMELSAD_ZCTA5_20", "OID_COUNTY_20", 
                                                            "GEOID_ZCTA5_20", "GEOID_COUNTY_20", "geometry_y", "GEOID20"])
omop_county_failed_zcta_zip = omop_county_failed_zcta_zip.rename({"geometry_x": "geometry"})

omop_county_failed_zcta_zip_match = omop_county_failed_zcta_zip.loc[omop_county_failed_zcta_zip.county_match == 1]
# omop_county_zcta_zip_match = omop_county_zcta_zip_match.drop(columns = ["GEOIDFQ20", "INTPTLAT20", "INTPTLON20", "NAMELSAD_ZCTA5_20", "OID_COUNTY_20", 
#                                                                         "Unnamed: 0", "GEOID_ZCTA5_20", "GEOID_COUNTY_20", "geometry_y", "GEOID20"])
omop_county_failed_zcta_zip_match = omop_county_failed_zcta_zip_match.rename(columns={"geometry_x": "geometry"})
omop_county_failed_zcta_zip_match

Unnamed: 0,Location_id,address_1,address_2,city,state,zip,county,location_source_value,latitude,longitude,address_type,state_abbr,flag,geometry,ZCTA5CE20,NAMELSAD_COUNTY_20,county_match
1,157,Po Box 6149,,China Village,Maine,04926,Kennebec County,"PO BOX 6149, CHINA VILLAGE, ME 04926",44.481721,-69.516751,PO Box,ME,FAILED DUE TO PO BOX ADDRESS,POINT (-69.51675 44.48172),04926,Kennebec County,1
2,158,Po Box 555,,Northeast Hbr,Maine,04662,Hancock County,"PO BOX 555, NORTHEAST HBR, ME 04662",44.294140,-68.290211,PO Box,ME,FAILED DUE TO PO BOX ADDRESS,POINT (-68.29021 44.29414),04662,Hancock County,1
4,160,Po Box 826,,Presque Isle,Maine,04769,Aroostook County,"PO BOX 826, PRESQUE ISLE, ME 04769",46.681235,-68.010188,PO Box,ME,FAILED DUE TO PO BOX ADDRESS,POINT (-68.01019 46.68124),04769,Aroostook County,1
5,161,Po Box 435,,North Haven,Maine,04853,Knox County,"PO BOX 435, NORTH HAVEN, ME 04853",44.132285,-68.873329,PO Box,ME,FAILED DUE TO PO BOX ADDRESS,POINT (-68.87333 44.13229),04853,Knox County,1
6,162,Po Box 391,,Kennebunkport,Maine,04046,York County,"PO BOX 391, KENNEBUNKPORT, ME 04046",43.404581,-70.411332,PO Box,ME,FAILED DUE TO PO BOX ADDRESS,POINT (-70.41133 43.40458),04046,York County,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21487,104053,90 United States Highway 22 West,,Springfield,New Jersey,07081,Union County,"90 UNITED STATES HIGHWAY 22 WEST, SPRINGFIELD,...",40.687946,-74.315669,Street Address,NJ,FAILED DUE TO NON 5-DIGIT ZIPCODE,POINT (-74.31567 40.68795),07081,Union County,1
21490,104054,891 Tabor Road,,Morris Plains,New Jersey,00795,Morris County,"891 TABOR ROAD, MORRIS PLAINS, NJ 07950",40.860893,-74.469700,Street Address,NJ,FAILED DUE TO NON 5-DIGIT ZIPCODE,POINT (-74.46970 40.86089),00795,Morris County,1
21493,104083,16 Ethel Road,,Edison,New Jersey,08817,Middlesex County,"16 ETHEL ROAD, EDISON, NJ 08817",40.540541,-74.398598,Street Address,NJ,FAILED DUE TO NON 5-DIGIT ZIPCODE,POINT (-74.39860 40.54054),08817,Middlesex County,1
21494,104085,606 Dowd Avenue,,Elizabeth,New Jersey,07201,Union County,"606 DOWD AVENUE, ELIZABETH, NJ 07201",40.666597,-74.188170,Street Address,NJ,FAILED DUE TO NON 5-DIGIT ZIPCODE,POINT (-74.18817 40.66660),07201,Union County,1


In [15]:
OMOP_failed_county_zcta_zip_match_path = os.path.join(abs_path, 'output', 'OMOP_failed_county_zcta_zip_match.csv')
omop_county_failed_zcta_zip_match.reset_index().to_csv(OMOP_failed_county_zcta_zip_match_path, index=False)

# Nominatim Parsing
1. Parse base Nominatim components with `usaddress`
2. Keep only the components needed for Nominatim
3. Merge the Nominatim components to the Publication 28 (OMOP) components
4. Convert all abbreviations for cardinal directions and street type to full strings
5. Join Nominatim components to a full address string

In [16]:
from requests.structures import CaseInsensitiveDict

# for converting cardinal direction abbreviations to full string
cardinal_directions_to_full = CaseInsensitiveDict({"N": "North", "E": "East", "S": "South", "W": "West", 
                                                  "NE": "Northeast", "SE": "Southeast", "NW": "Northwest", "SW": "Southwest"})

# for converting street type abbreviations to full string
street_suffix_to_full = CaseInsensitiveDict({
    'aly': 'Alley',
    'ave': 'Avenue',
    'blvd': 'Boulevard',
    'cir': 'Circle',
    'ct': 'Court',
    'dr': 'Drive',
    'expy': 'Expressway', 
    'grv': 'Grove', 
    'grve': 'Grove',
    'hwy': 'Highway',
    'ln': 'Lane',
    'pkwy': 'Parkway',
    'pl': 'Place', 
    'plz': 'Place',
    'rd': 'Road', 
    'spgs': 'Springs',
    'sq': 'Square',
    'st': 'Street',
    'ter': 'Terrace',
    'trl': 'Trail', 
    'vly': 'Valley',
    'way': 'Way'
})

In [17]:
# function for case insensitive dictionary replacement using Regex
def replace_words(text, word_dict):
    for key, value in word_dict.items():
        # Use case-insensitive regex for replacement
        text = re.sub(r'\b'+key+'\b', value, text, flags=re.IGNORECASE, count=1)
    return text

def multipleReplace(text, wordDict):
    for key in wordDict:
        text = text.replace(key, wordDict[key], 1)
    return text

def cardinal_direction_lambda(x, word_dict):
    """
    Pass cardinal_directions_to_full
    """
    if str(x) == "nan":
        return np.NaN
    elif x in word_dict.keys():
        return word_dict.get(str(x))
    else:
        return x

def street_suffix_lambda(x, word_dict):
    """
    Pass street_suffix_to_full
    """
    if str(x) == "nan" or str(x) == "Nan":
        return np.NaN
    # elif len(str(x)) > 2:
    #     return str(x)
    elif x in word_dict.keys():
        return word_dict.get(str(x))
    else:
        return x

# def replace_ordinal_numbers(text):
#     """
#     Replace ordinal numbers with full string
#     """
#     re_results = re.findall('(\d+(st|nd|rd|th))', text)
#     for enitre_result, suffix in re_results:
#         num = int(enitre_result[:-len(suffix)])
#         text = text.replace(enitre_result, num2words(num, ordinal=True))
#     return text

def word_to_ordinal(x):
    n = parse_ordinal(x)
    return n

def make_ordinal(x):
    '''
    Convert an integer into its ordinal representation::

        make_ordinal(0)   => '0th'
        make_ordinal(3)   => '3rd'
        make_ordinal(122) => '122nd'
        make_ordinal(213) => '213th'
    '''
    n = word_to_ordinal(x)
    if 11 <= (n % 100) <= 13:
        suffix = 'th'
    else:
        suffix = ['th', 'st', 'nd', 'rd', 'th'][min(n % 10, 4)]
    return str(n) + suffix

In [34]:
# import omop county file that has both ZCTA crosswalk merge and TIGER/Line spatial join match by county name
OMOP_county_zcta_zip_match_path = os.path.join(abs_path, 'output', 'OMOP_county_zcta_zip_match.csv')
omop_county_zcta_zip_match = pd.read_csv(OMOP_county_zcta_zip_match_path)
omop_county_zcta_zip_match = omop_county_zcta_zip_match.rename(columns={"geometry_x": "geometry", "Location_id": "location_id"})
omop_county_zcta_zip_match['location_id'] = omop_county_zcta_zip_match.index+1

In [35]:
omop_county_zcta_zip_match

Unnamed: 0,index,location_id,address_1,address_2,city,state,zip,county,location_source_value,latitude,longitude,address_type,state_abbr,flag,geometry,ZCTA5CE20,NAMELSAD_COUNTY_20,county_match
0,0,1,523 E Broadway,,South Boston,Massachusetts,2127,Suffolk County,"523 E BROADWAY, SOUTH BOSTON, MA 02127",42.335472,-71.043522,Street Address,MA,SUCCESSFUL ADDRESS,POINT (-71.04352199999346 42.33547200002667),2127,Suffolk County,1
1,2,2,569 Broadway,,Newark,New Jersey,7104,Essex County,"569 BROADWAY, NEWARK, NJ 07104",40.769935,-74.162821,Street Address,NJ,SUCCESSFUL ADDRESS,POINT (-74.16282099988595 40.76993499987205),7104,Essex County,1
2,4,3,3210 Southwestern Blvd,,Orchard Park,New York,14127,Erie County,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",42.798535,-78.747816,Street Address,NY,SUCCESSFUL ADDRESS,POINT (-78.74781599997843 42.79853499984358),14127,Erie County,1
3,5,4,431 Campground Rd,,Livermore Fls,Maine,4254,Androscoggin County,"431 CAMPGROUND RD, LIVERMORE FLS, ME 04254",44.428610,-70.113777,Street Address,ME,SUCCESSFUL ADDRESS,POINT (-70.11377699970232 44.42860999977688),4254,Androscoggin County,1
4,6,5,105 Harris Ave,,Portland,Maine,4103,Cumberland County,"105 HARRIS AVE, PORTLAND, ME 04103",43.699355,-70.303214,Street Address,ME,SUCCESSFUL ADDRESS,POINT (-70.3032139999053 43.69935500023178),4103,Cumberland County,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44001,85668,44002,5000 N Bowes Rd,,Tucson,Arizona,85749,Pima County,"5000 N BOWES RD, TUCSON, AZ 85749",32.298797,-110.788349,Street Address,AZ,SUCCESSFUL ADDRESS,POINT (-110.7883488373378 32.29879689494836),85749,Pima County,1
44002,85669,44003,1303 Paseo Del Canon East,,Taos,New Mexico,87571,Taos County,"1303 PASEO DEL CANON EAST, TAOS, NM 87571",36.375292,-105.568573,Street Address,NM,SUCCESSFUL ADDRESS,POINT (-105.568573404848 36.37529177977968),87571,Taos County,1
44003,85670,44004,12606 E Main St,,Mayer,Arizona,86333,Yavapai County,"12606 E MAIN ST, MAYER, AZ 86333",34.346322,-112.157189,Street Address,AZ,SUCCESSFUL ADDRESS,POINT (-112.1571889686942 34.34632218671697),86333,Yavapai County,1
44004,85671,44005,4300 Blake Rd Sw,,Albuquerque,New Mexico,87121,Bernalillo County,"4300 BLAKE RD SW, ALBUQUERQUE, NM 87121",35.037163,-106.723264,Street Address,NM,SUCCESSFUL ADDRESS,POINT (-106.7232635088362 35.03716265606487),87121,Bernalillo County,1


In [36]:
%%time

# run `usaddress` parsing with base component names and append to list
repo = []
for ind, each in omop_county_zcta_zip_match.iterrows():
    try:
        obj = usaddress.tag(each.location_source_value)

        # create DataFrame of the `usaddress` OrderedDict with the components as the column names
        tmp = pd.DataFrame(obj[0], columns=obj[0].keys(), index=[ind])
        tmp['Address_type'] = obj[1]

        # assign the county name from omop_county_zcta_zip_match
        tmp['county'] = each.county
        
        # assign the location_source_value from omop_county_zcta_zip_match
        tmp['location_source_value'] = each.location_source_value

        # assign location_id from omop_county_zcta_zip_match
        tmp['location_id'] = each.location_id

        # development
        # OMOP_location.loc[ind, 'AddressNumber'] = [tmp['AddressNumber'].values[0] if len(tmp['AddressNumber'].values[0]) > 0 else np.NaN][0]
        # OMOP_location.loc[ind, 'StreetNamePreDirectional'] = [tmp['StreetNamePreDirectional'].values[0] if len(tmp['AddressNumber'].values[0]) > 0 else np.NaN][0]
        # OMOP_location.loc[ind, 'StreetNamePreType'] = [tmp['StreetNamePreType'].values[0] if len(tmp['AddressNumber'].values[0]) > 0 else np.NaN][0]
        # OMOP_location.loc[ind, 'StreetName'] = [tmp['StreetName'].values[0] if len(tmp['AddressNumber'].values[0]) > 0 else np.NaN][0]
        # OMOP_location.loc[ind, 'StreetNamePostType'] = [tmp['StreetNamePostType'].values[0] if len(tmp['AddressNumber'].values[0]) > 0 else np.NaN][0]
        # OMOP_location.loc[ind, 'StreetNamePostDirectional'] = [tmp['StreetNamePostDirectional'].values[0] if len(tmp['AddressNumber'].values[0]) > 0 else np.NaN][0]
        # OMOP_location.loc[ind, 'PlaceName'] = [tmp['PlaceName'].values[0] if len(tmp['AddressNumber'].values[0]) > 0 else np.NaN][0]
        # OMOP_location.loc[ind, 'StateName'] = [tmp['StateName'].values[0] if len(tmp['AddressNumber'].values[0]) > 0 else np.NaN][0]
        # OMOP_location.loc[ind, 'ZipCode'] = [tmp['ZipCode'].values[0] if len(tmp['AddressNumber'].values[0]) > 0 else np.NaN][0]

        # OMOP_location.loc[ind, 'AddressNumber'] = obj[0]['AddressNumber']
        # OMOP_location.loc[ind, 'StreetNamePreDirectional'] = obj[0]['StreetNamePreDirectional']
        # OMOP_location.loc[ind, 'StreetNamePreType'] = obj[0]['StreetNamePreType']
        # OMOP_location.loc[ind, 'StreetName'] = obj[0]['StreetName']
        # OMOP_location.loc[ind, 'StreetNamePostType'] = obj[0]['StreetNamePostType']
        # OMOP_location.loc[ind, 'StreetNamePostDirectional'] = obj[0]['StreetNamePostDirectional']
        # OMOP_location.loc[ind, 'PlaceName']=obj[0]['PlaceName']
        # OMOP_location.loc[ind, 'StateName']=obj[0]['StateName']
        # OMOP_location.loc[ind, 'ZipCode']=obj[0]['ZipCode']

        repo.append(tmp)

        # StreetNamePreDirectional = tmp['StreetNamePreDirectional'].values[0]
        # if not StreetNamePreDirectional:
        #     OMOP_location.loc[ind, 'StreetNamePreDirectional'] = np.NaN
        # else:
        #     OMOP_location.loc[ind, 'StreetNamePreDirectional'] = StreetNamePreDirectional

    except:
        pass

CPU times: total: 25.4 s
Wall time: 1min 12s


In [37]:
%%time

# concatenate above list to a single dataframe and only keep address components for Nominatim
nominatim_components = pd.concat(repo)
nominatim_components[['location_id','AddressNumber', 'StreetNamePreDirectional', 'StreetNamePreType', 'StreetName', 'StreetNamePostType','StreetNamePostDirectional',
                      'PlaceName', 'county', 'StateName', 'ZipCode', 'location_source_value']]

CPU times: total: 13.3 s
Wall time: 36.4 s


Unnamed: 0,location_id,AddressNumber,StreetNamePreDirectional,StreetNamePreType,StreetName,StreetNamePostType,StreetNamePostDirectional,PlaceName,county,StateName,ZipCode,location_source_value
0,1,523,E,,BROADWAY,,,SOUTH BOSTON,Suffolk County,MA,02127,"523 E BROADWAY, SOUTH BOSTON, MA 02127"
1,2,569,,,BROADWAY,,,NEWARK,Essex County,NJ,07104,"569 BROADWAY, NEWARK, NJ 07104"
2,3,3210,,,SOUTHWESTERN,BLVD,,ORCHARD PARK,Erie County,NY,14127,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127"
3,4,431,,,CAMPGROUND,RD,,LIVERMORE FLS,Androscoggin County,ME,04254,"431 CAMPGROUND RD, LIVERMORE FLS, ME 04254"
4,5,105,,,HARRIS,AVE,,PORTLAND,Cumberland County,ME,04103,"105 HARRIS AVE, PORTLAND, ME 04103"
...,...,...,...,...,...,...,...,...,...,...,...,...
44001,44002,5000,N,,BOWES,RD,,TUCSON,Pima County,AZ,85749,"5000 N BOWES RD, TUCSON, AZ 85749"
44002,44003,1303,,,PASEO DEL CANON,,EAST,TAOS,Taos County,NM,87571,"1303 PASEO DEL CANON EAST, TAOS, NM 87571"
44003,44004,12606,E,,MAIN,ST,,MAYER,Yavapai County,AZ,86333,"12606 E MAIN ST, MAYER, AZ 86333"
44004,44005,4300,,,BLAKE,RD,SW,ALBUQUERQUE,Bernalillo County,NM,87121,"4300 BLAKE RD SW, ALBUQUERQUE, NM 87121"


In [38]:
# keep only columns necessary to merge to omop_county_zcta_zip_match that has the USPS Publication 28 Standard components
nominatim_keep_columns = nominatim_components[['location_id','AddressNumber', 'StreetNamePreDirectional', 'StreetNamePreType', 'StreetName', 'StreetNamePostType','StreetNamePostDirectional','county',]]
nominatim_keep_columns

Unnamed: 0,location_id,AddressNumber,StreetNamePreDirectional,StreetNamePreType,StreetName,StreetNamePostType,StreetNamePostDirectional,county
0,1,523,E,,BROADWAY,,,Suffolk County
1,2,569,,,BROADWAY,,,Essex County
2,3,3210,,,SOUTHWESTERN,BLVD,,Erie County
3,4,431,,,CAMPGROUND,RD,,Androscoggin County
4,5,105,,,HARRIS,AVE,,Cumberland County
...,...,...,...,...,...,...,...,...
44001,44002,5000,N,,BOWES,RD,,Pima County
44002,44003,1303,,,PASEO DEL CANON,,EAST,Taos County
44003,44004,12606,E,,MAIN,ST,,Yavapai County
44004,44005,4300,,,BLAKE,RD,SW,Bernalillo County


In [39]:
# create empty dataframe with USPS Publication 28 Standard components and source address string and lat/lon
nominatim_location = pd.DataFrame(columns=['location_id', 'address_1', 'address_2', 'city', 'state', 'zip', 'state_abbr', 'location_source_value','latitude','longitude', 'geometry'])

# fill in address and new location_id from omop_county_zcta_zip_match 
nominatim_location['location_source_value'] = omop_county_zcta_zip_match.location_source_value
nominatim_location['location_id'] = nominatim_location.index+1
# nominatim_location['county'] = omop_county_zcta_zip_match.county

# assign the same columns from omop_county_zcta_zip_match
nominatim_location.address_1 = omop_county_zcta_zip_match.address_1
nominatim_location.address_2 = omop_county_zcta_zip_match.address_2
nominatim_location.city = omop_county_zcta_zip_match.city
nominatim_location.state = omop_county_zcta_zip_match.state
nominatim_location.zip = omop_county_zcta_zip_match.zip
nominatim_location.state_abbr = omop_county_zcta_zip_match.state_abbr
nominatim_location.latitude = omop_county_zcta_zip_match.latitude
nominatim_location.longitude = omop_county_zcta_zip_match.longitude
nominatim_location.geometry = omop_county_zcta_zip_match.geometry
nominatim_location

Unnamed: 0,location_id,address_1,address_2,city,state,zip,state_abbr,location_source_value,latitude,longitude,geometry
0,1,523 E Broadway,,South Boston,Massachusetts,2127,MA,"523 E BROADWAY, SOUTH BOSTON, MA 02127",42.335472,-71.043522,POINT (-71.04352199999346 42.33547200002667)
1,2,569 Broadway,,Newark,New Jersey,7104,NJ,"569 BROADWAY, NEWARK, NJ 07104",40.769935,-74.162821,POINT (-74.16282099988595 40.76993499987205)
2,3,3210 Southwestern Blvd,,Orchard Park,New York,14127,NY,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",42.798535,-78.747816,POINT (-78.74781599997843 42.79853499984358)
3,4,431 Campground Rd,,Livermore Fls,Maine,4254,ME,"431 CAMPGROUND RD, LIVERMORE FLS, ME 04254",44.428610,-70.113777,POINT (-70.11377699970232 44.42860999977688)
4,5,105 Harris Ave,,Portland,Maine,4103,ME,"105 HARRIS AVE, PORTLAND, ME 04103",43.699355,-70.303214,POINT (-70.3032139999053 43.69935500023178)
...,...,...,...,...,...,...,...,...,...,...,...
44001,44002,5000 N Bowes Rd,,Tucson,Arizona,85749,AZ,"5000 N BOWES RD, TUCSON, AZ 85749",32.298797,-110.788349,POINT (-110.7883488373378 32.29879689494836)
44002,44003,1303 Paseo Del Canon East,,Taos,New Mexico,87571,NM,"1303 PASEO DEL CANON EAST, TAOS, NM 87571",36.375292,-105.568573,POINT (-105.568573404848 36.37529177977968)
44003,44004,12606 E Main St,,Mayer,Arizona,86333,AZ,"12606 E MAIN ST, MAYER, AZ 86333",34.346322,-112.157189,POINT (-112.1571889686942 34.34632218671697)
44004,44005,4300 Blake Rd Sw,,Albuquerque,New Mexico,87121,NM,"4300 BLAKE RD SW, ALBUQUERQUE, NM 87121",35.037163,-106.723264,POINT (-106.7232635088362 35.03716265606487)


In [40]:
# merge omop_county_zcta_zip_match addresses to the ones that parsed fully from nominatim_keep_columns_merge_county
nominatim_keep_columns_merge_county = nominatim_keep_columns.merge(nominatim_location, how='left', on='location_id')
print(nominatim_keep_columns_merge_county.dtypes)
nominatim_keep_columns_merge_county

location_id                    int64
AddressNumber                 object
StreetNamePreDirectional      object
StreetNamePreType             object
StreetName                    object
StreetNamePostType            object
StreetNamePostDirectional     object
county                        object
address_1                     object
address_2                     object
city                          object
state                         object
zip                            int64
state_abbr                    object
location_source_value         object
latitude                     float64
longitude                    float64
geometry                      object
dtype: object


Unnamed: 0,location_id,AddressNumber,StreetNamePreDirectional,StreetNamePreType,StreetName,StreetNamePostType,StreetNamePostDirectional,county,address_1,address_2,city,state,zip,state_abbr,location_source_value,latitude,longitude,geometry
0,1,523,E,,BROADWAY,,,Suffolk County,523 E Broadway,,South Boston,Massachusetts,2127,MA,"523 E BROADWAY, SOUTH BOSTON, MA 02127",42.335472,-71.043522,POINT (-71.04352199999346 42.33547200002667)
1,2,569,,,BROADWAY,,,Essex County,569 Broadway,,Newark,New Jersey,7104,NJ,"569 BROADWAY, NEWARK, NJ 07104",40.769935,-74.162821,POINT (-74.16282099988595 40.76993499987205)
2,3,3210,,,SOUTHWESTERN,BLVD,,Erie County,3210 Southwestern Blvd,,Orchard Park,New York,14127,NY,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",42.798535,-78.747816,POINT (-78.74781599997843 42.79853499984358)
3,4,431,,,CAMPGROUND,RD,,Androscoggin County,431 Campground Rd,,Livermore Fls,Maine,4254,ME,"431 CAMPGROUND RD, LIVERMORE FLS, ME 04254",44.428610,-70.113777,POINT (-70.11377699970232 44.42860999977688)
4,5,105,,,HARRIS,AVE,,Cumberland County,105 Harris Ave,,Portland,Maine,4103,ME,"105 HARRIS AVE, PORTLAND, ME 04103",43.699355,-70.303214,POINT (-70.3032139999053 43.69935500023178)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43932,44002,5000,N,,BOWES,RD,,Pima County,5000 N Bowes Rd,,Tucson,Arizona,85749,AZ,"5000 N BOWES RD, TUCSON, AZ 85749",32.298797,-110.788349,POINT (-110.7883488373378 32.29879689494836)
43933,44003,1303,,,PASEO DEL CANON,,EAST,Taos County,1303 Paseo Del Canon East,,Taos,New Mexico,87571,NM,"1303 PASEO DEL CANON EAST, TAOS, NM 87571",36.375292,-105.568573,POINT (-105.568573404848 36.37529177977968)
43934,44004,12606,E,,MAIN,ST,,Yavapai County,12606 E Main St,,Mayer,Arizona,86333,AZ,"12606 E MAIN ST, MAYER, AZ 86333",34.346322,-112.157189,POINT (-112.1571889686942 34.34632218671697)
43935,44005,4300,,,BLAKE,RD,SW,Bernalillo County,4300 Blake Rd Sw,,Albuquerque,New Mexico,87121,NM,"4300 BLAKE RD SW, ALBUQUERQUE, NM 87121",35.037163,-106.723264,POINT (-106.7232635088362 35.03716265606487)


In [41]:
# replace cardinal direction abbreviations with full string and capitalize only first letter
nominatim_keep_columns_merge_county['StreetNamePreDirectional'] = nominatim_keep_columns_merge_county.StreetNamePreDirectional.apply(lambda x: np.NaN if x == np.NaN else cardinal_direction_lambda(x, cardinal_directions_to_full))
nominatim_keep_columns_merge_county['StreetNamePreDirectional'] = nominatim_keep_columns_merge_county.StreetNamePreDirectional.apply(lambda x: str(x).title())

# replace cardinal direction abbreviations with full string and capitalize only first letter
nominatim_keep_columns_merge_county['StreetNamePostDirectional'] = nominatim_keep_columns_merge_county.StreetNamePostDirectional.apply(lambda x: np.NaN if x == np.NaN else cardinal_direction_lambda(x, cardinal_directions_to_full))
nominatim_keep_columns_merge_county['StreetNamePostDirectional'] = nominatim_keep_columns_merge_county.StreetNamePostDirectional.apply(lambda x: str(x).title())

# capitalize only first letters
nominatim_keep_columns_merge_county['StreetName'] = nominatim_keep_columns_merge_county.StreetName.apply(lambda x: str(x).title())
nominatim_keep_columns_merge_county['StreetName'] = nominatim_keep_columns_merge_county.StreetName.apply(lambda x: str(x).title())

# replace street post abbreviations with full string
nominatim_keep_columns_merge_county['StreetNamePreType'] = nominatim_keep_columns_merge_county.StreetNamePreType.apply(lambda x: street_suffix_lambda(str(x).lower(), street_suffix_to_full))
nominatim_keep_columns_merge_county['StreetNamePostType'] = nominatim_keep_columns_merge_county.StreetNamePostType.apply(lambda x: street_suffix_lambda(str(x).lower(), street_suffix_to_full))

# remove ending whitespaces
nominatim_keep_columns_merge_county['AddressNumber'] = nominatim_keep_columns_merge_county.AddressNumber.apply(lambda x: str(x).rstrip())
nominatim_keep_columns_merge_county

Unnamed: 0,location_id,AddressNumber,StreetNamePreDirectional,StreetNamePreType,StreetName,StreetNamePostType,StreetNamePostDirectional,county,address_1,address_2,city,state,zip,state_abbr,location_source_value,latitude,longitude,geometry
0,1,523,East,,Broadway,,Nan,Suffolk County,523 E Broadway,,South Boston,Massachusetts,2127,MA,"523 E BROADWAY, SOUTH BOSTON, MA 02127",42.335472,-71.043522,POINT (-71.04352199999346 42.33547200002667)
1,2,569,Nan,,Broadway,,Nan,Essex County,569 Broadway,,Newark,New Jersey,7104,NJ,"569 BROADWAY, NEWARK, NJ 07104",40.769935,-74.162821,POINT (-74.16282099988595 40.76993499987205)
2,3,3210,Nan,,Southwestern,Boulevard,Nan,Erie County,3210 Southwestern Blvd,,Orchard Park,New York,14127,NY,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",42.798535,-78.747816,POINT (-78.74781599997843 42.79853499984358)
3,4,431,Nan,,Campground,Road,Nan,Androscoggin County,431 Campground Rd,,Livermore Fls,Maine,4254,ME,"431 CAMPGROUND RD, LIVERMORE FLS, ME 04254",44.428610,-70.113777,POINT (-70.11377699970232 44.42860999977688)
4,5,105,Nan,,Harris,Avenue,Nan,Cumberland County,105 Harris Ave,,Portland,Maine,4103,ME,"105 HARRIS AVE, PORTLAND, ME 04103",43.699355,-70.303214,POINT (-70.3032139999053 43.69935500023178)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43932,44002,5000,North,,Bowes,Road,Nan,Pima County,5000 N Bowes Rd,,Tucson,Arizona,85749,AZ,"5000 N BOWES RD, TUCSON, AZ 85749",32.298797,-110.788349,POINT (-110.7883488373378 32.29879689494836)
43933,44003,1303,Nan,,Paseo Del Canon,,East,Taos County,1303 Paseo Del Canon East,,Taos,New Mexico,87571,NM,"1303 PASEO DEL CANON EAST, TAOS, NM 87571",36.375292,-105.568573,POINT (-105.568573404848 36.37529177977968)
43934,44004,12606,East,,Main,Street,Nan,Yavapai County,12606 E Main St,,Mayer,Arizona,86333,AZ,"12606 E MAIN ST, MAYER, AZ 86333",34.346322,-112.157189,POINT (-112.1571889686942 34.34632218671697)
43935,44005,4300,Nan,,Blake,Road,Southwest,Bernalillo County,4300 Blake Rd Sw,,Albuquerque,New Mexico,87121,NM,"4300 BLAKE RD SW, ALBUQUERQUE, NM 87121",35.037163,-106.723264,POINT (-106.7232635088362 35.03716265606487)


In [42]:
# postprocessing clean up
# convert zip to type string
nominatim_keep_columns_merge_county['zip'] = nominatim_keep_columns_merge_county.zip.astype(str)

# convert full string to ordinal numbers
nominatim_keep_columns_merge_county['StreetName'] = nominatim_keep_columns_merge_county.StreetName.apply(lambda x: str(x).lower())
# nominatim_keep_columns_merge_county['StreetName'] = nominatim_keep_columns_merge_county.StreetName.apply(lambda x: replace_ordinal_numbers(str(x)))
# nominatim_keep_columns_merge_county['StreetName'] = nominatim_keep_columns_merge_county.StreetName.apply(lambda x: make_ordinal(str(x)))
nominatim_keep_columns_merge_county['StreetName'] = nominatim_keep_columns_merge_county.StreetName.apply(lambda x: str(x).title() if not str(x)[0].isdigit() else str(x).lower())
nominatim_keep_columns_merge_county['StreetName'] = nominatim_keep_columns_merge_county.StreetName.apply(lambda x: str(x).replace("-", " "))

# capitalize first letter
nominatim_keep_columns_merge_county['StreetNamePreType'] = nominatim_keep_columns_merge_county.StreetNamePreType.apply(lambda x: np.NaN if x == np.NaN else str(x).title())
nominatim_keep_columns_merge_county['StreetNamePostType'] = nominatim_keep_columns_merge_county.StreetNamePostType.apply(lambda x: np.NaN if x == np.NaN else str(x).title())

# join Nominatim components to a single string of AddressNumber, StreetNamePreDirectional + StreetNamePreType + StreetName + StreetNamePostType, city, county, state, zip, United States
nominatim_keep_columns_merge_county['Nominatim_address'] = nominatim_keep_columns_merge_county['AddressNumber'] + ',' + \
                                                            nominatim_keep_columns_merge_county[['StreetNamePreDirectional', 'StreetNamePreType', 'StreetName', 'StreetNamePostType', 'StreetNamePostDirectional']].fillna('').agg(' '.join, axis=1) + ', '\
                                                            + nominatim_keep_columns_merge_county[['city', 'county', 'state', 'zip']].fillna('').agg(', '.join, axis=1) + ', United States'

# remove double whitespaces
nominatim_keep_columns_merge_county['Nominatim_address'] = nominatim_keep_columns_merge_county.Nominatim_address.apply(lambda x: x.replace("  ", " "))

# replace string "nan" with np.NaN
nominatim_keep_columns_merge_county['StreetNamePreDirectional'] = nominatim_keep_columns_merge_county.StreetNamePreDirectional.apply(lambda x: np.NaN if str(x) == "Nan" or str(x) == "nan" else x)
nominatim_keep_columns_merge_county['StreetNamePostDirectional'] = nominatim_keep_columns_merge_county.StreetNamePostDirectional.apply(lambda x: np.NaN if str(x) == "Nan" or str(x) == "nan" else x)

nominatim_keep_columns_merge_county['StreetNamePreType'] = nominatim_keep_columns_merge_county.StreetNamePreType.apply(lambda x: np.NaN if str(x) == "Nan" or str(x) == "nan" else x)
nominatim_keep_columns_merge_county['StreetNamePostType'] = nominatim_keep_columns_merge_county.StreetNamePostType.apply(lambda x: np.NaN if str(x) == "Nan" or str(x) == "nan" else x)

# replace "Nan" string from directionals
nominatim_keep_columns_merge_county['Nominatim_address'] = nominatim_keep_columns_merge_county.Nominatim_address.apply(lambda x: str(x).replace('Nan', ''))

# replace adjacent commas with one comma
nominatim_keep_columns_merge_county['Nominatim_address'] = nominatim_keep_columns_merge_county.Nominatim_address.apply(lambda x: str(x).replace(', ,', ', '))
nominatim_keep_columns_merge_county                                                                                                                      

Unnamed: 0,location_id,AddressNumber,StreetNamePreDirectional,StreetNamePreType,StreetName,StreetNamePostType,StreetNamePostDirectional,county,address_1,address_2,city,state,zip,state_abbr,location_source_value,latitude,longitude,geometry,Nominatim_address
0,1,523,East,,Broadway,,,Suffolk County,523 E Broadway,,South Boston,Massachusetts,2127,MA,"523 E BROADWAY, SOUTH BOSTON, MA 02127",42.335472,-71.043522,POINT (-71.04352199999346 42.33547200002667),"523,East Broadway , South Boston, Suffolk Co..."
1,2,569,,,Broadway,,,Essex County,569 Broadway,,Newark,New Jersey,7104,NJ,"569 BROADWAY, NEWARK, NJ 07104",40.769935,-74.162821,POINT (-74.16282099988595 40.76993499987205),"569, Broadway , Newark, Essex County, New Je..."
2,3,3210,,,Southwestern,Boulevard,,Erie County,3210 Southwestern Blvd,,Orchard Park,New York,14127,NY,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",42.798535,-78.747816,POINT (-78.74781599997843 42.79853499984358),"3210, Southwestern Boulevard , Orchard Park, ..."
3,4,431,,,Campground,Road,,Androscoggin County,431 Campground Rd,,Livermore Fls,Maine,4254,ME,"431 CAMPGROUND RD, LIVERMORE FLS, ME 04254",44.428610,-70.113777,POINT (-70.11377699970232 44.42860999977688),"431, Campground Road , Livermore Fls, Androsc..."
4,5,105,,,Harris,Avenue,,Cumberland County,105 Harris Ave,,Portland,Maine,4103,ME,"105 HARRIS AVE, PORTLAND, ME 04103",43.699355,-70.303214,POINT (-70.3032139999053 43.69935500023178),"105, Harris Avenue , Portland, Cumberland Cou..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43932,44002,5000,North,,Bowes,Road,,Pima County,5000 N Bowes Rd,,Tucson,Arizona,85749,AZ,"5000 N BOWES RD, TUCSON, AZ 85749",32.298797,-110.788349,POINT (-110.7883488373378 32.29879689494836),"5000,North Bowes Road , Tucson, Pima County, ..."
43933,44003,1303,,,Paseo Del Canon,,East,Taos County,1303 Paseo Del Canon East,,Taos,New Mexico,87571,NM,"1303 PASEO DEL CANON EAST, TAOS, NM 87571",36.375292,-105.568573,POINT (-105.568573404848 36.37529177977968),"1303, Paseo Del Canon East, Taos, Taos Count..."
43934,44004,12606,East,,Main,Street,,Yavapai County,12606 E Main St,,Mayer,Arizona,86333,AZ,"12606 E MAIN ST, MAYER, AZ 86333",34.346322,-112.157189,POINT (-112.1571889686942 34.34632218671697),"12606,East Main Street , Mayer, Yavapai Count..."
43935,44005,4300,,,Blake,Road,Southwest,Bernalillo County,4300 Blake Rd Sw,,Albuquerque,New Mexico,87121,NM,"4300 BLAKE RD SW, ALBUQUERQUE, NM 87121",35.037163,-106.723264,POINT (-106.7232635088362 35.03716265606487),"4300, Blake Road Southwest, Albuquerque, Bern..."


In [43]:
nominatim_keep_columns_merge_county.iloc[40352].Nominatim_address

'310,West  Pancake Boulevard , Liberal, Seward County, Kansas, 67901, United States'

In [44]:
nominatim_keep_columns_merge_county_parsed_path = os.path.join(abs_path, 'output', 'nominatim_keep_columns_merge_county_parsed.csv')
nominatim_keep_columns_merge_county.to_csv(nominatim_keep_columns_merge_county_parsed_path, index=False)

# Random Sampling Nominatim Addresses
* 10 per state plus US territories

In [29]:
nominatim_keep_columns_merge_county_parsed_path = os.path.join(abs_path, 'output', 'nominatim_keep_columns_merge_county_parsed.csv')
nominatim_keep_columns_merge_county = pd.read_csv(nominatim_keep_columns_merge_county_parsed_path)
nominatim_keep_columns_merge_county

Unnamed: 0,location_id,AddressNumber,StreetNamePreDirectional,StreetNamePreType,StreetName,StreetNamePostType,StreetNamePostDirectional,county,address_1,address_2,city,state,zip,state_abbr,location_source_value,latitude,longitude,geometry,Nominatim_address
0,1,523,East,,Broadway,,,Suffolk County,523 E Broadway,,South Boston,Massachusetts,2127,MA,"523 E BROADWAY, SOUTH BOSTON, MA 02127",42.335472,-71.043522,POINT (-71.04352199999346 42.33547200002667),"523,East Broadway , South Boston, Suffolk Co..."
1,2,569,,,Broadway,,,Essex County,569 Broadway,,Newark,New Jersey,7104,NJ,"569 BROADWAY, NEWARK, NJ 07104",40.769935,-74.162821,POINT (-74.16282099988595 40.76993499987205),"569, Broadway , Newark, Essex County, New Je..."
2,3,3210,,,Southwestern,Boulevard,,Erie County,3210 Southwestern Blvd,,Orchard Park,New York,14127,NY,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",42.798535,-78.747816,POINT (-78.74781599997843 42.79853499984358),"3210, Southwestern Boulevard , Orchard Park, ..."
3,4,431,,,Campground,Road,,Androscoggin County,431 Campground Rd,,Livermore Fls,Maine,4254,ME,"431 CAMPGROUND RD, LIVERMORE FLS, ME 04254",44.428610,-70.113777,POINT (-70.11377699970232 44.42860999977688),"431, Campground Road , Livermore Fls, Androsc..."
4,5,105,,,Harris,Avenue,,Cumberland County,105 Harris Ave,,Portland,Maine,4103,ME,"105 HARRIS AVE, PORTLAND, ME 04103",43.699355,-70.303214,POINT (-70.3032139999053 43.69935500023178),"105, Harris Avenue , Portland, Cumberland Cou..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43932,44002,5000,North,,Bowes,Road,,Pima County,5000 N Bowes Rd,,Tucson,Arizona,85749,AZ,"5000 N BOWES RD, TUCSON, AZ 85749",32.298797,-110.788349,POINT (-110.7883488373378 32.29879689494836),"5000,North Bowes Road , Tucson, Pima County, ..."
43933,44003,1303,,,Paseo Del Canon,,East,Taos County,1303 Paseo Del Canon East,,Taos,New Mexico,87571,NM,"1303 PASEO DEL CANON EAST, TAOS, NM 87571",36.375292,-105.568573,POINT (-105.568573404848 36.37529177977968),"1303, Paseo Del Canon East, Taos, Taos Count..."
43934,44004,12606,East,,Main,Street,,Yavapai County,12606 E Main St,,Mayer,Arizona,86333,AZ,"12606 E MAIN ST, MAYER, AZ 86333",34.346322,-112.157189,POINT (-112.1571889686942 34.34632218671697),"12606,East Main Street , Mayer, Yavapai Count..."
43935,44005,4300,,,Blake,Road,Southwest,Bernalillo County,4300 Blake Rd Sw,,Albuquerque,New Mexico,87121,NM,"4300 BLAKE RD SW, ALBUQUERQUE, NM 87121",35.037163,-106.723264,POINT (-106.7232635088362 35.03716265606487),"4300, Blake Road Southwest, Albuquerque, Bern..."


In [30]:
territories_drop = ['MP', 'VI', 'PW', 'Of', 'PR', 'GU']

dropped_territories = nominatim_keep_columns_merge_county.loc[nominatim_keep_columns_merge_county.state_abbr.isin(territories_drop)]
nominatim_keep_columns_merge_county_drop = nominatim_keep_columns_merge_county.loc[~nominatim_keep_columns_merge_county.state_abbr.isin(territories_drop)]

print(nominatim_keep_columns_merge_county_drop.shape)

(43937, 19)


In [32]:
nominatim_sample = nominatim_keep_columns_merge_county_drop.groupby('state_abbr').apply(lambda x: x.sample(n=11)).reset_index(drop = True)
nominatim_sample

Unnamed: 0,location_id,AddressNumber,StreetNamePreDirectional,StreetNamePreType,StreetName,StreetNamePostType,StreetNamePostDirectional,county,address_1,address_2,city,state,zip,state_abbr,location_source_value,latitude,longitude,geometry,Nominatim_address
0,25991,820,West,,Washington,Street,,Barbour County,820 West Washington Street,,Eufaula,Alabama,36027,AL,"820 WEST WASHINGTON STREET, EUFAULA, AL 36027",31.888645,-85.156405,POINT (-85.1564054199999 31.8886450800001),"820,West Washington Street , Eufaula, Barbour..."
1,37014,510,,,3rd,Avenue,Southeast,Cullman County,510 3Rd Ave Se,,Cullman,Alabama,35055,AL,"510 3RD AVE SE, CULLMAN, AL 35055",34.173734,-86.837699,POINT (-86.83769857384317 34.17373428494142),"510, 3rd Avenue Southeast, Cullman, Cullman C..."
2,37132,340,,,Ashley,Street,,Wilcox County,340 Ashley St,,Camden,Alabama,36726,AL,"340 ASHLEY ST, CAMDEN, AL 36726",32.002961,-87.306198,POINT (-87.30619807135569 32.00296082568032),"340, Ashley Street , Camden, Wilcox County, A..."
3,31934,2051,South,,Broad,Street,,Mobile County,2051 S Broad St,,Mobile,Alabama,36615,AL,"2051 S Broad St, Mobile, AL 36615",30.645009,-88.069570,POINT (-88.06957000015663 30.64500900008375),"2051,South Broad Street , Mobile, Mobile Coun..."
4,36997,1500,,,Airport,Road,,Calhoun County,1500 Airport Rd,,Oxford,Alabama,36203,AL,"1500 AIRPORT RD, OXFORD, AL 36203",33.589470,-85.872989,POINT (-85.87298872802263 33.58947026309733),"1500, Airport Road , Oxford, Calhoun County, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
523,7814,1810,East,,Cedar,Street,,Carbon County,1810 E Cedar St,,Rawlins,Wyoming,82301,WY,"1810 E CEDAR ST, RAWLINS, WY 82301",41.791724,-107.217112,POINT (-107.217112000311 41.79172399972282),"1810,East Cedar Street , Rawlins, Carbon Coun..."
524,8730,409,,,Park,Street,,Sheridan County,409 Park St,,Sheridan,Wyoming,82801,WY,"409 PARK ST, SHERIDAN, WY 82801",44.794237,-106.949902,POINT (-106.9499019997259 44.79423699987856),"409, Park Street , Sheridan, Sheridan County,..."
525,6740,150,,,Delaney,Court,,Park County,150 Delaney Ct,,Powell,Wyoming,82435,WY,"150 DELANEY CT, POWELL, WY 82435",44.765312,-108.761002,POINT (-108.7610019999442 44.76531200028313),"150, Delaney Court , Powell, Park County, Wyo..."
526,10726,3321,,,Saratoga,Road,,Natrona County,3321 Saratoga Rd,,Casper,Wyoming,82604,WY,"3321 SARATOGA RD, CASPER, WY 82604",42.816866,-106.350932,POINT (-106.3509319997299 42.81686600002254),"3321, Saratoga Road , Casper, Natrona County,..."


In [33]:
nominatim_sample.state_abbr.value_counts()

state_abbr
AL    11
AR    11
NE    11
NH    11
NJ    11
NM    11
NV    11
NY    11
OH    11
OK    11
OR    11
PA    11
RI    11
SC    11
SD    11
TN    11
TX    11
UT    11
VA    11
VT    11
WA    11
WI    11
WV    11
ND    11
NC    11
MT    11
ID    11
AZ    11
CA    11
CO    11
CT    11
DE    11
FL    11
GA    11
HI    11
IA    11
IL    11
MS    11
IN    11
KS    11
KY    11
MA    11
MD    11
ME    11
MI    11
MN    11
MO    11
WY    11
Name: count, dtype: int64

In [51]:
nominatim_sample_path = os.path.join(abs_path, 'output', 'nominatim_sample_v2.csv')
nominatim_sample.to_csv(nominatim_sample_path, index=False)

## Failed Addresses

In [18]:
omop_county_failed_zcta_zip_match = omop_county_failed_zcta_zip_match.rename(columns={"geometry_x": "geometry", "Location_id": "location_id"})
omop_county_failed_zcta_zip_match['location_id'] = omop_county_failed_zcta_zip_match.index+1

In [19]:
%%time

# run `usaddress` parsing with base component names and append to list
repo = []
for ind, each in omop_county_failed_zcta_zip_match.iterrows():
    try:
        obj = usaddress.tag(each.location_source_value)

        # create DataFrame of the `usaddress` OrderedDict with the components as the column names
        tmp = pd.DataFrame(obj[0], columns=obj[0].keys(), index=[ind])
        tmp['Address_type'] = obj[1]

        # assign the county name from omop_county_zcta_zip_match
        tmp['county'] = each.county
        
        # assign the location_source_value from omop_county_zcta_zip_match
        tmp['location_source_value'] = each.location_source_value

        # assign location_id from omop_county_zcta_zip_match
        tmp['location_id'] = each.location_id
        tmp['flag'] = each.flag

        # development
        # OMOP_location.loc[ind, 'AddressNumber'] = [tmp['AddressNumber'].values[0] if len(tmp['AddressNumber'].values[0]) > 0 else np.NaN][0]
        # OMOP_location.loc[ind, 'StreetNamePreDirectional'] = [tmp['StreetNamePreDirectional'].values[0] if len(tmp['AddressNumber'].values[0]) > 0 else np.NaN][0]
        # OMOP_location.loc[ind, 'StreetNamePreType'] = [tmp['StreetNamePreType'].values[0] if len(tmp['AddressNumber'].values[0]) > 0 else np.NaN][0]
        # OMOP_location.loc[ind, 'StreetName'] = [tmp['StreetName'].values[0] if len(tmp['AddressNumber'].values[0]) > 0 else np.NaN][0]
        # OMOP_location.loc[ind, 'StreetNamePostType'] = [tmp['StreetNamePostType'].values[0] if len(tmp['AddressNumber'].values[0]) > 0 else np.NaN][0]
        # OMOP_location.loc[ind, 'StreetNamePostDirectional'] = [tmp['StreetNamePostDirectional'].values[0] if len(tmp['AddressNumber'].values[0]) > 0 else np.NaN][0]
        # OMOP_location.loc[ind, 'PlaceName'] = [tmp['PlaceName'].values[0] if len(tmp['AddressNumber'].values[0]) > 0 else np.NaN][0]
        # OMOP_location.loc[ind, 'StateName'] = [tmp['StateName'].values[0] if len(tmp['AddressNumber'].values[0]) > 0 else np.NaN][0]
        # OMOP_location.loc[ind, 'ZipCode'] = [tmp['ZipCode'].values[0] if len(tmp['AddressNumber'].values[0]) > 0 else np.NaN][0]

        # OMOP_location.loc[ind, 'AddressNumber'] = obj[0]['AddressNumber']
        # OMOP_location.loc[ind, 'StreetNamePreDirectional'] = obj[0]['StreetNamePreDirectional']
        # OMOP_location.loc[ind, 'StreetNamePreType'] = obj[0]['StreetNamePreType']
        # OMOP_location.loc[ind, 'StreetName'] = obj[0]['StreetName']
        # OMOP_location.loc[ind, 'StreetNamePostType'] = obj[0]['StreetNamePostType']
        # OMOP_location.loc[ind, 'StreetNamePostDirectional'] = obj[0]['StreetNamePostDirectional']
        # OMOP_location.loc[ind, 'PlaceName']=obj[0]['PlaceName']
        # OMOP_location.loc[ind, 'StateName']=obj[0]['StateName']
        # OMOP_location.loc[ind, 'ZipCode']=obj[0]['ZipCode']

        repo.append(tmp)

        # StreetNamePreDirectional = tmp['StreetNamePreDirectional'].values[0]
        # if not StreetNamePreDirectional:
        #     OMOP_location.loc[ind, 'StreetNamePreDirectional'] = np.NaN
        # else:
        #     OMOP_location.loc[ind, 'StreetNamePreDirectional'] = StreetNamePreDirectional

    except:
        pass

CPU times: total: 8.34 s
Wall time: 16.1 s


In [20]:
%%time

# concatenate above list to a single dataframe and only keep address components for Nominatim
nominatim_components = pd.concat(repo)
nominatim_components[['location_id','AddressNumber', 'StreetNamePreDirectional', 'StreetNamePreType', 'StreetName', 'StreetNamePostType','StreetNamePostDirectional',
                      'PlaceName', 'county', 'StateName', 'ZipCode', 'location_source_value', 'flag']]

CPU times: total: 5.28 s
Wall time: 8.58 s


Unnamed: 0,location_id,AddressNumber,StreetNamePreDirectional,StreetNamePreType,StreetName,StreetNamePostType,StreetNamePostDirectional,PlaceName,county,StateName,ZipCode,location_source_value,flag
1,2,,,,,,,CHINA VILLAGE,Kennebec County,ME,04926,"PO BOX 6149, CHINA VILLAGE, ME 04926",FAILED DUE TO PO BOX ADDRESS
2,3,,,,,,,NORTHEAST HBR,Hancock County,ME,04662,"PO BOX 555, NORTHEAST HBR, ME 04662",FAILED DUE TO PO BOX ADDRESS
4,5,,,,,,,PRESQUE ISLE,Aroostook County,ME,04769,"PO BOX 826, PRESQUE ISLE, ME 04769",FAILED DUE TO PO BOX ADDRESS
5,6,,,,,,,NORTH HAVEN,Knox County,ME,04853,"PO BOX 435, NORTH HAVEN, ME 04853",FAILED DUE TO PO BOX ADDRESS
6,7,,,,,,,KENNEBUNKPORT,York County,ME,04046,"PO BOX 391, KENNEBUNKPORT, ME 04046",FAILED DUE TO PO BOX ADDRESS
...,...,...,...,...,...,...,...,...,...,...,...,...,...
21485,21486,1814,EAST,,SECOND,STREET,,SCOTCH PLAINS,Union County,NJ,07076,"1814 EAST SECOND STREET, SCOTCH PLAINS, NJ 07076",FAILED DUE TO NON 5-DIGIT ZIPCODE
21490,21491,891,,,TABOR,ROAD,,MORRIS PLAINS,Morris County,NJ,07950,"891 TABOR ROAD, MORRIS PLAINS, NJ 07950",FAILED DUE TO NON 5-DIGIT ZIPCODE
21493,21494,16,,,ETHEL,ROAD,,EDISON,Middlesex County,NJ,08817,"16 ETHEL ROAD, EDISON, NJ 08817",FAILED DUE TO NON 5-DIGIT ZIPCODE
21494,21495,606,,,DOWD,AVENUE,,ELIZABETH,Union County,NJ,07201,"606 DOWD AVENUE, ELIZABETH, NJ 07201",FAILED DUE TO NON 5-DIGIT ZIPCODE


In [21]:
# keep only columns necessary to merge to omop_county_zcta_zip_match that has the USPS Publication 28 Standard components
nominatim_keep_columns = nominatim_components[['location_id','AddressNumber', 'StreetNamePreDirectional', 'StreetNamePreType', 'StreetName', 'StreetNamePostType','StreetNamePostDirectional','county',]]

In [22]:
# create empty dataframe with USPS Publication 28 Standard components and source address string and lat/lon
nominatim_location = pd.DataFrame(columns=['location_id', 'address_1', 'address_2', 'city', 'state', 'zip', 'state_abbr', 'location_source_value','latitude','longitude', 'geometry', 'flag'])

# fill in address and new location_id from omop_county_zcta_zip_match 
nominatim_location['location_source_value'] = omop_county_failed_zcta_zip_match.location_source_value
nominatim_location['location_id'] = nominatim_location.index+1
# nominatim_location['county'] = omop_county_zcta_zip_match.county

# assign the same columns from omop_county_zcta_zip_match
nominatim_location.address_1 = omop_county_failed_zcta_zip_match.address_1
nominatim_location.address_2 = omop_county_failed_zcta_zip_match.address_2
nominatim_location.city = omop_county_failed_zcta_zip_match.city
nominatim_location.state = omop_county_failed_zcta_zip_match.state
nominatim_location.zip = omop_county_failed_zcta_zip_match.zip
nominatim_location.state_abbr = omop_county_failed_zcta_zip_match.state_abbr
nominatim_location.latitude = omop_county_failed_zcta_zip_match.latitude
nominatim_location.longitude = omop_county_failed_zcta_zip_match.longitude
nominatim_location.geometry = omop_county_failed_zcta_zip_match.geometry
nominatim_location.flag = omop_county_failed_zcta_zip_match.flag
nominatim_location

Unnamed: 0,location_id,address_1,address_2,city,state,zip,state_abbr,location_source_value,latitude,longitude,geometry,flag
1,2,Po Box 6149,,China Village,Maine,04926,ME,"PO BOX 6149, CHINA VILLAGE, ME 04926",44.481721,-69.516751,POINT (-69.51675 44.48172),FAILED DUE TO PO BOX ADDRESS
2,3,Po Box 555,,Northeast Hbr,Maine,04662,ME,"PO BOX 555, NORTHEAST HBR, ME 04662",44.294140,-68.290211,POINT (-68.29021 44.29414),FAILED DUE TO PO BOX ADDRESS
4,5,Po Box 826,,Presque Isle,Maine,04769,ME,"PO BOX 826, PRESQUE ISLE, ME 04769",46.681235,-68.010188,POINT (-68.01019 46.68124),FAILED DUE TO PO BOX ADDRESS
5,6,Po Box 435,,North Haven,Maine,04853,ME,"PO BOX 435, NORTH HAVEN, ME 04853",44.132285,-68.873329,POINT (-68.87333 44.13229),FAILED DUE TO PO BOX ADDRESS
6,7,Po Box 391,,Kennebunkport,Maine,04046,ME,"PO BOX 391, KENNEBUNKPORT, ME 04046",43.404581,-70.411332,POINT (-70.41133 43.40458),FAILED DUE TO PO BOX ADDRESS
...,...,...,...,...,...,...,...,...,...,...,...,...
21487,21488,90 United States Highway 22 West,,Springfield,New Jersey,07081,NJ,"90 UNITED STATES HIGHWAY 22 WEST, SPRINGFIELD,...",40.687946,-74.315669,POINT (-74.31567 40.68795),FAILED DUE TO NON 5-DIGIT ZIPCODE
21490,21491,891 Tabor Road,,Morris Plains,New Jersey,00795,NJ,"891 TABOR ROAD, MORRIS PLAINS, NJ 07950",40.860893,-74.469700,POINT (-74.46970 40.86089),FAILED DUE TO NON 5-DIGIT ZIPCODE
21493,21494,16 Ethel Road,,Edison,New Jersey,08817,NJ,"16 ETHEL ROAD, EDISON, NJ 08817",40.540541,-74.398598,POINT (-74.39860 40.54054),FAILED DUE TO NON 5-DIGIT ZIPCODE
21494,21495,606 Dowd Avenue,,Elizabeth,New Jersey,07201,NJ,"606 DOWD AVENUE, ELIZABETH, NJ 07201",40.666597,-74.188170,POINT (-74.18817 40.66660),FAILED DUE TO NON 5-DIGIT ZIPCODE


In [23]:
# merge omop_county_zcta_zip_match addresses to the ones that parsed fully from nominatim_keep_columns_merge_county
nominatim_keep_columns_merge_county = nominatim_keep_columns.merge(nominatim_location, how='left', on='location_id')
print(nominatim_keep_columns_merge_county.dtypes)
nominatim_keep_columns_merge_county

location_id                     int64
AddressNumber                  object
StreetNamePreDirectional       object
StreetNamePreType              object
StreetName                     object
StreetNamePostType             object
StreetNamePostDirectional      object
county                         object
address_1                      object
address_2                      object
city                           object
state                          object
zip                            object
state_abbr                     object
location_source_value          object
latitude                      float64
longitude                     float64
geometry                     geometry
flag                           object
dtype: object


Unnamed: 0,location_id,AddressNumber,StreetNamePreDirectional,StreetNamePreType,StreetName,StreetNamePostType,StreetNamePostDirectional,county,address_1,address_2,city,state,zip,state_abbr,location_source_value,latitude,longitude,geometry,flag
0,2,,,,,,,Kennebec County,Po Box 6149,,China Village,Maine,04926,ME,"PO BOX 6149, CHINA VILLAGE, ME 04926",44.481721,-69.516751,POINT (-69.51675 44.48172),FAILED DUE TO PO BOX ADDRESS
1,3,,,,,,,Hancock County,Po Box 555,,Northeast Hbr,Maine,04662,ME,"PO BOX 555, NORTHEAST HBR, ME 04662",44.294140,-68.290211,POINT (-68.29021 44.29414),FAILED DUE TO PO BOX ADDRESS
2,5,,,,,,,Aroostook County,Po Box 826,,Presque Isle,Maine,04769,ME,"PO BOX 826, PRESQUE ISLE, ME 04769",46.681235,-68.010188,POINT (-68.01019 46.68124),FAILED DUE TO PO BOX ADDRESS
3,6,,,,,,,Knox County,Po Box 435,,North Haven,Maine,04853,ME,"PO BOX 435, NORTH HAVEN, ME 04853",44.132285,-68.873329,POINT (-68.87333 44.13229),FAILED DUE TO PO BOX ADDRESS
4,7,,,,,,,York County,Po Box 391,,Kennebunkport,Maine,04046,ME,"PO BOX 391, KENNEBUNKPORT, ME 04046",43.404581,-70.411332,POINT (-70.41133 43.40458),FAILED DUE TO PO BOX ADDRESS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9237,21486,1814,EAST,,SECOND,STREET,,Union County,1814 East Second Street,,Scotch Plains,New Jersey,07076,NJ,"1814 EAST SECOND STREET, SCOTCH PLAINS, NJ 07076",40.648605,-74.397121,POINT (-74.39712 40.64860),FAILED DUE TO NON 5-DIGIT ZIPCODE
9238,21491,891,,,TABOR,ROAD,,Morris County,891 Tabor Road,,Morris Plains,New Jersey,00795,NJ,"891 TABOR ROAD, MORRIS PLAINS, NJ 07950",40.860893,-74.469700,POINT (-74.46970 40.86089),FAILED DUE TO NON 5-DIGIT ZIPCODE
9239,21494,16,,,ETHEL,ROAD,,Middlesex County,16 Ethel Road,,Edison,New Jersey,08817,NJ,"16 ETHEL ROAD, EDISON, NJ 08817",40.540541,-74.398598,POINT (-74.39860 40.54054),FAILED DUE TO NON 5-DIGIT ZIPCODE
9240,21495,606,,,DOWD,AVENUE,,Union County,606 Dowd Avenue,,Elizabeth,New Jersey,07201,NJ,"606 DOWD AVENUE, ELIZABETH, NJ 07201",40.666597,-74.188170,POINT (-74.18817 40.66660),FAILED DUE TO NON 5-DIGIT ZIPCODE


In [24]:
# replace cardinal direction abbreviations with full string and capitalize only first letter
nominatim_keep_columns_merge_county['StreetNamePreDirectional'] = nominatim_keep_columns_merge_county.StreetNamePreDirectional.apply(lambda x: np.NaN if x == np.NaN else cardinal_direction_lambda(x, cardinal_directions_to_full))
nominatim_keep_columns_merge_county['StreetNamePreDirectional'] = nominatim_keep_columns_merge_county.StreetNamePreDirectional.apply(lambda x: str(x).title())

# replace cardinal direction abbreviations with full string and capitalize only first letter
nominatim_keep_columns_merge_county['StreetNamePostDirectional'] = nominatim_keep_columns_merge_county.StreetNamePostDirectional.apply(lambda x: np.NaN if x == np.NaN else cardinal_direction_lambda(x, cardinal_directions_to_full))
nominatim_keep_columns_merge_county['StreetNamePostDirectional'] = nominatim_keep_columns_merge_county.StreetNamePostDirectional.apply(lambda x: str(x).title())

# capitalize only first letters
nominatim_keep_columns_merge_county['StreetName'] = nominatim_keep_columns_merge_county.StreetName.apply(lambda x: str(x).title())
nominatim_keep_columns_merge_county['StreetName'] = nominatim_keep_columns_merge_county.StreetName.apply(lambda x: str(x).title())

# replace street post abbreviations with full string
nominatim_keep_columns_merge_county['StreetNamePreType'] = nominatim_keep_columns_merge_county.StreetNamePreType.apply(lambda x: street_suffix_lambda(str(x).lower(), street_suffix_to_full))
nominatim_keep_columns_merge_county['StreetNamePostType'] = nominatim_keep_columns_merge_county.StreetNamePostType.apply(lambda x: street_suffix_lambda(str(x).lower(), street_suffix_to_full))

# remove ending whitespaces
nominatim_keep_columns_merge_county['AddressNumber'] = nominatim_keep_columns_merge_county.AddressNumber.apply(lambda x: str(x).rstrip())
nominatim_keep_columns_merge_county

Unnamed: 0,location_id,AddressNumber,StreetNamePreDirectional,StreetNamePreType,StreetName,StreetNamePostType,StreetNamePostDirectional,county,address_1,address_2,city,state,zip,state_abbr,location_source_value,latitude,longitude,geometry,flag
0,2,,Nan,,Nan,,Nan,Kennebec County,Po Box 6149,,China Village,Maine,04926,ME,"PO BOX 6149, CHINA VILLAGE, ME 04926",44.481721,-69.516751,POINT (-69.51675 44.48172),FAILED DUE TO PO BOX ADDRESS
1,3,,Nan,,Nan,,Nan,Hancock County,Po Box 555,,Northeast Hbr,Maine,04662,ME,"PO BOX 555, NORTHEAST HBR, ME 04662",44.294140,-68.290211,POINT (-68.29021 44.29414),FAILED DUE TO PO BOX ADDRESS
2,5,,Nan,,Nan,,Nan,Aroostook County,Po Box 826,,Presque Isle,Maine,04769,ME,"PO BOX 826, PRESQUE ISLE, ME 04769",46.681235,-68.010188,POINT (-68.01019 46.68124),FAILED DUE TO PO BOX ADDRESS
3,6,,Nan,,Nan,,Nan,Knox County,Po Box 435,,North Haven,Maine,04853,ME,"PO BOX 435, NORTH HAVEN, ME 04853",44.132285,-68.873329,POINT (-68.87333 44.13229),FAILED DUE TO PO BOX ADDRESS
4,7,,Nan,,Nan,,Nan,York County,Po Box 391,,Kennebunkport,Maine,04046,ME,"PO BOX 391, KENNEBUNKPORT, ME 04046",43.404581,-70.411332,POINT (-70.41133 43.40458),FAILED DUE TO PO BOX ADDRESS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9237,21486,1814,East,,Second,street,Nan,Union County,1814 East Second Street,,Scotch Plains,New Jersey,07076,NJ,"1814 EAST SECOND STREET, SCOTCH PLAINS, NJ 07076",40.648605,-74.397121,POINT (-74.39712 40.64860),FAILED DUE TO NON 5-DIGIT ZIPCODE
9238,21491,891,Nan,,Tabor,road,Nan,Morris County,891 Tabor Road,,Morris Plains,New Jersey,00795,NJ,"891 TABOR ROAD, MORRIS PLAINS, NJ 07950",40.860893,-74.469700,POINT (-74.46970 40.86089),FAILED DUE TO NON 5-DIGIT ZIPCODE
9239,21494,16,Nan,,Ethel,road,Nan,Middlesex County,16 Ethel Road,,Edison,New Jersey,08817,NJ,"16 ETHEL ROAD, EDISON, NJ 08817",40.540541,-74.398598,POINT (-74.39860 40.54054),FAILED DUE TO NON 5-DIGIT ZIPCODE
9240,21495,606,Nan,,Dowd,avenue,Nan,Union County,606 Dowd Avenue,,Elizabeth,New Jersey,07201,NJ,"606 DOWD AVENUE, ELIZABETH, NJ 07201",40.666597,-74.188170,POINT (-74.18817 40.66660),FAILED DUE TO NON 5-DIGIT ZIPCODE


In [25]:
# postprocessing clean up
# convert zip to type string
nominatim_keep_columns_merge_county['zip'] = nominatim_keep_columns_merge_county.zip.astype(str)

# convert full string to ordinal numbers
nominatim_keep_columns_merge_county['StreetName'] = nominatim_keep_columns_merge_county.StreetName.apply(lambda x: str(x).lower())
# nominatim_keep_columns_merge_county['StreetName'] = nominatim_keep_columns_merge_county.StreetName.apply(lambda x: replace_ordinal_numbers(str(x)))
# nominatim_keep_columns_merge_county['StreetName'] = nominatim_keep_columns_merge_county.StreetName.apply(lambda x: make_ordinal(str(x)))
nominatim_keep_columns_merge_county['StreetName'] = nominatim_keep_columns_merge_county.StreetName.apply(lambda x: str(x).title() if not str(x)[0].isdigit() else str(x).lower())
nominatim_keep_columns_merge_county['StreetName'] = nominatim_keep_columns_merge_county.StreetName.apply(lambda x: str(x).replace("-", " "))

# capitalize first letter
nominatim_keep_columns_merge_county['StreetNamePreType'] = nominatim_keep_columns_merge_county.StreetNamePreType.apply(lambda x: np.NaN if x == np.NaN else str(x).title())
nominatim_keep_columns_merge_county['StreetNamePostType'] = nominatim_keep_columns_merge_county.StreetNamePostType.apply(lambda x: np.NaN if x == np.NaN else str(x).title())

# join Nominatim components to a single string of AddressNumber, StreetNamePreDirectional + StreetNamePreType + StreetName + StreetNamePostType, city, county, state, zip, United States
nominatim_keep_columns_merge_county['Nominatim_address'] = nominatim_keep_columns_merge_county['AddressNumber'] + ',' + \
                                                            nominatim_keep_columns_merge_county[['StreetNamePreDirectional', 'StreetNamePreType', 'StreetName', 'StreetNamePostType', 'StreetNamePostDirectional']].fillna('').agg(' '.join, axis=1) + ', '\
                                                            + nominatim_keep_columns_merge_county[['city', 'county', 'state', 'zip']].fillna('').agg(', '.join, axis=1) + ', United States'

# remove double whitespaces
nominatim_keep_columns_merge_county['Nominatim_address'] = nominatim_keep_columns_merge_county.Nominatim_address.apply(lambda x: x.replace("  ", " "))

# replace string "nan" with np.NaN
nominatim_keep_columns_merge_county['StreetNamePreDirectional'] = nominatim_keep_columns_merge_county.StreetNamePreDirectional.apply(lambda x: np.NaN if str(x) == "Nan" or str(x) == "nan" else x)
nominatim_keep_columns_merge_county['StreetNamePostDirectional'] = nominatim_keep_columns_merge_county.StreetNamePostDirectional.apply(lambda x: np.NaN if str(x) == "Nan" or str(x) == "nan" else x)

nominatim_keep_columns_merge_county['StreetNamePreType'] = nominatim_keep_columns_merge_county.StreetNamePreType.apply(lambda x: np.NaN if str(x) == "Nan" or str(x) == "nan" else x)
nominatim_keep_columns_merge_county['StreetNamePostType'] = nominatim_keep_columns_merge_county.StreetNamePostType.apply(lambda x: np.NaN if str(x) == "Nan" or str(x) == "nan" else x)

# replace "Nan" string from directionals
nominatim_keep_columns_merge_county['Nominatim_address'] = nominatim_keep_columns_merge_county.Nominatim_address.apply(lambda x: str(x).replace('Nan', ''))

# replace adjacent commas with one comma
nominatim_keep_columns_merge_county['Nominatim_address'] = nominatim_keep_columns_merge_county.Nominatim_address.apply(lambda x: str(x).replace(', ,', ', '))
nominatim_keep_columns_merge_county                                                                                                                      

Unnamed: 0,location_id,AddressNumber,StreetNamePreDirectional,StreetNamePreType,StreetName,StreetNamePostType,StreetNamePostDirectional,county,address_1,address_2,city,state,zip,state_abbr,location_source_value,latitude,longitude,geometry,flag,Nominatim_address
0,2,,,,Nan,,,Kennebec County,Po Box 6149,,China Village,Maine,04926,ME,"PO BOX 6149, CHINA VILLAGE, ME 04926",44.481721,-69.516751,POINT (-69.51675 44.48172),FAILED DUE TO PO BOX ADDRESS,"nan, , China Village, Kennebec County, Main..."
1,3,,,,Nan,,,Hancock County,Po Box 555,,Northeast Hbr,Maine,04662,ME,"PO BOX 555, NORTHEAST HBR, ME 04662",44.294140,-68.290211,POINT (-68.29021 44.29414),FAILED DUE TO PO BOX ADDRESS,"nan, , Northeast Hbr, Hancock County, Maine..."
2,5,,,,Nan,,,Aroostook County,Po Box 826,,Presque Isle,Maine,04769,ME,"PO BOX 826, PRESQUE ISLE, ME 04769",46.681235,-68.010188,POINT (-68.01019 46.68124),FAILED DUE TO PO BOX ADDRESS,"nan, , Presque Isle, Aroostook County, Main..."
3,6,,,,Nan,,,Knox County,Po Box 435,,North Haven,Maine,04853,ME,"PO BOX 435, NORTH HAVEN, ME 04853",44.132285,-68.873329,POINT (-68.87333 44.13229),FAILED DUE TO PO BOX ADDRESS,"nan, , North Haven, Knox County, Maine, 048..."
4,7,,,,Nan,,,York County,Po Box 391,,Kennebunkport,Maine,04046,ME,"PO BOX 391, KENNEBUNKPORT, ME 04046",43.404581,-70.411332,POINT (-70.41133 43.40458),FAILED DUE TO PO BOX ADDRESS,"nan, , Kennebunkport, York County, Maine, 0..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9237,21486,1814,East,,Second,Street,,Union County,1814 East Second Street,,Scotch Plains,New Jersey,07076,NJ,"1814 EAST SECOND STREET, SCOTCH PLAINS, NJ 07076",40.648605,-74.397121,POINT (-74.39712 40.64860),FAILED DUE TO NON 5-DIGIT ZIPCODE,"1814,East Second Street , Scotch Plains, Unio..."
9238,21491,891,,,Tabor,Road,,Morris County,891 Tabor Road,,Morris Plains,New Jersey,00795,NJ,"891 TABOR ROAD, MORRIS PLAINS, NJ 07950",40.860893,-74.469700,POINT (-74.46970 40.86089),FAILED DUE TO NON 5-DIGIT ZIPCODE,"891, Tabor Road , Morris Plains, Morris Count..."
9239,21494,16,,,Ethel,Road,,Middlesex County,16 Ethel Road,,Edison,New Jersey,08817,NJ,"16 ETHEL ROAD, EDISON, NJ 08817",40.540541,-74.398598,POINT (-74.39860 40.54054),FAILED DUE TO NON 5-DIGIT ZIPCODE,"16, Ethel Road , Edison, Middlesex County, Ne..."
9240,21495,606,,,Dowd,Avenue,,Union County,606 Dowd Avenue,,Elizabeth,New Jersey,07201,NJ,"606 DOWD AVENUE, ELIZABETH, NJ 07201",40.666597,-74.188170,POINT (-74.18817 40.66660),FAILED DUE TO NON 5-DIGIT ZIPCODE,"606, Dowd Avenue , Elizabeth, Union County, N..."


In [26]:
nominatim_keep_columns_merge_county.state_abbr.value_counts()

state_abbr
CO         844
AZ         819
OR         778
NM         640
NE         518
CA         460
MT         389
MA         346
SD         336
ID         324
ND         288
ME         270
NJ         235
CT         235
NV         235
NY         209
UT         204
WY         160
TX         150
NH         134
NC         131
VT         114
IL         111
FL         111
MI         107
OH         105
HI          91
RI          86
IN          77
MO          75
TN          56
GA          51
PA          51
AL          48
WI          46
MN          46
KS          45
KY          37
WA          37
IA          37
WV          36
OK          29
AR          26
VA          26
MS          25
West VA     21
MD          19
SC          13
New          7
DE           4
Name: count, dtype: int64

In [27]:
nominatim_keep_columns_merge_county_parsed_path = os.path.join(abs_path, 'output', 'nominatim_keep_columns_merge_county_parsed_failedAddresses.csv')
nominatim_keep_columns_merge_county.to_csv(nominatim_keep_columns_merge_county_parsed_path, index=False)

### Random Sampling Failed Addresses

In [28]:
# random sampling
territories_drop = ['MP', 'VI', 'PW', 'Of', 'PR', 'GU', 'New', 'DE']

dropped_territories = nominatim_keep_columns_merge_county.loc[nominatim_keep_columns_merge_county.state_abbr.isin(territories_drop)]
nominatim_keep_columns_merge_county_drop = nominatim_keep_columns_merge_county.loc[~nominatim_keep_columns_merge_county.state_abbr.isin(territories_drop)]

print(nominatim_keep_columns_merge_county_drop.shape)

df_sample = nominatim_keep_columns_merge_county_drop.groupby('state_abbr').apply(lambda x: x.sample(n=10)).reset_index(drop = True)
# concatenate dropped US territories to the other random samples
nominatim_sample = pd.concat([dropped_territories, df_sample])
nominatim_sample

(9231, 20)


Unnamed: 0,location_id,AddressNumber,StreetNamePreDirectional,StreetNamePreType,StreetName,StreetNamePostType,StreetNamePostDirectional,county,address_1,address_2,city,state,zip,state_abbr,location_source_value,latitude,longitude,geometry,flag,Nominatim_address
5223,11241,1200,N.,,Dupont,Highway,,Kent County,1200 N. Dupont Highway,,Dover,Delaware,19901,DE,"1200 N. DUPONT HIGHWAY, DOVER, DE 19901",39.187173,-75.540530,POINT (-75.54053 39.18717),FAILED DUE TO PRESENCE OF SPECIAL CHARACTERS,"1200,N. Dupont Highway , Dover, Kent County, ..."
5684,12236,512,,,"Paseo Del Pueblo Sur, Taos",,,Taos County,512 PASEO DEL PUEBLO SUR,,Taos,New,87571,New,"512 PASEO DEL PUEBLO SUR, Taos, New Mexico 87571",36.397496,-105.578222,POINT (-105.57822 36.39750),FAILED DUE TO INCORRECT STATE FORMAT,"512, Paseo Del Pueblo Sur, Taos , Taos, Taos..."
5688,12253,100,South,,Federal,Place,,Santa Fe County,100 S FEDERAL PL,,Santa Fe,New,87501,New,"100 S FEDERAL PL, Santa Fe, New Mexico 87501",35.690484,-105.937780,POINT (-105.93778 35.69048),FAILED DUE TO INCORRECT STATE FORMAT,"100,South Federal Place , Santa Fe, Santa Fe ..."
5705,12349,707,,,"Paseo Del Pueblo Norte, Taos",,,Taos County,707 PASEO DEL PUEBLO NORTE,,Taos,New,87571,New,"707 PASEO DEL PUEBLO NORTE, Taos, New Mexico 8...",36.418729,-105.571283,POINT (-105.57128 36.41873),FAILED DUE TO INCORRECT STATE FORMAT,"707, Paseo Del Pueblo Norte, Taos , Taos, Ta..."
5708,12364,1790,,,Saint Michaels,Drive,,Santa Fe County,1790 SAINT MICHAELS DR,,Santa Fe,New,87501,New,"1790 SAINT MICHAELS DR, Santa Fe, New Mexico 8...",35.659213,-105.970291,POINT (-105.97029 35.65921),FAILED DUE TO INCORRECT STATE FORMAT,"1790, Saint Michaels Drive , Santa Fe, Santa ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,11853,160,,,Morgantown,Street,,Preston County,160 Morgantown St,,Bruceton Mills,West Virginia,26525,West VA,"160 Morgantown St, Bruceton Mills, West Virgin...",39.658873,-79.639849,POINT (-79.63985 39.65887),FAILED DUE TO INCORRECT STATE FORMAT,"160, Morgantown Street , Bruceton Mills, Pres..."
476,12006,200,,,First,Street,,Wyoming County,200 First Street,,Mullens,West Virginia,25882,West VA,"200 First Street, Mullens, West Virginia 25882",37.582173,-81.380575,POINT (-81.38057 37.58217),FAILED DUE TO INCORRECT STATE FORMAT,"200, First Street , Mullens, Wyoming County, ..."
477,12102,111,East,,Washington,Street,,Jefferson County,111 East Washington Street,,Charles Town,West Virginia,25414,West VA,"111 East Washington Street, Charles Town, West...",39.289234,-77.859384,POINT (-77.85938 39.28923),FAILED DUE TO INCORRECT STATE FORMAT,"111,East Washington Street , Charles Town, Je..."
478,11792,95,East,,Main,Street,,Hampshire County,95 East Main Street,,Romney,West Virginia,26757,West VA,"95 East Main Street, Romney, West Virginia 26757",39.342247,-78.757354,POINT (-78.75735 39.34225),FAILED DUE TO INCORRECT STATE FORMAT,"95,East Main Street , Romney, Hampshire Count..."


In [29]:
nominatim_sample.flag.value_counts()

flag
FAILED DUE TO PRESENCE OF SPECIAL CHARACTERS       202
FAILED DUE TO PO BOX ADDRESS                       113
FAILED DUE TO NON 5-DIGIT ZIPCODE                   86
FAILED DUE TO STREET ADDRESS STARTS WITH LETTER     75
FAILED DUE TO INCORRECT STATE FORMAT                15
Name: count, dtype: int64

In [30]:
nominatim_sample_path = os.path.join(abs_path, 'output', 'nominatim_failedAddresses_sample.csv')
nominatim_sample.to_csv(nominatim_sample_path, index=False)