In [1]:
import os
import sys
import re
import csv

import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import shapely.wkt
import usaddress
from num2words import num2words

In [2]:
os.chdir('..')
abs_path = os.getcwd()

zcta_folder = os.path.join(abs_path, 'data/tl_2023_us_zcta520_clean')
zcta_file = 'tl_2023_us_zcta520_clean.shp'
zcta_path = os.path.join(zcta_folder, zcta_file)

OMOP_county_full_path = os.path.join(abs_path, 'output', 'OMOP_county_full.csv')

zcta_crosswalk_file = "tab20_zcta520_county20_natl.txt"
zcta_crosswalk_path = os.path.join(abs_path, "data", zcta_crosswalk_file)

In [3]:
def create_dir(save_dir):
    """
    Creates directory if it does not exist
         
    Parameters
    ----------
        save_dir (str): path of desired output directory
    """
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

# ZCTA County Match
* ZCTA 2023
* ZCTA 2020 Crosswalk

In [4]:
omop_county_full = pd.read_csv(OMOP_county_full_path)
zcta_crosswalk = pd.read_csv(zcta_crosswalk_path, sep="|")

  omop_county_full = pd.read_csv(OMOP_county_full_path)


In [5]:
# drop unnecessary columns
zcta_crosswalk = zcta_crosswalk.drop(columns=["OID_ZCTA5_20", "AREALAND_ZCTA5_20", "AREAWATER_ZCTA5_20", "MTFCC_ZCTA5_20", "CLASSFP_ZCTA5_20", "FUNCSTAT_ZCTA5_20", 
                                              "AREALAND_COUNTY_20", "AREAWATER_COUNTY_20", "MTFCC_COUNTY_20", "CLASSFP_COUNTY_20", "FUNCSTAT_COUNTY_20", "AREALAND_PART", "AREAWATER_PART"])
zcta_crosswalk

Unnamed: 0,GEOID_ZCTA5_20,NAMELSAD_ZCTA5_20,OID_COUNTY_20,GEOID_COUNTY_20,NAMELSAD_COUNTY_20
0,,,27590114112812,1003,Baldwin County
1,,,2759099719300,1007,Bibb County
2,,,27590103020886,1015,Calhoun County
3,,,27590336389978,1021,Chilton County
4,,,2759075862059,1025,Clarke County
...,...,...,...,...,...
47858,99923.0,ZCTA5 99923,275903025880880,2198,Prince of Wales-Hyder Census Area
47859,99925.0,ZCTA5 99925,275903025880880,2198,Prince of Wales-Hyder Census Area
47860,99926.0,ZCTA5 99926,275903025880880,2198,Prince of Wales-Hyder Census Area
47861,99927.0,ZCTA5 99927,275903025880880,2198,Prince of Wales-Hyder Census Area


In [6]:
zcta_shapes = gpd.read_file(zcta_path)
zcta_shapes = zcta_shapes.to_crs("EPSG:4326")
zcta_shapes

Unnamed: 0,ZCTA5CE20,GEOID20,GEOIDFQ20,INTPTLAT20,INTPTLON20,geometry
0,47236,47236,860Z200US47236,+39.1517426,-085.7252769,"POLYGON ((-85.73410 39.15597, -85.72794 39.156..."
1,47870,47870,860Z200US47870,+39.3701518,-087.4735141,"POLYGON ((-87.47415 39.37016, -87.47410 39.370..."
2,47851,47851,860Z200US47851,+39.5735839,-087.2459559,"POLYGON ((-87.24769 39.57450, -87.24711 39.574..."
3,47337,47337,860Z200US47337,+39.8027537,-085.4372850,"POLYGON ((-85.44356 39.80328, -85.44345 39.803..."
4,47435,47435,860Z200US47435,+39.2657557,-086.2951577,"POLYGON ((-86.29592 39.26547, -86.29592 39.266..."
...,...,...,...,...,...,...
33786,37932,37932,860Z200US37932,+35.9172993,-084.1987873,"POLYGON ((-84.27347 35.93928, -84.27287 35.940..."
33787,37341,37341,860Z200US37341,+35.2199309,-085.0730025,"POLYGON ((-85.15090 35.11231, -85.15088 35.112..."
33788,37849,37849,860Z200US37849,+36.0540502,-084.0484876,"POLYGON ((-84.14857 36.04234, -84.14240 36.046..."
33789,37754,37754,860Z200US37754,+36.1390993,-084.0298007,"POLYGON ((-84.10549 36.11168, -84.10543 36.111..."


In [None]:
# save ZCTA shapefile after dropping columns
# ONLY RUN ONCE
zcta_shapes = zcta_shapes.drop(columns=["CLASSFP20", "MTFCC20", "FUNCSTAT20", "ALAND20", "AWATER20"])

# shp_file = dict[fname].set_geometry('source_centroid')
save_dir = os.path.join(abs_path, 'data')        
save_path = os.path.join(save_dir, "tl_2023_us_zcta520_clean")
create_dir(save_path)

zcta_shapes.to_file(save_path, driver='ESRI Shapefile')

In [7]:
zcta_crosswalk['ZCTA5CE20'] = zcta_crosswalk.GEOID_ZCTA5_20.apply(lambda x: str(x).rstrip(".0") if ".0" in str(x) else str(x))
# convert to string without trailing zeros and decimal

zcta_crosswalk

Unnamed: 0,GEOID_ZCTA5_20,NAMELSAD_ZCTA5_20,OID_COUNTY_20,GEOID_COUNTY_20,NAMELSAD_COUNTY_20,ZCTA5CE20
0,,,27590114112812,1003,Baldwin County,
1,,,2759099719300,1007,Bibb County,
2,,,27590103020886,1015,Calhoun County,
3,,,27590336389978,1021,Chilton County,
4,,,2759075862059,1025,Clarke County,
...,...,...,...,...,...,...
47858,99923.0,ZCTA5 99923,275903025880880,2198,Prince of Wales-Hyder Census Area,99923
47859,99925.0,ZCTA5 99925,275903025880880,2198,Prince of Wales-Hyder Census Area,99925
47860,99926.0,ZCTA5 99926,275903025880880,2198,Prince of Wales-Hyder Census Area,99926
47861,99927.0,ZCTA5 99927,275903025880880,2198,Prince of Wales-Hyder Census Area,99927


In [8]:
zcta_county_name = zcta_shapes.merge(zcta_crosswalk, how="left", on="ZCTA5CE20")
zcta_county_name['zip'] = zcta_county_name.ZCTA5CE20
zcta_county_name

Unnamed: 0,ZCTA5CE20,GEOID20,GEOIDFQ20,INTPTLAT20,INTPTLON20,geometry,GEOID_ZCTA5_20,NAMELSAD_ZCTA5_20,OID_COUNTY_20,GEOID_COUNTY_20,NAMELSAD_COUNTY_20,zip
0,47236,47236,860Z200US47236,+39.1517426,-085.7252769,"POLYGON ((-85.73410 39.15597, -85.72794 39.156...",47236.0,ZCTA5 47236,2.759010e+13,18005.0,Bartholomew County,47236
1,47870,47870,860Z200US47870,+39.3701518,-087.4735141,"POLYGON ((-87.47415 39.37016, -87.47410 39.370...",,,,,,47870
2,47851,47851,860Z200US47851,+39.5735839,-087.2459559,"POLYGON ((-87.24769 39.57450, -87.24711 39.574...",47851.0,ZCTA5 47851,2.759035e+13,18167.0,Vigo County,47851
3,47337,47337,860Z200US47337,+39.8027537,-085.4372850,"POLYGON ((-85.44356 39.80328, -85.44345 39.803...",47337.0,ZCTA5 47337,2.759011e+13,18065.0,Henry County,47337
4,47435,47435,860Z200US47435,+39.2657557,-086.2951577,"POLYGON ((-86.29592 39.26547, -86.29592 39.266...",47435.0,ZCTA5 47435,2.759010e+13,18013.0,Brown County,47435
...,...,...,...,...,...,...,...,...,...,...,...,...
45178,37754,37754,860Z200US37754,+36.1390993,-084.0298007,"POLYGON ((-84.10549 36.11168, -84.10543 36.111...",37754.0,ZCTA5 37754,2.759026e+13,47001.0,Anderson County,37754
45179,37754,37754,860Z200US37754,+36.1390993,-084.0298007,"POLYGON ((-84.10549 36.11168, -84.10543 36.111...",37754.0,ZCTA5 37754,2.759023e+13,47093.0,Knox County,37754
45180,37754,37754,860Z200US37754,+36.1390993,-084.0298007,"POLYGON ((-84.10549 36.11168, -84.10543 36.111...",37754.0,ZCTA5 37754,2.759054e+13,47173.0,Union County,37754
45181,37806,37806,860Z200US37806,+36.0846931,-083.7279865,"MULTIPOLYGON (((-83.78542 36.08103, -83.78461 ...",37806.0,ZCTA5 37806,2.759022e+13,47057.0,Grainger County,37806


In [14]:
# add point geometry column that matches expected gpd format
omop_county_full['geometry'] = omop_county_full.apply(lambda x: "POINT (" + str(x.longitude) + " " + str(x.latitude) + ")", axis=1)

# convert to geopandas DataFrame and set `geometry` centroid 
omop_county_gdf = gpd.GeoDataFrame(omop_county_full, geometry=omop_county_full['geometry'].apply(shapely.wkt.loads))
omop_county_gdf.crs = "EPSG:4326"
omop_county_gdf

Unnamed: 0.1,Unnamed: 0,Location_id,address_1,address_2,city,state,zip,county,location_source_value,latitude,longitude,address_type,state_abbr,flag,geometry
0,0,1,523 E Broadway,,South Boston,Massachusetts,02127,Suffolk,"523 E BROADWAY, SOUTH BOSTON, MA 02127",42.335472,-71.043522,Street Address,MA,SUCCESSFUL ADDRESS,POINT (-71.04352 42.33547)
1,1,2,454 Essex St,,Lawrence,Massachusetts,01840,Essex,"454 ESSEX ST, LAWRENCE, MA 01840",42.706213,-71.164940,Street Address,MA,SUCCESSFUL ADDRESS,POINT (-71.16494 42.70621)
2,2,3,569 Broadway,,Newark,New Jersey,07104,Essex,"569 BROADWAY, NEWARK, NJ 07104",40.769935,-74.162821,Street Address,NJ,SUCCESSFUL ADDRESS,POINT (-74.16282 40.76993)
3,3,4,3210 Southwestern Blvd,,Orchard Park,New York,14127,Erie,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",42.798535,-78.747816,Street Address,NY,SUCCESSFUL ADDRESS,POINT (-78.74782 42.79853)
4,4,5,431 Campground Rd,,Livermore Fls,Maine,04254,Androscoggin,"431 CAMPGROUND RD, LIVERMORE FLS, ME 04254",44.428610,-70.113777,Street Address,ME,SUCCESSFUL ADDRESS,POINT (-70.11378 44.42861)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78795,78795,104098,1300 Peachtree Industrial Boulevard,,Suwanee,Georgia,30024,Gwinnett,"1300 PEACHTREE INDUSTRIAL BOULEVARD, SUWANEE, ...",34.041727,-84.095174,Street Address,GA,SUCCESSFUL ADDRESS,POINT (-84.09517 34.04173)
78796,78796,104099,2660 Satellite Boulevard Northwest,,Duluth,Georgia,30096,Gwinnett,"2660 SATELLITE BOULEVARD NORTHWEST, DULUTH, GA...",33.966797,-84.101318,Street Address,GA,SUCCESSFUL ADDRESS,POINT (-84.10132 33.96680)
78797,78797,104100,3685 Braselton Highway,,Dacula,Georgia,30019,Gwinnett,"3685 BRASELTON HIGHWAY, DACULA, GA 30019",34.068832,-83.902215,Street Address,GA,SUCCESSFUL ADDRESS,POINT (-83.90222 34.06883)
78798,78798,104101,1055 Dove Run Road,,Lexington,Kentucky,40502,Fayette,"1055 DOVE RUN ROAD, LEXINGTON, KY 40502",37.996508,-84.494106,Street Address,KY,SUCCESSFUL ADDRESS,POINT (-84.49411 37.99651)


In [36]:
# spatial join omop county file to zcta
omop_county_zcta = omop_county_gdf.sjoin(zcta_county_name, how='left')
omop_county_zcta["county"] =  omop_county_zcta.county.apply(lambda x: str(x) + " County")
omop_county_zcta

Unnamed: 0.1,Unnamed: 0,Location_id,address_1,address_2,city,state,zip_left,county,location_source_value,latitude,...,GEOID20,GEOIDFQ20,INTPTLAT20,INTPTLON20,GEOID_ZCTA5_20,NAMELSAD_ZCTA5_20,OID_COUNTY_20,GEOID_COUNTY_20,NAMELSAD_COUNTY_20,zip_right
0,0,1,523 E Broadway,,South Boston,Massachusetts,02127,Suffolk County,"523 E BROADWAY, SOUTH BOSTON, MA 02127",42.335472,...,02127,860Z200US02127,+42.3353007,-071.0382163,,,,,,02127
1,1,2,454 Essex St,,Lawrence,Massachusetts,01840,Essex County,"454 ESSEX ST, LAWRENCE, MA 01840",42.706213,...,01840,860Z200US01840,+42.7067633,-071.1604026,,,,,,01840
2,2,3,569 Broadway,,Newark,New Jersey,07104,Essex County,"569 BROADWAY, NEWARK, NJ 07104",40.769935,...,07104,860Z200US07104,+40.7677132,-074.1683498,,,,,,07104
3,3,4,3210 Southwestern Blvd,,Orchard Park,New York,14127,Erie County,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",42.798535,...,14127,860Z200US14127,+42.7528037,-078.7396939,14127.0,ZCTA5 14127,2.759012e+13,36029.0,Erie County,14127
4,4,5,431 Campground Rd,,Livermore Fls,Maine,04254,Androscoggin County,"431 CAMPGROUND RD, LIVERMORE FLS, ME 04254",44.428610,...,04254,860Z200US04254,+44.4453367,-070.1380761,,,,,,04254
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78796,78796,104099,2660 Satellite Boulevard Northwest,,Duluth,Georgia,30096,Gwinnett County,"2660 SATELLITE BOULEVARD NORTHWEST, DULUTH, GA...",33.966797,...,30096,860Z200US30096,+33.9743018,-084.1453842,30096.0,ZCTA5 30096,2.759044e+13,13135.0,Gwinnett County,30096
78797,78797,104100,3685 Braselton Highway,,Dacula,Georgia,30019,Gwinnett County,"3685 BRASELTON HIGHWAY, DACULA, GA 30019",34.068832,...,30019,860Z200US30019,+33.9756343,-083.8837695,30019.0,ZCTA5 30019,2.759044e+13,13135.0,Gwinnett County,30019
78797,78797,104100,3685 Braselton Highway,,Dacula,Georgia,30019,Gwinnett County,"3685 BRASELTON HIGHWAY, DACULA, GA 30019",34.068832,...,30019,860Z200US30019,+33.9756343,-083.8837695,30019.0,ZCTA5 30019,2.759042e+13,13297.0,Walton County,30019
78798,78798,104101,1055 Dove Run Road,,Lexington,Kentucky,40502,Fayette County,"1055 DOVE RUN ROAD, LEXINGTON, KY 40502",37.996508,...,40502,860Z200US40502,+38.0109529,-084.4831908,40502.0,ZCTA5 40502,2.759010e+13,21067.0,Fayette County,40502


In [37]:
omop_county_zcta["county_match"] = omop_county_zcta.apply(lambda x: 1 if x.county == x.NAMELSAD_COUNTY_20 else 0, axis=1)
omop_county_zcta

Unnamed: 0.1,Unnamed: 0,Location_id,address_1,address_2,city,state,zip_left,county,location_source_value,latitude,...,GEOIDFQ20,INTPTLAT20,INTPTLON20,GEOID_ZCTA5_20,NAMELSAD_ZCTA5_20,OID_COUNTY_20,GEOID_COUNTY_20,NAMELSAD_COUNTY_20,zip_right,county_match
0,0,1,523 E Broadway,,South Boston,Massachusetts,02127,Suffolk County,"523 E BROADWAY, SOUTH BOSTON, MA 02127",42.335472,...,860Z200US02127,+42.3353007,-071.0382163,,,,,,02127,0
1,1,2,454 Essex St,,Lawrence,Massachusetts,01840,Essex County,"454 ESSEX ST, LAWRENCE, MA 01840",42.706213,...,860Z200US01840,+42.7067633,-071.1604026,,,,,,01840,0
2,2,3,569 Broadway,,Newark,New Jersey,07104,Essex County,"569 BROADWAY, NEWARK, NJ 07104",40.769935,...,860Z200US07104,+40.7677132,-074.1683498,,,,,,07104,0
3,3,4,3210 Southwestern Blvd,,Orchard Park,New York,14127,Erie County,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",42.798535,...,860Z200US14127,+42.7528037,-078.7396939,14127.0,ZCTA5 14127,2.759012e+13,36029.0,Erie County,14127,1
4,4,5,431 Campground Rd,,Livermore Fls,Maine,04254,Androscoggin County,"431 CAMPGROUND RD, LIVERMORE FLS, ME 04254",44.428610,...,860Z200US04254,+44.4453367,-070.1380761,,,,,,04254,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78796,78796,104099,2660 Satellite Boulevard Northwest,,Duluth,Georgia,30096,Gwinnett County,"2660 SATELLITE BOULEVARD NORTHWEST, DULUTH, GA...",33.966797,...,860Z200US30096,+33.9743018,-084.1453842,30096.0,ZCTA5 30096,2.759044e+13,13135.0,Gwinnett County,30096,1
78797,78797,104100,3685 Braselton Highway,,Dacula,Georgia,30019,Gwinnett County,"3685 BRASELTON HIGHWAY, DACULA, GA 30019",34.068832,...,860Z200US30019,+33.9756343,-083.8837695,30019.0,ZCTA5 30019,2.759044e+13,13135.0,Gwinnett County,30019,1
78797,78797,104100,3685 Braselton Highway,,Dacula,Georgia,30019,Gwinnett County,"3685 BRASELTON HIGHWAY, DACULA, GA 30019",34.068832,...,860Z200US30019,+33.9756343,-083.8837695,30019.0,ZCTA5 30019,2.759042e+13,13297.0,Walton County,30019,0
78798,78798,104101,1055 Dove Run Road,,Lexington,Kentucky,40502,Fayette County,"1055 DOVE RUN ROAD, LEXINGTON, KY 40502",37.996508,...,860Z200US40502,+38.0109529,-084.4831908,40502.0,ZCTA5 40502,2.759010e+13,21067.0,Fayette County,40502,1


In [38]:
omop_county_zcta.columns

Index(['Unnamed: 0', 'Location_id', 'address_1', 'address_2', 'city', 'state',
       'zip_left', 'county', 'location_source_value', 'latitude', 'longitude',
       'address_type', 'state_abbr', 'flag', 'geometry', 'index_right',
       'ZCTA5CE20', 'GEOID20', 'GEOIDFQ20', 'INTPTLAT20', 'INTPTLON20',
       'GEOID_ZCTA5_20', 'NAMELSAD_ZCTA5_20', 'OID_COUNTY_20',
       'GEOID_COUNTY_20', 'NAMELSAD_COUNTY_20', 'zip_right', 'county_match'],
      dtype='object')

In [39]:
omop_county_zcta = omop_county_zcta.drop(columns = ["GEOIDFQ20", "INTPTLAT20", "INTPTLON20", "NAMELSAD_ZCTA5_20", "OID_COUNTY_20", "zip_right" ,"Unnamed: 0", "GEOID_ZCTA5_20", "GEOID_COUNTY_20", "index_right"])
omop_county_zcta = omop_county_zcta.rename({"zip_left": "zip"})
omop_county_zcta

Unnamed: 0,Location_id,address_1,address_2,city,state,zip_left,county,location_source_value,latitude,longitude,address_type,state_abbr,flag,geometry,ZCTA5CE20,GEOID20,NAMELSAD_COUNTY_20,county_match
0,1,523 E Broadway,,South Boston,Massachusetts,02127,Suffolk County,"523 E BROADWAY, SOUTH BOSTON, MA 02127",42.335472,-71.043522,Street Address,MA,SUCCESSFUL ADDRESS,POINT (-71.04352 42.33547),02127,02127,,0
1,2,454 Essex St,,Lawrence,Massachusetts,01840,Essex County,"454 ESSEX ST, LAWRENCE, MA 01840",42.706213,-71.164940,Street Address,MA,SUCCESSFUL ADDRESS,POINT (-71.16494 42.70621),01840,01840,,0
2,3,569 Broadway,,Newark,New Jersey,07104,Essex County,"569 BROADWAY, NEWARK, NJ 07104",40.769935,-74.162821,Street Address,NJ,SUCCESSFUL ADDRESS,POINT (-74.16282 40.76993),07104,07104,,0
3,4,3210 Southwestern Blvd,,Orchard Park,New York,14127,Erie County,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",42.798535,-78.747816,Street Address,NY,SUCCESSFUL ADDRESS,POINT (-78.74782 42.79853),14127,14127,Erie County,1
4,5,431 Campground Rd,,Livermore Fls,Maine,04254,Androscoggin County,"431 CAMPGROUND RD, LIVERMORE FLS, ME 04254",44.428610,-70.113777,Street Address,ME,SUCCESSFUL ADDRESS,POINT (-70.11378 44.42861),04254,04254,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78796,104099,2660 Satellite Boulevard Northwest,,Duluth,Georgia,30096,Gwinnett County,"2660 SATELLITE BOULEVARD NORTHWEST, DULUTH, GA...",33.966797,-84.101318,Street Address,GA,SUCCESSFUL ADDRESS,POINT (-84.10132 33.96680),30096,30096,Gwinnett County,1
78797,104100,3685 Braselton Highway,,Dacula,Georgia,30019,Gwinnett County,"3685 BRASELTON HIGHWAY, DACULA, GA 30019",34.068832,-83.902215,Street Address,GA,SUCCESSFUL ADDRESS,POINT (-83.90222 34.06883),30019,30019,Gwinnett County,1
78797,104100,3685 Braselton Highway,,Dacula,Georgia,30019,Gwinnett County,"3685 BRASELTON HIGHWAY, DACULA, GA 30019",34.068832,-83.902215,Street Address,GA,SUCCESSFUL ADDRESS,POINT (-83.90222 34.06883),30019,30019,Walton County,0
78798,104101,1055 Dove Run Road,,Lexington,Kentucky,40502,Fayette County,"1055 DOVE RUN ROAD, LEXINGTON, KY 40502",37.996508,-84.494106,Street Address,KY,SUCCESSFUL ADDRESS,POINT (-84.49411 37.99651),40502,40502,Fayette County,1


In [40]:
omop_county_zcta.county_match.value_counts()

county_match
1    65697
0    36157
Name: count, dtype: int64

In [28]:
omop_county_zcta_match.columns

Index(['Unnamed: 0', 'Location_id', 'address_1', 'address_2', 'city', 'state',
       'zip_left', 'county', 'location_source_value', 'latitude', 'longitude',
       'address_type', 'state_abbr', 'flag', 'geometry', 'index_right',
       'ZCTA5CE20', 'GEOID20', 'GEOIDFQ20', 'INTPTLAT20', 'INTPTLON20',
       'GEOID_ZCTA5_20', 'NAMELSAD_ZCTA5_20', 'OID_COUNTY_20',
       'GEOID_COUNTY_20', 'NAMELSAD_COUNTY_20', 'zip_right', 'county_match'],
      dtype='object')

In [58]:
omop_county_zcta_match = omop_county_zcta.loc[omop_county_zcta.county_match == 1]
# omop_county_zcta_match = omop_county_zcta_match.drop(columns = ["GEOIDFQ20", "INTPTLAT20", "INTPTLON20", "NAMELSAD_ZCTA5_20", "OID_COUNTY_20", "zip_right" ,"Unnamed: 0", "GEOID_ZCTA5_20", "GEOID_COUNTY_20"])
omop_county_zcta_match = omop_county_zcta_match.rename({"zip_left": "zip"})
omop_county_zcta_match

Unnamed: 0,Location_id,address_1,address_2,city,state,zip_left,county,location_source_value,latitude,longitude,address_type,state_abbr,flag,geometry,ZCTA5CE20,GEOID20,NAMELSAD_COUNTY_20,county_match
3,4,3210 Southwestern Blvd,,Orchard Park,New York,14127,Erie County,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",42.798535,-78.747816,Street Address,NY,SUCCESSFUL ADDRESS,POINT (-78.74782 42.79853),14127,14127,Erie County,1
192,263,562 Wingate Dr,,East Meadow,New York,11554,Nassau County,"562 WINGATE DR, EAST MEADOW, NY 11554",40.710113,-73.528389,Street Address,NY,SUCCESSFUL ADDRESS,POINT (-73.52839 40.71011),11554,11554,Nassau County,1
195,279,8 Zoar Ave,,Albany,New York,12209,Albany County,"8 ZOAR AVE, ALBANY, NY 12209",42.639067,-73.788991,Street Address,NY,SUCCESSFUL ADDRESS,POINT (-73.78899 42.63907),12209,12209,Albany County,1
200,300,15402 41St Ave,,Flushing,New York,11354,Queens County,"15402 41ST AVE, FLUSHING, NY 11354",40.763003,-73.810944,Street Address,NY,SUCCESSFUL ADDRESS,POINT (-73.81094 40.76300),11354,11354,Queens County,1
201,305,5110 19Th Ave,,Brooklyn,New York,11204,Kings County,"5110 19TH AVE, BROOKLYN, NY 11204",40.625623,-73.980013,Street Address,NY,SUCCESSFUL ADDRESS,POINT (-73.98001 40.62562),11204,11204,Kings County,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78795,104098,1300 Peachtree Industrial Boulevard,,Suwanee,Georgia,30024,Gwinnett County,"1300 PEACHTREE INDUSTRIAL BOULEVARD, SUWANEE, ...",34.041727,-84.095174,Street Address,GA,SUCCESSFUL ADDRESS,POINT (-84.09517 34.04173),30024,30024,Gwinnett County,1
78796,104099,2660 Satellite Boulevard Northwest,,Duluth,Georgia,30096,Gwinnett County,"2660 SATELLITE BOULEVARD NORTHWEST, DULUTH, GA...",33.966797,-84.101318,Street Address,GA,SUCCESSFUL ADDRESS,POINT (-84.10132 33.96680),30096,30096,Gwinnett County,1
78797,104100,3685 Braselton Highway,,Dacula,Georgia,30019,Gwinnett County,"3685 BRASELTON HIGHWAY, DACULA, GA 30019",34.068832,-83.902215,Street Address,GA,SUCCESSFUL ADDRESS,POINT (-83.90222 34.06883),30019,30019,Gwinnett County,1
78798,104101,1055 Dove Run Road,,Lexington,Kentucky,40502,Fayette County,"1055 DOVE RUN ROAD, LEXINGTON, KY 40502",37.996508,-84.494106,Street Address,KY,SUCCESSFUL ADDRESS,POINT (-84.49411 37.99651),40502,40502,Fayette County,1


In [59]:
# capitalize only first letter
omop_county_zcta_match['address_1'] = omop_county_zcta_match.address_1.apply(lambda x: str(x).strip().title())
omop_county_zcta_match['address_2'] = omop_county_zcta_match.address_2.apply(lambda x: str(x).strip().title() if not np.NaN else x)
omop_county_zcta_match['city'] = omop_county_zcta_match.city.apply(lambda x: str(x).strip().title())
# omop_county_zcta_match['location_source_value'] = omop_county_zcta_match.location_source_value.apply(lambda x: str(x).strip().title())
omop_county_zcta_match

Unnamed: 0,Location_id,address_1,address_2,city,state,zip_left,county,location_source_value,latitude,longitude,address_type,state_abbr,flag,geometry,ZCTA5CE20,GEOID20,NAMELSAD_COUNTY_20,county_match
3,4,3210 Southwestern Blvd,,Orchard Park,New York,14127,Erie County,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",42.798535,-78.747816,Street Address,NY,SUCCESSFUL ADDRESS,POINT (-78.74782 42.79853),14127,14127,Erie County,1
192,263,562 Wingate Dr,,East Meadow,New York,11554,Nassau County,"562 WINGATE DR, EAST MEADOW, NY 11554",40.710113,-73.528389,Street Address,NY,SUCCESSFUL ADDRESS,POINT (-73.52839 40.71011),11554,11554,Nassau County,1
195,279,8 Zoar Ave,,Albany,New York,12209,Albany County,"8 ZOAR AVE, ALBANY, NY 12209",42.639067,-73.788991,Street Address,NY,SUCCESSFUL ADDRESS,POINT (-73.78899 42.63907),12209,12209,Albany County,1
200,300,15402 41St Ave,,Flushing,New York,11354,Queens County,"15402 41ST AVE, FLUSHING, NY 11354",40.763003,-73.810944,Street Address,NY,SUCCESSFUL ADDRESS,POINT (-73.81094 40.76300),11354,11354,Queens County,1
201,305,5110 19Th Ave,,Brooklyn,New York,11204,Kings County,"5110 19TH AVE, BROOKLYN, NY 11204",40.625623,-73.980013,Street Address,NY,SUCCESSFUL ADDRESS,POINT (-73.98001 40.62562),11204,11204,Kings County,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78795,104098,1300 Peachtree Industrial Boulevard,,Suwanee,Georgia,30024,Gwinnett County,"1300 PEACHTREE INDUSTRIAL BOULEVARD, SUWANEE, ...",34.041727,-84.095174,Street Address,GA,SUCCESSFUL ADDRESS,POINT (-84.09517 34.04173),30024,30024,Gwinnett County,1
78796,104099,2660 Satellite Boulevard Northwest,,Duluth,Georgia,30096,Gwinnett County,"2660 SATELLITE BOULEVARD NORTHWEST, DULUTH, GA...",33.966797,-84.101318,Street Address,GA,SUCCESSFUL ADDRESS,POINT (-84.10132 33.96680),30096,30096,Gwinnett County,1
78797,104100,3685 Braselton Highway,,Dacula,Georgia,30019,Gwinnett County,"3685 BRASELTON HIGHWAY, DACULA, GA 30019",34.068832,-83.902215,Street Address,GA,SUCCESSFUL ADDRESS,POINT (-83.90222 34.06883),30019,30019,Gwinnett County,1
78798,104101,1055 Dove Run Road,,Lexington,Kentucky,40502,Fayette County,"1055 DOVE RUN ROAD, LEXINGTON, KY 40502",37.996508,-84.494106,Street Address,KY,SUCCESSFUL ADDRESS,POINT (-84.49411 37.99651),40502,40502,Fayette County,1


In [41]:
OMOP_county_zcta_path = os.path.join(abs_path, 'output', 'OMOP_county_zcta.csv')
omop_county_zcta.to_csv(OMOP_county_zcta_path, index=False)

In [60]:
OMOP_county_zcta_match_path = os.path.join(abs_path, 'output', 'OMOP_county_zcta_match.csv')
omop_county_zcta_match.to_csv(OMOP_county_zcta_match_path, index=False)

In [95]:
# merge on zip
omop_county_zcta_zip = omop_county_gdf.merge(zcta_county_name, how='left', on='zip')
omop_county_zcta_zip["county"] =  omop_county_zcta_zip.county.apply(lambda x: str(x) + " County")
omop_county_zcta_zip["county_match"] = omop_county_zcta_zip.apply(lambda x: 1 if x.county == x.NAMELSAD_COUNTY_20 else 0, axis=1)

# omop_county_zcta_zip = omop_county_zcta_zip.loc[omop_county_zcta_zip.county_match == 1]
omop_county_zcta_zip

Unnamed: 0.1,Unnamed: 0,Location_id,address_1,address_2,city,state,zip,county,location_source_value,latitude,...,GEOIDFQ20,INTPTLAT20,INTPTLON20,geometry_y,GEOID_ZCTA5_20,NAMELSAD_ZCTA5_20,OID_COUNTY_20,GEOID_COUNTY_20,NAMELSAD_COUNTY_20,county_match
0,0,1,523 E Broadway,,South Boston,Massachusetts,02127,Suffolk County,"523 E BROADWAY, SOUTH BOSTON, MA 02127",42.335472,...,860Z200US02127,+42.3353007,-071.0382163,"POLYGON ((-71.06355 42.33079, -71.06291 42.331...",,,,,,0
1,1,2,454 Essex St,,Lawrence,Massachusetts,01840,Essex County,"454 ESSEX ST, LAWRENCE, MA 01840",42.706213,...,860Z200US01840,+42.7067633,-071.1604026,"POLYGON ((-71.17248 42.70787, -71.17261 42.707...",,,,,,0
2,2,3,569 Broadway,,Newark,New Jersey,07104,Essex County,"569 BROADWAY, NEWARK, NJ 07104",40.769935,...,860Z200US07104,+40.7677132,-074.1683498,"POLYGON ((-74.18521 40.75686, -74.18516 40.757...",,,,,,0
3,3,4,3210 Southwestern Blvd,,Orchard Park,New York,14127,Erie County,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",42.798535,...,860Z200US14127,+42.7528037,-078.7396939,"POLYGON ((-78.80954 42.77168, -78.80828 42.772...",14127.0,ZCTA5 14127,2.759012e+13,36029.0,Erie County,1
4,4,5,431 Campground Rd,,Livermore Fls,Maine,04254,Androscoggin County,"431 CAMPGROUND RD, LIVERMORE FLS, ME 04254",44.428610,...,860Z200US04254,+44.4453367,-070.1380761,"POLYGON ((-70.19976 44.47826, -70.19885 44.478...",,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96107,78795,104098,1300 Peachtree Industrial Boulevard,,Suwanee,Georgia,30024,Gwinnett County,"1300 PEACHTREE INDUSTRIAL BOULEVARD, SUWANEE, ...",34.041727,...,,,,,,,,,,0
96108,78796,104099,2660 Satellite Boulevard Northwest,,Duluth,Georgia,30096,Gwinnett County,"2660 SATELLITE BOULEVARD NORTHWEST, DULUTH, GA...",33.966797,...,,,,,,,,,,0
96109,78797,104100,3685 Braselton Highway,,Dacula,Georgia,30019,Gwinnett County,"3685 BRASELTON HIGHWAY, DACULA, GA 30019",34.068832,...,,,,,,,,,,0
96110,78798,104101,1055 Dove Run Road,,Lexington,Kentucky,40502,Fayette County,"1055 DOVE RUN ROAD, LEXINGTON, KY 40502",37.996508,...,,,,,,,,,,0


In [97]:
omop_county_zcta_zip_match.columns

Index(['Unnamed: 0', 'Location_id', 'address_1', 'address_2', 'city', 'state',
       'zip', 'county', 'location_source_value', 'latitude', 'longitude',
       'address_type', 'state_abbr', 'flag', 'geometry_x', 'ZCTA5CE20',
       'GEOID20', 'GEOIDFQ20', 'INTPTLAT20', 'INTPTLON20', 'geometry_y',
       'GEOID_ZCTA5_20', 'NAMELSAD_ZCTA5_20', 'OID_COUNTY_20',
       'GEOID_COUNTY_20', 'NAMELSAD_COUNTY_20', 'county_match'],
      dtype='object')

In [99]:
omop_county_zcta_zip_match = omop_county_zcta_zip.loc[omop_county_zcta_zip.county_match == 1]
omop_county_zcta_zip_match = omop_county_zcta_zip_match.drop(columns = ["GEOIDFQ20", "INTPTLAT20", "INTPTLON20", "NAMELSAD_ZCTA5_20", "OID_COUNTY_20", 
                                                                        "Unnamed: 0", "GEOID_ZCTA5_20", "GEOID_COUNTY_20", "geometry_y", "GEOID20"])
omop_county_zcta_zip_match = omop_county_zcta_zip_match.rename({"geometry_x": "geometry"})
omop_county_zcta_zip_match

Unnamed: 0,Location_id,address_1,address_2,city,state,zip,county,location_source_value,latitude,longitude,address_type,state_abbr,flag,geometry_x,ZCTA5CE20,NAMELSAD_COUNTY_20,county_match
3,4,3210 Southwestern Blvd,,Orchard Park,New York,14127,Erie County,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",42.798535,-78.747816,Street Address,NY,SUCCESSFUL ADDRESS,POINT (-78.74782 42.79853),14127,Erie County,1
192,263,562 Wingate Dr,,East Meadow,New York,11554,Nassau County,"562 WINGATE DR, EAST MEADOW, NY 11554",40.710113,-73.528389,Street Address,NY,SUCCESSFUL ADDRESS,POINT (-73.52839 40.71011),11554,Nassau County,1
195,279,8 Zoar Ave,,Albany,New York,12209,Albany County,"8 ZOAR AVE, ALBANY, NY 12209",42.639067,-73.788991,Street Address,NY,SUCCESSFUL ADDRESS,POINT (-73.78899 42.63907),12209,Albany County,1
200,300,15402 41St Ave,,Flushing,New York,11354,Queens County,"15402 41ST AVE, FLUSHING, NY 11354",40.763003,-73.810944,Street Address,NY,SUCCESSFUL ADDRESS,POINT (-73.81094 40.76300),11354,Queens County,1
201,305,5110 19Th Ave,,Brooklyn,New York,11204,Kings County,"5110 19TH AVE, BROOKLYN, NY 11204",40.625623,-73.980013,Street Address,NY,SUCCESSFUL ADDRESS,POINT (-73.98001 40.62562),11204,Kings County,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82843,85594,5000 N Bowes Rd,,Tucson,Arizona,85749,Pima County,"5000 N BOWES RD, TUCSON, AZ 85749",32.298797,-110.788349,Street Address,AZ,SUCCESSFUL ADDRESS,POINT (-110.78835 32.29880),85749,Pima County,1
82844,85596,1303 Paseo Del Canon East,,Taos,New Mexico,87571,Taos County,"1303 PASEO DEL CANON EAST, TAOS, NM 87571",36.375292,-105.568573,Street Address,NM,SUCCESSFUL ADDRESS,POINT (-105.56857 36.37529),87571,Taos County,1
82845,85599,12606 E Main St,,Mayer,Arizona,86333,Yavapai County,"12606 E MAIN ST, MAYER, AZ 86333",34.346322,-112.157189,Street Address,AZ,SUCCESSFUL ADDRESS,POINT (-112.15719 34.34632),86333,Yavapai County,1
82846,85600,4300 Blake Rd Sw,,Albuquerque,New Mexico,87121,Bernalillo County,"4300 BLAKE RD SW, ALBUQUERQUE, NM 87121",35.037163,-106.723264,Street Address,NM,SUCCESSFUL ADDRESS,POINT (-106.72326 35.03716),87121,Bernalillo County,1


In [100]:
OMOP_county_zcta_zip_match_path = os.path.join(abs_path, 'output', 'OMOP_county_zcta_zip_match.csv')
omop_county_zcta_zip_match.to_csv(OMOP_county_zcta_zip_match_path, index=False)

# Nominatim Parsing
1. Parse base Nominatim components with `usaddress`
2. Keep only the components needed for Nominatim
3. Merge the Nominatim components to the Publication 28 (OMOP) components
4. Convert all abbreviations for cardinal directions and street type to full strings
5. Join Nominatim components to a full address string

In [4]:
from requests.structures import CaseInsensitiveDict

# for converting cardinal direction abbreviations to full string
cardinal_directions_to_full = CaseInsensitiveDict({"N": "North", "E": "East", "S": "South", "W": "West", 
                                                  "NE": "Northeast", "SE": "Southeast", "NW": "Northwest", "SW": "Southwest"})

# for converting street type abbreviations to full string
street_suffix_to_full = CaseInsensitiveDict({
    'aly': 'Alley',
    'ave': 'Avenue',
    'blvd': 'Boulevard',
    'cir': 'Circle',
    'ct': 'Court',
    'dr': 'Drive',
    'expy': 'Expressway',
    'hwy': 'Highway',
    'ln': 'Lane',
    'pkwy': 'Parkway',
    'pl': 'Place',
    'rd': 'Road',
    'sq': 'Square',
    'st': 'Street',
    'ter': 'Terrace',
    'trl': 'Trail',
    'way': 'Way'
})

In [16]:
# function for case insensitive dictionary replacement using Regex
def replace_words(text, word_dict):
    for key, value in word_dict.items():
        # Use case-insensitive regex for replacement
        text = re.sub(r'\b'+key+'\b', value, text, flags=re.IGNORECASE, count=1)
    return text

def multipleReplace(text, wordDict):
    for key in wordDict:
        text = text.replace(key, wordDict[key], 1)
    return text

def cardinal_direction_lambda(x, word_dict):
    """
    Pass cardinal_directions_to_full
    """
    if str(x) == "nan":
        return np.NaN
    elif x in word_dict.keys():
        return word_dict.get(str(x))
    else:
        return x

def street_suffix_lambda(x, word_dict):
    """
    Pass street_suffix_to_full
    """
    if str(x) == "nan" or str(x) == "Nan":
        return np.NaN
    # elif len(str(x)) > 2:
    #     return str(x)
    elif x in word_dict.keys():
        return word_dict.get(str(x))
    else:
        return x

def replace_ordinal_numbers(text):
    """
    Replace ordinal numbers with full string
    """
    re_results = re.findall('(\d+(st|nd|rd|th))', text)
    for enitre_result, suffix in re_results:
        num = int(enitre_result[:-len(suffix)])
        text = text.replace(enitre_result, num2words(num, ordinal=True))
    return text

In [6]:
# import omop county file that has both zcta and spatial join match by county name
OMOP_county_zcta_zip_match_path = os.path.join(abs_path, 'output', 'OMOP_county_zcta_zip_match.csv')
omop_county_zcta_zip_match = pd.read_csv(OMOP_county_zcta_zip_match_path)
omop_county_zcta_zip_match = omop_county_zcta_zip_match.rename(columns={"geometry_x": "geometry", "Location_id": "location_id"})
omop_county_zcta_zip_match['location_id'] = omop_county_zcta_zip_match.index+1

In [7]:
omop_county_zcta_zip_match

Unnamed: 0,location_id,address_1,address_2,city,state,zip,county,location_source_value,latitude,longitude,address_type,state_abbr,flag,geometry,ZCTA5CE20,NAMELSAD_COUNTY_20,county_match
0,1,3210 Southwestern Blvd,,Orchard Park,New York,14127,Erie County,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",42.798535,-78.747816,Street Address,NY,SUCCESSFUL ADDRESS,POINT (-78.74781599997843 42.79853499984358),14127,Erie County,1
1,2,562 Wingate Dr,,East Meadow,New York,11554,Nassau County,"562 WINGATE DR, EAST MEADOW, NY 11554",40.710113,-73.528389,Street Address,NY,SUCCESSFUL ADDRESS,POINT (-73.52838899973285 40.71011299966788),11554,Nassau County,1
2,3,8 Zoar Ave,,Albany,New York,12209,Albany County,"8 ZOAR AVE, ALBANY, NY 12209",42.639067,-73.788991,Street Address,NY,SUCCESSFUL ADDRESS,POINT (-73.78899100027446 42.63906699972181),12209,Albany County,1
3,4,15402 41St Ave,,Flushing,New York,11354,Queens County,"15402 41ST AVE, FLUSHING, NY 11354",40.763003,-73.810944,Street Address,NY,SUCCESSFUL ADDRESS,POINT (-73.81094400044168 40.76300300008896),11354,Queens County,1
4,5,5110 19Th Ave,,Brooklyn,New York,11204,Kings County,"5110 19TH AVE, BROOKLYN, NY 11204",40.625623,-73.980013,Street Address,NY,SUCCESSFUL ADDRESS,POINT (-73.98001299964284 40.62562299970931),11204,Kings County,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40412,40413,5000 N Bowes Rd,,Tucson,Arizona,85749,Pima County,"5000 N BOWES RD, TUCSON, AZ 85749",32.298797,-110.788349,Street Address,AZ,SUCCESSFUL ADDRESS,POINT (-110.7883488373378 32.29879689494836),85749,Pima County,1
40413,40414,1303 Paseo Del Canon East,,Taos,New Mexico,87571,Taos County,"1303 PASEO DEL CANON EAST, TAOS, NM 87571",36.375292,-105.568573,Street Address,NM,SUCCESSFUL ADDRESS,POINT (-105.568573404848 36.37529177977968),87571,Taos County,1
40414,40415,12606 E Main St,,Mayer,Arizona,86333,Yavapai County,"12606 E MAIN ST, MAYER, AZ 86333",34.346322,-112.157189,Street Address,AZ,SUCCESSFUL ADDRESS,POINT (-112.1571889686942 34.34632218671697),86333,Yavapai County,1
40415,40416,4300 Blake Rd Sw,,Albuquerque,New Mexico,87121,Bernalillo County,"4300 BLAKE RD SW, ALBUQUERQUE, NM 87121",35.037163,-106.723264,Street Address,NM,SUCCESSFUL ADDRESS,POINT (-106.7232635088362 35.03716265606487),87121,Bernalillo County,1


In [8]:
%%time

# run `usaddress` parsing with base component names and append to list
repo = []
for ind, each in omop_county_zcta_zip_match.iterrows():
    try:
        obj = usaddress.tag(each.location_source_value)
        tmp = pd.DataFrame(obj[0], columns=obj[0].keys(), index=[ind])
        tmp['Address_type'] = obj[1]
        tmp['county'] = each.county
        tmp['location_source_value'] = each.location_source_value
        tmp['location_id'] = each.location_id

        # development
        # OMOP_location.loc[ind, 'AddressNumber'] = [tmp['AddressNumber'].values[0] if len(tmp['AddressNumber'].values[0]) > 0 else np.NaN][0]
        # OMOP_location.loc[ind, 'StreetNamePreDirectional'] = [tmp['StreetNamePreDirectional'].values[0] if len(tmp['AddressNumber'].values[0]) > 0 else np.NaN][0]
        # OMOP_location.loc[ind, 'StreetNamePreType'] = [tmp['StreetNamePreType'].values[0] if len(tmp['AddressNumber'].values[0]) > 0 else np.NaN][0]
        # OMOP_location.loc[ind, 'StreetName'] = [tmp['StreetName'].values[0] if len(tmp['AddressNumber'].values[0]) > 0 else np.NaN][0]
        # OMOP_location.loc[ind, 'StreetNamePostType'] = [tmp['StreetNamePostType'].values[0] if len(tmp['AddressNumber'].values[0]) > 0 else np.NaN][0]
        # OMOP_location.loc[ind, 'StreetNamePostDirectional'] = [tmp['StreetNamePostDirectional'].values[0] if len(tmp['AddressNumber'].values[0]) > 0 else np.NaN][0]
        # OMOP_location.loc[ind, 'PlaceName'] = [tmp['PlaceName'].values[0] if len(tmp['AddressNumber'].values[0]) > 0 else np.NaN][0]
        # OMOP_location.loc[ind, 'StateName'] = [tmp['StateName'].values[0] if len(tmp['AddressNumber'].values[0]) > 0 else np.NaN][0]
        # OMOP_location.loc[ind, 'ZipCode'] = [tmp['ZipCode'].values[0] if len(tmp['AddressNumber'].values[0]) > 0 else np.NaN][0]

        # OMOP_location.loc[ind, 'AddressNumber'] = obj[0]['AddressNumber']
        # OMOP_location.loc[ind, 'StreetNamePreDirectional'] = obj[0]['StreetNamePreDirectional']
        # OMOP_location.loc[ind, 'StreetNamePreType'] = obj[0]['StreetNamePreType']
        # OMOP_location.loc[ind, 'StreetName'] = obj[0]['StreetName']
        # OMOP_location.loc[ind, 'StreetNamePostType'] = obj[0]['StreetNamePostType']
        # OMOP_location.loc[ind, 'StreetNamePostDirectional'] = obj[0]['StreetNamePostDirectional']
        # OMOP_location.loc[ind, 'PlaceName']=obj[0]['PlaceName']
        # OMOP_location.loc[ind, 'StateName']=obj[0]['StateName']
        # OMOP_location.loc[ind, 'ZipCode']=obj[0]['ZipCode']

        repo.append(tmp)

        # StreetNamePreDirectional = tmp['StreetNamePreDirectional'].values[0]
        # if not StreetNamePreDirectional:
        #     OMOP_location.loc[ind, 'StreetNamePreDirectional'] = np.NaN
        # else:
        #     OMOP_location.loc[ind, 'StreetNamePreDirectional'] = StreetNamePreDirectional

    except:
        pass

CPU times: total: 1min
Wall time: 1min 6s


In [9]:
%%time

# concatenate above list to a single dataframe and only keep address components for Nominatim
nominatim_components = pd.concat(repo)
nominatim_components[['location_id','AddressNumber', 'StreetNamePreDirectional', 'StreetNamePreType', 'StreetName', 'StreetNamePostType','StreetNamePostDirectional',
                      'PlaceName', 'county', 'StateName', 'ZipCode', 'location_source_value']]

CPU times: total: 31.9 s
Wall time: 35.2 s


Unnamed: 0,location_id,AddressNumber,StreetNamePreDirectional,StreetNamePreType,StreetName,StreetNamePostType,StreetNamePostDirectional,PlaceName,county,StateName,ZipCode,location_source_value
0,1,3210,,,SOUTHWESTERN,BLVD,,ORCHARD PARK,Erie County,NY,14127,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127"
1,2,562,,,WINGATE,DR,,EAST MEADOW,Nassau County,NY,11554,"562 WINGATE DR, EAST MEADOW, NY 11554"
2,3,8,,,ZOAR,AVE,,ALBANY,Albany County,NY,12209,"8 ZOAR AVE, ALBANY, NY 12209"
3,4,15402,,,41ST,AVE,,FLUSHING,Queens County,NY,11354,"15402 41ST AVE, FLUSHING, NY 11354"
4,5,5110,,,19TH,AVE,,BROOKLYN,Kings County,NY,11204,"5110 19TH AVE, BROOKLYN, NY 11204"
...,...,...,...,...,...,...,...,...,...,...,...,...
40412,40413,5000,N,,BOWES,RD,,TUCSON,Pima County,AZ,85749,"5000 N BOWES RD, TUCSON, AZ 85749"
40413,40414,1303,,,PASEO DEL CANON,,EAST,TAOS,Taos County,NM,87571,"1303 PASEO DEL CANON EAST, TAOS, NM 87571"
40414,40415,12606,E,,MAIN,ST,,MAYER,Yavapai County,AZ,86333,"12606 E MAIN ST, MAYER, AZ 86333"
40415,40416,4300,,,BLAKE,RD,SW,ALBUQUERQUE,Bernalillo County,NM,87121,"4300 BLAKE RD SW, ALBUQUERQUE, NM 87121"


In [10]:
# keep only columns necessary to merge to omop_county_zcta_zip_match
nominatim_keep_columns = nominatim_components[['location_id','AddressNumber', 'StreetNamePreDirectional', 'StreetNamePreType', 'StreetName', 'StreetNamePostType','StreetNamePostDirectional','county',]]
nominatim_keep_columns

Unnamed: 0,location_id,AddressNumber,StreetNamePreDirectional,StreetNamePreType,StreetName,StreetNamePostType,StreetNamePostDirectional,county
0,1,3210,,,SOUTHWESTERN,BLVD,,Erie County
1,2,562,,,WINGATE,DR,,Nassau County
2,3,8,,,ZOAR,AVE,,Albany County
3,4,15402,,,41ST,AVE,,Queens County
4,5,5110,,,19TH,AVE,,Kings County
...,...,...,...,...,...,...,...,...
40412,40413,5000,N,,BOWES,RD,,Pima County
40413,40414,1303,,,PASEO DEL CANON,,EAST,Taos County
40414,40415,12606,E,,MAIN,ST,,Yavapai County
40415,40416,4300,,,BLAKE,RD,SW,Bernalillo County


In [12]:
# create empty dataframe
nominatim_location = pd.DataFrame(columns=['location_id', 'address_1', 'address_2', 'city', 'state', 'zip', 'state_abbr', 'location_source_value','latitude','longitude', 'geometry'])

# fill in address and new location_id from omop_county_zcta_zip_match 
nominatim_location['location_source_value'] = omop_county_zcta_zip_match.location_source_value
nominatim_location['location_id'] = nominatim_location.index+1
# nominatim_location['county'] = omop_county_zcta_zip_match.county

# copy over the same columns from omop_county_zcta_zip_match
nominatim_location.address_1 = omop_county_zcta_zip_match.address_1
nominatim_location.address_2 = omop_county_zcta_zip_match.address_2
nominatim_location.city = omop_county_zcta_zip_match.city
nominatim_location.state = omop_county_zcta_zip_match.state
nominatim_location.zip = omop_county_zcta_zip_match.zip
nominatim_location.state_abbr = omop_county_zcta_zip_match.state_abbr
nominatim_location.latitude = omop_county_zcta_zip_match.latitude
nominatim_location.longitude = omop_county_zcta_zip_match.longitude
nominatim_location.geometry = omop_county_zcta_zip_match.geometry
nominatim_location

Unnamed: 0,location_id,address_1,address_2,city,state,zip,state_abbr,location_source_value,latitude,longitude,geometry
0,1,3210 Southwestern Blvd,,Orchard Park,New York,14127,NY,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",42.798535,-78.747816,POINT (-78.74781599997843 42.79853499984358)
1,2,562 Wingate Dr,,East Meadow,New York,11554,NY,"562 WINGATE DR, EAST MEADOW, NY 11554",40.710113,-73.528389,POINT (-73.52838899973285 40.71011299966788)
2,3,8 Zoar Ave,,Albany,New York,12209,NY,"8 ZOAR AVE, ALBANY, NY 12209",42.639067,-73.788991,POINT (-73.78899100027446 42.63906699972181)
3,4,15402 41St Ave,,Flushing,New York,11354,NY,"15402 41ST AVE, FLUSHING, NY 11354",40.763003,-73.810944,POINT (-73.81094400044168 40.76300300008896)
4,5,5110 19Th Ave,,Brooklyn,New York,11204,NY,"5110 19TH AVE, BROOKLYN, NY 11204",40.625623,-73.980013,POINT (-73.98001299964284 40.62562299970931)
...,...,...,...,...,...,...,...,...,...,...,...
40412,40413,5000 N Bowes Rd,,Tucson,Arizona,85749,AZ,"5000 N BOWES RD, TUCSON, AZ 85749",32.298797,-110.788349,POINT (-110.7883488373378 32.29879689494836)
40413,40414,1303 Paseo Del Canon East,,Taos,New Mexico,87571,NM,"1303 PASEO DEL CANON EAST, TAOS, NM 87571",36.375292,-105.568573,POINT (-105.568573404848 36.37529177977968)
40414,40415,12606 E Main St,,Mayer,Arizona,86333,AZ,"12606 E MAIN ST, MAYER, AZ 86333",34.346322,-112.157189,POINT (-112.1571889686942 34.34632218671697)
40415,40416,4300 Blake Rd Sw,,Albuquerque,New Mexico,87121,NM,"4300 BLAKE RD SW, ALBUQUERQUE, NM 87121",35.037163,-106.723264,POINT (-106.7232635088362 35.03716265606487)


In [13]:
# merge omop_county_zcta_zip_match addresses to the ones that parsed fully from nominatim_keep_columns_merge_county
nominatim_keep_columns_merge_county = nominatim_keep_columns.merge(nominatim_location, how='left', on='location_id')
print(nominatim_keep_columns_merge_county.dtypes)
nominatim_keep_columns_merge_county

location_id                    int64
AddressNumber                 object
StreetNamePreDirectional      object
StreetNamePreType             object
StreetName                    object
StreetNamePostType            object
StreetNamePostDirectional     object
county                        object
address_1                     object
address_2                     object
city                          object
state                         object
zip                            int64
state_abbr                    object
location_source_value         object
latitude                     float64
longitude                    float64
geometry                      object
dtype: object


Unnamed: 0,location_id,AddressNumber,StreetNamePreDirectional,StreetNamePreType,StreetName,StreetNamePostType,StreetNamePostDirectional,county,address_1,address_2,city,state,zip,state_abbr,location_source_value,latitude,longitude,geometry
0,1,3210,,,SOUTHWESTERN,BLVD,,Erie County,3210 Southwestern Blvd,,Orchard Park,New York,14127,NY,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",42.798535,-78.747816,POINT (-78.74781599997843 42.79853499984358)
1,2,562,,,WINGATE,DR,,Nassau County,562 Wingate Dr,,East Meadow,New York,11554,NY,"562 WINGATE DR, EAST MEADOW, NY 11554",40.710113,-73.528389,POINT (-73.52838899973285 40.71011299966788)
2,3,8,,,ZOAR,AVE,,Albany County,8 Zoar Ave,,Albany,New York,12209,NY,"8 ZOAR AVE, ALBANY, NY 12209",42.639067,-73.788991,POINT (-73.78899100027446 42.63906699972181)
3,4,15402,,,41ST,AVE,,Queens County,15402 41St Ave,,Flushing,New York,11354,NY,"15402 41ST AVE, FLUSHING, NY 11354",40.763003,-73.810944,POINT (-73.81094400044168 40.76300300008896)
4,5,5110,,,19TH,AVE,,Kings County,5110 19Th Ave,,Brooklyn,New York,11204,NY,"5110 19TH AVE, BROOKLYN, NY 11204",40.625623,-73.980013,POINT (-73.98001299964284 40.62562299970931)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40349,40413,5000,N,,BOWES,RD,,Pima County,5000 N Bowes Rd,,Tucson,Arizona,85749,AZ,"5000 N BOWES RD, TUCSON, AZ 85749",32.298797,-110.788349,POINT (-110.7883488373378 32.29879689494836)
40350,40414,1303,,,PASEO DEL CANON,,EAST,Taos County,1303 Paseo Del Canon East,,Taos,New Mexico,87571,NM,"1303 PASEO DEL CANON EAST, TAOS, NM 87571",36.375292,-105.568573,POINT (-105.568573404848 36.37529177977968)
40351,40415,12606,E,,MAIN,ST,,Yavapai County,12606 E Main St,,Mayer,Arizona,86333,AZ,"12606 E MAIN ST, MAYER, AZ 86333",34.346322,-112.157189,POINT (-112.1571889686942 34.34632218671697)
40352,40416,4300,,,BLAKE,RD,SW,Bernalillo County,4300 Blake Rd Sw,,Albuquerque,New Mexico,87121,NM,"4300 BLAKE RD SW, ALBUQUERQUE, NM 87121",35.037163,-106.723264,POINT (-106.7232635088362 35.03716265606487)


In [14]:
# replace cardinal direction abbreviations with full string and capitalize only first letter
nominatim_keep_columns_merge_county['StreetNamePreDirectional'] = nominatim_keep_columns_merge_county.StreetNamePreDirectional.apply(lambda x: np.NaN if x == np.NaN else cardinal_direction_lambda(x, cardinal_directions_to_full))
nominatim_keep_columns_merge_county['StreetNamePreDirectional'] = nominatim_keep_columns_merge_county.StreetNamePreDirectional.apply(lambda x: str(x).title())

# replace cardinal direction abbreviations with full string and capitalize only first letter
nominatim_keep_columns_merge_county['StreetNamePostDirectional'] = nominatim_keep_columns_merge_county.StreetNamePostDirectional.apply(lambda x: np.NaN if x == np.NaN else cardinal_direction_lambda(x, cardinal_directions_to_full))
nominatim_keep_columns_merge_county['StreetNamePostDirectional'] = nominatim_keep_columns_merge_county.StreetNamePostDirectional.apply(lambda x: str(x).title())

# capitalize only first letters
nominatim_keep_columns_merge_county['StreetName'] = nominatim_keep_columns_merge_county.StreetName.apply(lambda x: str(x).title())
nominatim_keep_columns_merge_county['StreetName'] = nominatim_keep_columns_merge_county.StreetName.apply(lambda x: str(x).title())

# replace street post abbreviations with full string
nominatim_keep_columns_merge_county['StreetNamePreType'] = nominatim_keep_columns_merge_county.StreetNamePreType.apply(lambda x: street_suffix_lambda(str(x).lower(), street_suffix_to_full))
nominatim_keep_columns_merge_county['StreetNamePostType'] = nominatim_keep_columns_merge_county.StreetNamePostType.apply(lambda x: street_suffix_lambda(str(x).lower(), street_suffix_to_full))

# remove ending whitespaces
nominatim_keep_columns_merge_county['AddressNumber'] = nominatim_keep_columns_merge_county.AddressNumber.apply(lambda x: str(x).rstrip())
nominatim_keep_columns_merge_county

Unnamed: 0,location_id,AddressNumber,StreetNamePreDirectional,StreetNamePreType,StreetName,StreetNamePostType,StreetNamePostDirectional,county,address_1,address_2,city,state,zip,state_abbr,location_source_value,latitude,longitude,geometry
0,1,3210,Nan,,Southwestern,Boulevard,Nan,Erie County,3210 Southwestern Blvd,,Orchard Park,New York,14127,NY,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",42.798535,-78.747816,POINT (-78.74781599997843 42.79853499984358)
1,2,562,Nan,,Wingate,Drive,Nan,Nassau County,562 Wingate Dr,,East Meadow,New York,11554,NY,"562 WINGATE DR, EAST MEADOW, NY 11554",40.710113,-73.528389,POINT (-73.52838899973285 40.71011299966788)
2,3,8,Nan,,Zoar,Avenue,Nan,Albany County,8 Zoar Ave,,Albany,New York,12209,NY,"8 ZOAR AVE, ALBANY, NY 12209",42.639067,-73.788991,POINT (-73.78899100027446 42.63906699972181)
3,4,15402,Nan,,41St,Avenue,Nan,Queens County,15402 41St Ave,,Flushing,New York,11354,NY,"15402 41ST AVE, FLUSHING, NY 11354",40.763003,-73.810944,POINT (-73.81094400044168 40.76300300008896)
4,5,5110,Nan,,19Th,Avenue,Nan,Kings County,5110 19Th Ave,,Brooklyn,New York,11204,NY,"5110 19TH AVE, BROOKLYN, NY 11204",40.625623,-73.980013,POINT (-73.98001299964284 40.62562299970931)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40349,40413,5000,North,,Bowes,Road,Nan,Pima County,5000 N Bowes Rd,,Tucson,Arizona,85749,AZ,"5000 N BOWES RD, TUCSON, AZ 85749",32.298797,-110.788349,POINT (-110.7883488373378 32.29879689494836)
40350,40414,1303,Nan,,Paseo Del Canon,,East,Taos County,1303 Paseo Del Canon East,,Taos,New Mexico,87571,NM,"1303 PASEO DEL CANON EAST, TAOS, NM 87571",36.375292,-105.568573,POINT (-105.568573404848 36.37529177977968)
40351,40415,12606,East,,Main,Street,Nan,Yavapai County,12606 E Main St,,Mayer,Arizona,86333,AZ,"12606 E MAIN ST, MAYER, AZ 86333",34.346322,-112.157189,POINT (-112.1571889686942 34.34632218671697)
40352,40416,4300,Nan,,Blake,Road,Southwest,Bernalillo County,4300 Blake Rd Sw,,Albuquerque,New Mexico,87121,NM,"4300 BLAKE RD SW, ALBUQUERQUE, NM 87121",35.037163,-106.723264,POINT (-106.7232635088362 35.03716265606487)


In [21]:
# postprocessing clean up
# convert zip to type string
nominatim_keep_columns_merge_county['zip'] = nominatim_keep_columns_merge_county.zip.astype(str)

# convert ordinal numbers to full string
nominatim_keep_columns_merge_county['StreetName'] = nominatim_keep_columns_merge_county.StreetName.apply(lambda x: str(x).lower())
nominatim_keep_columns_merge_county['StreetName'] = nominatim_keep_columns_merge_county.StreetName.apply(lambda x: replace_ordinal_numbers(str(x)))
nominatim_keep_columns_merge_county['StreetName'] = nominatim_keep_columns_merge_county.StreetName.apply(lambda x: str(x).title())
nominatim_keep_columns_merge_county['StreetName'] = nominatim_keep_columns_merge_county.StreetName.apply(lambda x: str(x).replace("-", " "))

# capitalize first letter
nominatim_keep_columns_merge_county['StreetNamePreType'] = nominatim_keep_columns_merge_county.StreetNamePreType.apply(lambda x: np.NaN if x == np.NaN else str(x).title())
nominatim_keep_columns_merge_county['StreetNamePostType'] = nominatim_keep_columns_merge_county.StreetNamePostType.apply(lambda x: np.NaN if x == np.NaN else str(x).title())

# join Nominatim components to a single string of AddressNumber, StreetNamePreDirectional + StreetNamePreType + StreetName + StreetNamePostType, city, county, state, zip, United States
nominatim_keep_columns_merge_county['Nominatim_address'] = nominatim_keep_columns_merge_county['AddressNumber'] + ',' + \
                                                            nominatim_keep_columns_merge_county[['StreetNamePreDirectional', 'StreetNamePreType', 'StreetName', 'StreetNamePostType']].fillna('').agg(' '.join, axis=1) + ', '\
                                                            + nominatim_keep_columns_merge_county[['city', 'county', 'state', 'zip']].fillna('').agg(', '.join, axis=1) + ', United States'

# remove double whitespaces
nominatim_keep_columns_merge_county['Nominatim_address'] = nominatim_keep_columns_merge_county.Nominatim_address.apply(lambda x: x.replace("  ", " "))

# replace string "nan" with np.NaN
nominatim_keep_columns_merge_county['StreetNamePreDirectional'] = nominatim_keep_columns_merge_county.StreetNamePreDirectional.apply(lambda x: np.NaN if str(x) == "Nan" or str(x) == "nan" else x)
nominatim_keep_columns_merge_county['StreetNamePostDirectional'] = nominatim_keep_columns_merge_county.StreetNamePostDirectional.apply(lambda x: np.NaN if str(x) == "Nan" or str(x) == "nan" else x)

nominatim_keep_columns_merge_county['StreetNamePreType'] = nominatim_keep_columns_merge_county.StreetNamePreType.apply(lambda x: np.NaN if str(x) == "Nan" or str(x) == "nan" else x)
nominatim_keep_columns_merge_county['StreetNamePostType'] = nominatim_keep_columns_merge_county.StreetNamePostType.apply(lambda x: np.NaN if str(x) == "Nan" or str(x) == "nan" else x)

# replace "Nan" string from directionals
nominatim_keep_columns_merge_county['Nominatim_address'] = nominatim_keep_columns_merge_county.Nominatim_address.apply(lambda x: str(x).replace('Nan', ''))

# replace adjacent commas with one comma
nominatim_keep_columns_merge_county['Nominatim_address'] = nominatim_keep_columns_merge_county.Nominatim_address.apply(lambda x: str(x).replace(', ,', ', '))
nominatim_keep_columns_merge_county                                                                                                                      

Unnamed: 0,location_id,AddressNumber,StreetNamePreDirectional,StreetNamePreType,StreetName,StreetNamePostType,StreetNamePostDirectional,county,address_1,address_2,city,state,zip,state_abbr,location_source_value,latitude,longitude,geometry,Nominatim_address
0,1,3210,,,Southwestern,Boulevard,,Erie County,3210 Southwestern Blvd,,Orchard Park,New York,14127,NY,"3210 SOUTHWESTERN BLVD, ORCHARD PARK, NY 14127",42.798535,-78.747816,POINT (-78.74781599997843 42.79853499984358),"3210, Southwestern Boulevard, Orchard Park, E..."
1,2,562,,,Wingate,Drive,,Nassau County,562 Wingate Dr,,East Meadow,New York,11554,NY,"562 WINGATE DR, EAST MEADOW, NY 11554",40.710113,-73.528389,POINT (-73.52838899973285 40.71011299966788),"562, Wingate Drive, East Meadow, Nassau Count..."
2,3,8,,,Zoar,Avenue,,Albany County,8 Zoar Ave,,Albany,New York,12209,NY,"8 ZOAR AVE, ALBANY, NY 12209",42.639067,-73.788991,POINT (-73.78899100027446 42.63906699972181),"8, Zoar Avenue, Albany, Albany County, New Yo..."
3,4,15402,,,Forty First,Avenue,,Queens County,15402 41St Ave,,Flushing,New York,11354,NY,"15402 41ST AVE, FLUSHING, NY 11354",40.763003,-73.810944,POINT (-73.81094400044168 40.76300300008896),"15402, Forty First Avenue, Flushing, Queens C..."
4,5,5110,,,Nineteenth,Avenue,,Kings County,5110 19Th Ave,,Brooklyn,New York,11204,NY,"5110 19TH AVE, BROOKLYN, NY 11204",40.625623,-73.980013,POINT (-73.98001299964284 40.62562299970931),"5110, Nineteenth Avenue, Brooklyn, Kings Coun..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40349,40413,5000,North,,Bowes,Road,,Pima County,5000 N Bowes Rd,,Tucson,Arizona,85749,AZ,"5000 N BOWES RD, TUCSON, AZ 85749",32.298797,-110.788349,POINT (-110.7883488373378 32.29879689494836),"5000,North Bowes Road, Tucson, Pima County, A..."
40350,40414,1303,,,Paseo Del Canon,,East,Taos County,1303 Paseo Del Canon East,,Taos,New Mexico,87571,NM,"1303 PASEO DEL CANON EAST, TAOS, NM 87571",36.375292,-105.568573,POINT (-105.568573404848 36.37529177977968),"1303, Paseo Del Canon , Taos, Taos County, Ne..."
40351,40415,12606,East,,Main,Street,,Yavapai County,12606 E Main St,,Mayer,Arizona,86333,AZ,"12606 E MAIN ST, MAYER, AZ 86333",34.346322,-112.157189,POINT (-112.1571889686942 34.34632218671697),"12606,East Main Street, Mayer, Yavapai County..."
40352,40416,4300,,,Blake,Road,Southwest,Bernalillo County,4300 Blake Rd Sw,,Albuquerque,New Mexico,87121,NM,"4300 BLAKE RD SW, ALBUQUERQUE, NM 87121",35.037163,-106.723264,POINT (-106.7232635088362 35.03716265606487),"4300, Blake Road, Albuquerque, Bernalillo Cou..."


In [23]:
nominatim_keep_columns_merge_county.iloc[4].Nominatim_address

'5110,  Nineteenth Avenue, Brooklyn, Kings County, New York, 11204, United States'

In [24]:
nominatim_keep_columns_merge_county_parsed_path = os.path.join(abs_path, 'output', 'nominatim_keep_columns_merge_county_parsed.csv')
nominatim_keep_columns_merge_county.to_csv(nominatim_keep_columns_merge_county_parsed_path, index=False)