In [1]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt
import geopandas as gpd, descartes
import itertools
import os

# Compare censusgeomatch vs LBD's batchgeocode site.

LBD has a geocoding site here: https://batchgeocode.ihme.washington.edu/index. It costs $ to query google maps.
The censusgeocode package is more efficient, but LBD folks indicated google maps is the source they trust the most. I tagged 58 locations with both tools. Comparing what pct of tags ended up in the same blockgroup.

RESULTS: out of 58 locs, 12% were tagged to different blockgroups.

In [2]:
# load in points with coord tags
snfs = pd.read_csv('/homes/beatrixh/repos/geomatch/king_county_03_30_2020/king_county_geocode_for_vetting.csv')

In [3]:
snfs.head()

Unnamed: 0.1,Unnamed: 0,address,lat,lon,best_lat,best_long
0,0,"805 FRONT ST SOUTH, ISSAQUAH, WA,",47.522125,-122.034225,47.521655,-122.034809
1,1,"901 12TH AVENUE, SEATTLE, WA",47.610485,-122.31679,47.610229,-122.317629
2,2,"491 SOUTH 338TH STREET, FEDERAL WAY, WA,",47.298603,-122.3267,47.298048,-122.326598
3,3,"1334 TERRY AVENUE, SEATTLE, WA,",47.61138,-122.32854,47.611873,-122.328573
4,4,"135 SOUTH 336TH STREET, FEDERAL WAY, WA,",47.29988,-122.32848,47.298797,-122.331097


In [4]:
# turn into geodataframe using lbd coord results
snfs = gpd.GeoDataFrame(snfs, geometry=gpd.points_from_xy(snfs.best_long,snfs.best_lat))

In [5]:
# read in shapefile. these are blockgoup level, washington 2018
# https://www.census.gov/cgi-bin/geo/shapefiles/index.php?year=2018&layergroup=Block+Groups
wa_bg = gpd.read_file('/home/j/temp/beatrixh/sim_science/census_GIS/tl_2018_53_bg/tl_2018_53_bg.shp')
king_bg = wa_bg[wa_bg.GEOID.str[:5]=='53033'][['GEOID','geometry']]

In [6]:
king_bg.head()

Unnamed: 0,GEOID,geometry
7,530330219042,"POLYGON ((-122.17557 47.72344, -122.17328 47.7..."
8,530330219044,"POLYGON ((-122.17558 47.72581, -122.17323 47.7..."
9,530330219051,"POLYGON ((-122.18723 47.74653, -122.18719 47.7..."
10,530330219052,"POLYGON ((-122.18910 47.72612, -122.18908 47.7..."
11,530330220032,"POLYGON ((-122.20776 47.72030, -122.20754 47.7..."


In [8]:
# create cols to hold geoids associated with each snf
snfs['geoid_lbd'] = ''

for i in range(snfs.shape[0]): #loop through snfs
    for j in range(king_bg.shape[0]): #loop through blockgroups
        if snfs.iloc[i,].geometry.intersects(king_bg.iloc[j].geometry): #if there's an intersection
            snfs.iloc[i,-1] += str(king_bg.iloc[j,].GEOID) + ' ' #record geoid

In [9]:
# turn into geodataframe using censusgeomatch coords
snfs_compare_geoids = gpd.GeoDataFrame(snfs[['address','lat','lon','geoid_lbd']], geometry=gpd.points_from_xy(snfs.lon,snfs.lat))

In [10]:
# create cols to hold geoids associated with each snf
snfs_compare_geoids['geoid_censusgeomatch'] = ''

for i in range(snfs_compare_geoids.shape[0]): #loop through snfs
    for j in range(king_bg.shape[0]): #loop through blockgroups
        if snfs_compare_geoids.iloc[i,].geometry.intersects(king_bg.iloc[j].geometry): #if there's an intersection
            snfs_compare_geoids.iloc[i,-1] += str(king_bg.iloc[j,].GEOID) + ' ' #record geoid

In [11]:
snfs_compare_geoids.head()

Unnamed: 0,address,lat,lon,geoid_lbd,geometry,geoid_censusgeomatch
0,"805 FRONT ST SOUTH, ISSAQUAH, WA,",47.522125,-122.034225,530330321043,POINT (-122.03423 47.52213),530330321043
1,"901 12TH AVENUE, SEATTLE, WA",47.610485,-122.31679,530330086002,POINT (-122.31679 47.61048),530330086002
2,"491 SOUTH 338TH STREET, FEDERAL WAY, WA,",47.298603,-122.3267,530330303132,POINT (-122.32670 47.29860),530330303132
3,"1334 TERRY AVENUE, SEATTLE, WA,",47.61138,-122.32854,530330083002,POINT (-122.32854 47.61138),530330083002
4,"135 SOUTH 336TH STREET, FEDERAL WAY, WA,",47.29988,-122.32848,530330303092,POINT (-122.32848 47.29988),530330303132


In [15]:
print(f'pct of geoids that differ: {snfs_compare_geoids[snfs_compare_geoids.geoid_censusgeomatch!=snfs_compare_geoids.geoid_lbd].shape[0]/snfs_compare_geoids.shape[0]}')
print(f'pct of sample size: {snfs_compare_geoids.shape[0]}')

pct of geoids that differ: 0.1206896551724138
pct of sample size: 58
