# Notebook to calculate zipcodes based on lat-lon coordinates

In [6]:
import numpy as np
import pandas as pd
import sys
import json
import os
import urllib.request as ulr
from googleplaces import GooglePlaces, types, lang
YOUR_API_KEY = os.environ['GOOGLE_API_KEY']
#YOUR_API_KEY = <Put your GooglePlaces-enabled Google API Key here>

In [7]:
pd.set_option('max_rows', 10)

## Load crime lat-lon data

In [11]:
crime_latlon_df = pd.read_csv('crime_latlon.out', header=None)
crime_latlon_df[0] = pd.to_numeric(crime_latlon_df[0].str.strip(to_strip = '('))
crime_latlon_df[1] = crime_latlon_df[1].str.strip(to_strip = ')').str.strip()
crime_latlon_df[1] = pd.to_numeric(crime_latlon_df[1])
crime_latlon_df[2] = pd.to_numeric(crime_latlon_df[2])
crime_latlon_df = crime_latlon_df.dropna(axis=0)
crime_latlon_df = crime_latlon_df.rename(index=str, columns={0:'lat',1:'lon',2:'crimes'})
crime_latlon_df.head()

Unnamed: 0,lat,lon,crimes
0,40.675955,-73.735304,5.0
1,40.804214,-73.964751,188.0
2,40.67046,-73.882559,65.0
3,40.822017,-73.938915,266.0
4,40.749871,-73.898905,63.0


In [8]:
#Grabbing and parsing the JSON data
def GoogPlac(lat,lng,radius,key):
    #making the url
    AUTH_KEY = key
    LOCATION = str(lat) + "," + str(lng)
    RADIUS = radius
    MyUrl = ('https://maps.googleapis.com/maps/api/place/nearbysearch/json'
             '?location=%s'
             '&radius=%s'
             #'&types=%s'
             '&sensor=false&key=%s') % (LOCATION, RADIUS,AUTH_KEY)
    #grabbing the JSON result
    response = ulr.urlopen(MyUrl)
    data = response.read().decode("utf-8")
    data = json.loads(data)
    return data
def GoogPlacText(loc,radius,key):
    #making the url
    AUTH_KEY = key
    LOCATION = loc.replace(' ', '+')
    RADIUS = radius
    MyUrl = ('https://maps.googleapis.com/maps/api/place/textsearch/json'
             '?query=%s'
             '&radius=%s'
             '&sensor=false&key=%s') % (LOCATION, RADIUS,AUTH_KEY)
    #grabbing the JSON result
    response = ulr.urlopen(MyUrl)
    data = response.read().decode("utf-8")
    data = json.loads(data)
    return data

def GoogPlacID(ID,key):
    AUTH_KEY = key
    PLACE_ID = ID
    MyUrl = ('https://maps.googleapis.com/maps/api/place/details/json'
             '?placeid=%s'
             '&key=%s') % (PLACE_ID,AUTH_KEY)
    #grabbing the JSON result
    response = ulr.urlopen(MyUrl)
    data = response.read().decode("utf-8")
    data = json.loads(data)
    return data


### Test out geocoding on known address (NYU Stern, zipcode 10012)

In [9]:
lat,lon = 40.729242, -73.996491
query = GoogPlac(lat,lon, 10, YOUR_API_KEY)
query2 = GoogPlacID(query['results'][1]['place_id'],YOUR_API_KEY)
for i in range(len(query2['result']['address_components'])):
    if query2['result']['address_components'][i]['types'] == ['postal_code']:
        print(query2['result']['address_components'][i]['long_name'])

10012


### Test out geocoding on known address (NYU Tandon, zipcode 11201)

In [33]:
lat,lon = 40.694187, -73.986558
query = GoogPlac(lat,lon, 10, YOUR_API_KEY)
query2 = GoogPlacID(query['results'][1]['place_id'],YOUR_API_KEY)
for i in range(len(query2['result']['address_components'])):
    if query2['result']['address_components'][i]['types'] == ['postal_code']:
        print(query2['result']['address_components'][i]['long_name'])

11201


### Test out geocoding on known address (Brunswick, ME, zipcode 04011)

In [35]:
lat,lon = 43.899295, -69.964007
query = GoogPlac(lat,lon, 10, YOUR_API_KEY)
query2 = GoogPlacID(query['results'][0]['place_id'],YOUR_API_KEY)
for i in range(len(query2['result']['address_components'])):
    if query2['result']['address_components'][i]['types'] == ['postal_code']:
        print(query2['result']['address_components'][i]['long_name'])

04011


If the location is a dense area, the first hit of the google search is likely the administrative area (multiple zipcodes). If not, the second hit likely will be. We can check both entries and stop once we have a zip code.

Initialize our storage lists:

In [75]:
pl_lat = []
pl_lon = []
pl_zip = []
search_failures = []
failure_index = []
ct = 0

Specify our index ranges:

In [83]:
start_index = 0
end_index = 50

In [76]:
from tqdm import tqdm 
import time

#Adjust the range as needed for missing values
for i in tqdm(np.arange(start_index,end_index)):
    try:
        success = 0
        lat = crime_latlon_df['lat'][i]
        lon = crime_latlon_df['lon'][i]
        query = GoogPlac(lat,lon, 10, YOUR_API_KEY)
        for j in range(2):
            if success == 0:
                query2 = GoogPlacID(query['results'][j]['place_id'],YOUR_API_KEY)
                for i in range(len(query2['result']['address_components'])):
                    if query2['result']['address_components'][i]['types'] == ['postal_code']:
                        pl_zip.append(query2['result']['address_components'][i]['long_name'])
                        success = 1
                        break
        if success == 0:
            pl_zip.append('ZIP_NOT_FOUND')
        pl_lat.append(lat)
        pl_lon.append(lon)


    except:
        ct = ct + 1
        pl_lat.append(crime_latlon_df['lat'][i])
        pl_lon.append(crime_latlon_df['lon'][i])
        pl_zip.append('SEARCH FAILURE')
        search_failures.append((crime_latlon_df['lat'][i],crime_latlon_df['lon'][i]))
        failure_index.append(i)
        if np.mod(ct,25) == 0:
            print('Failure ' + str(ct) + ': ' + str((crime_latlon_df['lat'][i],crime_latlon_df['lon'][i])))

   

100%|██████████| 50/50 [00:19<00:00,  2.64it/s]


In [77]:
pl_zip

['11422',
 '10025',
 '11208',
 '10039',
 '11377',
 '10027',
 '11419',
 '11365',
 '11233',
 '11201',
 '11212',
 '11223',
 '11214',
 '10457',
 '10458',
 '10014',
 '11233',
 '11228',
 '11225',
 '10282',
 '11101',
 '10301',
 '10451',
 '11204',
 '10027',
 '11207',
 '10303',
 '10459',
 '10475',
 '11691',
 '10458',
 '11420',
 '10460',
 '10003',
 '10466',
 '10012',
 '11354',
 '10457',
 '11209',
 '10465',
 '10310',
 '10456',
 '11372',
 '11434',
 '11416',
 '10473',
 '10305',
 '10459',
 '11207',
 '11215']

In [78]:
print(len(pl_lat))
print(len(pl_lon))
print(len(pl_zip))
print(len(search_failures))
print(len(failure_index))

50
50
50
0
0


Merge our (partial) zipcode information with the (partial) crime data

In [79]:
df_zips = pd.DataFrame([pl_lat, pl_lon, pl_zip]).T
df_zips = df_zips.rename(index=str, columns={0: 'lat', 1: 'lon', 2: 'zipcode'})
df_merged = df_zips.merge(crime_latlon_df.iloc[start_index:end_index], on=['lat','lon'])
df_merged

Unnamed: 0,lat,lon,zipcode,crimes
0,40.676,-73.7353,11422,5.0
1,40.8042,-73.9648,10025,188.0
2,40.6705,-73.8826,11208,65.0
3,40.822,-73.9389,10039,266.0
4,40.7499,-73.8989,11377,63.0
5,40.8164,-73.9589,10027,133.0
6,40.6859,-73.8264,11419,106.0
7,40.7329,-73.7999,11365,6.0
8,40.6767,-73.9165,11233,131.0
9,40.6886,-73.9851,11201,326.0


In [80]:
df_merged.zipcode.describe()

count        50
unique       44
top       10027
freq          2
Name: zipcode, dtype: object

Aggregate crime counts at the zipcode level

In [81]:
df_merged.groupby('zipcode').sum()

Unnamed: 0_level_0,crimes
zipcode,Unnamed: 1_level_1
10003,352.0
10012,368.0
10014,90.0
10025,188.0
10027,1390.0
10039,266.0
10282,135.0
10301,108.0
10303,113.0
10305,13.0


Save our (partial) lat-lon-zip data

In [82]:
out_file_name = 'partial_crime_latlon_zips_' + str(start_index) + '_' + str(end_index-1) + '.csv'
df_merged.to_csv(out_file_name)