In [None]:
import numpy as np
import pandas as pd
import requests
import os
from functools import lru_cache

In [12]:
df = pd.read_csv("data/Motor_Vehicle_Collisions_-_Crashes_20251117.csv", low_memory=False)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2220334 entries, 0 to 2220333
Data columns (total 29 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   CRASH DATE                     object 
 1   CRASH TIME                     object 
 2   BOROUGH                        object 
 3   ZIP CODE                       object 
 4   LATITUDE                       float64
 5   LONGITUDE                      float64
 6   LOCATION                       object 
 7   ON STREET NAME                 object 
 8   CROSS STREET NAME              object 
 9   OFF STREET NAME                object 
 10  NUMBER OF PERSONS INJURED      float64
 11  NUMBER OF PERSONS KILLED       float64
 12  NUMBER OF PEDESTRIANS INJURED  int64  
 13  NUMBER OF PEDESTRIANS KILLED   int64  
 14  NUMBER OF CYCLIST INJURED      int64  
 15  NUMBER OF CYCLIST KILLED       int64  
 16  NUMBER OF MOTORIST INJURED     int64  
 17  NUMBER OF MOTORIST KILLED      int64  
 18  CO

In [31]:
df.isna().mean()

CRASH DATE                       0.000000
CRASH TIME                       0.000000
borough                          0.306384
zip_code                         0.306508
lat                              0.108249
lon                              0.108249
LOCATION                         0.108254
on_street                        0.217811
cross_street                     0.382118
off_street                       0.823792
NUMBER OF PERSONS INJURED        0.000008
NUMBER OF PERSONS KILLED         0.000014
NUMBER OF PEDESTRIANS INJURED    0.000000
NUMBER OF PEDESTRIANS KILLED     0.000000
NUMBER OF CYCLIST INJURED        0.000000
NUMBER OF CYCLIST KILLED         0.000000
NUMBER OF MOTORIST INJURED       0.000000
NUMBER OF MOTORIST KILLED        0.000000
CONTRIBUTING FACTOR VEHICLE 1    0.003531
CONTRIBUTING FACTOR VEHICLE 2    0.160535
CONTRIBUTING FACTOR VEHICLE 3    0.927642
CONTRIBUTING FACTOR VEHICLE 4    0.983480
CONTRIBUTING FACTOR VEHICLE 5    0.995475
COLLISION_ID                     0

In [29]:
df.rename(columns={
    "LATITUDE": "lat",
    "LONGITUDE": "lon",
    "ZIP CODE": "zip_code",
    "BOROUGH": "borough",
    "ON STREET NAME": "on_street",
    "CROSS STREET NAME": "cross_street",
    "OFF STREET NAME": "off_street",
}, inplace=True)

In [8]:
GEOCLIENT_KEY = "bce991a9fbdd47b3b8130ca2d5c5ce0f"  # recommended to store via .env

BASE_URL = "https://api.nyc.gov/geoclient/v2/search"

In [9]:
def geoclient_search(query):
    headers = {
        "Ocp-Apim-Subscription-Key": GEOCLIENT_KEY
    }

    params = {
        "input": query
    }

    r = requests.get(BASE_URL, headers=headers, params=params, timeout=10)

    if r.status_code != 200:
        print("Error:", r.text)
        return None

    return r.json()


In [10]:
def geocode_freeform(query):
    data = geoclient_search(query)
    if not data:
        return None, None

    results = data.get("results", [])
    if not results:
        return None, None

    resp = results[0].get("response", {})
    lat = resp.get("latitude")
    lon = resp.get("longitude")

    if lat is None or lon is None:
        return None, None

    return float(lat), float(lon)


In [23]:
lat, lon = geocode_freeform("49 street & 5 avenue")
print(lat, lon)

Geocoded '49 street & 5 avenue' to (40.757912, -73.977653)
40.757912 -73.977653


In [20]:
import os
import requests
from functools import lru_cache

GEOCLIENT_KEY = "bce991a9fbdd47b3b8130ca2d5c5ce0f"  # put your key in env
BASE_URL = "https://api.nyc.gov/geoclient/v2/search"


@lru_cache(maxsize=50_000)
def geocode_freeform(query: str):
    """
    Return (lat, lon) for a free-form address or intersection string.
    Uses Geoclient v2 and caches results in memory with LRU cache.
    """
    if not query:
        return None, None

    query_norm = query.strip().lower()

    headers = {
        "Ocp-Apim-Subscription-Key": GEOCLIENT_KEY
    }
    params = {"input": query_norm}

    r = requests.get(BASE_URL, headers=headers, params=params, timeout=8)
    if r.status_code != 200:
        return None, None

    data = r.json()
    results = data.get("results") or []
    if not results:
        return None, None

    resp = results[0].get("response", {})
    lat = resp.get("latitude")
    lon = resp.get("longitude")

    if lat is None or lon is None:
        return None, None
    print(f"Geocoded '{query}' to ({lat}, {lon})")
    return float(lat), float(lon)


In [26]:
def build_query(row):
    on = str(row.get("on_street", "") or "").strip()
    cross = str(row.get("cross_street") or "").strip()
    bor = str(row.get("borough") or "").strip()
    
    if on and cross and bor:
        return f"{on} & {cross}, {bor}, NY"
    if on and bor:
        return f"{on}, {bor}, NY"

    return None

In [30]:
mask = df["lat"].isna() | df["lon"].isna()

for idx, row in df[mask].iterrows():
    q = build_query(row)
    print(f"Geocoding row {idx}: {q}")
    lat, lon = geocode_freeform(q)
    df.at[idx, "lat"] = lat
    df.at[idx, "lon"] = lon

Geocoding row 0: WHITESTONE EXPRESSWAY & 20 AVENUE, nan, NY
Geocoding row 1: QUEENSBORO BRIDGE UPPER & nan, nan, NY
Geocoding row 3: THROGS NECK BRIDGE & nan, nan, NY
Geocoding row 4: BROOKLYN BRIDGE & nan, nan, NY
Geocoding row 5: WEST 54 STREET & nan, nan, NY
Geocoding row 6: HUTCHINSON RIVER PARKWAY & nan, nan, NY
Geocoding row 7: WEST 35 STREET & HENRY HUDSON RIVER, nan, NY
Geocoding row 8: nan & nan, nan, NY
Geocoding row 11: MAJOR DEEGAN EXPRESSWAY RAMP & nan, nan, NY
Geocoding row 18: broadway & west 80 street -west 81 street, nan, NY
Geocoding row 23: MEEKER AVENUE & LORIMER STREET, nan, NY
Geocoding row 38: 35 AVENUE & nan, nan, NY
Geocoding row 39: EASTCHESTER ROAD & PELHAM PARKWAY NORTH, nan, NY
Geocoding row 40: KINGSLAND AVENUE & MEEKER AVENUE, nan, NY
Geocoding row 41: WILLIAMSBURG BRIDGE OUTER ROADWA & nan, nan, NY
Geocoding row 43: nan & nan, BROOKLYN, NY
Geocoding row 44: WILLIAMSBURG BRIDGE OUTER ROADWA & nan, nan, NY
Geocoding row 45: HUTCHINSON RIVER PARKWAY & nan, 

KeyboardInterrupt: 

In [34]:
df.groupby("zip_code")["lat"].median()

zip_code
         40.720103
10000    40.772550
10001    40.750206
10002    40.718555
10003    40.733290
           ...    
11692    40.592728
11693    40.588587
11694    40.581234
11695    40.562128
11697    40.560776
Name: lat, Length: 235, dtype: float64

ZIP Centroid imputation

In [42]:
import geopandas as gpd
# Load ZIP GeoJSON
zips = gpd.read_file("data/raw/Modified Zip Code Tabulation Areas (MODZCTA).geojson")
# Make sure ZIP codes are strings
zips["zip_code"] = zips["zcta"].astype(str).str.zfill(5)
# Project to a metric CRS (NY State Plane)
zips = zips.to_crs(epsg=2263)
# Compute centroids in projected coordinates
zips["centroid"] = zips.geometry.centroid
# Convert centroids back to WGS84 lat/lon
zips_centroids = zips.set_geometry("centroid").to_crs(epsg=4326)
zips["centroid_lat"] = zips_centroids.geometry.y
zips["centroid_lon"] = zips_centroids.geometry.x


In [43]:
zips

Unnamed: 0,modzcta,label,zcta,pop_est,geometry,zip_code,centroid,centroid_lat,centroid_lon
0,10001,"10001, 10118","10001, 10119, 10199",23072,"MULTIPOLYGON (((987646 210360, 987522 210136, ...","10001, 10119, 10199",POINT (985043.105 212772.632),40.750688,-73.997138
1,10002,10002,10002,74993,"MULTIPOLYGON (((984942 199432, 985056 199632, ...",10002,POINT (988082.688 200054.877),40.715780,-73.986174
2,10003,10003,10003,54682,"MULTIPOLYGON (((987400 202660, 987366 202674, ...",10003,POINT (987254.423 205900.824),40.731826,-73.989160
3,10026,10026,10026,39363,"MULTIPOLYGON (((994768 232748, 995304 232456, ...",10026,POINT (997353.028 231609.987),40.802382,-73.952672
4,10004,10004,10004,3028,"MULTIPOLYGON (((981958 197118, 981652 197444, ...",10004,POINT (980636.711 195670.687),40.703747,-74.013032
...,...,...,...,...,...,...,...,...,...
173,11434,11434,"11430, 11434",65989,"MULTIPOLYGON (((1052346 175164, 1052572 174984...","11430, 11434",POINT (1044352.146 178663.113),40.656862,-73.783384
174,11435,11435,11435,59296,"MULTIPOLYGON (((1038120 188146, 1038038 188374...",11435,POINT (1037038.231 194831.875),40.701288,-73.809618
175,11694,11694,11694,21354,"MULTIPOLYGON (((1021770 145744, 1021596 146114...",11694,POINT (1027431.649 149749.784),40.577599,-73.844553
176,11697,11697,11697,3540,"MULTIPOLYGON (((1022340 149180, 1022362 149134...",11697,POINT (1010035.7 142973.959),40.559069,-73.907201


In [45]:
df["zip_code"] = df["zip_code"].astype(str).str.zfill(5)
df = df.merge(
    zips[["zip_code", "centroid_lat", "centroid_lon"]],
    on="zip_code",
    how="left"
)
mask_missing = df["lat"].isna() | df["lon"].isna()
df.loc[mask_missing, "lat"]  = df.loc[mask_missing, "centroid_lat"]
df.loc[mask_missing, "lon"] = df.loc[mask_missing, "centroid_lon"]


In [38]:
zips["zip_code"] = zips["zcta"].astype(str).str.zfill(5)


In [39]:
zips

Unnamed: 0,modzcta,label,zcta,pop_est,geometry,zip_code
0,10001,"10001, 10118","10001, 10119, 10199",23072,"MULTIPOLYGON (((-73.98774 40.74407, -73.98819 ...","10001, 10119, 10199"
1,10002,10002,10002,74993,"MULTIPOLYGON (((-73.9975 40.71407, -73.99709 4...",10002
2,10003,10003,10003,54682,"MULTIPOLYGON (((-73.98864 40.72293, -73.98876 ...",10003
3,10026,10026,10026,39363,"MULTIPOLYGON (((-73.96201 40.80551, -73.96007 ...",10026
4,10004,10004,10004,3028,"MULTIPOLYGON (((-74.00827 40.70772, -74.00937 ...",10004
...,...,...,...,...,...,...
173,11434,11434,"11430, 11434",65989,"MULTIPOLYGON (((-73.75461 40.6472, -73.7538 40...","11430, 11434"
174,11435,11435,11435,59296,"MULTIPOLYGON (((-73.80577 40.68293, -73.80606 ...",11435
175,11694,11694,11694,21354,"MULTIPOLYGON (((-73.86496 40.56663, -73.86558 ...",11694
176,11697,11697,11697,3540,"MULTIPOLYGON (((-73.86289 40.57606, -73.86281 ...",11697


In [40]:
zips["centroid"] = zips.geometry.centroid


  zips["centroid"] = zips.geometry.centroid


In [41]:
zips["centroid"].y

0      40.750688
1      40.715780
2      40.731826
3      40.802382
4      40.703747
         ...    
173    40.656864
174    40.701289
175    40.577599
176    40.559067
177    40.709534
Length: 178, dtype: float64