In [22]:
import numpy as np
import pandas as pd
import geopandas as gpd
import os
import json

In [10]:
NYC_MODZCTA_URL = (
    "https://data.cityofnewyork.us/api/geospatial/pri4-ifjk"
    "?method=export&format=GeoJSON"
)

print("Downloading NYC ZIP (MODZCTA) polygons…")
nyc_zips = gpd.read_file(NYC_MODZCTA_URL)

# Keep only ZIP code + geometry, rename column to 'zipcode'
nyc_zips = nyc_zips[["modzcta", "geometry"]].rename(columns={"modzcta": "zipcode"})
nyc_zips["zipcode"] = nyc_zips["zipcode"].astype(str).str.zfill(5)
nyc_zips = nyc_zips.to_crs("EPSG:4326")  # Ensure correct CRS

nyc_zips.head()

Downloading NYC ZIP (MODZCTA) polygons…


Unnamed: 0,zipcode,geometry
0,10001,"MULTIPOLYGON (((-73.98774 40.74407, -73.98819 ..."
1,10002,"MULTIPOLYGON (((-73.9975 40.71407, -73.99709 4..."
2,10003,"MULTIPOLYGON (((-73.98864 40.72293, -73.98876 ..."
3,10026,"MULTIPOLYGON (((-73.96201 40.80551, -73.96007 ..."
4,10004,"MULTIPOLYGON (((-74.00827 40.70772, -74.00937 ..."


In [16]:
all_listings = pd.read_csv("../data/airbnb_one_year.csv")

all_listings.head()

Unnamed: 0,id,name,host_id,neighborhood_cleansed,neighborhood_group_cleansed,latitude,longitude,room_type,price,minimum_nights,availability_365,license,dataset_date
0,40824219,Room close to Manhattan for FEMALE guests,317540555,Sunnyside,Queens,40.74698,-73.91763,Private room,$66.00,30,77,,2025-10-01
1,808629897642520802,Wyndham Midtown 45 Resort | King Bed Studio Suite,442029804,Midtown,Manhattan,40.752656,-73.97248,Entire home/apt,$330.00,30,285,,2025-10-01
2,808629343999219473,Wyndham Midtown 45 Resort | King Bed Studio Suite,442029804,Midtown,Manhattan,40.752656,-73.97248,Entire home/apt,$312.00,30,285,,2025-10-01
3,808629391209329400,Wyndham Midtown 45 Resort | King Bed Studio Suite,442029804,Midtown,Manhattan,40.75266,-73.97248,Entire home/apt,,30,286,,2025-10-01
4,808629522066886810,Wyndham Midtown 45 Resort | King Bed Studio Suite,442029804,Midtown,Manhattan,40.752656,-73.97248,Entire home/apt,$312.00,30,285,,2025-10-01


In [15]:
# Now I want to get hte crime data from Open Data
nypd = pd.read_csv("../data/NYPD_Complaint_Data_Current_(Year_To_Date)_20251217.csv")

print(nypd.shape)
nypd.head()


(438556, 36)


Unnamed: 0,CMPLNT_NUM,ADDR_PCT_CD,BORO_NM,CMPLNT_FR_DT,CMPLNT_FR_TM,CMPLNT_TO_DT,CMPLNT_TO_TM,CRM_ATPT_CPTD_CD,HADEVELOPT,HOUSING_PSA,...,TRANSIT_DISTRICT,VIC_AGE_GROUP,VIC_RACE,VIC_SEX,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,Lat_Lon,New Georeferenced Column
0,303250435,13,MANHATTAN,03/20/2025,00:30:00,,(null),COMPLETED,(null),,...,,25-44,WHITE,M,988886,207857,40.737203,-73.983273,"(40.7372030985741, -73.9832725981497)",POINT (-73.9832725981497 40.7372030985741)
1,309137838,14,MANHATTAN,07/05/2025,18:00:00,07/05/2025,18:31:00,COMPLETED,(null),,...,,18-24,BLACK,F,0,0,0.0,0.0,"(0.0, 0.0)",POINT (0 0)
2,298788022,81,BROOKLYN,01/02/2025,01:50:00,01/02/2025,03:00:00,COMPLETED,(null),,...,,18-24,WHITE,F,1005319,190473,40.689464,-73.924029,"(40.6894642952604, -73.9240290899499)",POINT (-73.9240290899499 40.6894642952604)
3,307271594,47,BRONX,03/05/2025,17:00:00,,(null),COMPLETED,(null),,...,,25-44,BLACK,F,1026480,262584,40.887314,-73.847272,"(40.8873136344706, -73.8472717577564)",POINT (-73.8472717577564 40.8873136344706)
4,309265327,121,STATEN ISLAND,07/04/2025,12:00:00,07/04/2025,13:00:00,COMPLETED,(null),,...,,65+,WHITE,M,939629,162183,40.611728,-74.160711,"(40.611728, -74.160711)",POINT (-74.160711 40.611728)


In [17]:
#Put the crime data into the zipcodes
crime_df= nypd
crime_geo = gpd.GeoDataFrame(
    crime_df,
    geometry=gpd.points_from_xy(crime_df["Longitude"], crime_df["Latitude"]),
    crs="EPSG:4326"
)

# Do the Spatial Join
crime_zip_join = gpd.sjoin(
    crime_geo,
    nyc_zips,
    how="left",
    predicate="within"
)

# Count the incident by ZIP
crime_by_zip = (
    crime_zip_join
    .groupby("zipcode")
    .size()
    .reset_index(name="total_major_crime_reports")
)
crime_by_zip.head()

Unnamed: 0,zipcode,total_major_crime_reports
0,10001,6108
1,10002,4845
2,10003,3949
3,10004,503
4,10005,271


In [18]:
# I will put all listings into the Zipcodes too
all_listings_geo = gpd.GeoDataFrame(
    all_listings,
    geometry=gpd.points_from_xy(all_listings["longitude"], all_listings["latitude"]),
    crs="EPSG:4326"   # WGS84
)

# Make sure ZIP polygons are in same CRS
if nyc_zips.crs is None:
    nyc_zips = nyc_zips.set_crs("EPSG:4326")
else:
    nyc_zips = nyc_zips.to_crs("EPSG:4326")

# Spatial join: which ZIP polygon each listing falls within
listings_with_zip = gpd.sjoin(
    all_listings_geo,
    nyc_zips,
    how="left",
    predicate="within"
)

# Clean up join column
listings_with_zip = listings_with_zip.drop(columns=["index_right"])

listings_with_zip[["id", "neighborhood_group_cleansed", "latitude", "longitude", "zipcode"]].head()

Unnamed: 0,id,neighborhood_group_cleansed,latitude,longitude,zipcode
0,40824219,Queens,40.74698,-73.91763,11104
1,808629897642520802,Manhattan,40.752656,-73.97248,10017
2,808629343999219473,Manhattan,40.752656,-73.97248,10017
3,808629391209329400,Manhattan,40.75266,-73.97248,10017
4,808629522066886810,Manhattan,40.752656,-73.97248,10017


In [19]:
# Drop rows that didn't match any ZIP
listings_with_zip = listings_with_zip.dropna(subset=["zipcode"])
listings_with_zip["zipcode"] = listings_with_zip["zipcode"].astype(str).str.zfill(5)

# Count unique listings per ZIP
airbnb_by_zip = (
    listings_with_zip
      .groupby("zipcode")["id"]
      .nunique()
      .reset_index(name="airbnb_count")
)

airbnb_by_zip.head()

Unnamed: 0,zipcode,airbnb_count
0,10001,982
1,10002,1087
2,10003,888
3,10004,104
4,10005,258


In [20]:
listings_with_zip['price'] = (
    listings_with_zip['price']
    .astype(str)
    .str.replace('$', '', regex=False)
    .str.replace(',', '', regex=False)
    .astype(float)
)

avg_price_by_zip = (
    listings_with_zip
    .groupby('zipcode')
    ['price']
    .mean()
    .reset_index(name='average_price')
)

# Merge with airbnb_by_zip to get both count and average price
zip_code_summary = pd.merge(airbnb_by_zip, avg_price_by_zip, on='zipcode', how='left')

print(zip_code_summary.head())

  zipcode  airbnb_count  average_price
0   10001           982    1008.292151
1   10002          1087    1539.947853
2   10003           888     366.814196
3   10004           104     350.283784
4   10005           258     880.087719


In [21]:
merged_zip_data = pd.merge(zip_code_summary, crime_by_zip, on='zipcode', how='inner')

In [None]:
# Make sure ZIPs are strings with 5 digits
merged_zip_data["zipcode"] = merged_zip_data["zipcode"].astype(str).str.zfill(5)
nyc_zips["zipcode"] = nyc_zips["zipcode"].astype(str).str.zfill(5)

In [29]:
nyc_zips.to_file("nyc_zipcodes.geojson", driver="GeoJSON")

merged_zip_data.to_csv("../data/merged_zip_data.csv", index=False)