# ZAP (Zoning Application Portal) Data Integration

This notebook integrates ZAP BBL and Project data with the HPD multifamily finance dataset.

## Steps:
1. Load HPD data with all dates from output folder
2. Join with ZAP BBL data to get Zoning Project IDs
3. Join with ZAP Project data to get full project details
4. Handle multiple matches by prioritizing ULURP projects


In [1]:
import pandas as pd
import requests
import json
from pathlib import Path
import numpy as np
from datetime import datetime


## Step 1: Load HPD Data


In [2]:
# Load the HPD data with all dates
hpd_df = pd.read_csv('output/hpd_multifamily_finance_new_construction_with_all_dates.csv')

print(f"Loaded {len(hpd_df)} rows from HPD dataset")
print(f"Columns: {len(hpd_df.columns)}")
print(f"\nFirst few columns: {list(hpd_df.columns[:10])}")


Loaded 581 rows from HPD dataset
Columns: 52

First few columns: ['Project ID', 'Project Name', 'Project Start Date', 'Building ID', 'Number', 'Street', 'Borough', 'Postcode', 'BBL', 'BIN']


## Step 2: Normalize BBL for Matching


In [3]:
# Normalize BBL_str to ensure 10-digit zero-padded format
def normalize_bbl(bbl):
    """Convert BBL to 10-digit zero-padded string"""
    if pd.isna(bbl) or bbl is None:
        return None
    try:
        # Convert to int then string, then pad to 10 digits
        return str(int(float(bbl))).zfill(10)
    except (ValueError, TypeError):
        bbl_str = str(bbl).strip()
        if bbl_str in ('None', 'nan', '', 'NaN'):
            return None
        if bbl_str.isdigit() and len(bbl_str) <= 10:
            return bbl_str.zfill(10)
        return None

# Create normalized BBL column
hpd_df['bbl_normalized'] = hpd_df['BBL_str'].apply(normalize_bbl)

# Check how many valid BBLs we have
valid_bbls = hpd_df['bbl_normalized'].notna().sum()
print(f"Rows with valid BBL: {valid_bbls} out of {len(hpd_df)} ({valid_bbls/len(hpd_df)*100:.1f}%)")


Rows with valid BBL: 548 out of 581 (94.3%)


## Step 3: Fetch ZAP BBL Data


In [4]:
# Fetch ZAP BBL data with pagination to get all records
zap_bbl_url = "https://data.cityofnewyork.us/resource/2iga-a6mk.json"

print("Fetching ZAP BBL data (with pagination)...")
zap_bbl_data = []
limit = 50000  # Socrata API max per request
offset = 0

while True:
    response = requests.get(zap_bbl_url, params={"$limit": limit, "$offset": offset})
    response.raise_for_status()
    batch = response.json()
    
    if len(batch) == 0:
        break
    
    zap_bbl_data.extend(batch)
    print(f"  Fetched {len(batch)} records (total: {len(zap_bbl_data)})")
    
    if len(batch) < limit:
        break
    
    offset += limit

print(f"\nFetched {len(zap_bbl_data)} total records from ZAP BBL dataset")

# Convert to DataFrame
zap_bbl_df = pd.DataFrame(zap_bbl_data)

# Normalize BBL in ZAP data
zap_bbl_df['bbl_normalized'] = zap_bbl_df['bbl'].apply(normalize_bbl)

# Filter to only records with valid BBLs
zap_bbl_df = zap_bbl_df[zap_bbl_df['bbl_normalized'].notna()].copy()

print(f"ZAP BBL records with valid BBL: {len(zap_bbl_df)}")
print(f"\nSample ZAP BBL data:")
print(zap_bbl_df[['project_id', 'bbl', 'bbl_normalized', 'validated']].head())


Fetching ZAP BBL data (with pagination)...
  Fetched 50000 records (total: 50000)
  Fetched 50000 records (total: 100000)
  Fetched 31165 records (total: 131165)

Fetched 131165 total records from ZAP BBL dataset
ZAP BBL records with valid BBL: 130021

Sample ZAP BBL data:
   project_id         bbl bbl_normalized validated
0  P2017X0311  2030430008     2030430008      true
1  P2017X0311  2030430030     2030430030      true
2   2025K0154  3019900040     3019900040      true
3   2025K0339  3015460001     3015460001      true
4   2024M0432  1017740001     1017740001      true


## Step 4: Join HPD with ZAP BBL Data


In [5]:
# Check for multiple project_ids per BBL in ZAP data
bbl_project_counts = zap_bbl_df.groupby('bbl_normalized')['project_id'].nunique()
multiple_projects = bbl_project_counts[bbl_project_counts > 1]

print(f"BBLs with multiple project_ids in ZAP: {len(multiple_projects)}")
if len(multiple_projects) > 0:
    print(f"\nSample BBLs with multiple projects:")
    for bbl in multiple_projects.head(10).index:
        projects = zap_bbl_df[zap_bbl_df['bbl_normalized'] == bbl]['project_id'].unique()
        print(f"  BBL {bbl}: {len(projects)} projects - {list(projects)}")
    
    # Check if the specific BBL the user mentioned is in the list
    test_bbl = '3070610027'
    if test_bbl in multiple_projects.index:
        projects = zap_bbl_df[zap_bbl_df['bbl_normalized'] == test_bbl]['project_id'].unique()
        print(f"\n  ✓ BBL {test_bbl} correctly identified with {len(projects)} projects: {list(projects)}")
    else:
        print(f"\n  ✗ BBL {test_bbl} NOT in multiple_projects list (should be checked)")

# Note: We'll handle multiple matches after fetching ZAP Project data
# For now, just note that we need to deduplicate before joining
print(f"\nNote: Will handle multiple project_ids per BBL after fetching ZAP Project data (prioritize ULURP)")


BBLs with multiple project_ids in ZAP: 15252

Sample BBLs with multiple projects:
  BBL 1000000000: 12 projects - ['P2013M0121', 'P2013M0522', 'P2017M0256', 'P2018M0316', 'P2012M0178', 'P2014M0169', 'P2015M0004', 'P2014M0457', 'P2017M0403', 'P2012M0455', 'P2018M0250', 'P2017M0204']
  BBL 1000010010: 7 projects - ['2023M0374', '2025M0365', 'P2013M0181', 'P2015M0142', 'P2015M0559', 'P1996M0440', 'P2018M0255']
  BBL 1000010201: 2 projects - ['P1994M0190', 'P1994M0189']
  BBL 1000020001: 3 projects - ['P1999M0236', 'P1996M0145', 'P2005M0415']
  BBL 1000030001: 11 projects - ['P1997M0356', 'P1985M0723', 'P1996M0145', 'P1984M0806', 'P1985M0442', 'P1988M0337', 'P1988M0486', 'P1999M0236', 'P1981M0644', 'P1986M0971', 'P1989M0432']
  BBL 1000030002: 3 projects - ['P1999M0236', 'P2005M0415', 'P1996M0145']
  BBL 1000030003: 3 projects - ['P1999M0236', 'P1993M0340', 'P2005M0415']
  BBL 1000047501: 3 projects - ['P2002M0202', 'P2014M0256', 'P2003M0306']
  BBL 1000050010: 6 projects - ['2023M0432', '

## Step 5: Fetch ZAP Project Data


In [6]:
# Fetch ZAP Project data with pagination to get all records
zap_project_url = "https://data.cityofnewyork.us/resource/hgx4-8ukb.json"

print("Fetching ZAP Project data (with pagination)...")
zap_project_data = []
limit = 50000  # Socrata API max per request
offset = 0

while True:
    response = requests.get(zap_project_url, params={"$limit": limit, "$offset": offset})
    response.raise_for_status()
    batch = response.json()
    
    if len(batch) == 0:
        break
    
    zap_project_data.extend(batch)
    print(f"  Fetched {len(batch)} records (total: {len(zap_project_data)})")
    
    if len(batch) < limit:
        break
    
    offset += limit

print(f"\nFetched {len(zap_project_data)} total records from ZAP Project dataset")

# Convert to DataFrame
zap_project_df = pd.DataFrame(zap_project_data)

print(f"\nZAP Project columns: {list(zap_project_df.columns)}")
print(f"\nSample ZAP Project data:")
print(zap_project_df[['project_id', 'project_name', 'ulurp_non', 'project_status']].head())


Fetching ZAP Project data (with pagination)...
  Fetched 32836 records (total: 32836)

Fetched 32836 total records from ZAP Project dataset

ZAP Project columns: ['project_id', 'project_name', 'project_brief', 'project_status', 'public_status', 'ulurp_non', 'actions', 'ulurp_numbers', 'ceqr_number', 'primary_applicant', 'applicant_type', 'borough', 'community_district', 'flood_zone_a', 'flood_zone_shadedx', 'current_milestone', 'current_milestone_date', 'app_filed_date', 'certified_referred', 'mih_flag', 'mih_option1', 'mih_option2', 'mih_workforce', 'mih_deepaffordability', 'dcp_visibility', 'completed_date', 'ceqr_type', 'ceqr_leadagency', 'eas_eis', 'noticed_date', 'approval_date', 'cc_district', 'mih_mapped_no_res', 'current_envmilestone', 'current_envmilestone_date']

Sample ZAP Project data:
   project_id                                project_name  ulurp_non  \
0   2026K0149  Coney Island Business Improvement District  Non-ULURP   
1  P1985K0462          DISPOSITION OF CITY-OWNE

## Step 6: Join HPD with ALL ZAP BBL Matches


In [7]:
# Join HPD with ALL ZAP BBL matches (not deduplicated)
# This will create multiple rows for HPD buildings that match multiple ZAP projects
# Strategy: Keep all ZAP project matches, don't deduplicate

# Create a mapping: for each BBL, get all project_ids (for reporting)
bbl_to_projects = zap_bbl_df.groupby('bbl_normalized')['project_id'].apply(list).to_dict()

# Join HPD with ALL ZAP BBL matches (this will create multiple rows for multiple matches)
hpd_with_zap_bbl = pd.merge(
    hpd_df,
    zap_bbl_df[['bbl_normalized', 'project_id']].rename(columns={'project_id': 'Zoning Project ID'}),
    on='bbl_normalized',
    how='left'  # Keep all HPD rows, add ZAP matches where they exist
)

# Count matches
matched_count = hpd_with_zap_bbl['Zoning Project ID'].notna().sum()
total_hpd_rows = len(hpd_df)
total_rows_after_join = len(hpd_with_zap_bbl)

print(f"\n=== ZAP BBL Join Statistics ===")
print(f"Original HPD rows: {total_hpd_rows}")
print(f"Rows after join (with multiple matches): {total_rows_after_join}")
print(f"Rows with ZAP BBL match: {matched_count} ({matched_count/total_hpd_rows*100:.1f}% of original HPD rows)")
print(f"Rows without ZAP BBL match: {total_hpd_rows - (total_rows_after_join - (total_hpd_rows - matched_count))}")

# Count how many HPD buildings have multiple ZAP matches
hpd_buildings_with_multiple = hpd_with_zap_bbl[
    hpd_with_zap_bbl['Zoning Project ID'].notna()
].groupby('Building ID')['Zoning Project ID'].nunique()
hpd_buildings_with_multiple = hpd_buildings_with_multiple[hpd_buildings_with_multiple > 1]

print(f"\n=== Multiple Match Statistics ===")
print(f"HPD buildings with multiple ZAP project matches: {len(hpd_buildings_with_multiple)}")
print(f"Total ZAP matches (including duplicates): {matched_count}")
print(f"Unique HPD buildings matched: {hpd_with_zap_bbl[hpd_with_zap_bbl['Zoning Project ID'].notna()]['Building ID'].nunique()}")

# Show examples of buildings with multiple matches
if len(hpd_buildings_with_multiple) > 0:
    print(f"\nExample HPD buildings with multiple ZAP projects:")
    for building_id in hpd_buildings_with_multiple.head(5).index:
        building_matches = hpd_with_zap_bbl[
            (hpd_with_zap_bbl['Building ID'] == building_id) & 
            (hpd_with_zap_bbl['Zoning Project ID'].notna())
        ]
        bbl = building_matches['bbl_normalized'].iloc[0] if len(building_matches) > 0 else None
        projects = building_matches['Zoning Project ID'].unique().tolist()
        print(f"\n  Building ID {building_id} (BBL {bbl}): {len(projects)} ZAP projects")
        for project_id in projects:
            # Get ULURP status if available
            ulurp_info = zap_project_df[zap_project_df['project_id'] == project_id]['ulurp_non']
            ulurp_status = ulurp_info.iloc[0] if len(ulurp_info) > 0 else 'N/A'
            print(f"    - {project_id} (ulurp_non: {ulurp_status})")



=== ZAP BBL Join Statistics ===
Original HPD rows: 581
Rows after join (with multiple matches): 683
Rows with ZAP BBL match: 270 (46.5% of original HPD rows)
Rows without ZAP BBL match: 209

=== Multiple Match Statistics ===
HPD buildings with multiple ZAP project matches: 35
Total ZAP matches (including duplicates): 270
Unique HPD buildings matched: 167

Example HPD buildings with multiple ZAP projects:

  Building ID 32654.0 (BBL 1007620023): 2 ZAP projects
    - P2016M0041 (ulurp_non: Non-ULURP)
    - 2018M0412 (ulurp_non: Non-ULURP)

  Building ID 525826.0 (BBL 4022480228): 2 ZAP projects
    - P1991Q0589 (ulurp_non: nan)
    - P2016Q0293 (ulurp_non: ULURP)

  Building ID 927150.0 (BBL 1019110061): 3 ZAP projects
    - P2012M0277 (ulurp_non: ULURP)
    - P1994M0328 (ulurp_non: ULURP)
    - P1997M0333 (ulurp_non: ULURP)

  Building ID 927209.0 (BBL 3027850042): 2 ZAP projects
    - P2011K0047 (ulurp_non: ULURP)
    - P1997K0271 (ulurp_non: ULURP)

  Building ID 927737.0 (BBL 301556

In [8]:
# Join HPD data with ZAP Project data
# We'll add all columns from ZAP Project data with a prefix to avoid conflicts

# Prepare ZAP project data for join (rename columns to add prefix)
zap_project_renamed = zap_project_df.copy()
# Keep project_id as is for joining, but rename other columns
columns_to_rename = {col: f'ZAP_{col}' for col in zap_project_renamed.columns if col != 'project_id'}
zap_project_renamed = zap_project_renamed.rename(columns=columns_to_rename)

# Join on Zoning Project ID
hpd_final = pd.merge(
    hpd_with_zap_bbl,
    zap_project_renamed,
    left_on='Zoning Project ID',
    right_on='project_id',
    how='left'
)

# Drop the duplicate project_id column from ZAP (we already have 'Zoning Project ID')
if 'project_id' in hpd_final.columns:
    hpd_final = hpd_final.drop(columns=['project_id'])

print(f"\n=== Final Join Statistics ===")
print(f"Total rows (including multiple ZAP matches): {len(hpd_final)}")
print(f"Original HPD buildings: {hpd_df['Building ID'].nunique()}")
print(f"Rows with Zoning Project ID: {hpd_final['Zoning Project ID'].notna().sum()}")
print(f"Rows with ZAP Project data: {hpd_final['ZAP_project_name'].notna().sum()}")
print(f"Unique HPD buildings with ZAP data: {hpd_final[hpd_final['Zoning Project ID'].notna()]['Building ID'].nunique()}")

# Show new columns added
zap_columns = [col for col in hpd_final.columns if col.startswith('ZAP_')]
print(f"\nZAP columns added: {len(zap_columns)}")
print(f"Sample ZAP columns: {zap_columns[:10]}")



=== Final Join Statistics ===
Total rows (including multiple ZAP matches): 683
Original HPD buildings: 571
Rows with Zoning Project ID: 270
Rows with ZAP Project data: 270
Unique HPD buildings with ZAP data: 167

ZAP columns added: 34
Sample ZAP columns: ['ZAP_project_name', 'ZAP_project_brief', 'ZAP_project_status', 'ZAP_public_status', 'ZAP_ulurp_non', 'ZAP_actions', 'ZAP_ulurp_numbers', 'ZAP_ceqr_number', 'ZAP_primary_applicant', 'ZAP_applicant_type']


## Step 8: Summary Statistics


In [9]:
print("=== ZAP Data Integration Summary ===")
print(f"\nTotal HPD buildings: {len(hpd_final)}")
print(f"Buildings matched with ZAP BBL data: {hpd_final['Zoning Project ID'].notna().sum()} ({hpd_final['Zoning Project ID'].notna().sum()/len(hpd_final)*100:.1f}%)")
print(f"Buildings with full ZAP Project data: {hpd_final['ZAP_project_name'].notna().sum()} ({hpd_final['ZAP_project_name'].notna().sum()/len(hpd_final)*100:.1f}%)")

# Statistics on ULURP vs Non-ULURP
if hpd_final['ZAP_ulurp_non'].notna().any():
    ulurp_counts = hpd_final['ZAP_ulurp_non'].value_counts()
    print(f"\nULURP classification:")
    for ulurp_type, count in ulurp_counts.items():
        print(f"  {ulurp_type}: {count}")

# Show sample of matched data
print(f"\n=== Sample Matched Records ===")
matched_sample = hpd_final[hpd_final['Zoning Project ID'].notna()][
    ['Project ID', 'Building ID', 'Number', 'Street', 'Borough', 'Zoning Project ID', 
     'ZAP_project_name', 'ZAP_ulurp_non', 'ZAP_project_status']
].head(10)
print(matched_sample.to_string())

print(f"\n=== Data Shape ===")
print(f"Original HPD columns: {len(hpd_df.columns)}")
print(f"Final columns: {len(hpd_final.columns)}")
print(f"New columns added: {len(hpd_final.columns) - len(hpd_df.columns)}")


=== ZAP Data Integration Summary ===

Total HPD buildings: 683
Buildings matched with ZAP BBL data: 270 (39.5%)
Buildings with full ZAP Project data: 270 (39.5%)

ULURP classification:
  ULURP: 191
  Non-ULURP: 75

=== Sample Matched Records ===
   Project ID  Building ID Number        Street    Borough Zoning Project ID                ZAP_project_name ZAP_ulurp_non ZAP_project_status
0       44218     987329.0   2319      3 AVENUE  Manhattan         2025M0253    East Harlem 125th Street BID     Non-ULURP           Complete
1       44218     987329.0   2319      3 AVENUE  Manhattan         2025M0253    East Harlem 125th Street BID     Non-ULURP           Complete
2       44218     987329.0   2319      3 AVENUE  Manhattan         2025M0253    East Harlem 125th Street BID     Non-ULURP           Complete
3       44218     987329.0   2319      3 AVENUE  Manhattan         2025M0253    East Harlem 125th Street BID     Non-ULURP           Complete
4       44218     987329.0   2319      3 AVE

## Step 9: Save Results (Optional)


In [10]:
# Uncomment to save the results
output_filename = f'output/hpd_multifamily_finance_new_construction_with_zap_data_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
hpd_final.to_csv(output_filename, index=False)
print(f"Saved results to {output_filename}")


Saved results to output/hpd_multifamily_finance_new_construction_with_zap_data_20251203_144611.csv
