# ZAP (Zoning Application Portal) Data Integration

This notebook integrates ZAP BBL and Project data with the HPD multifamily finance dataset.

## Steps:
1. Load HPD data with all dates from output folder
2. Join with ZAP BBL data to get Zoning Project IDs
3. Join with ZAP Project data to get full project details
4. Handle multiple matches by prioritizing ULURP projects


In [14]:
import pandas as pd
import requests
import json
from pathlib import Path
import numpy as np
from datetime import datetime


## Step 1: Load HPD Data


In [15]:
# Load the HPD data with all dates
hpd_df = pd.read_csv('output/hpd_multifamily_finance_new_construction_with_all_dates.csv')

print(f"Loaded {len(hpd_df)} rows from HPD dataset")
print(f"Columns: {len(hpd_df.columns)}")
print(f"\nFirst few columns: {list(hpd_df.columns[:10])}")


Loaded 581 rows from HPD dataset
Columns: 52

First few columns: ['Project ID', 'Project Name', 'Project Start Date', 'Building ID', 'Number', 'Street', 'Borough', 'Postcode', 'BBL', 'BIN']


## Step 2: Normalize BBL for Matching


In [16]:
# Normalize BBL_str to ensure 10-digit zero-padded format
def normalize_bbl(bbl):
    """Convert BBL to 10-digit zero-padded string"""
    if pd.isna(bbl) or bbl is None:
        return None
    try:
        # Convert to int then string, then pad to 10 digits
        return str(int(float(bbl))).zfill(10)
    except (ValueError, TypeError):
        bbl_str = str(bbl).strip()
        if bbl_str in ('None', 'nan', '', 'NaN'):
            return None
        if bbl_str.isdigit() and len(bbl_str) <= 10:
            return bbl_str.zfill(10)
        return None

# Create normalized BBL column
hpd_df['bbl_normalized'] = hpd_df['BBL_str'].apply(normalize_bbl)

# Check how many valid BBLs we have
valid_bbls = hpd_df['bbl_normalized'].notna().sum()
print(f"Rows with valid BBL: {valid_bbls} out of {len(hpd_df)} ({valid_bbls/len(hpd_df)*100:.1f}%)")


Rows with valid BBL: 548 out of 581 (94.3%)


## Step 3: Fetch ZAP BBL Data


In [None]:
# Fetch ZAP BBL data with pagination to get all records
zap_bbl_url = "https://data.cityofnewyork.us/resource/2iga-a6mk.json"

print("Fetching ZAP BBL data (with pagination)...")
zap_bbl_data = []
limit = 50000  # Socrata API max per request
offset = 0

while True:
    response = requests.get(zap_bbl_url, params={"$limit": limit, "$offset": offset})
    response.raise_for_status()
    batch = response.json()
    
    if len(batch) == 0:
        break
    
    zap_bbl_data.extend(batch)
    print(f"  Fetched {len(batch)} records (total: {len(zap_bbl_data)})")
    
    if len(batch) < limit:
        break
    
    offset += limit

print(f"\nFetched {len(zap_bbl_data)} total records from ZAP BBL dataset")

# Convert to DataFrame
zap_bbl_df = pd.DataFrame(zap_bbl_data)

# Normalize BBL in ZAP data
zap_bbl_df['bbl_normalized'] = zap_bbl_df['bbl'].apply(normalize_bbl)

# Filter to only records with valid BBLs
zap_bbl_df = zap_bbl_df[zap_bbl_df['bbl_normalized'].notna()].copy()

print(f"ZAP BBL records with valid BBL: {len(zap_bbl_df)}")
print(f"\nSample ZAP BBL data:")
print(zap_bbl_df[['project_id', 'bbl', 'bbl_normalized', 'validated']].head())


Fetching ZAP BBL data...
Fetched 50000 records from ZAP BBL dataset
ZAP BBL records with valid BBL: 48999

Sample ZAP BBL data:
   project_id         bbl bbl_normalized validated
0  P2017X0311  2030430008     2030430008      true
1  P2017X0311  2030430030     2030430030      true
2   2025K0154  3019900040     3019900040      true
3   2025K0339  3015460001     3015460001      true
4   2024M0432  1017740001     1017740001      true


## Step 4: Join HPD with ZAP BBL Data


In [None]:
# Check for multiple project_ids per BBL in ZAP data
bbl_project_counts = zap_bbl_df.groupby('bbl_normalized')['project_id'].nunique()
multiple_projects = bbl_project_counts[bbl_project_counts > 1]

print(f"BBLs with multiple project_ids in ZAP: {len(multiple_projects)}")
if len(multiple_projects) > 0:
    print(f"\nSample BBLs with multiple projects:")
    for bbl in multiple_projects.head(10).index:
        projects = zap_bbl_df[zap_bbl_df['bbl_normalized'] == bbl]['project_id'].unique()
        print(f"  BBL {bbl}: {len(projects)} projects - {list(projects)}")
    
    # Check if the specific BBL the user mentioned is in the list
    test_bbl = '3070610027'
    if test_bbl in multiple_projects.index:
        projects = zap_bbl_df[zap_bbl_df['bbl_normalized'] == test_bbl]['project_id'].unique()
        print(f"\n  ✓ BBL {test_bbl} correctly identified with {len(projects)} projects: {list(projects)}")
    else:
        print(f"\n  ✗ BBL {test_bbl} NOT in multiple_projects list (should be checked)")

# Note: We'll handle multiple matches after fetching ZAP Project data
# For now, just note that we need to deduplicate before joining
print(f"\nNote: Will handle multiple project_ids per BBL after fetching ZAP Project data (prioritize ULURP)")


BBLs with multiple project_ids in ZAP: 796

Sample BBLs with multiple projects:
  BBL 1000730008: 4 projects - ['2019M0371', '2021M0224', '2021M0422', '2022M0253']
  BBL 1000730010: 5 projects - ['2021M0224', '2019M0371', '2021M0422', '2022M0211', '2022M0253']
  BBL 1000730011: 5 projects - ['2021M0224', '2019M0371', '2021M0422', '2022M0211', '2022M0253']
  BBL 1000980001: 3 projects - ['2021M0224', '2022M0211', '2022M0253']
  BBL 1005317505: 2 projects - ['2018M0375', 'P2012M0582']

Note: Will handle multiple project_ids per BBL after fetching ZAP Project data (prioritize ULURP)


## Step 5: Fetch ZAP Project Data


In [None]:
# Fetch ZAP Project data with pagination to get all records
zap_project_url = "https://data.cityofnewyork.us/resource/hgx4-8ukb.json"

print("Fetching ZAP Project data (with pagination)...")
zap_project_data = []
limit = 50000  # Socrata API max per request
offset = 0

while True:
    response = requests.get(zap_project_url, params={"$limit": limit, "$offset": offset})
    response.raise_for_status()
    batch = response.json()
    
    if len(batch) == 0:
        break
    
    zap_project_data.extend(batch)
    print(f"  Fetched {len(batch)} records (total: {len(zap_project_data)})")
    
    if len(batch) < limit:
        break
    
    offset += limit

print(f"\nFetched {len(zap_project_data)} total records from ZAP Project dataset")

# Convert to DataFrame
zap_project_df = pd.DataFrame(zap_project_data)

print(f"\nZAP Project columns: {list(zap_project_df.columns)}")
print(f"\nSample ZAP Project data:")
print(zap_project_df[['project_id', 'project_name', 'ulurp_non', 'project_status']].head())


Fetching ZAP Project data...
Fetched 32836 records from ZAP Project dataset

ZAP Project columns: ['project_id', 'project_name', 'project_brief', 'project_status', 'public_status', 'ulurp_non', 'actions', 'ulurp_numbers', 'ceqr_number', 'primary_applicant', 'applicant_type', 'borough', 'community_district', 'flood_zone_a', 'flood_zone_shadedx', 'current_milestone', 'current_milestone_date', 'app_filed_date', 'certified_referred', 'mih_flag', 'mih_option1', 'mih_option2', 'mih_workforce', 'mih_deepaffordability', 'dcp_visibility', 'completed_date', 'ceqr_type', 'ceqr_leadagency', 'eas_eis', 'noticed_date', 'approval_date', 'cc_district', 'mih_mapped_no_res', 'current_envmilestone', 'current_envmilestone_date']

Sample ZAP Project data:
   project_id                                project_name  ulurp_non  \
0   2026K0149  Coney Island Business Improvement District  Non-ULURP   
1  P1985K0462          DISPOSITION OF CITY-OWNED PROPERTY      ULURP   
2   2026K0202        130 St. Felix Spec

## Step 6: Handle Multiple Matches and Prioritize ULURP


In [None]:
# Handle multiple project_ids per BBL by prioritizing ULURP projects
# Strategy: For each BBL, if it has multiple project_ids, select the one with ULURP in ulurp_non column

# Create a mapping: for each BBL, get all project_ids
bbl_to_projects = zap_bbl_df.groupby('bbl_normalized')['project_id'].apply(list).to_dict()

# For BBLs with multiple projects, check which ones are ULURP
def get_preferred_project_id(bbl_norm, zap_project_df):
    """
    For a given BBL, if it has multiple project_ids, return the one with ULURP in ulurp_non column.
    If none have ULURP, return the first one.
    """
    if bbl_norm is None or pd.isna(bbl_norm):
        return None
    
    if bbl_norm not in bbl_to_projects:
        return None
    
    project_ids = bbl_to_projects[bbl_norm]
    
    if len(project_ids) == 1:
        return project_ids[0]
    
    # Multiple projects - check for ULURP
    # Get ulurp_non values for these projects
    project_ulurp = zap_project_df[zap_project_df['project_id'].isin(project_ids)][['project_id', 'ulurp_non']].copy()
    
    # Check if any have ULURP (case-insensitive equality - must be exactly "ULURP", not "Non-ULURP")
    # Values can be "ULURP" or "Non-ULURP", so we check for exact match (case-insensitive)
    ulurp_projects = project_ulurp[
        project_ulurp['ulurp_non'].astype(str).str.strip().str.upper() == 'ULURP'
    ]
    
    if len(ulurp_projects) > 0:
        # Return first ULURP project
        return ulurp_projects.iloc[0]['project_id']
    else:
        # No ULURP, return first project
        return project_ids[0]

# Create a deduplicated ZAP BBL mapping with preferred project_id for each BBL
zap_bbl_deduped = zap_bbl_df.copy()
zap_bbl_deduped['preferred_project_id'] = zap_bbl_deduped['bbl_normalized'].apply(
    lambda x: get_preferred_project_id(x, zap_project_df)
)

# Keep only one row per BBL (with the preferred project_id)
zap_bbl_deduped = zap_bbl_deduped[
    zap_bbl_deduped['project_id'] == zap_bbl_deduped['preferred_project_id']
].drop_duplicates(subset=['bbl_normalized'], keep='first')

# Now join HPD with deduplicated ZAP BBL data
hpd_with_zap_bbl = pd.merge(
    hpd_df,
    zap_bbl_deduped[['bbl_normalized', 'preferred_project_id']].rename(columns={'preferred_project_id': 'Zoning Project ID'}),
    on='bbl_normalized',
    how='left'
)

# Count matches
matched_count = hpd_with_zap_bbl['Zoning Project ID'].notna().sum()
total_count = len(hpd_with_zap_bbl)

print(f"\n=== ZAP BBL Join Statistics ===")
print(f"Total HPD rows: {total_count}")
print(f"Rows matched with ZAP BBL: {matched_count} ({matched_count/total_count*100:.1f}%)")
print(f"Rows without ZAP BBL match: {total_count - matched_count} ({(total_count-matched_count)/total_count*100:.1f}%)")

# Count how many had multiple matches and were prioritized
bbls_with_multiple = [bbl for bbl, projs in bbl_to_projects.items() if len(projs) > 1]
hpd_rows_with_multiple = hpd_with_zap_bbl[hpd_with_zap_bbl['bbl_normalized'].isin(bbls_with_multiple)]

print(f"\n=== Multiple Match Handling ===")
print(f"BBLs with multiple ZAP projects: {len(bbls_with_multiple)}")
print(f"HPD rows affected by multiple matches: {len(hpd_rows_with_multiple)}")

# Show examples of prioritization
if len(bbls_with_multiple) > 0:
    print(f"\nExample BBLs with multiple projects:")
    for bbl in list(bbls_with_multiple)[:5]:
        all_projects = bbl_to_projects[bbl]
        preferred = get_preferred_project_id(bbl, zap_project_df)
        project_info = zap_project_df[zap_project_df['project_id'].isin(all_projects)][['project_id', 'ulurp_non']].drop_duplicates()
        print(f"\n  BBL {bbl}:")
        for _, row in project_info.iterrows():
            marker = " <-- SELECTED" if row['project_id'] == preferred else ""
            print(f"    Project {row['project_id']}: ulurp_non = {row['ulurp_non']}{marker}")



=== ZAP BBL Join Statistics ===
Total HPD rows: 581
Rows matched with ZAP BBL: 31 (5.3%)
Rows without ZAP BBL match: 550 (94.7%)

=== Multiple Match Handling ===
BBLs with multiple ZAP projects: 909
HPD rows affected by multiple matches: 4

Example BBLs with multiple projects:

  BBL 1000730008:
    Project 2019M0371: ulurp_non = Non-ULURP
    Project 2022M0253: ulurp_non = Non-ULURP
    Project 2021M0224: ulurp_non = ULURP <-- SELECTED
    Project 2021M0422: ulurp_non = ULURP

  BBL 1000730010:
    Project 2019M0371: ulurp_non = Non-ULURP
    Project 2022M0211: ulurp_non = Non-ULURP
    Project 2022M0253: ulurp_non = Non-ULURP
    Project 2021M0224: ulurp_non = ULURP <-- SELECTED
    Project 2021M0422: ulurp_non = ULURP

  BBL 1000730011:
    Project 2019M0371: ulurp_non = Non-ULURP
    Project 2022M0211: ulurp_non = Non-ULURP
    Project 2022M0253: ulurp_non = Non-ULURP
    Project 2021M0224: ulurp_non = ULURP <-- SELECTED
    Project 2021M0422: ulurp_non = ULURP

  BBL 1000980001:


In [21]:
# Join HPD data with ZAP Project data
# We'll add all columns from ZAP Project data with a prefix to avoid conflicts

# Prepare ZAP project data for join (rename columns to add prefix)
zap_project_renamed = zap_project_df.copy()
# Keep project_id as is for joining, but rename other columns
columns_to_rename = {col: f'ZAP_{col}' for col in zap_project_renamed.columns if col != 'project_id'}
zap_project_renamed = zap_project_renamed.rename(columns=columns_to_rename)

# Join on Zoning Project ID
hpd_final = pd.merge(
    hpd_with_zap_bbl,
    zap_project_renamed,
    left_on='Zoning Project ID',
    right_on='project_id',
    how='left'
)

# Drop the duplicate project_id column from ZAP (we already have 'Zoning Project ID')
if 'project_id' in hpd_final.columns:
    hpd_final = hpd_final.drop(columns=['project_id'])

print(f"\n=== Final Join Statistics ===")
print(f"Total HPD rows: {len(hpd_final)}")
print(f"Rows with Zoning Project ID: {hpd_final['Zoning Project ID'].notna().sum()}")
print(f"Rows with ZAP Project data: {hpd_final['ZAP_project_name'].notna().sum()}")

# Show new columns added
zap_columns = [col for col in hpd_final.columns if col.startswith('ZAP_')]
print(f"\nZAP columns added: {len(zap_columns)}")
print(f"Sample ZAP columns: {zap_columns[:10]}")



=== Final Join Statistics ===
Total HPD rows: 581
Rows with Zoning Project ID: 31
Rows with ZAP Project data: 31

ZAP columns added: 34
Sample ZAP columns: ['ZAP_project_name', 'ZAP_project_brief', 'ZAP_project_status', 'ZAP_public_status', 'ZAP_ulurp_non', 'ZAP_actions', 'ZAP_ulurp_numbers', 'ZAP_ceqr_number', 'ZAP_primary_applicant', 'ZAP_applicant_type']


## Step 8: Summary Statistics


In [22]:
print("=== ZAP Data Integration Summary ===")
print(f"\nTotal HPD buildings: {len(hpd_final)}")
print(f"Buildings matched with ZAP BBL data: {hpd_final['Zoning Project ID'].notna().sum()} ({hpd_final['Zoning Project ID'].notna().sum()/len(hpd_final)*100:.1f}%)")
print(f"Buildings with full ZAP Project data: {hpd_final['ZAP_project_name'].notna().sum()} ({hpd_final['ZAP_project_name'].notna().sum()/len(hpd_final)*100:.1f}%)")

# Statistics on ULURP vs Non-ULURP
if hpd_final['ZAP_ulurp_non'].notna().any():
    ulurp_counts = hpd_final['ZAP_ulurp_non'].value_counts()
    print(f"\nULURP classification:")
    for ulurp_type, count in ulurp_counts.items():
        print(f"  {ulurp_type}: {count}")

# Show sample of matched data
print(f"\n=== Sample Matched Records ===")
matched_sample = hpd_final[hpd_final['Zoning Project ID'].notna()][
    ['Project ID', 'Building ID', 'Number', 'Street', 'Borough', 'Zoning Project ID', 
     'ZAP_project_name', 'ZAP_ulurp_non', 'ZAP_project_status']
].head(10)
print(matched_sample.to_string())

print(f"\n=== Data Shape ===")
print(f"Original HPD columns: {len(hpd_df.columns)}")
print(f"Final columns: {len(hpd_final.columns)}")
print(f"New columns added: {len(hpd_final.columns) - len(hpd_df.columns)}")


=== ZAP Data Integration Summary ===

Total HPD buildings: 581
Buildings matched with ZAP BBL data: 31 (5.3%)
Buildings with full ZAP Project data: 31 (5.3%)

ULURP classification:
  Non-ULURP: 20
  ULURP: 11

=== Sample Matched Records ===
    Project ID  Building ID  Number           Street    Borough Zoning Project ID                                  ZAP_project_name ZAP_ulurp_non ZAP_project_status
0        44218     987329.0    2319         3 AVENUE  Manhattan         2025M0253                      East Harlem 125th Street BID     Non-ULURP           Complete
5        44225     927748.0     469  EAST 147 STREET      Bronx        P2012X0221                                      BROOK AVENUE     Non-ULURP           Complete
6        44225     955261.0     455  EAST 147 STREET      Bronx        P2012X0221                                      BROOK AVENUE     Non-ULURP           Complete
60       49938     947501.0      33     EAGLE STREET   Brooklyn         2021K0402   Greenpoint Land

## Step 9: Save Results (Optional)


In [23]:
# Uncomment to save the results
output_filename = f'output/hpd_multifamily_finance_new_construction_with_zap_data_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
hpd_final.to_csv(output_filename, index=False)
print(f"Saved results to {output_filename}")


Saved results to output/hpd_multifamily_finance_new_construction_with_zap_data_20251203_132601.csv
