# Housing Data Workflow Notebook

Modular workflow where you can run individual steps independently.
Run cells in order or skip any steps you don't need.

Each step shows dataframe views and statistics for inspection.

## Quick Start
- Run **Setup** cell first
- Then run any combination of Step 1-4 cells
- Skip cells you don't want to execute
- Each cell is self-contained and shows results

## 🔧 Setup

Run this cell first to import modules and define helper functions.

In [None]:
import sys
from pathlib import Path
from typing import Optional
import time

import pandas as pd

# Add current directory to path for local imports
sys.path.append(".")

# Import our workflow modules
from fetch_affordable_housing_data import update_local_data, verify_and_fetch_hpd_data
from query_ll44_funding import query_and_add_financing
from query_dob_filings import query_dob_bisweb_bin, query_dob_bisweb_bbl, query_dobnow_bin, query_dobnow_bbl, decompose_bbl, query_condo_lots_for_bbl, query_dob_by_address, pad_block, pad_lot
from query_co_filings import query_co_filings
from HPD_DOB_Join_On_BIN import create_separate_timelines
from create_timeline_chart import create_timeline_chart, create_financing_charts
from data_quality import quality_tracker

print("✅ All imports successful")

# Helper functions
def _normalize_bin(bin_value) -> Optional[str]:
    """Normalize BIN to a clean string."""
    if pd.isna(bin_value):
        return None
    try:
        return str(int(float(bin_value)))
    except (TypeError, ValueError):
        value = str(bin_value).strip()
        return value or None

def _write_bin_file(source_csv: Path, output_txt: Path) -> Path:
    """Extract BINs from a CSV and write them to a text file for CO searches."""
    df = pd.read_csv(source_csv)
    candidate_cols = [col for col in df.columns if col.lower() in ("bin", "bin_normalized")]
    if not candidate_cols:
        raise SystemExit(f"Could not find a BIN column in {source_csv}")

    bins = [_normalize_bin(val) for val in df[candidate_cols[0]].dropna()]
    bins = sorted({b for b in bins if b})

    output_txt.parent.mkdir(parents=True, exist_ok=True)
    output_txt.write_text("\n".join(bins))
    print(f"Wrote {len(bins)} BINs to {output_txt}")
    return output_txt

print("✅ Helper functions defined")

## 📥 Step 1: Fetch HPD Data

Load or refresh the HPD affordable housing dataset.

**Options:**
- Set `refresh_data = True` to fetch fresh data
- Set `refresh_data = False` to use existing data

In [None]:
# Step 1 Configuration
refresh_data = False  # Set to True to fetch fresh HPD data
hpd_output_path = "data/raw/Affordable_Housing_Production_by_Building.csv"  # Output path for HPD data
refresh_hpd_projects = False  # Set to True to fetch fresh HPD projects data

print("=" * 70)
print("STEP 1: FETCH HPD DATA")
print("=" * 70)

# Start quality tracking

# Handle HPD projects cache refresh if requested
if refresh_hpd_projects:
    print("Force refreshing HPD projects cache...")
    from fetch_affordable_housing_data import verify_and_fetch_hpd_projects_data
    hpd_projects_df, hpd_projects_path = verify_and_fetch_hpd_projects_data(use_existing=False)
    print(f"HPD projects cache refreshed: {len(hpd_projects_df)} records\n")

quality_tracker.start_processing()

if refresh_data:
    print("Fetching fresh HPD data from NYC Open Data...")
    hpd_df, hpd_csv = update_local_data(hpd_output_path)
else:
    print("Verifying local HPD data against API...")
    hpd_df, hpd_csv = verify_and_fetch_hpd_data(output_path=hpd_output_path, use_projects_cache=not refresh_hpd_projects)

if not hpd_csv.exists():
    raise SystemExit(f"HPD dataset not found at {hpd_csv}")

# Record initial dataset size

# Get total units before filter
original_count = len(hpd_df)
original_units = hpd_df['Total Units'].sum()

# Filter to New Construction only
hpd_df = hpd_df[hpd_df["Reporting Construction Type"] == "New Construction"].copy()

filtered_count = len(hpd_df)
filtered_units = hpd_df['Total Units'].sum()
filtered_out = original_count - filtered_count
filtered_units_out = original_units - filtered_units

print(f"🏗️ Filtered to New Construction only:")
print(f"  Original: {original_count:,} projects, {original_units:,} total units")
print(f"  Filtered: {filtered_count:,} projects ({filtered_count/original_count*100:.1f}%), {filtered_units:,} total units ({filtered_units/original_units*100:.1f}%)")
print(f"  Removed: {filtered_out:,} non-new construction projects ({filtered_out/original_count*100:.1f}%), {filtered_units_out:,} units filtered out ({filtered_units_out/original_units*100:.1f}%)")

quality_tracker.analyze_hpd_data(hpd_df, "Full_HPD_Dataset")
quality_tracker.record_pipeline_stage("raw_hpd_data", len(hpd_df), "Raw HPD affordable housing dataset")

print(f"✅ Step 1 complete: {len(hpd_df):,} records loaded")
print(f"📁 Data location: {hpd_csv}")

# Display the dataframe
print("\n🔍 HPD Dataset Overview:")
print(f"Shape: {hpd_df.shape}")
print("\nColumns:")
for col in hpd_df.columns:
    print(f"  - {col}")

print("\n📊 Sample Data:")
display(hpd_df.head())
print("\n📈 Basic Statistics:")
display(hpd_df.describe(include="all"))

In [None]:
# How many unique counts are there by project id as primary key per program group,
# and show total units in parentheticals (but NOT for the unique project counts).

# Compute total units per Program Group (all rows)
units_per_group = hpd_df.groupby('Program Group')['Total Units'].sum()

print("Program Group counts (raw rows) (total units in parentheses):")
raw_row_counts = hpd_df['Program Group'].value_counts()
for group, count in raw_row_counts.items():
    units = units_per_group.get(group, 0)
    print(f"{group}: {count} rows ({units} units)")
print()

# Group by Program Group, count unique Project IDs
unique_proj_counts = hpd_df.groupby('Program Group')['Project ID'].nunique().sort_values(ascending=False)
unique_proj_ids = (
    hpd_df
    .groupby('Program Group')
    .apply(lambda df: df['Project ID'].unique())
)

print("Program Group counts (unique Project ID as primary key):")
for group, count in unique_proj_counts.items():
    print(f"{group}: {count} projects")
print()

print("\nTax Abatement by Program Group (based on unique Project ID):")
if 'Planned Tax Benefit' in hpd_df.columns:
    # For this, deduplicate by Project ID first
    unique_project_rows = hpd_df.drop_duplicates(subset=['Project ID'])
    tax_abate_ct = (
        unique_project_rows
        .groupby('Program Group')['Planned Tax Benefit']
        .value_counts(dropna=False)
        .unstack(fill_value=0)
        .sort_index(axis=1)
    )
    # Also display total units per Program Group in this table, if desired
    units_per_group_project = unique_project_rows.groupby('Program Group')['Total Units'].sum()
    print("Total units (unique Project ID per Program Group):")
    display(units_per_group_project)
    display(tax_abate_ct)
else:
    print("Column 'Planned Tax Benefit' not found in dataset.")

# Make a version of this with unit count by program and tax benefit
if 'Planned Tax Benefit' in unique_project_rows.columns and 'Program Group' in unique_project_rows.columns:
    units_pivot = (
        unique_project_rows
        .groupby(['Program Group', 'Planned Tax Benefit'])['Total Units']
        .sum()
        .unstack(fill_value=0)
        .sort_index(axis=1)
    )
    print("Total units by Program Group and Planned Tax Benefit (unique Project ID only):")
    display(units_pivot)
else:
    print("Required columns not found for unit pivot table.")

# Calculate average units per year by Program Group and Planned Tax Benefit

if 'Project Start Date' in unique_project_rows.columns and 'Total Units' in unique_project_rows.columns:
    # Extract year from 'Project Start Date'
    unique_project_rows = unique_project_rows.copy()
    unique_project_rows['Project Year'] = pd.to_datetime(unique_project_rows['Project Start Date'], errors='coerce').dt.year

    avg_units_per_year = (
        unique_project_rows
        .groupby(['Program Group', 'Planned Tax Benefit', 'Project Year'])['Total Units']
        .sum()
        .reset_index()
    )

    # Now calculate the average units per year by program group and tax abatement
    avg_units_table = (
        avg_units_per_year
        .groupby(['Program Group', 'Planned Tax Benefit'])['Total Units']
        .mean()
        .unstack(fill_value=0)
        .sort_index(axis=1)
    )
    print("Average units per year by Program Group and Planned Tax Benefit (unique Project ID only):")
    display(avg_units_table)
else:
    print("Required columns not found for average units per year table.")



In [None]:
import matplotlib.pyplot as plt

# We'll use the full raw HPD data, because we want all programs, not just Multifamily Finance Program
if 'Project Start Date' in hpd_df.columns and 'Total Units' in hpd_df.columns:
    hpd_bar_df = hpd_df.copy()
    hpd_bar_df['Project Year'] = pd.to_datetime(hpd_bar_df['Project Start Date'], errors='coerce').dt.year

    # Only focus on desired groups
    programs_of_interest = ['Multifamily Finance Program', 'Multifamily Incentives Program']
    mask = hpd_bar_df['Program Group'].isin(programs_of_interest)
    hpd_bar_df = hpd_bar_df[mask & hpd_bar_df['Project Year'].notna()]

    # Fill NAs in Planned Tax Benefit with "None"
    hpd_bar_df['Planned Tax Benefit'] = hpd_bar_df['Planned Tax Benefit'].fillna('None')

    # Prepare for grouped bar with stack
    # Pivot: rows = Project Year, columns = (Program Group, Planned Tax Benefit), values = sum of units
    pivot = (
        hpd_bar_df
        .groupby(['Project Year', 'Program Group', 'Planned Tax Benefit'])['Total Units']
        .sum()
        .reset_index()
    )

    # Ensure proper order of years and programs
    years = sorted(pivot['Project Year'].dropna().unique())
    tax_benefits = sorted(pivot['Planned Tax Benefit'].unique())
    # Keep consistent order for bars
    program_order = ['Multifamily Finance Program', 'Multifamily Incentives Program']

    # Prepare data structure: for each year, for each program, get breakdown by tax benefit
    bar_data = {}
    for year in years:
        bar_data[year] = {}
        for prog in program_order:
            mask = (pivot['Project Year'] == year) & (pivot['Program Group'] == prog)
            year_prog_data = pivot[mask].set_index('Planned Tax Benefit')['Total Units'].reindex(tax_benefits, fill_value=0)
            bar_data[year][prog] = year_prog_data.values

    # Number of bars per group (2 programs), group by year, stacked by tax benefit
    x = range(len(years))
    width = 0.35

    fig, ax = plt.subplots(figsize=(14, 7))

    # Colors for planned tax benefits
    import matplotlib.cm as cm
    color_map = cm.get_cmap('tab20', len(tax_benefits))
    colors = [color_map(i) for i in range(len(tax_benefits))]

    bottoms_p1 = [0] * len(years)
    bottoms_p2 = [0] * len(years)

    # For each tax benefit, draw the stack pieces for both programs
    legend_handles = []
    for idx, tax in enumerate(tax_benefits):
        values_p1 = [bar_data[year][program_order[0]][idx] for year in years]
        values_p2 = [bar_data[year][program_order[1]][idx] for year in years]

        bar1 = ax.bar(
            [i - width/2 for i in x], values_p1, width,
            bottom=bottoms_p1, color=colors[idx],
            label=tax if (tax not in [h.get_label() for h in legend_handles]) else None,
            edgecolor='black', hatch='////'
        )
        bar2 = ax.bar(
            [i + width/2 for i in x], values_p2, width,
            bottom=bottoms_p2, color=colors[idx],
            label=None,
            edgecolor='black'
        )

        if tax not in [h.get_label() for h in legend_handles]:
            legend_handles.append(bar1)

        bottoms_p1 = [b + v for b, v in zip(bottoms_p1, values_p1)]
        bottoms_p2 = [b + v for b, v in zip(bottoms_p2, values_p2)]

    # Add year labels
    ax.set_xticks(x)
    ax.set_xticklabels([str(int(y)) for y in years], rotation=45)
    ax.set_xlabel("Project Start Year")
    ax.set_ylabel("Total Units Financed")
    ax.set_title("Units Financed by Year: Multifamily Finance and Incentives Programs\nColored by Planned Tax Benefit")

    # Custom legend for program groups
    import matplotlib.patches as mpatches
    progs = [
        mpatches.Patch(color='gray', label='Multifamily Finance Program', ec='black', hatch='////'),
        mpatches.Patch(color='gray', label='Multifamily Incentives Program', ec='black')
    ]
    # Only add one legend for planned tax benefit
    handles_tax = [plt.Rectangle((0,0),1,1, color=colors[i], edgecolor='black', label=f"{tax_benefits[i]}") for i in range(len(tax_benefits))]
    legend1 = ax.legend(handles=handles_tax, title="Planned Tax Benefit", loc='upper right')
    ax.add_artist(legend1)
    # Add manual tick legend for program bars
    bar_locs = [x[0] - width/2, x[0] + width/2]
    ax.bar(bar_locs[0], 0, width, color='white', hatch='////', ec='black', label='Multifamily Finance Program')
    ax.bar(bar_locs[1], 0, width, color='white', ec='black', label='Multifamily Incentives Program')
    ax.legend(
        handles=[
            plt.Rectangle((0,0),1,1, facecolor='white', hatch='////', edgecolor='black', label='Multifamily Finance Program'),
            plt.Rectangle((0,0),1,1, facecolor='white', edgecolor='black', label='Multifamily Incentives Program')
        ], title="Program Group", loc='upper left'
    )

    ax.grid(True, which='major', axis='y', alpha=0.3)
    plt.tight_layout()
    plt.show()
else:
    print("Required columns ('Project Start Date', 'Total Units') not found in HPD Data.")



In [None]:
# Count and sample: Planned Tax Benefit '421a' and Project Start Date in 2025
if "Planned Tax Benefit" in hpd_df.columns and "Project Start Date" in hpd_df.columns:
    # Filter for 421a and 2025 start year
    mask_421a_2025 = (
        (hpd_df["Planned Tax Benefit"] == "421a") &
        (hpd_df["Project Start Date"].astype(str).str.startswith("2025"))
    )
    df_421a_2025 = hpd_df[mask_421a_2025]

    # Count unique projects (by Project ID), and total units
    total_projects = df_421a_2025["Project ID"].nunique() if "Project ID" in df_421a_2025.columns else len(df_421a_2025)
    total_units = df_421a_2025["Total Units"].sum() if "Total Units" in df_421a_2025.columns else "N/A"

    print(f"Total projects with Planned Tax Benefit '421a' and 2025 Start Date: {total_projects:,}")
    print(f"Total units in these projects: {total_units:,}")

    # Show up to 5 sample rows
    pd.set_option('display.max_columns', None)
    print("\nSample 421a Planned Tax Benefit projects with Project Start Date in 2025:")
    display(df_421a_2025.head(5))
    pd.reset_option('display.max_columns')
else:
    print("One or both of the columns 'Planned Tax Benefit' or 'Project Start Date' not found in HPD DataFrame.")



## 🏗️ Step 3A: Query DOB Filings

Search for DOB New Building filings.

**Depends on:** Step 2
**Options:**
- Set `skip_dob = True` to use existing DOB data
- Set `use_bbl_fallback = False` to disable BBL fallback

In [None]:
# Step 3A Configuration

# We now explicitly call all 4 DOB query types for transparency

from pathlib import Path
import tempfile
import os

dob_output_path = None  # Optional explicit DOB output path
building_csv = hpd_csv  # Use HPD data directly (skipping financing step)

# Filter to Multifamily Finance Program only
print(f"Reading HPD data from {building_csv}")
hpd_df = pd.read_csv(building_csv)
original_count = len(hpd_df)
print(f"Loaded {original_count:,} total buildings")

# Apply filters: New Construction + Multifamily Finance Program
filtered_df = hpd_df[
    (hpd_df["Reporting Construction Type"] == "New Construction") &
    (hpd_df["Program Group"] == "Multifamily Finance Program")
]
filtered_count = len(filtered_df)

print(f"🏗️ Filtered to Multifamily Finance Program:")
print(f"  Original: {original_count:,} buildings")
print(f"  Filtered: {filtered_count:,} buildings ({filtered_count/original_count*100:.1f}%)")

# Save filtered data for DOB processing
temp_fd, temp_path = tempfile.mkstemp(suffix="_multifamily_finance.csv")
os.close(temp_fd)
filtered_df.to_csv(temp_path, index=False)
building_csv = Path(temp_path)
print(f"📁 Using filtered dataset: {filtered_count:,} Multifamily Finance Program buildings")

dob_output_base = Path(dob_output_path) if dob_output_path else Path(
    f"data/processed/{building_csv.stem}_dob"
)
dob_output_base.parent.mkdir(parents=True, exist_ok=True)

# Extract BINs and BBLs from the filtered data
bins = []
# Extract BBLs properly using decompose_bbl function
from query_dob_filings import decompose_bbl

bbls = []
for idx, row in filtered_df.iterrows():
    if pd.notna(row.get("BBL")):
        bbl_result = decompose_bbl(str(row["BBL"]))
        if bbl_result and len(bbl_result) >= 3:
            borough, block, lot = bbl_result
            bbls.append((borough, block, lot))


# Filter out bad/placeholder BINs (e.g., 1000000, 2000000, 3000000, 4000000, 5000000)
# These are placeholder values that don't exist in DOB
def is_bad_bin(bin_str):
    """Check if BIN is a placeholder/bad value."""
    if not bin_str or pd.isna(bin_str) or str(bin_str).lower() == 'nan':
        return True
    bin_str_clean = str(bin_str).strip()
    # Check for pattern: [1-5]000000 (borough placeholder BINs)
    if bin_str_clean in ['1000000', '2000000', '3000000', '4000000', '5000000']:
        return True
    return False

if 'BIN' in filtered_df.columns:
    bins = [str(b).replace('.0', '') for b in filtered_df['BIN'].dropna() if str(b) != 'nan' and not is_bad_bin(str(b).replace('.0', ''))]
elif 'BIN' in filtered_df.columns:
    bins = [str(b).replace('.0', '') for b in filtered_df['BIN'].dropna() if str(b) != 'nan']

if 'BBL' in filtered_df.columns:
    bbl_col = filtered_df['BBL'].astype(str).str.zfill(10)
    bbls = [
        (
            bbl_val[0],                     # borough code (as string)
            bbl_val[1:6],                   # block (padded 5 chars)
            bbl_val[6:]                     # lot   (padded 4 chars)
        )
        for bbl_val in bbl_col
        if len(bbl_val) == 10
    ]

print("\n" + "=" * 70)
print("STEP 3A: QUERY DOB FILINGS - Explicitly by API and key")
print("=" * 70)

# Implement proper API-specific fallback logic

# 1. BISWEB BIN for all buildings
print("=" * 70)
print("STEP 3A: BISWEB BIN QUERY (ALL BUILDINGS)")
print("=" * 70)
dob_bisweb_bin_path = dob_output_base.with_name(dob_output_base.stem + "_bisweb_bin.csv")
print(f"▶️ Querying BISWEB BIN for {len(bins)} buildings...")
dob_bisweb_bin_df = query_dob_bisweb_bin(bins)
bisweb_bin_matches = set()
if not dob_bisweb_bin_df.empty and "bin__" in dob_bisweb_bin_df.columns:
    bisweb_bin_matches = set(dob_bisweb_bin_df["bin__"].dropna().astype(str).unique())
bisweb_bin_unmatched = [b for b in bins if b not in bisweb_bin_matches]
print(f"BISWEB BIN: {len(bisweb_bin_matches)} matches, {len(bisweb_bin_unmatched)} need BBL fallback")
dob_bisweb_bin_df.to_csv(dob_bisweb_bin_path, index=False)


# 3. DOB NOW BIN for all buildings
print("\n" + "=" * 70)
print("STEP 3B: DOB NOW BIN QUERY (ALL BUILDINGS)")
print("=" * 70)
dob_now_bin_path = dob_output_base.with_name(dob_output_base.stem + "_now_bin.csv")
print(f"▶️ Querying DOB NOW BIN for {len(bins)} buildings...")
dob_now_bin_df = query_dobnow_bin(bins)
dobnow_bin_matches = set()
if not dob_now_bin_df.empty and "bin" in dob_now_bin_df.columns:
    dobnow_bin_matches = set(dob_now_bin_df["bin"].dropna().astype(str).unique())
print(f"DOB NOW BIN: {len(dobnow_bin_matches)} matches")
dob_now_bin_df.to_csv(dob_now_bin_path, index=False)

# 4. Combined BBL fallback for buildings that failed ALL BIN searches
all_bin_matches = bisweb_bin_matches.union(dobnow_bin_matches)
all_bin_unmatched = [b for b in bins if b not in all_bin_matches]
print(f"Combined BIN search: {len(all_bin_matches)} total matches, {len(all_bin_unmatched)} need BBL fallback")

# BISWEB BBL fallback
if all_bin_unmatched:
    print("\n" + "=" * 70)
    print("STEP 3C: BISWEB BBL FALLBACK")
    print("=" * 70)
    dob_bisweb_bbl_path = dob_output_base.with_name(dob_output_base.stem + "_bisweb_bbl.csv")
    bbl_tuples = []
    # Also include projects with bad BINs in BBL queries
    # These projects should skip BIN queries and go straight to BBL
    bad_bin_projects = filtered_df[
        filtered_df['BIN'].astype(str).str.replace('.0', '').apply(is_bad_bin)
    ]
    if not bad_bin_projects.empty:
        print(f"Found {len(bad_bin_projects)} projects with bad/placeholder BINs - adding to BBL queries")
        for _, row in bad_bin_projects.iterrows():
            if pd.notna(row.get("BBL")):
                bbl_result = decompose_bbl(str(row["BBL"]))
                if bbl_result and len(bbl_result) >= 3:
                    bbl_tuples.append(bbl_result)

    print(f"▶️ Querying BISWEB BBL for {len(all_bin_unmatched)} buildings...")
    for bin_val in all_bin_unmatched:
        matching_rows = filtered_df[
            (filtered_df["BIN"].astype(str).str.replace(".0", "") == bin_val)
        ]
        if not matching_rows.empty:
            # Iterate through ALL matching rows, not just the first
            for _, row in matching_rows.iterrows():
                if pd.notna(row.get("BBL")):
                    bbl_result = decompose_bbl(str(row["BBL"]))
                    if bbl_result and len(bbl_result) >= 3:
                        bbl_tuples.append(bbl_result)
    bbl_tuples = list(set(bbl_tuples))
    print(f"Deduplicated to {len(bbl_tuples)} unique BBLs")
    dob_bisweb_bbl_df = query_dob_bisweb_bbl(bbl_tuples)
    
    # Try condo billing BBLs for BBLs that didn't match
    # Track which BBLs matched in the initial query
        # Reconstruct BBLs from DOB results to see which ones matched
        
        # Get matched BBL tuples
    
        
            # Combine with existing results
    
    
    # This catches cases where permits are on billing BBLs (lot 7501) instead of base lots
    
        # Combine with existing results
    
    dob_bisweb_bbl_df.to_csv(dob_bisweb_bbl_path, index=False)
    
    # DOB NOW BBL fallback
    print("\n" + "=" * 70)
    print("STEP 3D: DOB NOW BBL FALLBACK")
    print("=" * 70)
    dob_now_bbl_path = dob_output_base.with_name(dob_output_base.stem + "_now_bbl.csv")
    dob_now_bbl_df = query_dobnow_bbl(bbl_tuples)
    dob_now_bbl_df.to_csv(dob_now_bbl_path, index=False)
# Display results summary
    
    # Try condo billing BBLs only for BBLs that didn't match in either BISWEB or DOB NOW
    # Track which BBLs matched
    
    # Track which BBLs matched in BISWEB or DOB NOW
    matched_bbl_tuples = set()
    
    # Get matched BBLs from BISWEB results
    if not dob_bisweb_bbl_df.empty:
        for _, row in dob_bisweb_bbl_df.iterrows():
            if pd.notna(row.get('borough')) and pd.notna(row.get('block')) and pd.notna(row.get('lot')):
                matched_bbl_tuples.add((str(row['borough']).upper(), pad_block(row['block']), pad_lot(row['lot'])))
    
    # Get matched BBLs from DOB NOW results
    if not dob_now_bbl_df.empty:
        for _, row in dob_now_bbl_df.iterrows():
            if pd.notna(row.get('borough')) and pd.notna(row.get('block')) and pd.notna(row.get('lot')):
                matched_bbl_tuples.add((str(row['borough']).upper(), pad_block(row['block']), pad_lot(row['lot'])))
    
    
    
    # Find BBLs that didn't match in either API
    unmatched_bbl_tuples = [bbl for bbl in bbl_tuples if bbl not in matched_bbl_tuples]
    
    if unmatched_bbl_tuples:
        print(f"\nTrying condo billing BBLs for {len(unmatched_bbl_tuples)} BBLs that didn't match in BISWEB or DOB NOW...")
        condo_results = []
        condo_matched_base_bbls = set()
        for bbl_tuple in unmatched_bbl_tuples:
            borough, block, lot = bbl_tuple
            # Construct base BBL for condo billing lookup
            borough_map = {'MANHATTAN': '1', 'BRONX': '2', 'BROOKLYN': '3', 'QUEENS': '4', 'STATEN ISLAND': '5'}
            borough_code = borough_map.get(borough.upper())
            if borough_code:
                block_clean = str(int(float(block.replace('.0', ''))))
                lot_clean = str(int(float(lot.replace('.0', ''))))
                base_bbl = borough_code + block_clean.zfill(5) + lot_clean.zfill(4)
                condo_df = query_condo_lots_for_bbl(borough, block, lot, base_bbl=base_bbl)
                if not condo_df.empty:
                    condo_results.append(condo_df)
                    condo_matched_base_bbls.add(bbl_tuple)
                    condo_results.append(condo_df)
        
        if condo_results:
            condo_df_combined = pd.concat(condo_results, ignore_index=True)
            print(f"Found {len(condo_df_combined)} records on condo billing BBLs")
            # Add to BISWEB results (they'll be combined later)
            if not dob_bisweb_bbl_df.empty:
                dob_bisweb_bbl_df = pd.concat([dob_bisweb_bbl_df, condo_df_combined], ignore_index=True)
            else:
                dob_bisweb_bbl_df = condo_df_combined
            
            # Update matched_bbl_tuples with billing BBLs found
            for _, row in condo_df_combined.iterrows():
                if pd.notna(row.get('borough')) and pd.notna(row.get('block')) and pd.notna(row.get('lot')):
                    matched_bbl_tuples.add((str(row['borough']).upper(), pad_block(row['block']), pad_lot(row['lot'])))
            
            # Mark original base BBLs as matched since we found permits via condo billing BBLs
            for base_bbl_tuple in condo_matched_base_bbls:
                matched_bbl_tuples.add(base_bbl_tuple)
            for bbl_tuple in unmatched_bbl_tuples:
                if bbl_tuple in [bbl for bbl in unmatched_bbl_tuples]:
                    # Check if we found a condo billing BBL for this base BBL
                    borough, block, lot = bbl_tuple
                    borough_map = {'MANHATTAN': '1', 'BRONX': '2', 'BROOKLYN': '3', 'QUEENS': '4', 'STATEN ISLAND': '5'}
                    borough_code = borough_map.get(borough.upper())
                    if borough_code:
                        block_clean = str(int(float(block.replace('.0', ''))))
                        lot_clean = str(int(float(lot.replace('.0', ''))))
                        base_bbl = borough_code + block_clean.zfill(5) + lot_clean.zfill(4)
                        # Check if we found records for this base BBL's condo billing BBL
                        from query_dob_filings import get_condo_billing_bbl
                        billing_tuple = get_condo_billing_bbl(base_bbl)
                        if billing_tuple:
                            billing_borough, billing_block, billing_lot = billing_tuple
                            # Check if this billing BBL is in our results
                            billing_in_results = False
                            for _, row in condo_df_combined.iterrows():
                                if (str(row.get('borough', '')).upper() == billing_borough.upper() and
                                    pad_block(row.get('block', '')) == billing_block and
                                    pad_lot(row.get('lot', '')) == billing_lot):
                                    billing_in_results = True
                                    break
                            if billing_in_results:
                                # Mark the original base BBL as matched
                                matched_bbl_tuples.add(bbl_tuple)
    
    # Address search as final fallback for still-unmatched BBLs
    still_unmatched_after_condo = [bbl for bbl in unmatched_bbl_tuples if bbl not in matched_bbl_tuples]
    
    if still_unmatched_after_condo:
        print(f"\nTrying address search for {len(still_unmatched_after_condo)} BBLs that still didn't match...")
        
        # Build address list from HPD data for unmatched BBLs
        address_list = []
        for _, row in filtered_df.iterrows():
            bbl_tuple = decompose_bbl(row.get('BBL'))
            if bbl_tuple and len(bbl_tuple) >= 3:
                bbl_key = (str(bbl_tuple[0]).upper(), pad_block(bbl_tuple[1]), pad_lot(bbl_tuple[2]))
                if bbl_key in still_unmatched_after_condo:
                    number = str(row.get('Number', '')).strip()
                    street = str(row.get('Street', '')).strip()
                    borough = str(bbl_tuple[0]).upper()
                    if number and street and borough:
                        address_list.append((borough, number, street))
        
        if address_list:
            # Deduplicate addresses
            address_list = list(set(address_list))
            print(f"Searching {len(address_list)} unique addresses...")
            
            address_df = query_dob_by_address(address_list)
            
            if not address_df.empty:
                print(f"Found {len(address_df)} records by address search")
                # Add to BISWEB results
                if not dob_bisweb_bbl_df.empty:
                    dob_bisweb_bbl_df = pd.concat([dob_bisweb_bbl_df, address_df], ignore_index=True)
                else:
                    dob_bisweb_bbl_df = address_df
                
                # Update matched_bbl_tuples with BBLs found by address
                for _, row in address_df.iterrows():
                    if pd.notna(row.get('borough')) and pd.notna(row.get('block')) and pd.notna(row.get('lot')):
                        matched_bbl_tuples.add((str(row['borough']).upper(), pad_block(row['block']), pad_lot(row['lot'])))
        else:
            print("No addresses available for address search")
    else:
        print("All BBLs matched - no address fallback needed")
    
    
print(f"DOB NOW BBL: {len(dob_now_bbl_df) if 'dob_now_bbl_df' in locals() else 0} records")

# Preview each dataset
if len(dob_bisweb_bin_df) > 0:
    print("\n📊 BISWEB BIN sample:")
    display(dob_bisweb_bin_df.head())

if len(dob_bisweb_bbl_df) > 0:
    print("\n📊 BISWEB BBL sample:")

if len(dob_now_bin_df) > 0:
    print("\n📊 DOB NOW BIN sample:")
if len(dob_now_bbl_df) > 0:
    print("\n📊 DOB NOW BBL sample:")
display(dob_bisweb_bin_df.head())

print("\n📊 DOB BISWEB (BBL) sample:")
display(dob_bisweb_bbl_df.head())

print("\n📊 DOB NOW (BIN) sample:")
display(dob_now_bin_df.head())

print("\n📊 DOB NOW (BBL) sample:")
display(dob_now_bbl_df.head())

# If needed, merge or explore further:
# dob_df = pd.concat([dob_bisweb_bin_df, dob_bisweb_bbl_df, dob_now_bin_df, dob_now_bbl_df],
#                    ignore_index=True, sort=False)
# print(f"\nCombined DOB Data: {dob_df.shape[0]} records")


In [None]:
# For Multifamily Finance Program (MFP) new construction projects, find those with no DOB match in any table.

# Filter only MFP, new construction projects
mfp_new_construction = hpd_df[(hpd_df['Program Group'] == 'Multifamily Finance Program') & 
                               (hpd_df['Reporting Construction Type'].str.lower().str.contains('new'))]

# Defensive: set of unique Project IDs for matching
mfp_project_ids = set(mfp_new_construction['Project ID'].unique())

# Combine all DOB dataframes and normalize BIN columns
all_dob_dfs = []

# Normalize BIN columns in each DOB dataframe
if not dob_bisweb_bin_df.empty:
    if 'bin__' in dob_bisweb_bin_df.columns:
        dob_bisweb_bin_df = dob_bisweb_bin_df.copy()
        dob_bisweb_bin_df['bin_normalized'] = dob_bisweb_bin_df['bin__'].astype(str).str.replace('.0', '')
    # Ensure BBL is displayed as a string, not float
    if 'bbl' in dob_bisweb_bin_df.columns:
        dob_bisweb_bin_df['bbl'] = dob_bisweb_bin_df['bbl'].apply(lambda x: str(int(float(x))).zfill(10) if pd.notna(x) else None)
    all_dob_dfs.append(dob_bisweb_bin_df)

if not dob_bisweb_bbl_df.empty:
    if 'bin__' in dob_bisweb_bbl_df.columns:
        dob_bisweb_bbl_df = dob_bisweb_bbl_df.copy()
        dob_bisweb_bbl_df['bin_normalized'] = dob_bisweb_bbl_df['bin__'].astype(str).str.replace('.0', '')
    elif 'bin' in dob_bisweb_bbl_df.columns:
        dob_bisweb_bbl_df = dob_bisweb_bbl_df.copy()
        dob_bisweb_bbl_df['bin_normalized'] = dob_bisweb_bbl_df['bin'].astype(str).str.replace('.0', '')
    # Ensure BBL is displayed as a string, not float
    if 'bbl' in dob_bisweb_bbl_df.columns:
        dob_bisweb_bbl_df['bbl'] = dob_bisweb_bbl_df['bbl'].apply(lambda x: str(int(float(x))).zfill(10) if pd.notna(x) else None)
    all_dob_dfs.append(dob_bisweb_bbl_df)

if not dob_now_bin_df.empty:
    if 'bin' in dob_now_bin_df.columns:
        dob_now_bin_df = dob_now_bin_df.copy()
        dob_now_bin_df['bin_normalized'] = dob_now_bin_df['bin'].astype(str).str.replace('.0', '')
    # Ensure BBL is displayed as a string, not float
    if 'bbl' in dob_now_bin_df.columns:
        dob_now_bin_df['bbl'] = dob_now_bin_df['bbl'].apply(lambda x: str(int(float(x))).zfill(10) if pd.notna(x) else None)
    all_dob_dfs.append(dob_now_bin_df)

if not dob_now_bbl_df.empty:
    if 'bin' in dob_now_bbl_df.columns:
        dob_now_bbl_df = dob_now_bbl_df.copy()
        dob_now_bbl_df['bin_normalized'] = dob_now_bbl_df['bin'].astype(str).str.replace('.0', '')
    # Ensure BBL is displayed as a string, not float
    if 'bbl' in dob_now_bbl_df.columns:
        dob_now_bbl_df['bbl'] = dob_now_bbl_df['bbl'].apply(lambda x: str(int(float(x))).zfill(10) if pd.notna(x) else None)
    all_dob_dfs.append(dob_now_bbl_df)

# Combine all DOB records
if all_dob_dfs:
    combined_dob = pd.concat(all_dob_dfs, ignore_index=True)
    print(f'Total DOB records: {len(combined_dob)}')
else:
    combined_dob = pd.DataFrame()
    print('No DOB records found')

# Prepare HPD data for matching - normalize BIN and ensure BBL is string
hpd_for_matching = mfp_new_construction.copy()
hpd_for_matching['bin_normalized'] = hpd_for_matching['BIN'].astype(str).str.replace('.0', '')
hpd_for_matching['bbl_normalized'] = hpd_for_matching['BBL'].apply(lambda x: str(int(float(x))).zfill(10) if pd.notna(x) else None)

# Join on BIN first
if not combined_dob.empty and 'bin_normalized' in combined_dob.columns:
    dob_bin_matches = pd.merge(
        hpd_for_matching,
        combined_dob[['bin_normalized']].drop_duplicates(),
        on='bin_normalized',
        how='inner'
    )
    matched_project_ids_bin = set(dob_bin_matches['Project ID'].unique())
    print(f'Projects matched on BIN: {len(matched_project_ids_bin)}')
else:
    matched_project_ids_bin = set()

# Join on BBL for those that didn't match on BIN
unmatched_on_bin = hpd_for_matching[~hpd_for_matching['Project ID'].isin(matched_project_ids_bin)]

# Initialize BBL matching result
matched_project_ids_bbl = set()

# Reconstruct BBL in DOB data for sources that don't have it (like BISWEB)
# Reconstruct BBL from borough, block, lot for records that don't have it
def reconstruct_bbl(row):
    if pd.isna(row.get('borough')) or pd.isna(row.get('block')) or pd.isna(row.get('lot')):
        return None
    borough_map = {'MANHATTAN': '1', 'BRONX': '2', 'BROOKLYN': '3', 'QUEENS': '4', 'STATEN ISLAND': '5'}
    borough_code = borough_map.get(str(row['borough']).upper(), None)
    if not borough_code:
        return None
    # Remove leading zeros from block/lot for BBL reconstruction
    block_str = str(int(float(str(row['block']).replace('.0', ''))))
    lot_str = str(int(float(str(row['lot']).replace('.0', ''))))
    # Reconstruct: borough(1) + block(5) + lot(4) = 10 digits
    bbl_str = borough_code + block_str.zfill(5) + lot_str.zfill(4)
    return bbl_str.zfill(10)

# Always reconstruct BBL for records that need it (BISWEB data doesn't have bbl column)
combined_dob['bbl_reconstructed'] = combined_dob.apply(reconstruct_bbl, axis=1)
# Normalize BBL in DOB data (use existing bbl or reconstructed)
# Use bbl column if available, otherwise use reconstructed BBL
if 'bbl' in combined_dob.columns:
    # Use existing bbl column, normalized to 10 digits
    combined_dob['bbl_normalized'] = combined_dob['bbl'].apply(lambda x: str(int(float(x))).zfill(10) if pd.notna(x) else None)
    # Fill missing values with reconstructed BBL
    if 'bbl_reconstructed' in combined_dob.columns:
        combined_dob['bbl_normalized'] = combined_dob['bbl_normalized'].fillna(combined_dob['bbl_reconstructed'])
elif 'bbl_reconstructed' in combined_dob.columns:
    # Use reconstructed BBL if no bbl column exists
    combined_dob['bbl_normalized'] = combined_dob['bbl_reconstructed']
else:
    combined_dob['bbl_normalized'] = None

# Now match on BBL
if 'bbl_normalized' in combined_dob.columns and combined_dob['bbl_normalized'].notna().any():
    dob_bbl_matches = pd.merge(
        unmatched_on_bin,
        combined_dob[['bbl_normalized']].drop_duplicates(),
        on='bbl_normalized',
        how='inner'
    )
    matched_project_ids_bbl = set(dob_bbl_matches['Project ID'].unique())
    print(f'Projects matched on BBL (fallback): {len(matched_project_ids_bbl)}')
else:
    matched_project_ids_bbl = set()
    print('No BBL data available for matching')

# Combine all matched project IDs
dob_matched_project_ids = matched_project_ids_bin | matched_project_ids_bbl

# Find projects without DOB matches
mfp_projects_without_dob = mfp_project_ids - dob_matched_project_ids

print(f'\nTotal Multifamily Finance Program new construction projects: {len(mfp_project_ids)}')
print(f'Projects with DOB matches: {len(dob_matched_project_ids)}')
print(f'Number of these with NO DOB row in any table: {len(mfp_projects_without_dob)}')

# Debug: show a sample of matched and unmatched projects
if len(matched_project_ids_bin) > 0:
    print(f'\nSample matched on BIN: {list(matched_project_ids_bin)[:3]}')
if len(matched_project_ids_bbl) > 0:
    print(f'Sample matched on BBL: {list(matched_project_ids_bbl)[:3]}')
if len(mfp_projects_without_dob) > 0:
    print(f'Sample unmatched: {list(mfp_projects_without_dob)[:3]}')

# DEBUG: Analyze a sample project to understand matching
if len(mfp_projects_without_dob) > 0:
    sample_project_id = list(mfp_projects_without_dob)[0]
    sample_project = mfp_new_construction[mfp_new_construction['Project ID'] == sample_project_id]
    print(f'\n=== DEBUG: Sample unmatched project ===')
    print(f'Project ID: {sample_project_id}')
    print(f'Number of buildings in project: {len(sample_project)}')
    sample_bins = sample_project['BIN'].dropna().astype(str).str.replace('.0', '').tolist()
    sample_bbls = sample_project['BBL'].dropna().apply(lambda x: str(int(float(x))).zfill(10) if pd.notna(x) else None).tolist()
    print(f'BINs in project: {sample_bins[:5]}')
    print(f'BBLs in project: {sample_bbls[:5]}')
    
    # Check if these BINs/BBLs exist in DOB data
    if not combined_dob.empty:
        if 'bin_normalized' in combined_dob.columns:
            dob_bins = set(combined_dob['bin_normalized'].dropna().astype(str).unique())
            matching_bins = [b for b in sample_bins if b in dob_bins]
            print(f'BINs found in DOB data: {matching_bins[:5] if matching_bins else "None"}')
        if 'bbl' in combined_dob.columns:
            dob_bbls = set(combined_dob['bbl'].dropna().apply(lambda x: str(int(float(x))).zfill(10) if pd.notna(x) else None).unique())
            matching_bbls = [b for b in sample_bbls if b in dob_bbls]
            print(f'BBLs found in DOB data: {matching_bbls[:5] if matching_bbls else "None"}')
# Show the head of the table of unmatched projects (project-level)
if len(mfp_projects_without_dob) > 0:
    print("\nHead of unmatched Multifamily Finance Program new construction projects:")
    unmatched_projects_df = mfp_new_construction[mfp_new_construction['Project ID'].isin(mfp_projects_without_dob)].copy()
    # Ensure BBL is displayed as a string, not float
    if 'BBL' in unmatched_projects_df.columns:
        unmatched_projects_df['BBL'] = unmatched_projects_df['BBL'].apply(lambda x: str(int(float(x))).zfill(10) if pd.notna(x) else None)
    display_cols = [c for c in unmatched_projects_df.columns if c not in [
        "Lot Area", "Available Units", "Privately Financed Units", "Extremely Low Income Units",
        "Very Low Income Units", "Low Income Units", "Moderate Income Units", "Middle Income Units",
        "Studio Units", "One Bedroom Units", "Two Bedroom Units", "Three Bedroom Units",
        "Four Bedroom Units", "Five Bedroom Units", "Six Bedroom Units", "Unknown Bedroom Units",
    ][:15]]  # Limit extra-wide tables in notebook
    display(unmatched_projects_df[display_cols].head(10))
else:
    print("\nAll Multifamily Finance Program projects matched to DOB data!")


In [None]:
def bbl_to_boro_block_lot_and_name(bbl):
    """
    Convert a NYC BBL (Borough-Block-Lot) string (like '3015560003') to a tuple:
    (borough_number, borough_name, block, lot).
    Borough number is int (1-5). Name is string ("Manhattan", "Bronx", ...).

    Example:
        bbl_to_boro_block_lot_and_name('3015560003')
        -> (3, 'Brooklyn', '01556', '0003')
    """
    boro_names = {
        1: "Manhattan",
        2: "Bronx",
        3: "Brooklyn",
        4: "Queens",
        5: "Staten Island",
    }
    bbl_str = str(bbl).zfill(10)
    borough_num = int(bbl_str[0])
    borough_name = boro_names.get(borough_num, "Unknown")
    block = bbl_str[1:6]
    lot = bbl_str[6:10]
    return (borough_num, borough_name, block, lot)

# Example usage
bbl_example = '3015560003'
boro_num, boro_name, block, lot = bbl_to_boro_block_lot_and_name(bbl_example)
print(f"BBL {bbl_example} -> Borough {boro_num} ({boro_name}), Block {block}, Lot {lot}")


## 🏛️ Step 3B: Query Certificate of Occupancy

Search for Certificate of Occupancy filings.

**Depends on:** Step 2
**Options:**
- Set `skip_co = True` to use existing CO data

In [None]:
# Step 3B Configuration
skip_co = False  # Set to True to use existing CO data
co_output_path = None  # Custom CO output path

print("\n" + "=" * 70)
print("STEP 3B: QUERY CERTIFICATE OF OCCUPANCY")
print("=" * 70)

# Generate BIN file for CO searches
bin_output = Path("data/processed/workflow_bins.txt")
bin_file = _write_bin_file(building_csv, bin_output)

print(f"\n📋 BIN file created: {bin_file}")
print(f"Contains {len(bin_file.read_text().split())} BINs")

co_output = Path(co_output_path) if co_output_path else Path(
    f"data/processed/{bin_file.stem}_co_filings.csv"
)
co_output.parent.mkdir(parents=True, exist_ok=True)

if skip_co:
    print("⏭️ Using existing CO data")
    # Look for existing CO files
    alt_co_path = Path(f"data/external/{bin_file.stem}_co_filings.csv")
    if co_output.exists():
        print(f"📁 Using existing CO data at {co_output}")
        co_df = pd.read_csv(co_output)
    elif alt_co_path.exists():
        print(f"📁 Using existing CO data from external folder: {alt_co_path}")
        co_output = alt_co_path
        co_df = pd.read_csv(co_output)
    else:
        print("⚠️ No existing CO data found")
        co_df = None
        co_output = None
else:
    print(f"🏛️ Querying CO APIs using {bin_file} -> {co_output}")
    query_co_filings(str(bin_file), output_path=str(co_output))
    co_df = pd.read_csv(co_output)

# Display CO data if available
if co_df is not None:
    print(f"📊 Certificate of Occupancy Data: {co_df.shape[0]} records")
    print("Columns:")
    for col in co_df.columns:
        print(f"  - {col}")
    
    print("\n📊 Sample CO Data:")
    display(co_df.head())
    
    # Show some statistics
    if "issue_date" in co_df.columns:
        print("\n📈 CO Issue Date Statistics:")
        display(co_df["issue_date"].describe())
else:
    print("⚠️ No CO data available")

## 📊 Step 4: Generate Timelines and Charts

Create timeline visualizations from enriched data.

**Depends on:** Steps 2, 3A
**Options:**
- Set `skip_join = True` to skip timeline creation
- Set `skip_charts = True` to skip chart generation

In [None]:
# Step 4 Configuration
skip_join = False   # Set to True to skip timeline creation
skip_charts = False # Set to True to skip chart generation

print("\n" + "=" * 70)
print("STEP 4: GENERATE TIMELINES AND CHARTS")
print("=" * 70)

if skip_join:
    print("⏭️ Skipping timeline join step.")
else:
    if dob_output is None or not dob_output.exists():
        print("⚠️ No DOB data available; skipping timeline creation.")
    else:
        print("🔗 Building timelines...")
        create_separate_timelines(
            str(building_csv),
            str(dob_output),
            str(co_output) if co_output else None,
        )
        
        # Load and display timeline data
        hpd_timeline = Path(str(building_csv).replace(".csv", "_hpd_financed_timeline.csv"))
        private_timeline = Path(str(building_csv).replace(".csv", "_privately_financed_timeline.csv"))
        
        if hpd_timeline.exists():
            hpd_timeline_df = pd.read_csv(hpd_timeline)
            print(f"\n📊 HPD Financed Timeline Data ({hpd_timeline_df.shape[0]} records):")
            display(hpd_timeline_df.head())
            
            # Show event type distribution
            if "event_type" in hpd_timeline_df.columns:
                print("\n📈 Event Types in HPD Timeline:")
                display(hpd_timeline_df["event_type"].value_counts())
        
        if private_timeline.exists():
            private_timeline_df = pd.read_csv(private_timeline)
            print(f"\n📊 Privately Financed Timeline Data ({private_timeline_df.shape[0]} records):")
            display(private_timeline_df.head())
            
            # Show event type distribution
            if "event_type" in private_timeline_df.columns:
                print("\n📈 Event Types in Private Timeline:")
                display(private_timeline_df["event_type"].value_counts())

if skip_charts:
    print("⏭️ Skipping chart generation.")
else:
    # Charts
    print("\n📈 Generating charts...")
    default_timeline_stem = "Affordable_Housing_Production_by_Building_with_financing"
    if Path(building_csv).name == f"{default_timeline_stem}.csv":
        create_financing_charts()
        print("✅ Created financing-specific charts")
    else:
        hpd_timeline = Path(str(building_csv).replace(".csv", "_hpd_financed_timeline.csv"))
        private_timeline = Path(str(building_csv).replace(".csv", "_privately_financed_timeline.csv"))
        
        if hpd_timeline.exists():
            create_timeline_chart(str(hpd_timeline))
            print(f"✅ Created HPD financed timeline chart")
        else:
            print(f"⚠️ No HPD financed timeline found; skipping.")

        if private_timeline.exists():
            create_timeline_chart(str(private_timeline))
            print(f"✅ Created privately financed timeline chart")
        else:
            print(f"⚠️ No privately financed timeline found; skipping.")

print("\n✅ Step 4 complete")

## 📋 Final Summary

Generate data quality report and workflow summary.

**Optional:** Run this at the end to see final statistics.

In [None]:
print("\n" + "=" * 70)
print("📊 FINAL DATA QUALITY REPORT")
print("=" * 70)

# Generate final data quality report and Sankey diagram
quality_tracker.end_processing()
report_filename = quality_tracker.save_report_to_file("notebook_workflow")
sankey_filename = quality_tracker.generate_sankey_diagram()
quality_tracker.print_report()

print("\n🎉 WORKFLOW COMPLETED!")
print(f"📊 Data quality report: {report_filename}")
if sankey_filename:
    print(f"📊 Sankey diagram: {sankey_filename}")

# Summary of what we accomplished
print("\n📋 WORKFLOW SUMMARY:")
try:
    print(f"• HPD Records Processed: {len(hpd_df):,}")
except NameError:
    print("• HPD Records: Step 1 not run")
try:
    print(f"• Records with Financing: {len(financing_df):,}")
except NameError:
    print("• Records with Financing: Step 2 not run")
try:
    if dob_df is not None:
        print(f"• DOB Filings Found: {len(dob_df):,}")
    else:
        print("• DOB Filings: No data")
except NameError:
    print("• DOB Filings: Step 3A not run")
try:
    if co_df is not None:
        print(f"• CO Filings Found: {len(co_df):,}")
    else:
        print("• CO Filings: No data")
except NameError:
    print("• CO Filings: Step 3B not run")

print("\n✅ Notebook workflow complete!")
print("Each step showed dataframe views for inspection.")