# UHI Data Download and Processing Pipeline

## 1. Setup and Dependencies

In [1]:
import sys
import os
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
from pathlib import Path
from datetime import datetime
import logging
import requests
import os
from tqdm import tqdm
import subprocess
import zipfile # For more robust unzipping
import time
import pystac_client
import pystac
import planetary_computer
import stackstac

# Imports for tile index processing
import geopandas as gpd
from shapely.geometry import box

# Add the project root to the Python path to allow importing from src
project_root = Path(os.getcwd()).parent  # Assumes notebook is in 'notebooks' subdir
sys.path.insert(0, str(project_root))

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [3]:

# --- Configuration for Data Download ---

# Configure parameters for the target city (e.g., NYC)
city_name = "NYC"

# Time window matching original notebooks (adjust if needed)
sentinel_time_window = "2021-06-01/2021-09-01"
lst_time_window = "2021-06-01/2021-09-01"

# Input files and general settings
data_dir = Path("data")
abs_output_dir = project_root / data_dir
uhi_csv = data_dir / city_name / "uhi.csv" # Path to UHI data
abs_uhi_csv = project_root / uhi_csv
# bbox_csv is no longer needed for bounds calculation

if not abs_uhi_csv.exists():
    raise FileNotFoundError(f"UHI data CSV not found at {abs_uhi_csv}. Cannot derive bounds.")
print(f"Loading bounds from UHI data: {abs_uhi_csv}")
uhi_df = pd.read_csv(abs_uhi_csv)
# Check if required columns exist
required_cols = ['Longitude', 'Latitude']
if not all(col in uhi_df.columns for col in required_cols):
     raise ValueError(f"UHI CSV must contain columns: {required_cols}")

# Load bounds
bounds = [
    uhi_df['Longitude'].min(),
    uhi_df['Latitude'].min(),
    uhi_df['Longitude'].max(),
    uhi_df['Latitude'].max()
]

# Load observation da
first_datetime_obj = pd.to_datetime(uhi_df['datetime'].iloc[0], format='%d-%m-%Y %H:%M')
# Format the date object into 'YYYY-MM-DD' string format
uhi_date_str = first_datetime_obj.strftime('%Y-%m-%d')
print(f"Representative UHI date (from first row): {uhi_date_str}")

Loading bounds from UHI data: /Users/arnav/MLC-Project/data/NYC/uhi.csv
Representative UHI date (from first row): 2021-07-24


### 3. Download + Build DEM / DSM for NYC (AOI‑bounded, 1 ft grid)

In [4]:
# ─────────────────────────────────────────────────────────────────────────────
# CONFIG
# ─────────────────────────────────────────────────────────────────────────────
# DEM/DSM specific settings
# Fetching BOTH DEM and DSM from Planetary Computer

# --- URLs and Paths ---
CRS_WGS84 = "EPSG:4326"

# Ensure the 'sat_files' directory exists
sat_files_dir = abs_output_dir / city_name / "sat_files"
sat_files_dir.mkdir(parents=True, exist_ok=True)

# --- Target Resolution --- # 
TARGET_ELEV_RESOLUTION_M = 10 # Target resolution in meters
# Approximate 10m in degrees at NYC latitude (approx 40.7 N)
# Latitude: 1 deg ~ 111 km => 10m ~ 0.00009 deg
# Longitude: 1 deg ~ 85 km => 10m ~ 0.000118 deg
# Using the latitude approximation for stackstac resolution parameter
TARGET_ELEV_RESOLUTION_DEG = 0.00009 

# --- Define final expected TIF paths --- #
# UPDATED Filenames to reflect 10m resolution
final_dem_path_tif = sat_files_dir / f"nyc_dem_{TARGET_ELEV_RESOLUTION_M}m_pc.tif" 
final_dsm_path_tif = sat_files_dir / f"nyc_dsm_{TARGET_ELEV_RESOLUTION_M}m_pc.tif"

# --- Define nodata value --- #
ELEV_NODATA_VALUE = -9999.0 # Use a float value for both

# --- Download DEM from Planetary Computer --- #
print(f"\n--- Downloading DEM from Planetary Computer at approx {TARGET_ELEV_RESOLUTION_M}m resolution ---")
dem_downloaded_pc = False
if final_dem_path_tif.exists():
    logging.info(f"DEM file {final_dem_path_tif.name} already exists. Skipping download.")
    dem_downloaded_pc = True
else:
    try:
        # Ensure necessary libraries are imported (should be in cell 2)
        import pystac_client
        import planetary_computer
        import stackstac
        import rioxarray
        import numpy as np

        # Define bounding box and CRS for STAC query
        bbox = bounds # Use the same bounds calculated earlier
        logging.info(f"Using bounding box for DEM query: {bbox}")

        # Search the 3dep-lidar-DTM collection (DEM)
        catalog = pystac_client.Client.open("https://planetarycomputer.microsoft.com/api/stac/v1", modifier=planetary_computer.sign_inplace)
        search_dem = catalog.search(
            collections=["3dep-lidar-dtm"], # Use DTM collection
            bbox=bbox,
        )
        items_dem = search_dem.item_collection()
        logging.info(f"Found {len(items_dem)} 3DEP DTM (DEM) items for the bounding box.")

        if not items_dem:
            logging.warning("No 3DEP DTM (DEM) items found for the specified area. Cannot download DEM.")
        else:
            # Load items, mean aggregate time, select the single band, then squeeze
            logging.info(f"Requesting DEM data from stackstac at resolution: {TARGET_ELEV_RESOLUTION_DEG} degrees")
            dem_data = stackstac.stack(
                items_dem,
                assets=["data"],
                epsg=4326,
                resolution=TARGET_ELEV_RESOLUTION_DEG, # Use target 10m resolution in degrees
                dtype=np.float32, # Save directly as float32
                fill_value=ELEV_NODATA_VALUE,
                rescale=False,
                bounds_latlon=bbox
            ).mean("time", skipna=True) # Aggregate time first

            # Explicitly select the single 'data' band if 'band' dimension exists
            if "band" in dem_data.dims:
                dem_data = dem_data.isel(band=0)

            # Squeeze any remaining singleton dimensions
            dem_data = dem_data.squeeze()

            logging.info(f"Created DEM xarray with shape: {dem_data.shape} and dtype: {dem_data.dtype}")

            # Assign CRS if missing
            if dem_data.rio.crs is None:
                 logging.warning("Assigning CRS EPSG:4326 to DEM data as it was missing after stackstac.")
                 dem_data = dem_data.rio.write_crs(CRS_WGS84)

            # Set nodata value explicitly for writing
            dem_data.rio.write_nodata(ELEV_NODATA_VALUE, inplace=True)

            # Save the DataArray to a GeoTIFF (Filename now reflects 10m)
            logging.info(f"Saving DEM to {final_dem_path_tif}...")
            dem_data.rio.to_raster(final_dem_path_tif, driver="COG") 
            logging.info("Successfully downloaded and saved DEM from Planetary Computer.")
            dem_downloaded_pc = True
            del dem_data # Clean up memory

    except ImportError as e:
        logging.error(f"Missing libraries for Planetary Computer access ({e}). Please install: pystac-client planetary-computer stackstac rioxarray numpy")
    except Exception as e:
        logging.error(f"An error occurred during Planetary Computer DEM download: {e}", exc_info=True)

# --- Download DSM from Planetary Computer --- #
print(f"\n--- Downloading DSM from Planetary Computer at approx {TARGET_ELEV_RESOLUTION_M}m resolution ---")
dsm_downloaded_pc = False
if final_dsm_path_tif.exists():
    logging.info(f"DSM file {final_dsm_path_tif.name} already exists. Skipping download.")
    dsm_downloaded_pc = True
else:
    try:
        # Libraries should be imported already
        import pystac_client
        import planetary_computer
        import stackstac
        import rioxarray
        import numpy as np

        bbox = bounds # Use the same bounds
        logging.info(f"Using bounding box for DSM query: {bbox}")

        # Search the 3dep-lidar-dsm collection
        catalog = pystac_client.Client.open("https://planetarycomputer.microsoft.com/api/stac/v1", modifier=planetary_computer.sign_inplace)
        search_dsm = catalog.search(
            collections=["3dep-lidar-dsm"],
            bbox=bbox,
        )
        items_dsm = search_dsm.item_collection()
        logging.info(f"Found {len(items_dsm)} 3DEP DSM items for the bounding box.")

        if not items_dsm:
            logging.warning("No 3DEP DSM items found for the specified area. Cannot download DSM.")
        else:
            # Load items, mean aggregate time, select the single band, then squeeze
            logging.info(f"Requesting DSM data from stackstac at resolution: {TARGET_ELEV_RESOLUTION_DEG} degrees")
            dsm_data = stackstac.stack(
                items_dsm,
                assets=["data"],
                epsg=4326,
                resolution=TARGET_ELEV_RESOLUTION_DEG, # Use target 10m resolution in degrees
                dtype=np.float32, # Save directly as float32
                fill_value=ELEV_NODATA_VALUE,
                rescale=False,
                bounds_latlon=bbox
            ).mean("time", skipna=True) # Aggregate time first

            # Explicitly select the single 'data' band if 'band' dimension exists
            if "band" in dsm_data.dims:
                dsm_data = dsm_data.isel(band=0)

            # Squeeze any remaining singleton dimensions
            dsm_data = dsm_data.squeeze()

            logging.info(f"Created DSM xarray with shape: {dsm_data.shape} and dtype: {dsm_data.dtype}")

            # Assign CRS if missing
            if dsm_data.rio.crs is None:
                 logging.warning("Assigning CRS EPSG:4326 to DSM data as it was missing after stackstac.")
                 dsm_data = dsm_data.rio.write_crs(CRS_WGS84)

            # Set nodata value explicitly for writing
            dsm_data.rio.write_nodata(ELEV_NODATA_VALUE, inplace=True)

            # Save the DataArray to a GeoTIFF (Filename now reflects 10m)
            logging.info(f"Saving DSM to {final_dsm_path_tif}...")
            dsm_data.rio.to_raster(final_dsm_path_tif, driver="COG") 
            logging.info("Successfully downloaded and saved DSM from Planetary Computer.")
            dsm_downloaded_pc = True
            del dsm_data # Clean up memory

    except ImportError as e:
        logging.error(f"Missing libraries for Planetary Computer access ({e}). Please install: pystac-client planetary-computer stackstac rioxarray numpy")
    except Exception as e:
        logging.error(f"An error occurred during Planetary Computer DSM download: {e}", exc_info=True)

# --- Update Relative Paths for Config ---
dem_path_relative = Path("data") / city_name / "sat_files" / final_dem_path_tif.name
dsm_path_relative = Path("data") / city_name / "sat_files" / final_dsm_path_tif.name
print(f"\nRelative paths for config (relative to project root '{project_root}'):")
print(f"  DEM: {dem_path_relative}")
print(f"  DSM: {dsm_path_relative}")

# --- Final Check ---
print("\n--- Final Status ---")
if final_dem_path_tif.exists():
    print(f"DEM file exists: {final_dem_path_tif}")
else:
    print("DEM file NOT found.")
if final_dsm_path_tif.exists():
    print(f"DSM file exists: {final_dsm_path_tif}")
else:
    print("DSM file NOT found.")



--- Downloading DEM/DSM for NYC ---
Querying DEM API: https://elevation.its.ny.gov/arcgis/rest/services/Dem_Indexes/FeatureServer/0/query
Found DEM download URL via API: https://gisdata.ny.gov/elevation/DEM/NYC_TopoBathymetric2017/be_NYC_001.tif
Downloading nyc_dem_1ft_2017.zip from https://gisdata.ny.gov/elevation/DEM/NYC_TopoBathymetric2017/be_NYC_001.tif...


nyc_dem_1ft_2017.zip:  27%|██▋       | 21.0M/77.8M [00:06<00:16, 3.57MiB/s]


KeyboardInterrupt: 