# UHI Data Download and Processing Pipeline

Configured for downloading Sentinel-2b / Landsat-LST mosaics within a bounding box defined by a csv of UHI measurements
Designed to work for NYC but can be extended to other cities. Requires ground-level intraday UHI measurements.

## 1. Setup and Dependencies

In [6]:
import sys
import os
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
from pathlib import Path
from datetime import datetime
import logging
import requests
import os
from tqdm import tqdm
import subprocess
import zipfile # For more robust unzipping
import time
import pystac_client
import pystac
import planetary_computer
import stackstac

# Imports for tile index processing
import geopandas as gpd
from shapely.geometry import box

# Add the project root to the Python path to allow importing from src
project_root = Path(os.getcwd()).parent  # Assumes notebook is in 'notebooks' subdir
sys.path.insert(0, str(project_root))

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [7]:

# --- Configuration for Data Download ---

# Configure parameters for the target city (e.g., NYC)
city_name = "NYC"

# Time window matching original notebooks (adjust if needed)
sentinel_time_window = "2021-06-01/2021-09-01"
lst_time_window = "2021-06-01/2021-09-01"

# Input files and general settings
data_dir = Path("data")
abs_output_dir = project_root / data_dir
uhi_csv = data_dir / city_name / "uhi.csv" # Path to UHI data
abs_uhi_csv = project_root / uhi_csv
# bbox_csv is no longer needed for bounds calculation

if not abs_uhi_csv.exists():
    raise FileNotFoundError(f"UHI data CSV not found at {abs_uhi_csv}. Cannot derive bounds.")
print(f"Loading bounds from UHI data: {abs_uhi_csv}")
uhi_df = pd.read_csv(abs_uhi_csv)
# Check if required columns exist
required_cols = ['Longitude', 'Latitude']
if not all(col in uhi_df.columns for col in required_cols):
     raise ValueError(f"UHI CSV must contain columns: {required_cols}")

# Load bounds
bounds = [
    uhi_df['Longitude'].min(),
    uhi_df['Latitude'].min(),
    uhi_df['Longitude'].max(),
    uhi_df['Latitude'].max()
]

# Load observation da
first_datetime_obj = pd.to_datetime(uhi_df['datetime'].iloc[0], format='%d-%m-%Y %H:%M')
# Format the date object into 'YYYY-MM-DD' string format
uhi_date_str = first_datetime_obj.strftime('%Y-%m-%d')
print(f"Representative UHI date (from first row): {uhi_date_str}")

Loading bounds from UHI data: /home/jupyter/MLC-Project/data/NYC/uhi.csv
Representative UHI date (from first row): 2021-07-24


## 2. Download Satellite Data for Cities

Now we'll download satellite imagery data (Sentinel-2 median composites, Landsat LST medians) for specific cities and time periods derived from the UHI data timestamps. Data is saved locally for use by the dataloader.

In [8]:
# Import functions
from src.ingest.get_median import create_and_save_cloudless_mosaic
# Import the modified LST download function
from src.ingest.create_sat_tensor_files import download_single_lst_median
import pandas as pd
from pathlib import Path
import os

# Parameters for Cloudless Mosaic (matching Sentinel2_GeoTIFF.ipynb)
mosaic_bands = ["B02", "B03", "B04", "B08"] # RGB+NIR for Clay compatibility
mosaic_resolution_m = 10
mosaic_cloud_cover = 30

# Parameters for LST Median (matching Landsat_LST.ipynb)
include_lst = False         # Whether to download LST
lst_resolution_m = 30      # Native resolution for Landsat LST

# Generate output path for the mosaic based on the new time window
start_dt_str = sentinel_time_window.split('/')[0].replace('-','')
end_dt_str = sentinel_time_window.split('/')[1].replace('-','')
band_str = "_".join(mosaic_bands)
cloudless_mosaic_filename = f"sentinel_{city_name}_{start_dt_str}_to_{end_dt_str}_cloudless_mosaic.npy"
cloudless_mosaic_path = abs_output_dir / city_name / "sat_files" / cloudless_mosaic_filename
# --- Verification ---
print(f"City: {city_name}")
print(f"Sentinel-2 Time Window: {sentinel_time_window}")
print(f"Sentinel-2 Cloud Cover Threshold: {mosaic_cloud_cover}%")
print(f"LST Time Window: {lst_time_window}")
print(f"Bounds derived from {uhi_csv.name}: {bounds}")
print(f"Target mosaic output path: {cloudless_mosaic_path}")
print(f"Include LST: {include_lst}")


City: NYC
Sentinel-2 Time Window: 2021-06-01/2021-09-01
Sentinel-2 Cloud Cover Threshold: 30%
LST Time Window: 2021-06-01/2021-09-01
Bounds derived from uhi.csv: [np.float64(-73.99445667), np.float64(40.75879167), np.float64(-73.87945833), np.float64(40.85949667)]
Target mosaic output path: /home/jupyter/MLC-Project/data/NYC/sat_files/sentinel_NYC_20210601_to_20210901_cloudless_mosaic.npy
Include LST: False


In [None]:
# --- 1. Generate Cloudless Mosaic --- 
print(f"\n--- Generating Cloudless Mosaic ({sentinel_time_window}) ---")

mosaic_output_path = create_and_save_cloudless_mosaic(
    city_name=city_name,
    bounds=bounds,
    output_dir=abs_output_dir,
    time_window=sentinel_time_window, # Use the explicit time window
    selected_bands=mosaic_bands,
    resolution_m=mosaic_resolution_m,
    cloud_cover=mosaic_cloud_cover # Use the updated cloud cover
)

if mosaic_output_path:
    print(f"Cloudless mosaic saved/found at: {mosaic_output_path}")
else:
    # Stop if mosaic fails, as it's required
    raise RuntimeError("Failed to generate cloudless mosaic.")

# --- 2. Download Single LST Median (if enabled) ---
print(f"\n--- Downloading Single LST Median (Include: {include_lst}, Window: {lst_time_window}) ---")

single_lst_median_file_path = None # Initialize path variable
if include_lst:
    # No need to check UHI CSV, we provide the time window directly
    
    # Download the single LST median using the explicit time window
    single_lst_median_file_path = download_single_lst_median(
        city_name=city_name,
        bounds=bounds,
        output_dir=abs_output_dir,
        time_window=lst_time_window, # Provide explicit window
        # uhi_csv_path and averaging_window are omitted/None
        resolution_m=lst_resolution_m
        # lst_cloud_cover is handled internally by load_lst_tensor_from_bbox_median
    )

    if single_lst_median_file_path:
        print(f"Single LST median saved/found at: {single_lst_median_file_path}")
    else:
        print("Failed to generate single LST median.")
else:
    print("Skipping LST median download as include_lst is False.")

# --- Verification ---
sat_files_check_dir = Path(abs_output_dir) / city_name / "sat_files"
print(f"\nVerifying output files:")
print(f"  Mosaic path ({cloudless_mosaic_path.name}) exists: {cloudless_mosaic_path.exists()}")
if include_lst:
    # Construct expected LST filename based on the explicit window
    lst_start_str = lst_time_window.split('/')[0].replace('-','')
    lst_end_str = lst_time_window.split('/')[1].replace('-','')
    expected_lst_filename = f"lst_{city_name}_median_{lst_start_str}_to_{lst_end_str}.npy"
    expected_lst_path = sat_files_check_dir / expected_lst_filename
    print(f"  Single LST median path ({expected_lst_filename}) exists: {expected_lst_path.exists()}")
    # Update the variable used by later cells if generation was successful
    if single_lst_median_file_path and not single_lst_median_file_path.exists():
         # This case shouldn't happen if the function worked, but good sanity check
         print(f"Warning: LST download function returned a path but it doesn't exist: {single_lst_median_file_path}")
         single_lst_median_file_path = None # Ensure later cells know it failed
    elif not single_lst_median_file_path and expected_lst_path.exists():
         # File existed previously, update path variable for later cells
         single_lst_median_file_path = expected_lst_path


### 3. Download + Build DEM (DTM) / DSM for NYC from planetary computer

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# CONFIG
# ─────────────────────────────────────────────────────────────────────────────
# DEM/DSM specific settings
# Fetching BOTH DEM and DSM from Planetary Computer

# --- URLs and Paths ---
CRS_WGS84 = "EPSG:4326"

# Ensure the 'sat_files' directory exists
sat_files_dir = abs_output_dir / city_name / "sat_files"
sat_files_dir.mkdir(parents=True, exist_ok=True)

# --- Target Resolution --- # 
TARGET_ELEV_RESOLUTION_M = 10 # Target resolution in meters (downloading source 1m files make dataloading to the model extremely slow)
# Approximate 10m in degrees at NYC latitude (approx 40.7 N)
# Latitude: 1 deg ~ 111 km => 10m ~ 0.00009 deg
# Longitude: 1 deg ~ 85 km => 10m ~ 0.000118 deg
# Using the latitude approximation for stackstac resolution parameter
TARGET_ELEV_RESOLUTION_DEG = 0.00009 

# --- Define final expected TIF paths --- #
# UPDATED Filenames to reflect 10m resolution
final_dem_path_tif = sat_files_dir / f"nyc_dem_{TARGET_ELEV_RESOLUTION_M}m_pc.tif" 
final_dsm_path_tif = sat_files_dir / f"nyc_dsm_{TARGET_ELEV_RESOLUTION_M}m_pc.tif"

# --- Define nodata value --- #
ELEV_NODATA_VALUE = -9999.0 # Use a float value for both

# --- Download DEM from Planetary Computer --- #
print(f"\n--- Downloading DEM from Planetary Computer at approx {TARGET_ELEV_RESOLUTION_M}m resolution ---")
dem_downloaded_pc = False
if final_dem_path_tif.exists():
    logging.info(f"DEM file {final_dem_path_tif.name} already exists. Skipping download.")
    dem_downloaded_pc = True
else:
    try:
        # Ensure necessary libraries are imported (should be in cell 2)
        import pystac_client
        import planetary_computer
        import stackstac
        import rioxarray
        import numpy as np

        # Define bounding box and CRS for STAC query
        bbox = bounds # Use the same bounds calculated earlier
        logging.info(f"Using bounding box for DEM query: {bbox}")

        # Search the 3dep-lidar-DTM collection (DEM)
        catalog = pystac_client.Client.open("https://planetarycomputer.microsoft.com/api/stac/v1", modifier=planetary_computer.sign_inplace)
        search_dem = catalog.search(
            collections=["3dep-lidar-dtm"], # Use DTM collection
            bbox=bbox,
        )
        items_dem = search_dem.item_collection()
        logging.info(f"Found {len(items_dem)} 3DEP DTM (DEM) items for the bounding box.")

        if not items_dem:
            logging.warning("No 3DEP DTM (DEM) items found for the specified area. Cannot download DEM.")
        else:
            # Load items, mean aggregate time, select the single band, then squeeze
            logging.info(f"Requesting DEM data from stackstac at resolution: {TARGET_ELEV_RESOLUTION_DEG} degrees")
            dem_data = stackstac.stack(
                items_dem,
                assets=["data"],
                epsg=4326,
                resolution=TARGET_ELEV_RESOLUTION_DEG, # Use target 10m resolution in degrees
                dtype=np.float32, # Save directly as float32
                fill_value=ELEV_NODATA_VALUE,
                rescale=False,
                bounds_latlon=bbox
            ).mean("time", skipna=True) # Aggregate time first

            # Explicitly select the single 'data' band if 'band' dimension exists
            if "band" in dem_data.dims:
                dem_data = dem_data.isel(band=0)

            # Squeeze any remaining singleton dimensions
            dem_data = dem_data.squeeze()

            logging.info(f"Created DEM xarray with shape: {dem_data.shape} and dtype: {dem_data.dtype}")

            # Assign CRS if missing
            if dem_data.rio.crs is None:
                 logging.warning("Assigning CRS EPSG:4326 to DEM data as it was missing after stackstac.")
                 dem_data = dem_data.rio.write_crs(CRS_WGS84)

            # Set nodata value explicitly for writing
            dem_data.rio.write_nodata(ELEV_NODATA_VALUE, inplace=True)

            # Save the DataArray to a GeoTIFF (Filename now reflects 10m)
            logging.info(f"Saving DEM to {final_dem_path_tif}...")
            dem_data.rio.to_raster(final_dem_path_tif, driver="COG") 
            logging.info("Successfully downloaded and saved DEM from Planetary Computer.")
            dem_downloaded_pc = True
            del dem_data # Clean up memory

    except ImportError as e:
        logging.error(f"Missing libraries for Planetary Computer access ({e}). Please install: pystac-client planetary-computer stackstac rioxarray numpy")
    except Exception as e:
        logging.error(f"An error occurred during Planetary Computer DEM download: {e}", exc_info=True)

# --- Download DSM from Planetary Computer --- #
print(f"\n--- Downloading DSM from Planetary Computer at approx {TARGET_ELEV_RESOLUTION_M}m resolution ---")
dsm_downloaded_pc = False
if final_dsm_path_tif.exists():
    logging.info(f"DSM file {final_dsm_path_tif.name} already exists. Skipping download.")
    dsm_downloaded_pc = True
else:
    try:
        # Libraries should be imported already
        import pystac_client
        import planetary_computer
        import stackstac
        import rioxarray
        import numpy as np

        bbox = bounds # Use the same bounds
        logging.info(f"Using bounding box for DSM query: {bbox}")

        # Search the 3dep-lidar-dsm collection
        catalog = pystac_client.Client.open("https://planetarycomputer.microsoft.com/api/stac/v1", modifier=planetary_computer.sign_inplace)
        search_dsm = catalog.search(
            collections=["3dep-lidar-dsm"],
            bbox=bbox,
        )
        items_dsm = search_dsm.item_collection()
        logging.info(f"Found {len(items_dsm)} 3DEP DSM items for the bounding box.")

        if not items_dsm:
            logging.warning("No 3DEP DSM items found for the specified area. Cannot download DSM.")
        else:
            # Load items, mean aggregate time, select the single band, then squeeze
            logging.info(f"Requesting DSM data from stackstac at resolution: {TARGET_ELEV_RESOLUTION_DEG} degrees")
            dsm_data = stackstac.stack(
                items_dsm,
                assets=["data"],
                epsg=4326,
                resolution=TARGET_ELEV_RESOLUTION_DEG, # Use target 10m resolution in degrees
                dtype=np.float32, # Save directly as float32
                fill_value=ELEV_NODATA_VALUE,
                rescale=False,
                bounds_latlon=bbox
            ).mean("time", skipna=True) # Aggregate time first

            # Explicitly select the single 'data' band if 'band' dimension exists
            if "band" in dsm_data.dims:
                dsm_data = dsm_data.isel(band=0)

            # Squeeze any remaining singleton dimensions
            dsm_data = dsm_data.squeeze()

            logging.info(f"Created DSM xarray with shape: {dsm_data.shape} and dtype: {dsm_data.dtype}")

            # Assign CRS if missing
            if dsm_data.rio.crs is None:
                 logging.warning("Assigning CRS EPSG:4326 to DSM data as it was missing after stackstac.")
                 dsm_data = dsm_data.rio.write_crs(CRS_WGS84)

            # Set nodata value explicitly for writing
            dsm_data.rio.write_nodata(ELEV_NODATA_VALUE, inplace=True)

            # Save the DataArray to a GeoTIFF (Filename now reflects 10m)
            logging.info(f"Saving DSM to {final_dsm_path_tif}...")
            dsm_data.rio.to_raster(final_dsm_path_tif, driver="COG") 
            logging.info("Successfully downloaded and saved DSM from Planetary Computer.")
            dsm_downloaded_pc = True
            del dsm_data # Clean up memory

    except ImportError as e:
        logging.error(f"Missing libraries for Planetary Computer access ({e}). Please install: pystac-client planetary-computer stackstac rioxarray numpy")
    except Exception as e:
        logging.error(f"An error occurred during Planetary Computer DSM download: {e}", exc_info=True)

# --- Update Relative Paths for Config ---
dem_path_relative = Path("data") / city_name / "sat_files" / final_dem_path_tif.name
dsm_path_relative = Path("data") / city_name / "sat_files" / final_dsm_path_tif.name
print(f"\nRelative paths for config (relative to project root '{project_root}'):")
print(f"  DEM: {dem_path_relative}")
print(f"  DSM: {dsm_path_relative}")

# --- Final Check ---
print("\n--- Final Status ---")
if final_dem_path_tif.exists():
    print(f"DEM file exists: {final_dem_path_tif}")
else:
    print("DEM file NOT found.")
if final_dsm_path_tif.exists():
    print(f"DSM file exists: {final_dsm_path_tif}")
else:
    print("DSM file NOT found.")
