# UHI Data Download and Processing Pipeline

## 1. Setup and Dependencies

In [1]:
import sys
import os
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
from pathlib import Path
from datetime import datetime
import logging
import requests
import os
from tqdm import tqdm
import subprocess
import zipfile # For more robust unzipping
import time

# Imports for tile index processing
import geopandas as gpd
from shapely.geometry import box

# Add the project root to the Python path to allow importing from src
project_root = Path(os.getcwd()).parent  # Assumes notebook is in 'notebooks' subdir
sys.path.insert(0, str(project_root))

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [3]:

# --- Configuration for Data Download ---

# Configure parameters for the target city (e.g., NYC)
city_name = "NYC"

# Time window matching original notebooks (adjust if needed)
sentinel_time_window = "2021-06-01/2021-09-01"
lst_time_window = "2021-06-01/2021-09-01"

# Input files and general settings
data_dir = Path("data")
abs_output_dir = project_root / data_dir
uhi_csv = data_dir / city_name / "uhi.csv" # Path to UHI data
abs_uhi_csv = project_root / uhi_csv
# bbox_csv is no longer needed for bounds calculation

if not abs_uhi_csv.exists():
    raise FileNotFoundError(f"UHI data CSV not found at {abs_uhi_csv}. Cannot derive bounds.")
print(f"Loading bounds from UHI data: {abs_uhi_csv}")
uhi_df = pd.read_csv(abs_uhi_csv)
# Check if required columns exist
required_cols = ['Longitude', 'Latitude']
if not all(col in uhi_df.columns for col in required_cols):
     raise ValueError(f"UHI CSV must contain columns: {required_cols}")

# Load bounds
bounds = [
    uhi_df['Longitude'].min(),
    uhi_df['Latitude'].min(),
    uhi_df['Longitude'].max(),
    uhi_df['Latitude'].max()
]

# Load observation da
first_datetime_obj = pd.to_datetime(uhi_df['datetime'].iloc[0], format='%d-%m-%Y %H:%M')
# Format the date object into 'YYYY-MM-DD' string format
uhi_date_str = first_datetime_obj.strftime('%Y-%m-%d')
print(f"Representative UHI date (from first row): {uhi_date_str}")

Loading bounds from UHI data: /Users/arnav/MLC-Project/data/NYC/uhi.csv
Representative UHI date (from first row): 2021-07-24


## 2. Download Satellite Data for Cities

Now we'll download satellite imagery data (Sentinel-2 median composites, Landsat LST medians) for specific cities and time periods derived from the UHI data timestamps. Data is saved locally for use by the dataloader.

In [None]:
# Cell from notebooks/download_data.ipynb (modified)

# Import functions
from src.ingest.get_median import create_and_save_cloudless_mosaic
# Import the modified LST download function
from src.ingest.create_sat_tensor_files import download_single_lst_median
import pandas as pd
from pathlib import Path
import os

# Parameters for Cloudless Mosaic (matching Sentinel2_GeoTIFF.ipynb)
mosaic_bands = ["B02", "B03", "B04", "B08"] # RGB+NIR for Clay compatibility
mosaic_resolution_m = 10
mosaic_cloud_cover = 30

# Parameters for LST Median (matching Landsat_LST.ipynb)
include_lst = False         # Whether to download LST
lst_resolution_m = 30      # Native resolution for Landsat LST

# Generate output path for the mosaic based on the new time window
start_dt_str = sentinel_time_window.split('/')[0].replace('-','')
end_dt_str = sentinel_time_window.split('/')[1].replace('-','')
band_str = "_".join(mosaic_bands)
cloudless_mosaic_filename = f"sentinel_{city_name}_{start_dt_str}_to_{end_dt_str}_cloudless_mosaic.npy"
cloudless_mosaic_path = abs_output_dir / city_name / "sat_files" / cloudless_mosaic_filename
# --- Verification ---
print(f"City: {city_name}")
print(f"Sentinel-2 Time Window: {sentinel_time_window}")
print(f"Sentinel-2 Cloud Cover Threshold: {mosaic_cloud_cover}%")
print(f"LST Time Window: {lst_time_window}")
print(f"Bounds derived from {uhi_csv.name}: {bounds}")
print(f"Target mosaic output path: {cloudless_mosaic_path}")
print(f"Include LST: {include_lst}")

Loading bounds from UHI data: /home/jupyter/UHI/MLC-Project/data/NYC/uhi.csv
City: NYC
Sentinel-2 Time Window: 2021-06-01/2021-09-01
Sentinel-2 Cloud Cover Threshold: 30%
LST Time Window: 2021-06-01/2021-09-01
Bounds derived from uhi.csv: [np.float64(-73.99445667), np.float64(40.75879167), np.float64(-73.87945833), np.float64(40.85949667)]
Target mosaic output path: /home/jupyter/UHI/MLC-Project/data/NYC/sat_files/sentinel_NYC_20210601_to_20210901_cloudless_mosaic.npy
Include LST: False


### 3. Download + Build DEM / DSM for NYC (AOI‑bounded, 1 ft grid)

1. **Tile‑index fetch**  
   * Pull both tile‑index shapefiles (DEM & LAS) from the NYS topo‑bathymetric 2017 FTP mirror.  
   * They live in `/BE_DEM/…zip` and `/LAS/…zip`.  
   * They’re unzipped to a temp folder, read by GeoPandas, then deleted.

2. **Intersect AOI**  
   * Your AOI bounds (given in decimal lat/long) are re‑projected to EPSG 2263.  
   * We select only the DEM `.tif` tiles and LAS `.laz` tiles whose polygons hit that AOI.

3. **Download source tiles**  
   * DEM tiles (`be_NYC_###.tif`) stream straight from FTP to `dem_tiles/`.  
   * LAS tiles (`hh_NYC_###.laz`) stream to `las_tiles/`.  
   * Progress bars show raw byte count; no FTP `SIZE` calls (the server blocks those).

4. **Build rasters**  
   * **DEM** – `gdal_merge.py` mosaics the few BE tiles → `dem_merged_epsg2263.tif`.  
   * **DSM** – PDAL crops the LAS hits, bins highest return (`output_type=max`), writes `dsm_epsg2263.tif` at 0.3048 m (1 ft) resolution.  
   * Both rasters are then re‑projected with `gdalwarp` to EPSG 4326:  
     `dem_epsg4326.tif`, `dsm_epsg4326.tif`.

5. **Clean‑up**  
   * All temp folders (`indices/`, `dem_tiles/`, `las_tiles/`) and intermediate rasters in EPSG 2263 are removed once the 4326 GeoTIFFs are verified non‑empty.  
   * Leftover artefacts = **zero**. Only the two final products remain.

**Data Source**

* Source LiDAR survey: *NYC Topobathymetric LiDAR 2017* FTP server.
* DEM tiles: bare‑earth (`be_…`) rasters published by NYS GIS Clearinghouse.  
* DSM: freshly generated from the raw point cloud because no public HH DSM tiles exist, although the
  state did publish them in 2017, they seem to have been deleted since.

**Outputs**

| File | CRS / Units | Resolution | Description |
|------|-------------|------------|-------------|
| `dem_epsg4326.tif` | EPSG 4326, metres in Z | ≈0.00000274° (~0.3048 m) | Bare‑earth ground elevation |
| `dsm_epsg4326.tif` | EPSG 4326, metres in Z | same grid | Surface elevation (roofs, canopy) |

In [4]:
# ─────────────────────────────────────────────────────────────────────────────
# CONFIG
# ─────────────────────────────────────────────────────────────────────────────
FTP_HOST       = "ftp.gis.ny.gov"
FTP_ROOT       = "/elevation/LIDAR/NYC_TopoBathymetric2017"

# --- URLs and Paths ---
# DEM: Query the ArcGIS REST API to find the download link
dem_api_query_url = "https://elevation.its.ny.gov/arcgis/rest/services/Dem_Indexes/FeatureServer/0/query"
# DSM: Will be fetched from Planetary Computer

CRS_WGS84      = "EPSG:4326"
CRS_NAD83_NY   = "EPSG:2263"

dem_filename_zip = "nyc_dem_1ft_2017.zip" # Assuming API link points to a zip
# DSM filename will be determined by PC query
dem_output_path_zip = sat_files_dir / dem_filename_zip

# Define final expected TIF paths (DSM path updated)
final_dem_path_tif = sat_files_dir / "nyc_dem_1ft_2017.tif"
final_dsm_path_tif = sat_files_dir / "nyc_dsm_1m_pc.tif" # Using 1m resolution from PC

# --- Helper Function to Download (requests, for DEM API) ---
def download_file(url, output_path):
    if not url:
        print(f"Error: No URL provided for {output_path.name}.")
        return False
    if output_path.exists():
        print(f"File {output_path.name} already exists. Skipping download.")
        return True
    try:
        print(f"Downloading {output_path.name} from {url}...")
        response = requests.get(url, stream=True, timeout=120) # Increased timeout
        response.raise_for_status() # Raise an exception for bad status codes
        total_size = int(response.headers.get('content-length', 0))
        block_size = 8192 # Increased block size

        with open(output_path, 'wb') as f, tqdm(
            desc=output_path.name,
            total=total_size,
            unit='iB',
            unit_scale=True,
            unit_divisor=1024,
        ) as bar:
            for data in response.iter_content(block_size):
                size = f.write(data)
                bar.update(size)
        print(f"Successfully downloaded {output_path.name}")
        return True
    except requests.exceptions.RequestException as e:
        print(f"Error downloading {output_path.name}: {e}")
        if output_path.exists(): os.remove(output_path)
        return False
    except Exception as e:
        print(f"An unexpected error occurred during download of {output_path.name}: {e}")
        if output_path.exists(): os.remove(output_path)
        return False

# --- Helper Function to Unzip (for DEM) ---
def unzip_file(zip_path, extract_dir):
    if not zip_path.exists():
        print(f"Zip file not found: {zip_path}")
        return False
    expected_tif_name = zip_path.stem + ".tif"
    expected_tif_path = extract_dir / expected_tif_name
    if expected_tif_path.exists():
        print(f"Expected TIF file {expected_tif_path.name} already exists. Skipping unzip.")
        return True
        
    try:
        print(f"Unzipping {zip_path.name} to {extract_dir}...")
        subprocess.run(['unzip', '-o', str(zip_path), '-d', str(extract_dir)],
                       capture_output=True, text=True, check=True, timeout=300)
        print(f"Successfully unzipped {zip_path.name}")
        if not expected_tif_path.exists():
             print(f"Warning: Expected TIF file {expected_tif_path.name} not found after unzipping.")
             return False
        return True
    except Exception as e:
        print(f"Error during unzipping of {zip_path.name}: {e}")
        return False

# --- Download DEM (Existing Logic) ---
dem_download_url = None
try:
    print(f"Querying DEM API: {dem_api_query_url}")
    params = {'where': '1=1', 'outFields': 'DIRECT_DL', 'f': 'json'}
    api_response = requests.get(dem_api_query_url, params=params, timeout=30)
    api_response.raise_for_status()
    api_data = api_response.json()
    if 'features' in api_data and len(api_data['features']) > 0:
        dem_download_url = api_data['features'][0].get('attributes', {}).get('DIRECT_DL')
        if dem_download_url:
            print(f"Found DEM download URL via API: {dem_download_url}")
        else:
            print("Error: DEM API response missing 'DIRECT_DL'.")
    else:
        print("Error: No features found in DEM API query response.")
dexcept Exception as e:
    print(f"Error querying DEM API: {e}")

dem_downloaded = False
dem_unzipped = False
if dem_download_url:
    dem_downloaded = download_file(dem_download_url, dem_output_path_zip)
    if dem_downloaded:
        dem_unzipped = unzip_file(dem_output_path_zip, sat_files_dir)
else:
    print("Skipping DEM download and unzip.")

# --- Download DSM from Planetary Computer (No Auth Key Needed) --- 
print("\n--- Downloading DSM from Planetary Computer ---")
dsm_downloaded_pc = False
if final_dsm_path_tif.exists():
    print(f"DSM file {final_dsm_path_tif.name} already exists. Skipping download.")
    dsm_downloaded_pc = True
else:
    try:
        # Define bounding box and CRS for STAC query
        bbox = bounds # Use the same bounds calculated earlier
        print(f"Using bounding box for DSM query: {bbox}")

        # Search the 3dep-lidar-dsm collection using the public STAC endpoint
        catalog = pystac_client.Client.open("https://planetarycomputer.microsoft.com/api/stac/v1")
        search = catalog.search(
            collections=["3dep-lidar-dsm"],
            bbox=bbox,
        )
        items = search.item_collection()
        print(f"Found {len(items)} 3DEP DSM items for the bounding box.")
        
        if not items:
            print("No 3DEP DSM items found for the specified area. Cannot download DSM.")
        else:
            # Sign items - this might be needed for direct access depending on collection settings
            # but often works without an explicit key for public data
            signed_items = planetary_computer.sign(items)
            
            # Load items into an xarray DataArray using stackstac
            # We will use the 'data' asset which is the Cloud Optimized GeoTIFF
            # Requesting resolution close to 1m (approx 0.00001 degrees)
            dsm_data = stackstac.stack(
                signed_items, # Use signed items
                assets=["data"],
                resolution=0.00001, # Approx 1m
                dtype=np.float32,
                fill_value=np.nan, # Use NaN for fill
                bounds_latlon=bbox # Ensure stack uses the query bounds
            ).squeeze() # Remove time/band dims if unnecessary
            
            print(f"Created DSM xarray with shape: {dsm_data.shape}")
            # Assign CRS if missing (should be EPSG:4326 from PC)
            if dsm_data.rio.crs is None:
                 dsm_data = dsm_data.rio.write_crs("EPSG:4326")
                 
            # Set nodata value explicitly for writing
            dsm_data.rio.write_nodata(np.nan, inplace=True)
            
            # Save the DataArray to a GeoTIFF
            print(f"Saving DSM to {final_dsm_path_tif}...")
            dsm_data.rio.to_raster(final_dsm_path_tif, driver="COG") # Use Cloud Optimized GeoTIFF driver
            print("Successfully downloaded and saved DSM from Planetary Computer.")
            dsm_downloaded_pc = True
            
    except ImportError as e:
        print(f"Error: Missing libraries for Planetary Computer access ({e}). Install pystac-client, planetary-computer, stackstac, rioxarray.")
    except Exception as e:
        print(f"An error occurred during Planetary Computer DSM download: {e}")

# reproject DEM to EPSG:4326
if not dem4326.exists():
    subprocess.run(["gdalwarp", "-t_srs", CRS_WGS84, "-r", "bilinear",
                    "-dstnodata", "-9999", "-overwrite",
                    merged_dem2263, dem4326], check=True)

# --- Update Relative Paths for Config ---
config_dem_path_relative = Path("data") / city_name / "sat_files" / final_dem_path_tif.name
config_dsm_path_relative = Path("data") / city_name / "sat_files" / final_dsm_path_tif.name # Use the new filename
print(f"\nRelative paths for config:")
print(f"  DEM: {config_dem_path_relative}")
print(f"  DSM: {config_dsm_path_relative}")

# Optional: Clean up DEM zip file
if dem_unzipped and final_dem_path_tif.exists() and dem_output_path_zip.exists():
    print(f"Cleaning up {dem_output_path_zip.name}...")
    # os.remove(dem_output_path_zip) # Uncomment to enable cleanup



--- Downloading DEM/DSM for NYC ---
Querying DEM API: https://elevation.its.ny.gov/arcgis/rest/services/Dem_Indexes/FeatureServer/0/query
Found DEM download URL via API: https://gisdata.ny.gov/elevation/DEM/NYC_TopoBathymetric2017/be_NYC_001.tif
Downloading nyc_dem_1ft_2017.zip from https://gisdata.ny.gov/elevation/DEM/NYC_TopoBathymetric2017/be_NYC_001.tif...


nyc_dem_1ft_2017.zip:  27%|██▋       | 21.0M/77.8M [00:06<00:16, 3.57MiB/s]


KeyboardInterrupt: 