# UHI Data Download and Processing Pipeline

## 1. Setup and Dependencies

In [2]:
import sys
import os
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
from pathlib import Path
from datetime import datetime
import logging
import requests
import os
from tqdm import tqdm
import subprocess

# Add the project root to the Python path to allow importing from src
project_root = Path(os.getcwd()).parent  # Assumes notebook is in 'notebooks' subdir
sys.path.insert(0, str(project_root))

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [15]:

# --- Configuration for Data Download ---

# Configure parameters for the target city (e.g., NYC)
city_name = "NYC"

# Time window matching original notebooks (adjust if needed)
sentinel_time_window = "2021-06-01/2021-09-01"
lst_time_window = "2021-06-01/2021-09-01"

# Input files and general settings
data_dir = Path("data")
abs_output_dir = project_root / data_dir
uhi_csv = data_dir / city_name / "uhi.csv" # Path to UHI data
abs_uhi_csv = project_root / uhi_csv
# bbox_csv is no longer needed for bounds calculation

if not abs_uhi_csv.exists():
    raise FileNotFoundError(f"UHI data CSV not found at {abs_uhi_csv}. Cannot derive bounds.")
print(f"Loading bounds from UHI data: {abs_uhi_csv}")
uhi_df = pd.read_csv(abs_uhi_csv)
# Check if required columns exist
required_cols = ['Longitude', 'Latitude']
if not all(col in uhi_df.columns for col in required_cols):
     raise ValueError(f"UHI CSV must contain columns: {required_cols}")

# Load bounds
bounds = [
    uhi_df['Longitude'].min(),
    uhi_df['Latitude'].min(),
    uhi_df['Longitude'].max(),
    uhi_df['Latitude'].max()
]

# Load observation da
first_datetime_obj = pd.to_datetime(uhi_df['datetime'].iloc[0], format='%d-%m-%Y %H:%M')
# Format the date object into 'YYYY-MM-DD' string format
uhi_date_str = first_datetime_obj.strftime('%Y-%m-%d')
print(f"Representative UHI date (from first row): {uhi_date_str}")

Loading bounds from UHI data: /home/jupyter/UHI/MLC-Project/data/NYC/uhi.csv
Representative UHI date (from first row): 2021-07-24


## 2. Download Satellite Data for Cities

Now we'll download satellite imagery data (Sentinel-2 median composites, Landsat LST medians) for specific cities and time periods derived from the UHI data timestamps. Data is saved locally for use by the dataloader.

In [12]:
# Cell from notebooks/download_data.ipynb (modified)

# Import functions
from src.ingest.get_median import create_and_save_cloudless_mosaic
# Import the modified LST download function
from src.ingest.create_sat_tensor_files import download_single_lst_median
import pandas as pd
from pathlib import Path
import os

# Parameters for Cloudless Mosaic (matching Sentinel2_GeoTIFF.ipynb)
mosaic_bands = ["B02", "B03", "B04", "B08"] # RGB+NIR for Clay compatibility
mosaic_resolution_m = 10
mosaic_cloud_cover = 30

# Parameters for LST Median (matching Landsat_LST.ipynb)
include_lst = False         # Whether to download LST
lst_resolution_m = 30      # Native resolution for Landsat LST

# Generate output path for the mosaic based on the new time window
start_dt_str = sentinel_time_window.split('/')[0].replace('-','')
end_dt_str = sentinel_time_window.split('/')[1].replace('-','')
band_str = "_".join(mosaic_bands)
cloudless_mosaic_filename = f"sentinel_{city_name}_{start_dt_str}_to_{end_dt_str}_cloudless_mosaic.npy"
cloudless_mosaic_path = abs_output_dir / city_name / "sat_files" / cloudless_mosaic_filename
# --- Verification ---
print(f"City: {city_name}")
print(f"Sentinel-2 Time Window: {sentinel_time_window}")
print(f"Sentinel-2 Cloud Cover Threshold: {mosaic_cloud_cover}%")
print(f"LST Time Window: {lst_time_window}")
print(f"Bounds derived from {uhi_csv.name}: {bounds}")
print(f"Target mosaic output path: {cloudless_mosaic_path}")
print(f"Include LST: {include_lst}")

Loading bounds from UHI data: /home/jupyter/UHI/MLC-Project/data/NYC/uhi.csv
City: NYC
Sentinel-2 Time Window: 2021-06-01/2021-09-01
Sentinel-2 Cloud Cover Threshold: 30%
LST Time Window: 2021-06-01/2021-09-01
Bounds derived from uhi.csv: [np.float64(-73.99445667), np.float64(40.75879167), np.float64(-73.87945833), np.float64(40.85949667)]
Target mosaic output path: /home/jupyter/UHI/MLC-Project/data/NYC/sat_files/sentinel_NYC_20210601_to_20210901_cloudless_mosaic.npy
Include LST: False


In [13]:
# --- 1. Generate Cloudless Mosaic --- 
print(f"\n--- Generating Cloudless Mosaic ({sentinel_time_window}) ---")

mosaic_output_path = create_and_save_cloudless_mosaic(
    city_name=city_name,
    bounds=bounds,
    output_dir=abs_output_dir,
    time_window=sentinel_time_window, # Use the explicit time window
    selected_bands=mosaic_bands,
    resolution_m=mosaic_resolution_m,
    cloud_cover=mosaic_cloud_cover # Use the updated cloud cover
)

if mosaic_output_path:
    print(f"Cloudless mosaic saved/found at: {mosaic_output_path}")
else:
    # Stop if mosaic fails, as it's required
    raise RuntimeError("Failed to generate cloudless mosaic.")

# --- 2. Download Single LST Median (if enabled) ---
print(f"\n--- Downloading Single LST Median (Include: {include_lst}, Window: {lst_time_window}) ---")

single_lst_median_file_path = None # Initialize path variable
if include_lst:
    # No need to check UHI CSV, we provide the time window directly
    
    # Download the single LST median using the explicit time window
    single_lst_median_file_path = download_single_lst_median(
        city_name=city_name,
        bounds=bounds,
        output_dir=abs_output_dir,
        time_window=lst_time_window, # Provide explicit window
        # uhi_csv_path and averaging_window are omitted/None
        resolution_m=lst_resolution_m
        # lst_cloud_cover is handled internally by load_lst_tensor_from_bbox_median
    )

    if single_lst_median_file_path:
        print(f"Single LST median saved/found at: {single_lst_median_file_path}")
    else:
        print("Failed to generate single LST median.")
else:
    print("Skipping LST median download as include_lst is False.")

# --- Verification ---
sat_files_check_dir = Path(abs_output_dir) / city_name / "sat_files"
print(f"\nVerifying output files:")
print(f"  Mosaic path ({cloudless_mosaic_path.name}) exists: {cloudless_mosaic_path.exists()}")
if include_lst:
    # Construct expected LST filename based on the explicit window
    lst_start_str = lst_time_window.split('/')[0].replace('-','')
    lst_end_str = lst_time_window.split('/')[1].replace('-','')
    expected_lst_filename = f"lst_{city_name}_median_{lst_start_str}_to_{lst_end_str}.npy"
    expected_lst_path = sat_files_check_dir / expected_lst_filename
    print(f"  Single LST median path ({expected_lst_filename}) exists: {expected_lst_path.exists()}")
    # Update the variable used by later cells if generation was successful
    if single_lst_median_file_path and not single_lst_median_file_path.exists():
         # This case shouldn't happen if the function worked, but good sanity check
         print(f"Warning: LST download function returned a path but it doesn't exist: {single_lst_median_file_path}")
         single_lst_median_file_path = None # Ensure later cells know it failed
    elif not single_lst_median_file_path and expected_lst_path.exists():
         # File existed previously, update path variable for later cells
         single_lst_median_file_path = expected_lst_path

2025-04-27 21:00:41,109 - INFO - Cloudless mosaic /home/jupyter/UHI/MLC-Project/data/NYC/sat_files/sentinel_NYC_20210601_to_20210901_cloudless_mosaic.npy already exists. Skipping generation.



--- Generating Cloudless Mosaic (2021-06-01/2021-09-01) ---
Cloudless mosaic saved/found at: /home/jupyter/UHI/MLC-Project/data/NYC/sat_files/sentinel_NYC_20210601_to_20210901_cloudless_mosaic.npy

--- Downloading Single LST Median (Include: False, Window: 2021-06-01/2021-09-01) ---
Skipping LST median download as include_lst is False.

Verifying output files:
  Mosaic path (sentinel_NYC_20210601_to_20210901_cloudless_mosaic.npy) exists: True


## 3. Download DEM/DSM Data (NYC)

Download Digital Elevation Model (DEM) and Digital Surface Model (DSM) data for the city. These provide ground elevation and surface elevation (including buildings/trees) respectively.

We will use the 1ft resolution data from NYC Open Data (derived from 2017 LiDAR).

In [None]:
print(f"\n--- Downloading DEM/DSM for {city_name} ---")

# --- URLs and Paths ---
# DEM: Query the ArcGIS REST API to find the download link
dem_api_query_url = "https://elevation.its.ny.gov/arcgis/rest/services/Dem_Indexes/FeatureServer/0/query"
# DSM: Use the direct export link, but note it might fail (404 previously)
# If it fails, the correct download needs to be found manually on the dataset page:
# https://data.cityofnewyork.us/City-Government/1-foot-Digital-Surface-Model-DSM-2017-/btfm-ttmn
dsm_direct_url = "https://data.cityofnewyork.us/api/geospatial/btfm-ttmn?method=export&format=Original" # NYC 1ft DSM (2017 LiDAR)

sat_files_dir = abs_output_dir / city_name / "sat_files"
sat_files_dir.mkdir(parents=True, exist_ok=True)

dem_filename_zip = "nyc_dem_1ft_2017.zip" # Assuming API link points to a zip
dsm_filename_zip = "nyc_dsm_1ft_2017.zip"

dem_output_path_zip = sat_files_dir / dem_filename_zip
dsm_output_path_zip = sat_files_dir / dsm_filename_zip

# Define final expected TIF paths (adjust if filename inside zip is different)
final_dem_path_tif = sat_files_dir / "nyc_dem_1ft_2017.tif"
final_dsm_path_tif = sat_files_dir / "nyc_dsm_1ft_2017.tif"

# --- Helper Function to Download ---
def download_file(url, output_path):
    if not url:
        print(f"Error: No URL provided for {output_path.name}.")
        return False
    if output_path.exists():
        print(f"File {output_path.name} already exists. Skipping download.")
        return True
    try:
        print(f"Downloading {output_path.name} from {url}...")
        response = requests.get(url, stream=True, timeout=60) # Added timeout
        response.raise_for_status() # Raise an exception for bad status codes
        total_size = int(response.headers.get('content-length', 0))
        block_size = 8192 # Increased block size

        with open(output_path, 'wb') as f, tqdm(
            desc=output_path.name,
            total=total_size,
            unit='iB',
            unit_scale=True,
            unit_divisor=1024,
        ) as bar:
            for data in response.iter_content(block_size):
                size = f.write(data)
                bar.update(size)
        print(f"Successfully downloaded {output_path.name}")
        return True
    except requests.exceptions.RequestException as e:
        print(f"Error downloading {output_path.name}: {e}")
        if output_path.exists(): # Clean up partial download
            os.remove(output_path)
        return False
    except Exception as e:
        print(f"An unexpected error occurred during download of {output_path.name}: {e}")
        if output_path.exists(): # Clean up partial download
             os.remove(output_path)
        return False

# --- Helper Function to Unzip ---
def unzip_file(zip_path, extract_dir):
    if not zip_path.exists():
        print(f"Zip file not found: {zip_path}")
        return False
    # Check if the final TIF already exists to avoid unnecessary unzipping
    expected_tif_name = zip_path.stem + ".tif" # e.g., nyc_dem_1ft_2017.tif
    expected_tif_path = extract_dir / expected_tif_name
    if expected_tif_path.exists():
        print(f"Expected TIF file {expected_tif_path.name} already exists. Skipping unzip.")
        return True
        
    try:
        print(f"Unzipping {zip_path.name} to {extract_dir}...")
        # Use -o to overwrite existing files without prompting
        result = subprocess.run(['unzip', '-o', str(zip_path), '-d', str(extract_dir)],
                                capture_output=True, text=True, check=True, timeout=300) # Added timeout
        print(f"Successfully unzipped {zip_path.name}")
        
        # Verify the expected TIF file exists after unzipping
        if not expected_tif_path.exists():
             print(f"Warning: Expected TIF file {expected_tif_path.name} not found directly in {extract_dir} after unzipping. Check subdirectories or zip contents.")
             # Look for any .tif file as a fallback check
             tif_files = list(extract_dir.glob('*.tif')) + list(extract_dir.glob('*/*.tif'))
             if tif_files:
                 print(f"  Found other TIF files: {[f.name for f in tif_files]}. Manual check might be needed.")
             else:
                 print(f"  No .tif files found in {extract_dir} or immediate subdirectories.")
             return False # Consider it failed if the *specific* expected file isn't there
        return True
    except FileNotFoundError:
        print("Error: 'unzip' command not found. Please install it.")
        return False
    except subprocess.TimeoutExpired:
        print(f"Error: Unzipping {zip_path.name} timed out.")
        return False
    except subprocess.CalledProcessError as e:
        print(f"Error unzipping {zip_path.name}: {e}")
        print(f"Stderr: {e.stderr}")
        return False
    except Exception as e:
        print(f"An unexpected error occurred during unzipping of {zip_path.name}: {e}")
        return False

# --- Download DEM ---
dem_download_url = None
try:
    print(f"Querying DEM API: {dem_api_query_url}")
    params = {'where': '1=1', 'outFields': 'DIRECT_DL', 'f': 'json'}
    api_response = requests.get(dem_api_query_url, params=params, timeout=30)
    api_response.raise_for_status()
    api_data = api_response.json()
    
    if 'features' in api_data and len(api_data['features']) > 0:
        # Assume the first feature's link is sufficient or representative
        dem_download_url = api_data['features'][0].get('attributes', {}).get('DIRECT_DL')
        if dem_download_url:
            print(f"Found DEM download URL via API: {dem_download_url}")
        else:
            print("Error: Found features in DEM API response, but no 'DIRECT_DL' attribute.")
    else:
        print("Error: No features found in DEM API query response.")
        
except requests.exceptions.RequestException as e:
    print(f"Error querying DEM API: {e}")
except json.JSONDecodeError:
    print("Error: Could not decode JSON response from DEM API.")
except Exception as e:
    print(f"An unexpected error occurred during DEM API query: {e}")

dem_downloaded = False
dem_unzipped = False
if dem_download_url:
    dem_downloaded = download_file(dem_download_url, dem_output_path_zip)
    if dem_downloaded:
        dem_unzipped = unzip_file(dem_output_path_zip, sat_files_dir)
else:
    print("Skipping DEM download and unzip as URL could not be retrieved from API.")

# --- Download DSM ---
print("\nAttempting direct download for DSM (Note: Link may be broken, check dataset page if fails)")
dsm_downloaded = download_file(dsm_direct_url, dsm_output_path_zip)
dsm_unzipped = False
if dsm_downloaded:
    dsm_unzipped = unzip_file(dsm_output_path_zip, sat_files_dir)


# --- Verification ---
print(f"\nVerifying final DEM/DSM TIF files:")
print(f"  Expected DEM TIF: {final_dem_path_tif} -> Exists: {final_dem_path_tif.exists()}")
print(f"  Expected DSM TIF: {final_dsm_path_tif} -> Exists: {final_dsm_path_tif.exists()}")

# Store paths for potential use in later cells or for the training notebook config
config_dem_path_relative = Path("data") / city_name / "sat_files" / final_dem_path_tif.name
config_dsm_path_relative = Path("data") / city_name / "sat_files" / final_dsm_path_tif.name
print(f"\nRelative paths for config:")
print(f"  DEM: {config_dem_path_relative}")
print(f"  DSM: {config_dsm_path_relative}")

# Optional: Clean up zip files after successful unzip and verification
if dem_unzipped and final_dem_path_tif.exists() and dem_output_path_zip.exists():
    print(f"Cleaning up {dem_output_path_zip.name}...")
    # os.remove(dem_output_path_zip)
if dsm_unzipped and final_dsm_path_tif.exists() and dsm_output_path_zip.exists():
     print(f"Cleaning up {dsm_output_path_zip.name}...")
     # os.remove(dsm_output_path_zip) # Uncomment to enable cleanup
