# UHI Data Download and Processing Pipeline

In [1]:
# Your code here

# UHI Data Download and Processing Pipeline

This notebook demonstrates how to:

1. Download and process UHI GeoTIFF files from sources like Fort Lauderdale (FTL) and convert them to CSV format
2. Download satellite imagery for specific cities and time periods and save them locally
3. Use local satellite data files with the dataloader instead of direct API calls

The pipeline is designed to work with different city datasets with the same structure.

## 1. Setup and Dependencies

In [2]:
import sys
import os
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
from pathlib import Path
from datetime import datetime
import logging

# Add the project root to the Python path to allow importing from src
project_root = Path(os.getcwd()).parent  # Assumes notebook is in 'notebooks' subdir
sys.path.insert(0, str(project_root))

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [15]:

# --- Configuration for Data Download ---

# Configure parameters for the target city (e.g., NYC)
city_name = "NYC"

# Time window matching original notebooks (adjust if needed)
sentinel_time_window = "2021-06-01/2021-09-01"
lst_time_window = "2021-06-01/2021-09-01"

# Input files and general settings
data_dir = Path("data")
abs_output_dir = project_root / data_dir
uhi_csv = data_dir / city_name / "uhi.csv" # Path to UHI data
abs_uhi_csv = project_root / uhi_csv
# bbox_csv is no longer needed for bounds calculation

if not abs_uhi_csv.exists():
    raise FileNotFoundError(f"UHI data CSV not found at {abs_uhi_csv}. Cannot derive bounds.")
print(f"Loading bounds from UHI data: {abs_uhi_csv}")
uhi_df = pd.read_csv(abs_uhi_csv)
# Check if required columns exist
required_cols = ['Longitude', 'Latitude']
if not all(col in uhi_df.columns for col in required_cols):
     raise ValueError(f"UHI CSV must contain columns: {required_cols}")

# Load bounds
bounds = [
    uhi_df['Longitude'].min(),
    uhi_df['Latitude'].min(),
    uhi_df['Longitude'].max(),
    uhi_df['Latitude'].max()
]

# Load observation da
first_datetime_obj = pd.to_datetime(uhi_df['datetime'].iloc[0], format='%d-%m-%Y %H:%M')
# Format the date object into 'YYYY-MM-DD' string format
uhi_date_str = first_datetime_obj.strftime('%Y-%m-%d')
print(f"Representative UHI date (from first row): {uhi_date_str}")

Loading bounds from UHI data: /home/jupyter/UHI/MLC-Project/data/NYC/uhi.csv
Representative UHI date (from first row): 2021-07-24


## 2. Download Satellite Data for Cities

Now we'll download satellite imagery data (Sentinel-2 median composites, Landsat LST medians) for specific cities and time periods derived from the UHI data timestamps. Data is saved locally for use by the dataloader.

In [12]:
# Cell from notebooks/download_data.ipynb (modified)

# Import functions
from src.ingest.get_median import create_and_save_cloudless_mosaic
# Import the modified LST download function
from src.ingest.create_sat_tensor_files import download_single_lst_median
import pandas as pd
from pathlib import Path
import os

# Parameters for Cloudless Mosaic (matching Sentinel2_GeoTIFF.ipynb)
mosaic_bands = ["B02", "B03", "B04", "B08"] # RGB+NIR for Clay compatibility
mosaic_resolution_m = 10
mosaic_cloud_cover = 30

# Parameters for LST Median (matching Landsat_LST.ipynb)
include_lst = False         # Whether to download LST
lst_resolution_m = 30      # Native resolution for Landsat LST

# Generate output path for the mosaic based on the new time window
start_dt_str = sentinel_time_window.split('/')[0].replace('-','')
end_dt_str = sentinel_time_window.split('/')[1].replace('-','')
band_str = "_".join(mosaic_bands)
cloudless_mosaic_filename = f"sentinel_{city_name}_{start_dt_str}_to_{end_dt_str}_cloudless_mosaic.npy"
cloudless_mosaic_path = abs_output_dir / city_name / "sat_files" / cloudless_mosaic_filename
# --- Verification ---
print(f"City: {city_name}")
print(f"Sentinel-2 Time Window: {sentinel_time_window}")
print(f"Sentinel-2 Cloud Cover Threshold: {mosaic_cloud_cover}%")
print(f"LST Time Window: {lst_time_window}")
print(f"Bounds derived from {uhi_csv.name}: {bounds}")
print(f"Target mosaic output path: {cloudless_mosaic_path}")
print(f"Include LST: {include_lst}")

Loading bounds from UHI data: /home/jupyter/UHI/MLC-Project/data/NYC/uhi.csv
City: NYC
Sentinel-2 Time Window: 2021-06-01/2021-09-01
Sentinel-2 Cloud Cover Threshold: 30%
LST Time Window: 2021-06-01/2021-09-01
Bounds derived from uhi.csv: [np.float64(-73.99445667), np.float64(40.75879167), np.float64(-73.87945833), np.float64(40.85949667)]
Target mosaic output path: /home/jupyter/UHI/MLC-Project/data/NYC/sat_files/sentinel_NYC_20210601_to_20210901_cloudless_mosaic.npy
Include LST: False


In [13]:
# --- 1. Generate Cloudless Mosaic --- 
print(f"\n--- Generating Cloudless Mosaic ({sentinel_time_window}) ---")

mosaic_output_path = create_and_save_cloudless_mosaic(
    city_name=city_name,
    bounds=bounds,
    output_dir=abs_output_dir,
    time_window=sentinel_time_window, # Use the explicit time window
    selected_bands=mosaic_bands,
    resolution_m=mosaic_resolution_m,
    cloud_cover=mosaic_cloud_cover # Use the updated cloud cover
)

if mosaic_output_path:
    print(f"Cloudless mosaic saved/found at: {mosaic_output_path}")
else:
    # Stop if mosaic fails, as it's required
    raise RuntimeError("Failed to generate cloudless mosaic.")

# --- 2. Download Single LST Median (if enabled) ---
print(f"\n--- Downloading Single LST Median (Include: {include_lst}, Window: {lst_time_window}) ---")

single_lst_median_file_path = None # Initialize path variable
if include_lst:
    # No need to check UHI CSV, we provide the time window directly
    
    # Download the single LST median using the explicit time window
    single_lst_median_file_path = download_single_lst_median(
        city_name=city_name,
        bounds=bounds,
        output_dir=abs_output_dir,
        time_window=lst_time_window, # Provide explicit window
        # uhi_csv_path and averaging_window are omitted/None
        resolution_m=lst_resolution_m
        # lst_cloud_cover is handled internally by load_lst_tensor_from_bbox_median
    )

    if single_lst_median_file_path:
        print(f"Single LST median saved/found at: {single_lst_median_file_path}")
    else:
        print("Failed to generate single LST median.")
else:
    print("Skipping LST median download as include_lst is False.")

# --- Verification ---
sat_files_check_dir = Path(abs_output_dir) / city_name / "sat_files"
print(f"\nVerifying output files:")
print(f"  Mosaic path ({cloudless_mosaic_path.name}) exists: {cloudless_mosaic_path.exists()}")
if include_lst:
    # Construct expected LST filename based on the explicit window
    lst_start_str = lst_time_window.split('/')[0].replace('-','')
    lst_end_str = lst_time_window.split('/')[1].replace('-','')
    expected_lst_filename = f"lst_{city_name}_median_{lst_start_str}_to_{lst_end_str}.npy"
    expected_lst_path = sat_files_check_dir / expected_lst_filename
    print(f"  Single LST median path ({expected_lst_filename}) exists: {expected_lst_path.exists()}")
    # Update the variable used by later cells if generation was successful
    if single_lst_median_file_path and not single_lst_median_file_path.exists():
         # This case shouldn't happen if the function worked, but good sanity check
         print(f"Warning: LST download function returned a path but it doesn't exist: {single_lst_median_file_path}")
         single_lst_median_file_path = None # Ensure later cells know it failed
    elif not single_lst_median_file_path and expected_lst_path.exists():
         # File existed previously, update path variable for later cells
         single_lst_median_file_path = expected_lst_path

2025-04-27 21:00:41,109 - INFO - Cloudless mosaic /home/jupyter/UHI/MLC-Project/data/NYC/sat_files/sentinel_NYC_20210601_to_20210901_cloudless_mosaic.npy already exists. Skipping generation.



--- Generating Cloudless Mosaic (2021-06-01/2021-09-01) ---
Cloudless mosaic saved/found at: /home/jupyter/UHI/MLC-Project/data/NYC/sat_files/sentinel_NYC_20210601_to_20210901_cloudless_mosaic.npy

--- Downloading Single LST Median (Include: False, Window: 2021-06-01/2021-09-01) ---
Skipping LST median download as include_lst is False.

Verifying output files:
  Mosaic path (sentinel_NYC_20210601_to_20210901_cloudless_mosaic.npy) exists: True


## 3. Using Local Satellite Data with the Dataloader

Finally, we'll demonstrate how to use the modified dataloader that works with local satellite data files instead of making API calls directly.

In [14]:
from src.ingest.dataloader import CityDataSet
import matplotlib.pyplot as plt

weather_csv = data_dir / city_name / "weather.csv"


# Construct the *correct* path for the generated mosaic file
start_dt_str = sentinel_time_window.split('/')[0].replace('-','')
end_dt_str = sentinel_time_window.split('/')[1].replace('-','')
actual_mosaic_filename = f"sentinel_{city_name}_{start_dt_str}_to_{end_dt_str}_cloudless_mosaic.npy"
actual_mosaic_path = abs_data_dir / city_name / "sat_files" / actual_mosaic_filename

# Construct the *expected* path for the LST median file (used if include_lst=True)
single_lst_median_file_path = None # Initialize
if include_lst:
    lst_start_str = lst_time_window.split('/')[0].replace('-','')
    lst_end_str = lst_time_window.split('/')[1].replace('-','')
    expected_lst_filename = f"lst_{city_name}_median_{lst_start_str}_to_{lst_end_str}.npy"
    single_lst_median_file_path = abs_data_dir / city_name / "sat_files" / expected_lst_filename
    if not single_lst_median_file_path.exists():
        print(f"Warning: Expected LST median file not found at {single_lst_median_file_path}. LST will likely be disabled by DataLoader.")
        # Keep the path so DataLoader can log the error if needed, or set to None
        # single_lst_median_file_path = None

# --- Check required files before initializing Dataset ---
required_paths_dict_ds = {
    "UHI CSV": abs_uhi_csv,
    "Weather CSV": abs_weather_csv,
    "Cloudless Mosaic": actual_mosaic_path
}
if include_lst and single_lst_median_file_path: # Only require LST path if include_lst is True and path is defined
     required_paths_dict_ds["Single LST Median"] = single_lst_median_file_path

print("Checking existence of required files for CityDataSet initialization:")
all_ds_files_exist = True
for name, path in required_paths_dict_ds.items():
    exists = False
    if path and Path(path).exists():
        exists = True
    else:
        all_ds_files_exist = False
        print(f"  MISSING: {name} at {path}")
    # print(f"  {name}: {path} -> Exists: {exists}") # Optional verbose print

if not all_ds_files_exist:
    print("\nError: Not all required files exist. Cannot initialize CityDataSet.")
else:
    print("\nAll required files found. Initializing dataset...")
    try:
        # Use mosaic_resolution_m for the target resolution
        target_resolution_m = mosaic_resolution_m

        # Use the string path for LST, or None
        lst_path_arg = str(single_lst_median_file_path) if include_lst and single_lst_median_file_path and single_lst_median_file_path.exists() else None

        dataset = CityDataSet(
            bounds=bounds,
            averaging_window=averaging_window_lst, # Pass required arg
            resolution_m=target_resolution_m,
            uhi_csv=str(abs_uhi_csv),
            bbox_csv=str(abs_bbox_csv),
            weather_csv=str(abs_weather_csv),
            cloudless_mosaic_path=str(actual_mosaic_path), # Use correct mosaic path
            data_dir=str(abs_data_dir),
            city_name=city_name,
            include_lst=include_lst,
            single_lst_median_path=lst_path_arg # Pass path or None
        )

        print(f"\nSuccessfully initialized dataset with {len(dataset)} samples.")
        print(f"  LST included in output: {dataset.include_lst}")
        print(f"  Target grid resolution: {dataset.resolution_m}m")
        print(f"  Target grid shape (H, W): ({dataset.sat_H}, {dataset.sat_W})")


        # --- Inspect First Sample ---
        if len(dataset) > 0:
            first_sample = dataset[0]
            print("\nSample keys:", list(first_sample.keys()))

            print("\nTensor shapes in first sample:")
            for key, tensor in first_sample.items():
                if hasattr(tensor, 'shape'):
                     print(f"  {key}: {tensor.shape} (dtype: {tensor.dtype})")
                else:
                     print(f"  {key}: {type(tensor)}") # Should all be tensors

            # Plot the cloudless mosaic (RGB) - Requires bands B04, B03, B02 in that order in the mosaic
            # NOTE: This requires knowing the band order within the saved .npy file.
            # Assuming standard RGB order [Red (B04), Green (B03), Blue (B02)] might be indices [2, 1, 0]
            # if the mosaic was saved with ["B02", "B03", "B04", "B08"] order. Adjust if needed.
            mosaic_tensor = first_sample['cloudless_mosaic']
            if mosaic_tensor.shape[0] >= 3:
                try:
                    # Assuming Red=idx 2, Green=idx 1, Blue=idx 0 based on typical ["B02", "B03", "B04", ...] order
                    rgb_indices = [2, 1, 0]
                    rgb = mosaic_tensor[rgb_indices, :, :]
                    rgb = np.transpose(rgb.numpy(), (1, 2, 0)) # Transpose to H, W, C for imshow

                    # Normalize for display (simple min-max scaling)
                    min_val, max_val = np.percentile(rgb, [2, 98]) # Clip outliers
                    rgb_display = np.clip((rgb - min_val) / (max_val - min_val), 0, 1)

                    plt.figure(figsize=(8, 8))
                    plt.imshow(rgb_display)
                    plt.title(f"Cloudless Mosaic RGB Approx. ({city_name})")
                    plt.axis('off')
                    plt.show()
                except Exception as plot_e:
                    print(f"\nCould not plot mosaic RGB: {plot_e}")


            # Plot the target UHI grid for the first sample
            target_uhi = first_sample['target'].numpy() # Convert to numpy for plotting
            uhi_mask = first_sample['mask'].numpy() > 0.5 # Boolean mask
            plt.figure(figsize=(8, 8))
            valid_uhi = target_uhi[uhi_mask]
            vmin = np.nanmin(valid_uhi) if valid_uhi.size > 0 else 0
            vmax = np.nanmax(valid_uhi) if valid_uhi.size > 0 else 1
            # Display masked areas explicitly (e.g., as white)
            display_uhi = np.where(uhi_mask, target_uhi, np.nan)
            plt.imshow(display_uhi, cmap='viridis', vmin=vmin, vmax=vmax, interpolation='nearest')
            plt.colorbar(label='UHI Index')
            plt.title(f"Target UHI Grid (Sample 0) - Masked areas are NaN/White")
            plt.axis('off')
            plt.show()

            # Plot LST if included
            if dataset.include_lst:
                 lst_grid = first_sample['lst_seq'][0,0,:,:].numpy() # Remove T and C dims
                 plt.figure(figsize=(8, 8))
                 plt.imshow(lst_grid, cmap='plasma') # LST often uses plasma/inferno
                 plt.colorbar(label='Normalized LST')
                 plt.title(f"Static LST Median (Sample 0)")
                 plt.axis('off')
                 plt.show()

    except Exception as e:
        print(f"\nError initializing or inspecting dataset: {e}")
        import traceback
        traceback.print_exc()


Checking existence of required files for CityDataSet initialization:
  MISSING: Weather CSV at /home/jupyter/UHI/MLC-Project/data/NYC/weather_grid.csv

Error: Not all required files exist. Cannot initialize CityDataSet.


In [None]:
# Initialize the dataset if all required files exist
if required_files_exist:
    try:
        # Parameters from config cell (Cell 6)
        target_resolution_m = mosaic_resolution_m # Use mosaic resolution
        print(f"\nInitializing dataset with target resolution: {target_resolution_m}m")

        # Pass the path to the single LST median file if it exists
        lst_path_arg = str(single_lst_median_file_path) if include_lst and single_lst_median_file_path else None

        dataset = CityDataSet(
            bounds=bounds,
            averaging_window=averaging_window_lst, # Still needed by constructor, though not used for LST if path provided
            resolution_m=target_resolution_m,
            uhi_csv=abs_uhi_csv,
            bbox_csv=abs_bbox_csv,
            weather_csv=abs_weather_csv,
            cloudless_mosaic_path=str(cloudless_mosaic_path),
            data_dir=abs_data_dir,
            city_name=city_name,
            include_lst=include_lst,
            single_lst_median_path=lst_path_arg
        )

        print(f"\nSuccessfully initialized dataset with {len(dataset)} samples. LST included: {dataset.include_lst}")

        # --- Inspect First Sample ---
        if len(dataset) > 0:
            first_sample = dataset[0]
            print("\nSample keys:", list(first_sample.keys()))

            print("\nTensor shapes in first sample:")
            for key, tensor in first_sample.items():
                # Only print shape, handle potential non-tensor items gracefully if any added later
                if hasattr(tensor, 'shape'):
                     print(f"  {key}: {tensor.shape} (dtype: {tensor.dtype})")
                else:
                     print(f"  {key}: {type(tensor)}")

            # Plot the cloudless mosaic (RGB)
            mosaic_tensor = first_sample['cloudless_mosaic']
            if mosaic_tensor.shape[0] >= 3:
                rgb_indices = []
                required = ["B04", "B03", "B02"]
                missing = []
                for band in required:
                     try: rgb_indices.append(mosaic_bands.index(band))
                     except ValueError: missing.append(band)

                if not missing:
                    plt.figure(figsize=(10, 8))
                    rgb = mosaic_tensor[rgb_indices, :, :]
                    rgb = np.transpose(rgb, (1, 2, 0))
                    min_val, max_val = rgb.min(), rgb.max()
                    if max_val > min_val: rgb = (rgb - min_val) / (max_val - min_val)
                    else: rgb = np.zeros_like(rgb)
                    plt.imshow(rgb)
                    plt.title(f"Cloudless Mosaic RGB ({city_name}, {mosaic_year})")
                    plt.axis('off')
                    plt.show()
                else:
                    print(f"\nCannot display RGB composite: Missing bands {missing} in mosaic_bands {mosaic_bands}")

            # Plot the target UHI grid for the first sample
            target_uhi = first_sample['target']
            uhi_mask = first_sample['mask']
            plt.figure(figsize=(10, 8))
            # Handle cases where target might be all NaN after masking/grouping
            valid_uhi = target_uhi[uhi_mask > 0.5] # Use mask to select valid points
            vmin = np.nanmin(valid_uhi) if valid_uhi.size > 0 else 0
            vmax = np.nanmax(valid_uhi) if valid_uhi.size > 0 else 1
            plt.imshow(target_uhi, cmap='viridis', vmin=vmin, vmax=vmax)
            plt.colorbar(label='UHI Index')
            plt.title(f"Target UHI Grid (Sample 0) - Masked areas are NaN/White")
            plt.axis('off')
            plt.show()

    except Exception as e:
        print(f"\nError initializing or inspecting dataset: {e}")
        import traceback
        traceback.print_exc()
else:
    print("\nCannot initialize dataset: Not all required files were found. Please run previous steps.")