# UHI Data Download and Processing Pipeline

In [None]:
# Your code here

# UHI Data Download and Processing Pipeline

This notebook demonstrates how to:

1. Download and process UHI GeoTIFF files from sources like Fort Lauderdale (FTL) and convert them to CSV format
2. Download satellite imagery for specific cities and time periods and save them locally
3. Use local satellite data files with the dataloader instead of direct API calls

The pipeline is designed to work with different city datasets with the same structure.

## 1. Setup and Dependencies

In [None]:
import sys
import os
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
from pathlib import Path
from datetime import datetime
import logging

# Add the project root to the Python path to allow importing from src
project_root = Path(os.getcwd()).parent  # Assumes notebook is in 'notebooks' subdir
sys.path.insert(0, str(project_root))

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

## 2. Download Satellite Data for Cities

Now we'll download satellite imagery data (Sentinel-2 median composites, Landsat LST medians) for specific cities and time periods derived from the UHI data timestamps. Data is saved locally for use by the dataloader.

In [None]:
# Import functions
from src.ingest.get_median import create_and_save_cloudless_mosaic
# Import the modified LST download function
from src.ingest.create_sat_tensor_files import download_single_lst_median 
import pandas as pd

# --- Configuration for Data Download ---

# Configure parameters for the target city (e.g., NYC)
city_name = "NYC"

# Time window matching original notebooks
sentinel_time_window = "2021-06-01/2021-09-01"
lst_time_window = "2021-06-01/2021-09-01"

# Input files and general settings
data_dir = Path("data")
abs_output_dir = project_root / data_dir
# UHI CSV is now only needed for verification/context, not LST window calculation
uhi_csv = data_dir / city_name / "uhi_data.csv"
abs_uhi_csv = project_root / uhi_csv 
bbox_csv = data_dir / city_name / "bbox.csv"
abs_bbox_csv = project_root / bbox_csv

# Load bounds from bbox.csv for mosaic generation
if not abs_bbox_csv.exists():
    raise FileNotFoundError(f"bbox.csv not found at {abs_bbox_csv}")
bbox_df = pd.read_csv(abs_bbox_csv)
bounds = [
    bbox_df['longitudes'].min(),
    bbox_df['latitudes'].min(),
    bbox_df['longitudes'].max(),
    bbox_df['latitudes'].max()
]

# Parameters for Cloudless Mosaic (matching Sentinel2_GeoTIFF.ipynb)
mosaic_bands = ["B02", "B03", "B04", "B08"] # Still using RGB+NIR for Clay
mosaic_resolution_m = 10
mosaic_cloud_cover = 30 # Changed from 5 to 30

# Parameters for LST Median (matching Landsat_LST.ipynb)
include_lst = True         # Whether to download LST
lst_resolution_m = 30      # Native resolution for Landsat LST
# LST cloud cover fixed at 50 in load_lst_tensor_from_bbox_median

# Generate output path for the mosaic based on the new time window
start_dt_str = sentinel_time_window.split('/')[0].replace('-','')
end_dt_str = sentinel_time_window.split('/')[1].replace('-','')
cloudless_mosaic_filename = f"sentinel_{city_name}_{start_dt_str}_to_{end_dt_str}_cloudless_mosaic.npy"
cloudless_mosaic_path = abs_output_dir / city_name / "sat_files" / cloudless_mosaic_filename

# --- Verification ---
print(f"City: {city_name}")
print(f"Sentinel-2 Time Window: {sentinel_time_window}")
print(f"Sentinel-2 Cloud Cover Threshold: {mosaic_cloud_cover}%")
print(f"LST Time Window: {lst_time_window}")
print(f"Bounds from {bbox_csv.name}: {bounds}")
print(f"Checking if UHI CSV exists (for context): {abs_uhi_csv.exists()}")
print(f"Target mosaic output path: {cloudless_mosaic_path}")
print(f"Include LST: {include_lst}")

In [None]:
# --- 1. Generate Cloudless Mosaic --- 
print(f"\n--- Generating Cloudless Mosaic ({sentinel_time_window}) ---")

mosaic_output_path = create_and_save_cloudless_mosaic(
    city_name=city_name,
    bounds=bounds,
    output_dir=abs_output_dir,
    time_window=sentinel_time_window, # Use the explicit time window
    selected_bands=mosaic_bands,
    resolution_m=mosaic_resolution_m,
    cloud_cover=mosaic_cloud_cover # Use the updated cloud cover
)

if mosaic_output_path:
    print(f"Cloudless mosaic saved/found at: {mosaic_output_path}")
else:
    # Stop if mosaic fails, as it's required
    raise RuntimeError("Failed to generate cloudless mosaic.")

# --- 2. Download Single LST Median (if enabled) ---
print(f"\n--- Downloading Single LST Median (Include: {include_lst}, Window: {lst_time_window}) ---")

single_lst_median_file_path = None # Initialize path variable
if include_lst:
    # No need to check UHI CSV, we provide the time window directly
    
    # Download the single LST median using the explicit time window
    single_lst_median_file_path = download_single_lst_median(
        city_name=city_name,
        bounds=bounds,
        output_dir=abs_output_dir,
        time_window=lst_time_window, # Provide explicit window
        # uhi_csv_path and averaging_window are omitted/None
        resolution_m=lst_resolution_m
        # lst_cloud_cover is handled internally by load_lst_tensor_from_bbox_median
    )

    if single_lst_median_file_path:
        print(f"Single LST median saved/found at: {single_lst_median_file_path}")
    else:
        print("Failed to generate single LST median.")
else:
    print("Skipping LST median download as include_lst is False.")

# --- Verification ---
sat_files_check_dir = Path(abs_output_dir) / city_name / "sat_files"
print(f"\nVerifying output files:")
print(f"  Mosaic path ({cloudless_mosaic_path.name}) exists: {cloudless_mosaic_path.exists()}")
if include_lst:
    # Construct expected LST filename based on the explicit window
    lst_start_str = lst_time_window.split('/')[0].replace('-','')
    lst_end_str = lst_time_window.split('/')[1].replace('-','')
    expected_lst_filename = f"lst_{city_name}_median_{lst_start_str}_to_{lst_end_str}.npy"
    expected_lst_path = sat_files_check_dir / expected_lst_filename
    print(f"  Single LST median path ({expected_lst_filename}) exists: {expected_lst_path.exists()}")
    # Update the variable used by later cells if generation was successful
    if single_lst_median_file_path and not single_lst_median_file_path.exists():
         # This case shouldn't happen if the function worked, but good sanity check
         print(f"Warning: LST download function returned a path but it doesn't exist: {single_lst_median_file_path}")
         single_lst_median_file_path = None # Ensure later cells know it failed
    elif not single_lst_median_file_path and expected_lst_path.exists():
         # File existed previously, update path variable for later cells
         single_lst_median_file_path = expected_lst_path

## 4. Using Local Satellite Data with the Dataloader

Finally, we'll demonstrate how to use the modified dataloader that works with local satellite data files instead of making API calls directly.

In [None]:
# Import the updated data loader
from src.ingest.dataloader import CityDataSet

# --- Parameters for Dataloader ---
# Most parameters are now defined in Cell 6 (config cell)

# Paths relative to project root
uhi_csv_rel = f"data/{city_name}/uhi_data.csv"
bbox_csv_rel = f"data/{city_name}/bbox.csv"
weather_csv_rel = f"data/{city_name}/weather_grid.csv"

# Construct absolute paths
abs_uhi_csv = project_root / uhi_csv_rel
abs_bbox_csv = project_root / bbox_csv_rel
abs_weather_csv = project_root / weather_csv_rel
abs_data_dir = project_root / data_dir # data_dir defined in Cell 6

# Path to the cloudless mosaic generated earlier (defined in Cell 6)
# Path to the single LST median generated earlier (defined in Cell 7, may be None)
# single_lst_median_file_path

# Required files check
required_files_exist = all([
    abs_uhi_csv.exists(),
    abs_bbox_csv.exists(),
    abs_weather_csv.exists(),
    cloudless_mosaic_path.exists(),
    # Check LST file only if it was supposed to be generated
    (single_lst_median_file_path.exists() if include_lst and single_lst_median_file_path else True)
])

print(f"Target Resolution for Grids: {mosaic_resolution_m}m") # Use mosaic resolution for consistency here
print(f"UHI CSV exists: {abs_uhi_csv.exists()}")
print(f"Bbox CSV exists: {abs_bbox_csv.exists()}")
print(f"Weather CSV exists: {abs_weather_csv.exists()}")
print(f"Cloudless Mosaic exists: {cloudless_mosaic_path.exists()}")
if include_lst:
    lst_exists = single_lst_median_file_path.exists() if single_lst_median_file_path else False
    print(f"Single LST Median (needed={include_lst}) exists: {lst_exists}")
print(f"All required files exist: {required_files_exist}")

In [None]:
# Initialize the dataset if all required files exist
if required_files_exist:
    try:
        # Parameters from config cell (Cell 6)
        target_resolution_m = mosaic_resolution_m # Use mosaic resolution
        print(f"\nInitializing dataset with target resolution: {target_resolution_m}m")

        # Pass the path to the single LST median file if it exists
        lst_path_arg = str(single_lst_median_file_path) if include_lst and single_lst_median_file_path else None

        dataset = CityDataSet(
            bounds=bounds,
            averaging_window=averaging_window_lst, # Still needed by constructor, though not used for LST if path provided
            resolution_m=target_resolution_m,
            uhi_csv=abs_uhi_csv,
            bbox_csv=abs_bbox_csv,
            weather_csv=abs_weather_csv,
            cloudless_mosaic_path=str(cloudless_mosaic_path),
            data_dir=abs_data_dir,
            city_name=city_name,
            include_lst=include_lst,
            single_lst_median_path=lst_path_arg
        )

        print(f"\nSuccessfully initialized dataset with {len(dataset)} samples. LST included: {dataset.include_lst}")

        # --- Inspect First Sample ---
        if len(dataset) > 0:
            first_sample = dataset[0]
            print("\nSample keys:", list(first_sample.keys()))

            print("\nTensor shapes in first sample:")
            for key, tensor in first_sample.items():
                # Only print shape, handle potential non-tensor items gracefully if any added later
                if hasattr(tensor, 'shape'):
                     print(f"  {key}: {tensor.shape} (dtype: {tensor.dtype})")
                else:
                     print(f"  {key}: {type(tensor)}")

            # Plot the cloudless mosaic (RGB)
            mosaic_tensor = first_sample['cloudless_mosaic']
            if mosaic_tensor.shape[0] >= 3:
                rgb_indices = []
                required = ["B04", "B03", "B02"]
                missing = []
                for band in required:
                     try: rgb_indices.append(mosaic_bands.index(band))
                     except ValueError: missing.append(band)

                if not missing:
                    plt.figure(figsize=(10, 8))
                    rgb = mosaic_tensor[rgb_indices, :, :]
                    rgb = np.transpose(rgb, (1, 2, 0))
                    min_val, max_val = rgb.min(), rgb.max()
                    if max_val > min_val: rgb = (rgb - min_val) / (max_val - min_val)
                    else: rgb = np.zeros_like(rgb)
                    plt.imshow(rgb)
                    plt.title(f"Cloudless Mosaic RGB ({city_name}, {mosaic_year})")
                    plt.axis('off')
                    plt.show()
                else:
                    print(f"\nCannot display RGB composite: Missing bands {missing} in mosaic_bands {mosaic_bands}")

            # Plot the target UHI grid for the first sample
            target_uhi = first_sample['target']
            uhi_mask = first_sample['mask']
            plt.figure(figsize=(10, 8))
            # Handle cases where target might be all NaN after masking/grouping
            valid_uhi = target_uhi[uhi_mask > 0.5] # Use mask to select valid points
            vmin = np.nanmin(valid_uhi) if valid_uhi.size > 0 else 0
            vmax = np.nanmax(valid_uhi) if valid_uhi.size > 0 else 1
            plt.imshow(target_uhi, cmap='viridis', vmin=vmin, vmax=vmax)
            plt.colorbar(label='UHI Index')
            plt.title(f"Target UHI Grid (Sample 0) - Masked areas are NaN/White")
            plt.axis('off')
            plt.show()

    except Exception as e:
        print(f"\nError initializing or inspecting dataset: {e}")
        import traceback
        traceback.print_exc()
else:
    print("\nCannot initialize dataset: Not all required files were found. Please run previous steps.")

## 5. Adding a New City to the Pipeline

Here's how to add a new city to the data pipeline:

1. Prepare the UHI data CSV (lat;long;uhi) and bounding box CSV
2. Create a directory structure: `data/CITY_NAME/`
3. Run the satellite data download process for the new city
4. Use the local dataloader with the new city's satellite data

In [None]:
# Example for adding a new city (commented out - template for reference)
"""
# Step 1: Set up directory structure
new_city = "MIAMI"
os.makedirs(f"data/{new_city}", exist_ok=True)

# Step 2: If you have UHI GeoTIFFs, convert them to CSV
geotiff_dir = f"data/UHI_Surfaces_{new_city}"
if os.path.exists(geotiff_dir):
    process_uhi_directories(
        input_dirs=[geotiff_dir],
        output_dir="data"
    )

# Step 3: Prepare parameters
city_bounds = [-80.32, 25.70, -80.12, 25.90]  # Example for Miami [min_lon, min_lat, max_lon, max_lat]
uhi_csv = f"data/{new_city}/uhi_data.csv"
bbox_csv = f"data/{new_city}/bbox.csv"
weather_csv = f"data/{new_city}/weather_grid.csv"

# Step 4: Download satellite data
download_data_from_uhi_csv(
    city_name=new_city,
    uhi_csv=uhi_csv,
    bbox_csv=bbox_csv,
    averaging_window=30,
    output_dir="data",
    selected_bands=["B02", "B03", "B04", "B08"],
    resolution_m=10,
    include_lst=True
)

# Step 5: Initialize the dataset with local data
dataset = CityDataSet(
    bounds=city_bounds,
    averaging_window=30,
    selected_bands=["B02", "B03", "B04", "B08"],
    resolution_m=10,
    include_lst=True,
    uhi_csv=uhi_csv,
    bbox_csv=bbox_csv,
    weather_csv=weather_csv,
    data_dir="data",
    city_name=new_city
)
"""

## 3a. Download Weather Grid Data for the City

Next, we download daily weather data (max/min temperature, precipitation) for a grid covering the city's bounding box. This data is used by the dataloader.

In [None]:
import requests
import numpy as np
import pandas as pd
import os
from pathlib import Path
import logging

# Function to fetch weather data from Open-Meteo
def get_openmeteo_weather(lat, lon, start_date, end_date):
    url = (
        "https://archive-api.open-meteo.com/v1/archive?"
        f"latitude={lat}&longitude={lon}"
        f"&start_date={start_date}&end_date={end_date}"
        "&daily=temperature_2m_max,temperature_2m_min,precipitation_sum"
        "&timezone=America/New_York" # Consider making timezone configurable if needed
    )
    try:
        res = requests.get(url, timeout=30) # Added timeout
        res.raise_for_status() # Raise HTTPError for bad responses (4XX, 5XX)
        return res.json()["daily"]
    except requests.exceptions.RequestException as e:
        logging.error(f"API request failed for ({lat},{lon}): {e}")
        return None
    except Exception as e:
        logging.error(f"Failed processing weather for ({lat},{lon}): {e}")
        return None

# --- Weather Configuration ---
# Use the city_name and bbox_csv defined in the previous cell (Cell 9)
weather_city_name = city_name
weather_bbox_csv = project_root / bbox_csv # Use the relative path defined earlier

# Define grid interval and date range
grid_interval = 0.01 # Degrees (~1km) - adjust if needed
start_date = "2021-06-01" # Match the example script - adjust if needed
end_date = "2021-09-01"   # Match the example script - adjust if needed

# Output path for weather data
weather_output_dir = project_root / "data" / weather_city_name
weather_output_file = weather_output_dir / "weather_grid.csv"

# --- Weather Data Download ---
print(f"\nStarting weather data download for {weather_city_name}...")
print(f"Using bbox file: {weather_bbox_csv}")
print(f"Output file: {weather_output_file}")

if not os.path.exists(weather_bbox_csv):
    print(f"Error: Bounding box file not found at {weather_bbox_csv}. Skipping weather download.")
else:
    # Read bounding box
    bbox_df = pd.read_csv(weather_bbox_csv)
    min_lat, max_lat = bbox_df['latitudes'].min(), bbox_df['latitudes'].max()
    min_lon, max_lon = bbox_df['longitudes'].min(), bbox_df['longitudes'].max()

    # Create grid points, ensuring ranges cover the max values
    lats = np.arange(min_lat, max_lat + grid_interval, grid_interval)
    lons = np.arange(min_lon, max_lon + grid_interval, grid_interval)
    grid_points = [(round(lat, 4), round(lon, 4)) for lat in lats for lon in lons] # Rounded for precision
    print(f"Generated {len(grid_points)} grid points for weather data.")

    # Get weather data for each grid point
    weather_records = []
    successful_fetches = 0
    for i, (lat, lon) in enumerate(grid_points):
        if (i + 1) % 50 == 0: # Log progress every 50 points
             print(f"  Fetching weather for point {i+1}/{len(grid_points)} ({lat}, {lon})...")
             
        daily_data = get_openmeteo_weather(lat, lon, start_date, end_date)
        if daily_data and 'time' in daily_data: # Check if data was fetched successfully
            successful_fetches += 1
            for idx, date in enumerate(daily_data["time"]):
                weather_records.append({
                    "lat": lat,
                    "lon": lon,
                    "date": date,
                    "temp_max": daily_data["temperature_2m_max"][idx],
                    "temp_min": daily_data["temperature_2m_min"][idx],
                    "precip": daily_data["precipitation_sum"][idx],
                })
        # Optional: Add a small delay between requests if needed
        # import time
        # time.sleep(0.1)

    print(f"Successfully fetched data for {successful_fetches} out of {len(grid_points)} grid points.")

    if not weather_records:
        print("No weather data was successfully downloaded. CSV file not created.")
    else:
        # Save the weather data to a CSV file
        df_weather = pd.DataFrame(weather_records)
        
        # Ensure output directory exists
        os.makedirs(weather_output_dir, exist_ok=True)
        
        print(f"Saving weather data with {len(df_weather)} records to {weather_output_file}...")
        df_weather.to_csv(weather_output_file, index=False, float_format='%.4f')
        print(f"Weather data saved successfully.")

print("\nWeather data download process finished.")

