# UHI Data Download and Processing Pipeline

In [None]:
# Your code here

# UHI Data Download and Processing Pipeline

This notebook demonstrates how to:

1. Download and process UHI GeoTIFF files from sources like Fort Lauderdale (FTL) and convert them to CSV format
2. Download satellite imagery for specific cities and time periods and save them locally
3. Use local satellite data files with the dataloader instead of direct API calls

The pipeline is designed to work with different city datasets with the same structure.

## 1. Setup and Dependencies

In [None]:
import sys
import os
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
from pathlib import Path
from datetime import datetime
import logging
import s3fs # Add s3fs for GOES download

# Add the project root to the Python path to allow importing from src
project_root = Path(os.getcwd()).parent  # Assumes notebook is in 'notebooks' subdir
sys.path.insert(0, str(project_root))

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

## 2. GeoTIFF to CSV Conversion

First, we'll convert UHI GeoTIFF files (like those from Fort Lauderdale) to a structured CSV format with latitude, longitude, UHI values, and time period information.

In [None]:
# Import the UHI converter function
from src.ingest.uhi_converter import process_uhi_directories

# List of input directories containing the UHI GeoTIFFs
# Paths should be relative to the project root
input_directories = [
    "data/UHI Surfaces_FTL",
    # Add other directories here if needed, e.g., "data/UHI_Surfaces_CITY2"
]

# Directory where the output CSV files will be saved
output_directory = "data"

# Convert paths to absolute
absolute_input_dirs = [str(project_root / d) for d in input_directories]
absolute_output_dir = str(project_root / output_directory)

print(f"Input directories: {input_directories}")
print(f"Output directory: {output_directory}")

# Run conversion
print("\nStarting UHI GeoTIFF to CSV conversion...")
process_uhi_directories(input_dirs=absolute_input_dirs, output_dir=absolute_output_dir)
print("Conversion process finished. Check the output directory for CSV files.")

## 3. Download Satellite Data for Cities

Now we'll download satellite imagery data (Sentinel-2 median composites, Landsat LST medians, and hourly GOES LST) for specific cities and time periods derived from the UHI data timestamps. Data is saved locally for use by the dataloader.

In [None]:
# Import satellite data downloading functions
from src.ingest.create_sat_tensor_files import download_data_from_uhi_csv

# --- Configuration for Data Download ---

# Configure parameters for the target city (e.g., NYC)
city_name = "NYC"

# Input files for the city (relative to project root)
uhi_csv = f"data/{city_name}/uhi_data.csv"
bbox_csv = f"data/{city_name}/bbox.csv"

# Parameters for median composites and downloads
averaging_window = 30  # Days to look back for median composites
output_dir = "data"      # Base directory to save all downloaded data (Sentinel, LST, GOES)

# Sentinel-2 Configuration
selected_bands = ["B02", "B03", "B04", "B08"]  # Bands to use
resolution_m = 10  # Resolution in meters

# Landsat LST Configuration
include_lst = True  # Include Landsat LST median composites

# --- Verification ---
# Verify the essential input files exist
print(f"Checking if UHI CSV exists: {os.path.exists(project_root / uhi_csv)}")
print(f"Checking if bbox CSV exists: {os.path.exists(project_root / bbox_csv)}")


In [None]:
# Download satellite data if input files exist
if os.path.exists(project_root / uhi_csv) and os.path.exists(project_root / bbox_csv):
    print(f"Starting satellite data download for {city_name}...")
    print(f"Include Landsat LST: {include_lst}")
    
    # Convert paths to absolute
    abs_uhi_csv = str(project_root / uhi_csv)
    abs_bbox_csv = str(project_root / bbox_csv)
    abs_output_dir = str(project_root / output_dir)
    
    # --- Download Sentinel, Landsat LST, and GOES LST --- 
    # The function now handles all downloads based on flags
    lookup_table = download_data_from_uhi_csv(
        city_name=city_name,
        uhi_csv=abs_uhi_csv,
        bbox_csv=abs_bbox_csv,
        averaging_window=averaging_window,
        output_dir=abs_output_dir,
        selected_bands=selected_bands,
        resolution_m=resolution_m,
        include_lst=include_lst,
    )
    
    print(f"\nSatellite data download process complete for {city_name}.")
    
    # --- Verification of Downloaded Data Directories ---
    sat_files_check_dir = Path(abs_output_dir) / city_name / "sat_files"
    goes_files_check_dir = Path(abs_output_dir) / city_name / "goes_files"
    
    print(f"\nVerifying output directories:")
    print(f"  Sentinel/Landsat LST dir ({sat_files_check_dir}) exists: {sat_files_check_dir.exists()}")

    # Show first few entries in the lookup table (for Sentinel/Landsat medians)
    if lookup_table:
        print("\nSample entries from timewindow lookup table (Sentinel/Landsat):")
        count = 0
        for time_window, files in lookup_table.items():
            if count < 3:  # Show only first 3 entries
                print(f"  Time window: {time_window}")
                print(f"    Sentinel file: {files.get('sentinel')}")
                print(f"    LST file: {files.get('lst')}")
                count += 1
else:
    print("Input files (UHI or bbox CSV) not found. Please ensure they exist at the specified paths:")
    print(f"  UHI Path: {project_root / uhi_csv}")
    print(f"  Bbox Path: {project_root / bbox_csv}")

## 4. Using Local Satellite Data with the Dataloader

Finally, we'll demonstrate how to use the modified dataloader that works with local satellite data files instead of making API calls directly.

In [None]:
# Import the local data loader
from src.ingest.dataloaders_local import CityDataSet

# Define the parameters for NYC (as an example)
bounds = [-74.01, 40.75, -73.86, 40.88]  # NYC bounding box [min_lon, min_lat, max_lon, max_lat]
averaging_window = 30  # Days to look back
selected_bands = ["B02", "B03", "B04", "B08"]  # Sentinel-2 bands
resolution_m = 10  # Spatial resolution in meters
include_lst = True  # Include Land Surface Temperature data

# CSV file paths
uhi_csv = "data/NYC/uhi_data.csv"
bbox_csv = "data/NYC/bbox.csv"
weather_csv = "data/NYC/weather_grid.csv"

# Base directory for stored satellite data and city name
data_dir = "data"
city_name = "NYC"

# Verify satellite data directory exists
sat_files_dir = Path(project_root) / data_dir / city_name / "sat_files"
lookup_path = sat_files_dir / "timewindow_lookup.json"

print(f"Checking if satellite data directory exists: {os.path.exists(sat_files_dir)}")
print(f"Checking if timewindow lookup file exists: {os.path.exists(lookup_path)}")

In [None]:
# Initialize the dataset if satellite data directory exists
if os.path.exists(sat_files_dir) and os.path.exists(lookup_path):
    try:
        # Convert paths to absolute
        abs_uhi_csv = str(project_root / uhi_csv)
        abs_bbox_csv = str(project_root / bbox_csv)
        abs_weather_csv = str(project_root / weather_csv)
        abs_data_dir = str(project_root / data_dir)
        
        # Create the dataset with local satellite data
        dataset = CityDataSet(
            bounds=bounds,
            averaging_window=averaging_window,
            selected_bands=selected_bands,
            resolution_m=resolution_m,
            include_lst=include_lst,
            uhi_csv=abs_uhi_csv,
            bbox_csv=abs_bbox_csv,
            weather_csv=abs_weather_csv,
            data_dir=abs_data_dir,
            city_name=city_name
        )
        
        print(f"Successfully initialized dataset with {len(dataset)} samples")
        
        # Show dimensions of the first satellite tensor
        if len(dataset) > 0:
            first_sample = dataset[0]
            satellite_tensor, weather_tensor, meta_tensor = first_sample
            
            print(f"\nFirst satellite tensor shape: {satellite_tensor.shape}")
            print(f"Weather tensor: {weather_tensor}")
            print(f"Meta tensor: {meta_tensor}")
            
            # Plot the first satellite image (RGB composite)
            if satellite_tensor.shape[0] >= 3:
                plt.figure(figsize=(10, 8))
                # Extract R, G, B bands (assuming B04, B03, B02 order in selected_bands)
                rgb_idx = [selected_bands.index(b) for b in ["B04", "B03", "B02"] if b in selected_bands]
                if len(rgb_idx) == 3:
                    # Create RGB composite and normalize for display
                    rgb = satellite_tensor[rgb_idx, :, :]
                    rgb = np.transpose(rgb, (1, 2, 0))
                    rgb = (rgb - rgb.min()) / (rgb.max() - rgb.min())
                    plt.imshow(rgb)
                    plt.title(f"RGB Composite for {city_name}")
                    plt.axis('off')
                    plt.show()
                else:
                    print("Cannot display RGB composite: required bands not found")
    except Exception as e:
        print(f"Error initializing dataset: {e}")
else:
    print("Satellite data directory or lookup file not found. Please run the satellite data download step first.")

## 5. Adding a New City to the Pipeline

Here's how to add a new city to the data pipeline:

1. Prepare the UHI data CSV (lat;long;uhi) and bounding box CSV
2. Create a directory structure: `data/CITY_NAME/`
3. Run the satellite data download process for the new city
4. Use the local dataloader with the new city's satellite data

In [None]:
# Example for adding a new city (commented out - template for reference)
"""
# Step 1: Set up directory structure
new_city = "MIAMI"
os.makedirs(f"data/{new_city}", exist_ok=True)

# Step 2: If you have UHI GeoTIFFs, convert them to CSV
geotiff_dir = f"data/UHI_Surfaces_{new_city}"
if os.path.exists(geotiff_dir):
    process_uhi_directories(
        input_dirs=[geotiff_dir],
        output_dir="data"
    )

# Step 3: Prepare parameters
city_bounds = [-80.32, 25.70, -80.12, 25.90]  # Example for Miami [min_lon, min_lat, max_lon, max_lat]
uhi_csv = f"data/{new_city}/uhi_data.csv"
bbox_csv = f"data/{new_city}/bbox.csv"
weather_csv = f"data/{new_city}/weather_grid.csv"

# Step 4: Download satellite data
download_data_from_uhi_csv(
    city_name=new_city,
    uhi_csv=uhi_csv,
    bbox_csv=bbox_csv,
    averaging_window=30,
    output_dir="data",
    selected_bands=["B02", "B03", "B04", "B08"],
    resolution_m=10,
    include_lst=True
)

# Step 5: Initialize the dataset with local data
dataset = CityDataSet(
    bounds=city_bounds,
    averaging_window=30,
    selected_bands=["B02", "B03", "B04", "B08"],
    resolution_m=10,
    include_lst=True,
    uhi_csv=uhi_csv,
    bbox_csv=bbox_csv,
    weather_csv=weather_csv,
    data_dir="data",
    city_name=new_city
)
"""