In [None]:
# python libraries
import os
import json
import requests
import urllib.parse
from pathlib import Path
import subprocess
import tempfile
import shutil
import pprint as pp
import time
import json
import re
from zipfile import ZipFile
import random
from typing import Optional, List, Dict, Tuple, Any

# Geospatial & Data Handling
import pandas as pd
import geopandas as gpd
import duckdb
import h3
import pyarrow.parquet as pq
import pyarrow as pa
import xarray as xr # For ND-Array section
import pystac_client # For STAC section
from shapely.geometry import Point, Polygon, MultiPolygon

# Visualization
import matplotlib.pyplot as plt
import pydeck as pdk
import folium
import lonboard

# Presentation/Notebook Specific
# from IPython.display import display, Markdown, Latex
from IPython.display import display
from IPython.display import clear_output
from tqdm import tqdm
import ipywidgets as widgets
from jupyter_bbox_widget import BBoxWidget
from ipywidgets import Layout, interact

# data
import duckdb
import datahugger
import sciencebasepy
from seedir import seedir

# Import refactored utility functions
from utils.fetch_and_preprocess import (
    fetch_dataset_files, 
    filter_gdf_duplicates, 
    process_vector_geoms, 
    geom_db_consolidate_dataset,
    ddb_filter_duplicates
)
from utils.visualizations import (
    format_dataset_info,
    create_folium_cluster_map,
    create_folium_choropleth,
    create_folium_heatmap,
    create_pydeck_scatterplot,
    create_pydeck_polygons,
    create_pydeck_heatmap
)

from utils.st_context_processing import (
    add_h3_index_to_pv_labels,
    ddb_alter_table_add_h3,
    ddb_save_div_matches,
    ddb_save_subtype_geoms,
    get_duckdb_connection,
    group_pv_by_h3_cells,
    spatial_join_stac_items_with_h3,
    create_h3_stac_fetch_plan,
    fetch_overture_maps_theme,
    spatial_join_pv_overture_duckdb
)

from dotenv import load_dotenv
load_dotenv()

print("Libraries imported.")

# Leveraging Hierarchical Spatial Clustering and DGGS for Planetary-scale surveys of Photovoltaic Solar Panel Arrays

*CCOM6050: Analysis and Design of Algorithms*  
**Alejandro Vega Nogales**  
*Data Scientist @ Maxar Puerto Rico*   
*CCOM MS Student*

## Outline

1.  **Introduction & Background (Earth Observation)**
    * Earth Observation (EO) 
    * Remote Sensing (RS)
    * Geospatial Data
    * Thesis Topic 
2.  **Data & Methodology**
    * Open, Published PV Solar Panel Location Datasets
    * Cloud-Native Geospatial Stack
        - GeoParquet 
        - DuckDB
        - Overture
        - H3
    * Geospatial Data Handling & Analysis
        * STAC Collections
        * Xarray & ND-Arrays
        * Virtualization & Virtual Datasets
4.  **Core Topic: Hierarchical Spatial Clustering**
    * Relevant Algorithms & Papers
    * Application with H3 for PV Cluster Analysis
    * Minimizing STAC queries for multi-sensor and multi-temporal data
5.  **Preliminary Findings & Next Steps**
    * Testing with preliminary datasets and h3
    * Graphs and Network Algorithms?
    * Scaling to the Cloud
6.  **Conclusion**

## Earth Observation (EO): Fundamentals and Background


### What is Earth Observation (EO)?

- Gathering information about Earth's physical, chemical, and biological systems via remote sensing technologies.
- Sensors on satellites, aircraft, drones, etc.
- Key characteristics: Spatial, Temporal, and Spectral Resolution.

<figure style="text-align: right">
<img src="report/assets/figures/schmitt_et_al_fig1_geospatial_data.png" style="width:auto; height:40%;">
<figcaption align = "center"> Illustration of different RS sources, imagery types, and imaging details </figcaption>
</figure>

### Sensor Modalities

- **Optical:** Captures visible and near-infrared light (e.g., satellite imagery).
  - Panchromatic (grayscale), True Color (RGB), Multispectral (4-15 bands), Hyperspectral (100+ bands).
- **Radar (SAR):** Active sensor, penetrates clouds, measures surface properties and elevation.
- **Thermal:** Detects heat emitted/reflected.

### EO Data Complexities

- **Spatial Resolution (GSD):** Size of ground area covered by one pixel.
- **Temporal Resolution:** Time between observations of the same location.
- **Spectral Resolution:** Number and width of electromagnetic bands captured.
- **Challenges:** Clouds, atmospheric distortion, data volume, coordinate systems.

### Geospatial Data Types

- **Raster:**
  - Grid-based data (pixels).
  - Represents continuous phenomena (e.g., elevation, temperature, imagery).
  - Cell values store attribute information.
- **Vector:**
  - Coordinate-based data.
  - Represents discrete features (e.g., points, lines, polygons).
  - Examples: Roads (lines), buildings (polygons), PV panels (points/polygons).

## Proposed Thesis Topic: 

### Training on planetary-scale Datasets for Nation-scale Inventories of PV Solar Panel Datasets

## Data & Methodology

### Open, Published PV Solar Panel Locations

- Aggregated multiple open, published datasets of PV locations worldwide.
- Sources include Zenodo, Figshare, GitHub, ScienceBase.
- Variety of formats (CSV, *GeoJSON* [ideal], Shapefile, etc.).
- Goal: Create a consolidated, deduplicated dataset for analysis.


<!-- prev note for Georectified USA (Note: these correspond to PV _facilities_ rather than individual panel arrays or objects and need filtering of duplicates with other datasets and further processing to extract the PV arrays in the facility) -->

Here we list the dataset titles of publications alongside their first author, DOI links, and their number of labels:
- "Distributed solar photovoltaic array location and extent dataset for remote sensing object identification" - K. Bradbury, 2016 | [paper DOI](https://doi.org/10.1038/sdata.2016.106) | [dataset DOI](https://doi.org/10.6084/m9.figshare.3385780.v4) | polygon annotations for 19,433 PV modules in 4 cities in California, USA
- "A solar panel dataset of very high resolution satellite imagery to support the Sustainable Development Goals" - C. Clark et al, 2023 | [paper DOI](https://doi.org/10.1038/s41597-023-02539-8) | [dataset DOI](https://doi.org/10.6084/m9.figshare.22081091.v3) | 2,542 object labels (per spatial resolution)
- "A harmonised, high-coverage, open dataset of solar photovoltaic installations in the UK" - D. Stowell et al, 2020 | [paper DOI](https://doi.org/10.1038/s41597-020-00739-0) | [dataset DOI](https://zenodo.org/records/4059881) | 265,418 data points (over 255,000 are stand-alone installations, 1067 solar farms, and rest are subcomponents within solar farms)
- "An Artificial Intelligence Dataset for Solar Energy Locations in India" - A. Ortiz, 2022 | [paper DOI](https://doi.org/10.1038/s41597-022-01499-9) | [dataset link 1](https://researchlabwuopendata.blob.core.windows.net/solar-farms/solar_farms_india_2021.geojson) or [dataset link 2](https://raw.githubusercontent.com/microsoft/solar-farms-mapping/refs/heads/main/data/solar_farms_india_2021_merged_simplified.geojson) | 117 geo-referenced points of solar installations across India
- "GloSoFarID: Global multispectral dataset for Solar Farm IDentification in satellite imagery" - Z. Yang, 2024 | [paper DOI](https://doi.org/10.48550/arXiv.2404.05180) | [dataset DOI](https://github.com/yzyly1992/GloSoFarID/tree/main/data_coordinates) | 6,793 PV samples across 3 years (2019-2021) and 3 different spatial resolutions (0.5m, 1m, and 2m)
- "A global inventory of photovoltaic solar energy generating units" - L. Kruitwagen et al, 2021 | [paper DOI](https://doi.org/10.1038/s41586-021-03957-7) | [dataset DOI](https://doi.org/10.5281/zenodo.5005867) | 50,426 for training, cross-validation, and testing; 68,661 predicted polygon labels
- "Harmonised global datasets of wind and solar farm locations and power" - S. Dunnett et al, 2020 | [paper DOI](https://doi.org/10.1038/s41597-020-0469-8) | [dataset DOI](https://doi.org/10.6084/m9.figshare.11310269.v6) | 35272 PV installations

In [None]:
# load environment variables
load_dotenv()
DATASET_DIR = Path(os.getenv('DATA_PATH'))
# read dataset metadata from json file
with open('dataset_metadata.json', 'r') as f:
    dataset_metadata = json.load(f)

dataset_choices = [
    'global_harmonized_large_solar_farms_2020',
    # 'global_pv_inventory_sent2_2024',
    'global_pv_inventory_sent2_spot_2021',
    # 'fra_west_eur_pv_installations_2023',
    'ind_pv_solar_farms_2022',
    'usa_cali_usgs_pv_2016',
    # 'chn_med_res_pv_2024',
    # 'usa_eia_large_scale_pv_2023',
    'uk_crowdsourced_pv_2020',
    # 'deu_maxar_vhr_2023'   
]

In [None]:
# Initialize a list to store selected datasets
# mostly gen by github copilot with Claude 3.7 model
selected_datasets = dataset_choices.copy()

# Create an accordion to display selected datasets with centered layout
dataset_accordion = widgets.Accordion(
    children=[widgets.HTML(format_dataset_info(ds)) for ds in selected_datasets],
    layout=Layout(width='50%', margin='0 auto')
)
for i, ds in enumerate(selected_datasets):
    dataset_accordion.set_title(i, ds)

# Define a function to add or remove datasets
def manage_datasets(action, dataset=None):
    global selected_datasets, dataset_accordion
    
    if action == 'add' and dataset and dataset not in selected_datasets:
        selected_datasets.append(dataset)
    elif action == 'remove' and dataset and dataset in selected_datasets:
        selected_datasets.remove(dataset)
    
    # Update the accordion with current selections
    dataset_accordion.children = [widgets.HTML(format_dataset_info(ds)) for ds in selected_datasets]
    for i, ds in enumerate(selected_datasets):
        dataset_accordion.set_title(i, ds)
    
    f"Currently selected datasets: {len(selected_datasets)}"

# Create dropdown for available datasets
dataset_dropdown = widgets.Dropdown(
    options=list(dataset_metadata.keys()),
    description='Dataset:',
    disabled=False,
    layout=Layout(width='70%', margin='20 20 auto 20 20')
)

# Create buttons for actions
add_button = widgets.Button(description="Add Dataset", button_style='success')
remove_button = widgets.Button(description="Remove Dataset", button_style='danger')

# Define button click handlers
def on_add_clicked(b):
    manage_datasets('add', dataset_dropdown.value)

def on_remove_clicked(b):
    manage_datasets('remove', dataset_dropdown.value)

# Link buttons to handlers
add_button.on_click(on_add_clicked)
remove_button.on_click(on_remove_clicked)

### Dataset Selection Interface
#### Use the dropdown and buttons below to customize which solar panel datasets will be fetched and processed.
- Select a dataset from the dropdown:
    - Click "Add Dataset" to include it in processing
    - Click "Remove Dataset" to exclude it
- View metadata table for each selected dataset by clicking on it's row in the list

In [None]:
# Display the widgets
display(widgets.HBox([dataset_dropdown, add_button, remove_button]))
display(dataset_accordion)

### Simplifying Fetching and Organizing datasets

Using:
- [datahugger](https://j535d165.github.io/datahugger/) to fetch datasets hosted in Zenodo, figshare.
- sciencebasepy for the dataset hosted in the USGS ScienceBase Catalog.
- github datasets will be fetched using ad hoc functions.
<!-- We will use osf-client to fetch datasets hosted in the Open Science Framework (OSF). -->

#### Dataset Fetching UI widget 

- You can sequentially go through the selected datasets above 
- Initiate the download with the fetch button 
- Files that are already downloaded will be skipped unless the force redownload checkbox is checked 
- You can inspect each download's std out and stderr logs in the output area below

In [None]:
# iterate through the selected datasets and fetch files
# iterate through the selected datasets and fetch files
ds_trees = {}
max_mb = int(os.getenv('MAX_LABEL_MB', 100))

# Create widgets for controlling the fetching process
fetch_output = widgets.Output(
    layout=widgets.Layout(
        width='80%', 
        border='1px solid #ddd', 
        padding='10px',
        overflow='auto'
    )
)
# Apply direct CSS styling for text wrapping (Note: unvalidated)
display(widgets.HTML("""
<style>
.jupyter-widgets-output-area pre {
    white-space: pre-wrap !important;       /* CSS3 */
    word-wrap: break-word !important;        /* Internet Explorer 5.5+ */
    overflow-wrap: break-word !important;
    max-width: 100%;
}
</style>
"""))
control_panel = widgets.VBox(layout=widgets.Layout(width='20%', padding='10px', overflow='auto', word_wrap='break-word'))
fetch_button = widgets.Button(description="Fetch Next Dataset", button_style="primary")
progress_label = widgets.HTML("Waiting to start...")
dataset_index = 0

# Function to fetch the next dataset
def fetch_next_dataset(button=None):
    global dataset_index
    global dataset_metadata
    
    if dataset_index >= len(selected_datasets):
        with fetch_output:
            print("All datasets have been fetched!")
            progress_label.value = f"<b>Completed:</b> {dataset_index}/{len(selected_datasets)} datasets"
        fetch_button.disabled = True
        return
    
    dataset = selected_datasets[dataset_index]
    progress_label.value = f"<b>Fetching:</b> {dataset_index+1}/{len(selected_datasets)}<br><b>Current:</b> {dataset}"
    
    with fetch_output:
        clear_output(wait=True)
        print(f"Fetching dataset files for {dataset} using DOI/URL:\n {dataset_metadata[dataset]['doi']}")
        ds_tree = fetch_dataset_files(dataset, dataset_metadata[dataset], max_mb=max_mb, force=force_download_checkbox.value)
        
        
        if ds_tree:
            ds_trees[dataset] = ds_tree
            # update metadata dict with local filesystem info
            dataset_metadata[dataset]['output_folder'] = ds_tree['output_folder']
            dataset_metadata[dataset]['files'] = ds_tree['files']
            dataset_metadata[dataset]['fs_tree'] = ds_tree['fs_tree']
            # print the dataset file tree
        else:
            print(f"Failed to fetch dataset {dataset}")
    
    dataset_index += 1
    progress_label.value = f"<b>Completed:</b> {dataset_index}/{len(selected_datasets)}<br><b>Next:</b> {selected_datasets[dataset_index] if dataset_index < len(selected_datasets) else 'Done'}"

# Add a checkbox for force download option
force_download_checkbox = widgets.Checkbox(
    value=False,
    description='Force Download',
    tooltip='If checked, download will be forced even if files exist locally',
    layout=widgets.Layout(width='auto')
)

# Configure the button callback
fetch_button.on_click(fetch_next_dataset)

# Create the control panel
dataset_progress = widgets.HTML(f"Datasets selected: {len(selected_datasets)}")
fetch_status = widgets.HTML(
    f"Status: Ready to begin",
    layout=widgets.Layout(margin="10px 0")
)

# Create the control panel with left alignment
control_panel.children = [
    widgets.HTML("<h3 style='align:left;'>Fetch Control</h3>"), 
    dataset_progress,
    force_download_checkbox,
    widgets.HTML("<hr style='margin:10px 0'>"),
    progress_label,
    fetch_button
]

In [None]:
# Add custom CSS to ensure alignment
display(widgets.HTML("""
<style>
.widget-html {
    text-align: left !important;
}
.widget-checkbox {
    align-items: left !important;
}
.widget-button {
    width: 100% !important;
}
</style>
"""))
# Display the widget layout
display(widgets.HBox([fetch_output, control_panel]))

# Set up for first fetch
if selected_datasets:
    progress_label.value = f"<b>Ready to start:</b><br>First dataset: {selected_datasets[0]}"
else:
    progress_label.value = "<b>No datasets selected</b>"
    fetch_button.disabled = True

In [None]:
# keep subset of metadata dict for selected datasets
selected_metadata = {ds: dataset_metadata[ds] for ds in selected_datasets}
get_ds_files = lambda ds: dataset_metadata[ds]['files']
get_ds_dir = lambda ds: dataset_metadata[ds]['output_folder']
is_ds_ftype = lambda ds, fname: fname.endswith(f".{dataset_metadata[ds]['label_fmt']}")
get_full_ds_path = lambda ds: DATASET_DIR / 'raw' / 'labels' / ds
fra_ds_folder = 'replication'

# TODO: refactor this to a function as it'll quickly get out of hand with more datasets and pruning required
# make a manual selection of the set of files we'll use from each dataset
selected_ds_files = {ds : [f for f in get_ds_files(ds) if is_ds_ftype(ds, f)] for ds in selected_datasets}

# ad hoc selection of files for testing (keep files that contain 'solar' and 'WGS84' in filename)
selected_ds_files['global_harmonized_large_solar_farms_2020'] = [f for f in selected_ds_files['global_harmonized_large_solar_farms_2020'] if 'solar' in f.split('/')[-1] and 'WGS84' in f and not os.path.isdir(f)]
# prediction dataset was human verified thoroughly and meant for downstream applications; only use this file for now
selected_ds_files['global_pv_inventory_sent2_spot_2021'] = [f for f in selected_ds_files['global_pv_inventory_sent2_spot_2021'] if 'predicted' in os.path.basename(f)]
print(f"Selected {len(selected_ds_files['global_pv_inventory_sent2_spot_2021'])} files for {selected_datasets[0]}:\n{selected_ds_files['global_pv_inventory_sent2_spot_2021']}")

# only include files that were not filtered out
include_files = [os.path.basename(f) for ds in selected_datasets for f in selected_ds_files[ds]]
# don't print out unused directories
exclude_folders = [os.path.basename(dir) for dir in os.listdir(DATASET_DIR / 'raw' / 'labels') if dir not in selected_datasets]

# build and output tree for selected datasets
selected_ds_dirs = [get_ds_dir(ds) for ds in selected_datasets]
print("All selected datasets have been fetched with the following file tree:\n")
# TODO: fix unwanted dirs in the tree
selected_ds_tree = seedir(DATASET_DIR / 'raw' / 'labels', depthlimit=10, printout=True, regex=False, include_files=include_files, exclude_folders=exclude_folders)

In [None]:
# Organize outputs in tabs with dataset DOIs as tab titles 
tabs = widgets.Tab(children=out_widgets)
for i, ds in enumerate(selected_datasets):
    title = '_'.join(ds.split('_')[:3])
    tabs.set_title(i, title)
display(tabs)

## Cloud-Native Geospatial Stack

- Focus on tools optimized for scalable cloud environments.
- **Goal:** Process and analyze large geospatial datasets efficiently, leveraging cloud storage and compute.

### GeoParquet: Cloud-Optimized Vector Data
<div style="max-width: 80%; margin: 0 auto; padding-left: 1em; padding-right: 1em; text-align: justify;">
<h4 style="text-align: left">GeoParquet: Intro</h2>

<p>GeoParquet is <a href="https://geoparquet.org/">an incubating Open Geospatial Consortium (OGC) standard</a> that simply adds compatible geospatial <a href="https://docs.safe.com/fme/html/FME-Form-Documentation/FME-ReadersWriters/geoparquet/Geometry-Support.htm">geometry types</a> (Point, Line, Polygon, etc) to the mature and widely adopted <a href="https://parquet.apache.org/">Apache Parquet format</a>, a popular columnar storage file format commonly used in big data processing and modern data engineering pipelines and analytics. This is analogous to how the GeoTIFF raster format adds geospatial metadata to the longstanding TIFF standard. GeoParquet is designed to be a simple and efficient way to store geospatial <em>vector</em> data in a columnar format, and is designed to be compatible with existing Parquet tools and libraries to enable Cloud <em>Data Warehouse</em> Interoperability.</p>

<figure style="text-align: center">
<img src="https://miro.medium.com/v2/resize:fit:1400/1*QEQJjtnDb3JQ2xqhzARZZw.png" style="width:70%; height:auto;">
<figcaption align = "center"> Visualization of the layout of a Parquet file </figcaption>
</figure>

<div style="max-width: 80%; margin: 0 auto; padding-left: 1em; padding-right: 1em; text-align: justify;">
<h4 style="text-align: left">GeoParquet: Internal Layout</h2>

<p>These files are organized in a set of file chunks called "row groups". Row groups are logical groups of columns with the same number of rows. Each of these columns is actually a "column chunk" which is a contiguous block of data for that column. The schema across row groups must be consistent, i.e. the data types and number of columns must be the same for every row group. The new geospatial standard adds some relevant additional metadata such as the geometry's Coordinate Reference System (CRS), additional metadata for geometry columns, and <a href="https://medium.com/radiant-earth-insights/geoparquet-1-1-coming-soon-9b72c900fbf2">support for spatial indexing in v1.1</a>.
</div>

<figure style="text-align: center">
<img src="https://guide.cloudnativegeo.org/images/geoparquet_layout.png" style="width:40%; height:auto;">
<figcaption align = "center"> GeoParquet has the same layout with additional metadata </figcaption>
</figure>

<!-- GeoParquet is only the latest in a long line of cloud-native file formats  -->

<div style="max-width: 77%; margin: 0 auto; padding-left: 1em; padding-right: 1em; text-align: justify;">
<h4 style="text-align: left">GeoParquet: Features & Performance</h2>


- Efficient storage and compression: 
    - Internally compressed by default, and can be configured to optimize decompression (time) or storage size (space)
    - columnar format is more efficient for filtering on columns which is common in analytical workloads and results in better compression ratios vs row-based formats
- Scalability and Efficient data access:
    - Spatial indexing, spatial partitioning, and other optimizations enables
        - spatial joins and containment operations like intersection, within, overlaps, etc (ST_*)
        - [spatial predicate pushdowns](https://medium.com/radiant-earth-insights/geoparquet-1-1-coming-soon-9b72c900fbf2)
            - can significantly speed up spatial queries over the network by **applying filters at the storage level**
            - greatly reducing data movement if applied correctly
- Optimized for *read-heavy workflows*: 
    - Parquet itself is an immutable file format, which means taking advantage of cheap reads, and efficient filtering and grouping
    - Popular choice for storing large datasets using *modern cloud-centric DBMS architectures* like data lakes and data warehouses.
    - Designed for analytical workloads that require fast reads and complex queries (but not transactions and frequent updates)
        - idealfor OLAP (Online Analytical Processing) and BI (Business Intelligence) workloads
        - these revolve around historical and aggregated data that dont require high-frequency updates
- Cloud-native format: Optimized for object storage (s3, gcs, abfs, etc.)
    - **designed to be highly compressed**, which reduces storage and data transfer costs and improves RW performance
    - integrates into existing ecosystem of cloud data pipelines and workflows that have been built around the parquet format
    - Broad and fast adoption across the data engineering and geospatial ecosystems

### DuckDB: In-Process SQL OLAP RDBMS

From their ["Why DuckDB?" page](https://duckdb.org/why_duckdb.html):

DuckDB is an **in-process analytical data management system (OLAP RDBMS)**. Unlike traditional client-server databases (like PostgreSQL or MySQL), DuckDB runs directly within the host process (e.g., our Python script or Jupyter kernel), similar to SQLite. However, unlike SQLite which is optimized for transactional workloads (OLTP), DuckDB is specifically designed for **analytical queries (OLAP)** involving complex, long-running queries over potentially huge datasets, typical in big data analytics and scientific computing workflows.

Key benefits for our workflow include:
-   **Simplicity & Portability:** Easy installation (`pip install duckdb`) and no external dependencies or database server management required. Databases are stored as single, portable files (`.duckdb`), making them easy to manage, share, and archive.
-   **Direct Data Access:** Can directly query various file formats, including the **Parquet and GeoParquet files** we are generating and (geo)pandas DataFrames(!), without needing a separate, time-consuming ingestion/copy step. This is highly efficient for consolidating data from multiple files, and remote sources (e.g., S3, GCS).
-   **Powerful SQL:** Offers a rich, modern SQL dialect, including window functions, complex joins, and support for common table expressions (CTEs), allowing sophisticated data manipulation and analysis directly in SQL.
-   **Geospatial Capabilities:** Crucially, DuckDB has a **`spatial` extension** that provides functions for handling and querying geospatial data types (like points, lines, and polygons) using libraries like GEOS. This enables operations such as spatial joins (e.g., `ST_Intersects`, `ST_Contains`), area calculations (`ST_Area`), centroid computation (`ST_Centroid`), and reading/writing WKT/WKB formats directly within the database. This is essential for our tasks like deduplication and integrating PV labels with contextual layers like Overture Maps.
-   **Performance:** Its **column-vectorized query execution engine** is optimized for analytical performance, often *significantly faster than row-based systems* and more optimized than *pure Python/Pandas operations* for large datasets that may not fit into memory. 
-   **Python Integration:** Seamlessly integrates with Python libraries like Pandas and GeoPandas through its client API and tools like `jupysql`, allowing easy data exchange between dataframes and the database directly from our notebooks! 

In this notebook, we use DuckDB to:
1.  Efficiently consolidate multiple GeoParquet files (one per source dataset) into a single database table using its ability to read Parquet directly.
2.  Leverage its `spatial` extension for geospatial indexing, filtering, and performing spatial joins with the [Overture Maps divisions](#Overture-Maps-Divisions) data based on [H3 indices](#H3-Geospatial-Indexing-System-and-Spatial-Clustering).
3.  Provide a persistent, queryable, and portable database (`.duckdb` file) containing the cleaned, consolidated, and spatially enriched PV label data.


### Overture Maps: Adding Geospatial Context

<!-- From their [Division theme guide](https://docs.overturemaps.org/guides/divisions/) and their [brief blog on the history of the project](https://overturemaps.org/blog/2025/overture-maps-foundation-making-open-data-the-winning-choice/): -->

Overture Maps is a collaborative project that aims to create a high-quality, open map datasets for the entire world:
    - The project is a collaboration between several organizations, including Meta, Amazon Web Services (AWS), and Microsoft. 
    - Overture distributes its open datasets as GeoParquet files, and can be accessed through CLI, API or downloaded directly from [their S3](https://docs.overturemaps.org/guides/divisions/#data-access-and-retrieval) buckets

The Overture divisions theme: 
- has three feature types (division, **division_area**, and division_boundary) and contains more than 5.45 million point, line, and polygon representations of human settlements, such as countries, regions, states, cities, and even neighborhoods. 
- is derived from a conflation of OpenStreetMap data and geoBoundaries data
- **Used here as contextual layers** (e.g. dividing our data by continent, country, etc) to enrich PV data.
- their `division_area` subset provides a **hierarchical structure of administrative boundaries**, including countries, states, and cities.

<figure style="text-align: center">
<img src="https://docs.overturemaps.org/assets/images/divisions-admin0-admin1-coverage-ff1a8d4c6d68c88047b34d1f9c9109be.png" style="width:65%; height:auto;">
<figcaption align = "center"> Overture divisions data, styled by subtype: countries in purple, region boundaries as green lines. </figcaption>
</figure>

### H3: Geospatial Indexing System and Spatial Clustering

<!-- From their [home page](https://h3geo.org/), [announcement blog](https://www.uber.com/blog/h3/), and [overview page](https://h3geo.org/docs/core-library/overview/): -->

The H3 geospatial indexing system is a discrete global grid system developed at Uber. It was designed for **indexing geographies via multi-precision hexagonal tiling into a hexagonal grid with hierarchical indexes**. Geospatial coordinates can be indexed to *cell IDs* at different levels of resolution (0-15) that each represent a unique cell in the grid at each resolution. The hexagonal grid system is designed to be **hierarchical**, meaning that each cell at a given resolution can be subdivided into smaller cells at higher resolutions, allowing for efficient spatial queries and analysis.The hexagonal grid system is created on the planar faces of [a sphere-circumscribed icosahedron](https://www.researchgate.net/figure/a-Icosahedron-faces-projected-onto-a-sphere-b-Triangular-tessellation-c-Diamond_fig1_339813234) and the grid cells are then projected to the surface of the sphere...
<!-- It is common to use WGS84/EPSG:4326 CRS data with the H3 library. -->

<figure style="text-align: center">
<img src="https://blog.uber-cdn.com/cdn-cgi/image/width=2160,quality=100,onerror=redirect,format=auto/wp-content/uploads/2018/06/Twitter-H3.png" style="width:75%; height:50%;">
<figcaption align = "center"> H3 enables users to partition the globe into hexagons for more accurate analysis. </figcaption>
</figure>

- **Benefits for Spatial Analysis:**
  - *Fast Joins & Aggregation:* Quickly combine data across datasets based on cell ID.
  - *Efficient Neighborhood Queries:* Hexagons have uniform adjacency and H3 provides a built-in Grid Traversal API with distance metrics.
  - *Hierarchical Structure:* Easy aggregation/disaggregation across resolutions (parent/child cells).
  - *Optimized Grid Traversal:* Useful for spatial algorithms.
  - **Foundation for spatial indexing and clustering in this work.**

In [None]:
from IPython.display import IFrame
# display h3 viewer 
IFrame("https://h3.chotard.com", width=1080, height=540)

# Cloud-Native Geospatial Stack: Scalability and Efficiency

## Querying and Searching STAC Collections

- **S**patio**T**emporal **A**sset **C**atalog (STAC).
- Standardized specification for describing geospatial information.
- Enables searching and discovery of EO data (imagery, etc.) across different providers.
- Key concepts: Catalogs, Collections, Items, Assets.
- Libraries like `pystac-client` facilitate programmatic searching based on spatial (bbox, geometry) and temporal criteria.
    - STAC API supports CQL (Common Query Language) for complex queries over catalog fields

### Xarray and ND-arrays in Scientific Computing

- Xarray introduces labels (dimensions, coordinates, attributes) to multi-dimensional arrays (like NumPy's ndarray).
- **Benefits for Geospatial/EO Data:**
  - Handles complex data like satellite image time series (e.g., dimensions: time, band, y, x).
  - Facilitates operations like alignment, indexing, and aggregation based on labels (e.g., time series analysis, band math).
  - Integrates well with Dask for parallel computing on large datasets.
- Common in climate science, oceanography, and remote sensing.

### Importance of Virtualization for Cloud Advances

- **Foundation of Cloud Computing:** Allows abstraction of physical hardware (networks, storage, compute, you name it!) into virtual, ephemeral resources.
- **Resource Pooling & Elasticity:** Enables efficient sharing and dynamic allocation/scaling of compute, storage, and network resources centralized in data centers distributed across the globe.
- **Separation of Concerns:** Decouples applications from underlying infrastructure, allowing developers to focus on building applications without worrying about hardware management and reproducibility across environments.
- **Cost Efficiency:** Pay-as-you-go model for resources, reducing upfront costs and allowing for scaling based on demand.
    - **Cost Efficiency:** Pay-as-you-go model reduces upfront capital expenses.
        - Cloud providers offer flexible pricing models (on-demand, reserved, spot instances).
        - Be aware of potential vendor lock-in and hidden costs that can impact long-term economics.
    - **Computational Scale:** Access to dirt-cheap storage and massive computing resources on demand without infrastructure management overhead.
- **Enabler for:**
  - Infrastructure as a Service (IaaS), Platform as a Service (PaaS), Software as a Service (SaaS).
  - Modern data architectures (Data Lakes, Lakehouses)
  - **Serverless computing**
    - particularly relevant for data processing and analysis
    - easier to scale and manage briefly as needed for analysis
        - e.g. no need to keep a high-availability cluster with replicas and failover running 24/7

### Rise of Virtual Datasets in EO 

(Kerchunk, VirtuliZarr, Icechunk)

- **Concept:** Datasets defined by *references* to data assets stored elsewhere (often cloud object storage), rather than containing the data itself.
    - Pointers or references, but for TB's of scientific data.
    - These formats create lightweight indexes that map to specific byte ranges in cloud-stored files.
- **Motivation:** Avoid data duplication and large data transfers; analyze data *in place*.
- **Benefits:** 
        - Avoids data duplication and large data transfers.
        - Enables analysis of data *in place*.
        - Facilitates sharing and collaboration without transferring large datasets.
- **Impact:** Enables efficient analysis of massive planetary-scale archives (e.g., climate models, satellite imagery) directly from cloud storage.
- **Examples:**
  - **Kerchunk:** Creates index files mapping logical chunks (e.g., in NetCDF/HDF5) to byte ranges in cloud storage. Allows libraries like Xarray to read cloud data as if it were a single local file.
  - **VirtualiZarr:** Similar concepts for creating virtual Zarr datasets via Kerchunk references.
  - **Icechunk:** A library for chunked data access in cloud storage, enabling efficient reading of large datasets without downloading them entirely.

## Hierarchical Spatial Clustering Algorithms
- **Hierarchical Clustering:** Groups data points into a hierarchy of clusters.
- **Spatial Clustering:** Groups data points based on their spatial proximity.
- **Hierarchical Spatial Clustering:** Combines both concepts, creating a hierarchy of clusters based on spatial relationships.
- **Benefits:**
  - Captures multi-scale spatial patterns.
  - Provides a hierarchical structure for data exploration and analysis.
  - Useful for large datasets with varying spatial resolutions.

### Research References
- "Fast Parallel Algorithms for Euclidean Minimum Spanning Tree and Hierarchical Spatial Clustering∗"
- "Optimal Parallel Algorithms for Dendrogram Computation and Single-Linkage Clustering" (resolves and parallelizes sequential bottleneck in algorithm above)
- "PANDORA: A Parallel Dendrogram Construction Algorithm for Single Linkage Clustering on GPU" (same as above but for GPUs)

## Application with H3 for PV Cluster Analysis

- **Goal:** Identify optimal spatial clusters of PV solar panel labels.
- **Approach:**
  - Use Uber's H3 DGGS.
  - Index PV label locations (centroids or polygons) into H3 cells at an appropriate resolution.
  - Treat H3 cells containing PV labels as the nodes/vertices in the clustering algorithms.
- **Leveraging H3 Features:**
  - **Proximity:** Efficiently find neighboring cells (`k_ring`).
  - **Hierarchy:** Quickly compute parent/child cells for potential multi-resolution clustering or aggregation.
  - **Traversal:** Efficient grid traversal algorithms can be adapted.
- **Hypothesis:** H3 provides a performant spatial indexing foundation for implementing and scaling these hierarchical clustering algorithms, especially for distributed/parallel computation (relevant to thesis).

## Demo of Preliminary Work

### Setup and data overview

In [None]:
# Core Libraries
import os
import json
from pathlib import Path

# Geospatial & Data Handling
import pandas as pd
import geopandas as gpd
import duckdb
import h3
import pyarrow.parquet as pq
import pyarrow as pa
import xarray as xr # For ND-Array section
import pystac_client # For STAC section
from shapely.geometry import Point, Polygon, MultiPolygon

# Visualization
import matplotlib.pyplot as plt
import pydeck as pdk
import folium
import lonboard

# Presentation/Notebook Specific
from IPython.display import display, Markdown, Latex

# Optional: Load environment variables if needed (e.g., API keys, paths)
# from dotenv import load_dotenv
# load_dotenv()

print("Libraries imported.")

# ¡Gracias! ¿Preguntas?