# EDA of PV Datasets, Visualization, and Joining with Spatial Contexts



### References:
- [DuckLake Documentation](https://ducklake.select/docs/stable/)
- [DuckLake with Ibis Python DataFrames](https://emilsadek.com/blog/ducklake-ibis/)
- [A new data lakehouse with DuckLake and dbt](https://giacomo.coletto.io/blog/ducklake/)

In [1]:
import os
import shutil
import random
import pickle
from pathlib import Path
from collections import Counter

In [2]:
if 'google.colab' in str(get_ipython()):
    print('Running on CoLab')
    from google.colab import drive
    from google.colab import userdata
    import secrets

    # fetch requirements from cloning repo because GDrive encodes as .gdoc reference and Windows keeps encoding as .docx
    !git clone "https://github.com/avega17/Ice-mELT_DuckLake.git" 
    !ls -a
    # note that segment-geospatial[samgeo2] takes quite a while to download all deps and build on colab 2-core cpu
    !pip install -r Ice-mELT_DuckLake/requirements.txt

In [3]:

import ibis
from ibis import _
import ibis.selectors as s
import duckdb
import pandas as pd
# from huggingface_hub import HfFileSystem, login

import geopandas as gpd
import shapely
from shapely import wkt

from tqdm import tqdm
from dotenv import load_dotenv
from pprint import pprint

In [4]:
ibis.options.interactive = True
ibis.options.graphviz_repr=True
random.seed(24765131)

load_dotenv()

# assume we're using prod catalog, but default to local/dev if env var not set
local_default = os.getenv('DUCKLAKE_CONNECTION_STRING_DEV')
DUCKLAKE_CATALOG = os.getenv('DUCKLAKE_CONNECTION_STRING_PROD', local_default)
DUCKLAKE_ATTACH = os.getenv("DUCKLAKE_ATTACH_PROD")
DUCKLAKE_NAME = os.getenv("DUCKLAKE_NAME")
DUCKLAKE_DATA_PATH = os.getenv("DUCKLAKE_DATA_PATH")

# pretty print our connection string info
# TODO: comment out and remove output before commit
print(f"Using DuckLake catalog type: {DUCKLAKE_CATALOG.split(':')[1]}" )
catalog_creds = DUCKLAKE_CATALOG.split(':')[2].strip('()').split(' ')
# skip DATA_PATH at end
for cred in catalog_creds[:-2]:
    key, val = cred.split('=')
    # print(f"  {key}: {val}")
print(f"  DATA_PATH: {DUCKLAKE_CATALOG.split('DATA_PATH ')[1][:-1]}")

Using DuckLake catalog type: postgres
  DATA_PATH: 's3://eo-pv-lakehouse/ducklake_data'


### Connect to our data lake catalog with ibis



In [5]:
# con.load_extension("ducklake")
attach_catalog_sql = f"""ATTACH IF NOT EXISTS '{DUCKLAKE_ATTACH}' AS {DUCKLAKE_NAME}
    (DATA_PATH '{DUCKLAKE_DATA_PATH}');
USE {DUCKLAKE_NAME};
"""

duckdb_config = {
    'threads': 6,
    'memory_limit': '12GB',
    's3_access_key_id': os.getenv('R2_ACCESS_KEY_ID'),
    's3_secret_access_key': os.getenv('R2_SECRET_KEY'),
    's3_endpoint': os.getenv('R2_S3_ENDPOINT'),
    's3_use_ssl': 'true',
    's3_url_style': 'path'
}

extensions_query = """
    INSTALL httpfs;
    LOAD httpfs;
    INSTALL ducklake;
    LOAD ducklake;
    INSTALL spatial;
    LOAD spatial;
    INSTALL h3 FROM community;
    LOAD h3;
"""

# first connect directly via duckdb to described views in DuckLake catalog
con = duckdb.connect(database=':memory:', config=duckdb_config)
con.execute(attach_catalog_sql)
con.execute(extensions_query)

<_duckdb.DuckDBPyConnection at 0x1089ec4f0>

In [6]:
con.execute("show tables;").fetchall()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[('pv_h3_cells',),
 ('pv_h3_grid',),
 ('raw_chn_med_res_pv_2024',),
 ('raw_global_harmonized_large_solar_farms_2020',),
 ('raw_global_pv_inventory_sent2_spot_2021',),
 ('raw_ind_pv_solar_farms_2022',),
 ('raw_uk_crowdsourced_pv_2020',),
 ('raw_usa_cali_usgs_pv_2016',),
 ('stg_chn_med_res_pv_2024',),
 ('stg_global_harmonized_large_solar_farms_2020',),
 ('stg_global_pv_inventory_sent2_spot_2021',),
 ('stg_ind_pv_solar_farms_2022',),
 ('stg_pv_consolidated',),
 ('stg_uk_crowdsourced_pv_2020',),
 ('stg_usa_cali_usgs_pv_2016',)]

In [7]:
# see here for how to output view definition: https://duckdb.org/docs/stable/sql/statements/create_view
view_query = con.execute("""select sql FROM duckdb_views()
        where view_name = 'raw_global_pv_inventory_sent2_spot_2021';""").fetchall()[0][0]
pprint(view_query)

('CREATE VIEW raw_global_pv_inventory_sent2_spot_2021 AS WITH doi_manifest_raw '
 'AS (SELECT * FROM '
 "read_json_objects_auto('r2://eo-pv-lakehouse/pv_metadata/doi_manifest.json')), "
 'doi_manifest_expanded AS (SELECT je."key" AS dataset_name, je."value" AS '
 'metadata_json FROM doi_manifest_raw AS dmr , json_each(dmr."json") AS je), '
 'dataset_metadata AS (SELECT dataset_name, json_extract_string(metadata_json, '
 "'$.doi') AS doi, json_extract_string(metadata_json, '$.repo') AS repo, "
 "json_extract_string(metadata_json, '$.paper_doi') AS paper_doi, "
 "json_extract_string(metadata_json, '$.paper_title') AS paper_title FROM "
 'doi_manifest_expanded WHERE (dataset_name = '
 "'global_pv_inventory_sent2_spot_2021'))SELECT * EXCLUDE (geometry), "
 'st_astext(st_geomfromwkb(geometry)) AS geometry_wkt, '
 'st_aswkb(st_geomfromwkb(geometry)) AS geometry_wkb, dm.doi, dm.repo, '
 'dm.paper_doi, dm.paper_title, CURRENT_TIMESTAMP AS dbt_loaded_at FROM '
 "read_parquet('s3://eo-pv-lakehou

In [8]:
# display the underlying data files that makes up one of the stg tables
con.execute("FROM ducklake_list_files('catalog', 'stg_uk_crowdsourced_pv_2020');")

BinderException: Binder Error: Failed to find attached database "catalog"

LINE 1: FROM ducklake_list_files('catalog', 'stg_uk_crowdsourced_pv_2020...
             ^

In [None]:
# close duckdb connection so we can use ibis to connect
con.close()

In [None]:
# use in-memory/ephemeral db
con = ibis.duckdb.connect(
    extensions=["ducklake", "spatial", "h3", "httpfs"],
    **duckdb_config
    )

In [None]:

con.raw_sql(attach_catalog_sql)
# add community cache_httpfs extension; this causes an "http_init already loaded" error only when loading as part of ibis extensions arg
con.raw_sql("INSTALL cache_httpfs FROM community; LOAD cache_httpfs;")
if 'google.colab' in str(get_ipython()): # only in colab, the extension doesn't seem to load with ibis's extensions arg
    con.raw_sql("INSTALL h3 FROM community; LOAD h3;")
con.list_catalogs()

In [None]:
con.list_tables()

In [None]:
stg_pv = con.table("stg_pv_consolidated")
# exclude the uk dataset as it seems to have no intersects and severely distorts our matching %
# stg_pv = stg_pv.filter(_.dataset_name != 'uk_crowdsourced_pv_2020')

@ibis.udf.scalar.builtin
# signature returns GEOMETRY; use ibis geometry datatype
def ST_GeomFromText(wkt: str) -> ibis.expr.datatypes.GeoSpatial:
    '''Convert WKT to geometry'''
stg_pv = stg_pv.mutate(geom=ST_GeomFromText(_.geometry))

full_pv_dataset = stg_pv.to_pandas()
full_pv_dataset =  gpd.GeoDataFrame(full_pv_dataset, geometry=full_pv_dataset['geometry'].apply(shapely.wkt.loads), crs='EPSG:4326')
full_pv_dataset.describe()
# save to geoparquet
full_pv_dataset.to_parquet('full_pv_dataset.parquet', geometry_encoding='wkb', write_covering_bbox=True)



# sample
stg_pv.sample(0.01)

In [None]:
stg_pv.count()

In [None]:
from ibis import _

# test h3 functionality: https://github.com/isaacbrodsky/h3-duckdb?tab=readme-ov-file#full-list-of-functions
@ibis.udf.scalar.builtin
def h3_latlng_to_cell_string(lat: float, lng: float, resolution: int) -> str:
    '''Convert latitude/longitude coordinate to cell ID'''

# and ibis udf that enables using backend's (duckdb) functions including extensions: https://ibis-project.org/reference/scalar-udfs#ibis.expr.operations.udf.scalar.builtin
h3_test = stg_pv.sample(0.01).select("unified_id", "centroid_lat", "centroid_lon").\
    mutate(h3_cell=h3_latlng_to_cell_string(_.centroid_lat, _.centroid_lon, 8)).head(10)
display(h3_test)


In [None]:
from ibis.expr.visualize import to_graph


to_graph(con.table("stg_pv_consolidated").sample(0.01).select("unified_id", "centroid_lat", "centroid_lon").\
    mutate(h3_cell=h3_latlng_to_cell_string(_.centroid_lat, _.centroid_lon, 8)).head(10))

In [None]:
stg_pv.aggregate(by=["h3_index_8"], pv_count=_.unified_id.count(), pv_area=_.area_m2.sum()).order_by(ibis.desc("pv_count")).head(20)

### Step 1: Convert PV Labels to GeoPandas DataFrame

In [None]:
# Fetched PV labels from DuckLake using ibis
# Convert to pandas first; shuffle the rows (revise as dataset grows)
pv_df = stg_pv.to_pandas().sample(frac=1)

# Select a sample for MVP (e.g., 150-200 labels)
# pv_sample = pv_df.sample(n=200, random_state=42)
pv_sample = pv_df

# Create GeoPandas dataframe and convert WKT geometry strings to shapely geometries
pv_gdf = gpd.GeoDataFrame(pv_sample, geometry=pv_sample['geometry'].apply(shapely.wkt.loads), crs='EPSG:4326')

print(f"Loaded {len(pv_gdf)} PV labels into GeoPandas")
print(f"Columns: {pv_gdf.columns.tolist()}")
pv_gdf.sample(10)

In [None]:
# count how many Point vs Polygon labels we have
point_polygon_counts = pv_gdf['geometry'].geom_type.value_counts()
print("Geometry Type Counts:")
print(point_polygon_counts)