In [1]:
import sys
import os
sys.path.insert(0, '/home/madiejf/github-os/h3-indexer/src')

### Import package and Set environment variables

In [2]:
import h3_indexer
from dotenv import load_dotenv
load_dotenv()

True

### Create a Job Config

Read a Job Config from a JSON input.

This job config is providing a single input, a parquet file in S3. The parquet file is a line-type geometry dataset with a unique ID column called "route_id" and a geometry column called "geom_geojson". The attribute that I want to resolve to H3 grids at resolution 4 is called "dummy_emissions".

In [3]:
from h3_indexer.utils.config import read_yaml_config, read_json_config
import json

job_config_json = """{
  "name": "test",
  "version": "1.0.0",
  "h3_resolution": 4,
  "output_s3_path": "wws-air-quality",
  "inputs": {
    "routes": {
      "type": "vector",
      "s3_path": "wws-air-quality/testing/routes_03012025_tx/",
      "unique_id": "route_id",
      "geometry_type": "LINE",
      "geometry_column_name": "geom_geojson",
      "method": "PCT_LENGTH",
      "input_columns": [
        "dummy_emissions"
      ]
    }
  }
}"""

job_config = read_json_config(json.loads(job_config_json))

In [4]:
from h3_indexer.spark.spark import get_spark_session
spark, _ = get_spark_session(job_config.h3_resolution)

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/local/home/madiejf/h3-indexer-env/aws-glue-libs/jarsv1/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/local/home/madiejf/h3-indexer-env/spark-3.3.0-amzn-1-bin-3.3.3-amzn-0/spark/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/local/home/madiejf/h3-indexer-env/spark-3.3.0-amzn-1-bin-3.3.3-amzn-0/spark/jars/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Reload4jLoggerFactory]
https://artifacts.unidata.ucar.edu/repository/unidata-all added as a remote repository with the name: repo-1
Ivy Default Cache set to: /home/madiejf/.ivy2/cache
The jars for the packages stored in: /home/madiejf/.ivy2/jars
org.apache.sedona#sedona-spark-shaded-3.3_2.

:: loading settings :: url = jar:file:/local/home/madiejf/h3-indexer-env/aws-glue-libs/jarsv1/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.apache.sedona#sedona-spark-shaded-3.3_2.12;1.7.1 in central
	found org.datasyslab#geotools-wrapper;1.7.1-28.5 in central
	found org.apache.hadoop#hadoop-aws;3.4.1 in central
	found software.amazon.awssdk#bundle;2.24.6 in central
	found org.wildfly.openssl#wildfly-openssl;1.1.3.Final in central
:: resolution report :: resolve 229ms :: artifacts dl 7ms
	:: modules in use:
	org.apache.hadoop#hadoop-aws;3.4.1 from central in [default]
	org.apache.sedona#sedona-spark-shaded-3.3_2.12;1.7.1 from central in [default]
	org.datasyslab#geotools-wrapper;1.7.1-28.5 from central in [default]
	org.wildfly.openssl#wildfly-openssl;1.1.3.Final from central in [default]
	software.amazon.awssdk#bundle;2.24.6 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------

### Preview the Routes data

In [5]:

s3_path = "s3a://wws-air-quality/testing/routes_03012025_tx/"

# Load the parquet file into a PySpark DataFrame
routes_df = spark.read.parquet(s3_path)

# Show schema and sample data
print("Schema:")
routes_df.printSchema()
print("\nSample data:")
routes_df.show(5)
print(f"\nTotal routes: {routes_df.count()}")

                                                                                

Schema:
root
 |-- route_id: string (nullable = true)
 |-- geom: string (nullable = true)
 |-- dummy_emissions: double (nullable = true)
 |-- geom_geojson: string (nullable = true)


Sample data:


                                                                                

+--------------------+--------------------+--------------------+--------------------+
|            route_id|                geom|     dummy_emissions|        geom_geojson|
+--------------------+--------------------+--------------------+--------------------+
|01936312-1499-410...|01020000001a00000...|0.002798218687786...|{"type":"LineStri...|
|08039121-822d-489...|01020000002400000...|  5.3253454404509E-4|{"type":"LineStri...|
|0b7d13a5-a77e-46e...|01020000001000000...|2.571212936000028E-4|{"type":"LineStri...|
|        14619541-111|01020000003400000...|0.002819897809944...|{"type":"LineStri...|
|         14619696-70|01020000005c00000...|0.003055982783492923|{"type":"LineStri...|
+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows


Total routes: 1000


### Validate

In [6]:
from h3_indexer.validator import validate_config

job = validate_config(job_config, spark)

INFO:root:All specified input columns are numeric.
INFO:root:Total rows in input routes: 1000
INFO:root:Valid geometries in input routes: 1000
INFO:root:Removed invalid geometries in input routes: 0


### Index

In [7]:
from main import index_job

index_job(job, spark)

INFO:root:Exporting to S3: s3://wws-air-quality/test/indexer/routes             
                                                                                

In [8]:
job.inputs['routes'].h3_indexed_df.show(5)

+---------------+-------------+---------------+------------------+-----------+-------------------+------------------+
|       h3_index|h3_resolution|   h3_r3_parent|       h3_area_km2|   route_id|              ratio|   total_length_km|
+---------------+-------------+---------------+------------------+-----------+-------------------+------------------+
|84444d9ffffffff|            4|83444dfffffffff|1940.1190723384705| 14619111-9|0.06684305377665199|192.26525333611414|
|84444dbffffffff|            4|83444dfffffffff|1940.1063931159267|14621075-51|0.21388388573060046| 81.90121737051786|
|84444d9ffffffff|            4|83444dfffffffff|1940.1190723384705| 14619111-8|0.04990429293934751| 181.6751815613184|
|84444dbffffffff|            4|83444dfffffffff|1940.1063931159267|14620524-62|0.23078896914958355| 84.27524627831124|
|8426d0bffffffff|            4|8326d0fffffffff|1987.1533604416047|14620546-40|  0.457986480328869| 72.30157040041772|
+---------------+-------------+---------------+---------

### Resolve

In [9]:
from main import resolve_job

resolve_job(job, spark)

INFO:root:Exporting to S3: s3://wws-air-quality/test/resolver                   
                                                                                

In [10]:
job.h3_resolved_df.show(5)

+---------------+-------------+---------------+------------------+--------------------+
|       h3_index|h3_resolution|   h3_r3_parent|       h3_area_km2| sum_dummy_emissions|
+---------------+-------------+---------------+------------------+--------------------+
|8426c89ffffffff|            4|8326c8fffffffff|1956.4490093197985|0.009890607188183052|
|84446c9ffffffff|            4|83446cfffffffff|1985.9623553970523|0.014567905321624462|
|8426c8bffffffff|            4|8326c8fffffffff| 1957.471829299016|0.004658291294159675|
|84489c9ffffffff|            4|83489cfffffffff| 2044.078627833024|0.007991191580432464|
|8448923ffffffff|            4|834892fffffffff|1992.9078314864244|0.001300773557359...|
+---------------+-------------+---------------+------------------+--------------------+
only showing top 5 rows

