# Colorado Sample Data Generator

Generates ~10,000 geospatial vector features within the state of Colorado and writes them to Iceberg tables via the lakehouse catalog.

| Table | Geometry | Count | Attributes |
|-------|----------|-------|------------|
| `colorado.points` | Point | 5,000 | id, name, category |
| `colorado.lines` | LineString | 3,000 | id, name, length_km |
| `colorado.polygons` | Polygon | 2,000 | id, name, area_sqm |

**Colorado bounding box:** -109.05 to -102.05 lon, 37.0 to 41.0 lat

In [None]:
from sedona.spark import SedonaContext

sedona = SedonaContext.builder().getOrCreate()

sedona.sql("CREATE NAMESPACE IF NOT EXISTS lakehouse.colorado")
print("Namespace lakehouse.colorado ready.")

In [None]:
import random
random.seed(42)

# Colorado bounding box
LON_MIN, LON_MAX = -109.05, -102.05
LAT_MIN, LAT_MAX = 37.0, 41.0

CATEGORIES = ["park", "school", "hospital", "restaurant", "gas_station",
              "trailhead", "campground", "viewpoint", "water_tower", "fire_station"]

# Generate 5,000 random points
point_rows = []
for i in range(5000):
    lon = random.uniform(LON_MIN, LON_MAX)
    lat = random.uniform(LAT_MIN, LAT_MAX)
    cat = random.choice(CATEGORIES)
    point_rows.append((f"pt_{i:05d}", f"{cat}_{i}", cat, f"POINT({lon} {lat})"))

df_points = sedona.createDataFrame(point_rows, ["id", "name", "category", "wkt"])

# Convert WKT to geometry, then to WKB for Iceberg storage
df_points.createOrReplaceTempView("raw_points")
df_points_geo = sedona.sql("""
    SELECT id, name, category,
           ST_AsBinary(ST_GeomFromWKT(wkt)) AS geometry
    FROM raw_points
""")

# Write to Iceberg
sedona.sql("DROP TABLE IF EXISTS lakehouse.colorado.points")
df_points_geo.writeTo("lakehouse.colorado.points").create()

count = sedona.sql("SELECT count(*) AS n FROM lakehouse.colorado.points").collect()[0]["n"]
print(f"Wrote {count} points to lakehouse.colorado.points")

In [None]:
# Generate 3,000 random linestrings (2-5 vertices each)
ROAD_TYPES = ["road", "trail", "highway", "path", "creek"]

line_rows = []
for i in range(3000):
    num_verts = random.randint(2, 5)
    start_lon = random.uniform(LON_MIN, LON_MAX)
    start_lat = random.uniform(LAT_MIN, LAT_MAX)
    coords = [f"{start_lon} {start_lat}"]
    for _ in range(num_verts - 1):
        # Each vertex offsets slightly (roughly 1-10 km)
        start_lon += random.uniform(-0.05, 0.05)
        start_lat += random.uniform(-0.05, 0.05)
        # Clamp to Colorado
        start_lon = max(LON_MIN, min(LON_MAX, start_lon))
        start_lat = max(LAT_MIN, min(LAT_MAX, start_lat))
        coords.append(f"{start_lon} {start_lat}")
    wkt = f"LINESTRING({', '.join(coords)})"
    road_type = random.choice(ROAD_TYPES)
    line_rows.append((f"ln_{i:05d}", f"{road_type}_{i}", wkt))

df_lines = sedona.createDataFrame(line_rows, ["id", "name", "wkt"])
df_lines.createOrReplaceTempView("raw_lines")

df_lines_geo = sedona.sql("""
    SELECT id, name,
           ROUND(ST_Length(ST_GeomFromWKT(wkt)) * 111.32, 2) AS length_km,
           ST_AsBinary(ST_GeomFromWKT(wkt)) AS geometry
    FROM raw_lines
""")

sedona.sql("DROP TABLE IF EXISTS lakehouse.colorado.lines")
df_lines_geo.writeTo("lakehouse.colorado.lines").create()

count = sedona.sql("SELECT count(*) AS n FROM lakehouse.colorado.lines").collect()[0]["n"]
print(f"Wrote {count} lines to lakehouse.colorado.lines")

In [None]:
# Generate 2,000 random polygons (small quads simulating parcels/buildings)
PARCEL_TYPES = ["residential", "commercial", "industrial", "agricultural", "public"]

poly_rows = []
for i in range(2000):
    cx = random.uniform(LON_MIN + 0.01, LON_MAX - 0.01)
    cy = random.uniform(LAT_MIN + 0.01, LAT_MAX - 0.01)
    # Random size: 100m to 2km equivalent
    dx = random.uniform(0.001, 0.02)
    dy = random.uniform(0.001, 0.02)
    # Slightly irregular quad (jitter corners)
    j = lambda: random.uniform(-0.002, 0.002)
    coords = [
        f"{cx - dx + j()} {cy - dy + j()}",
        f"{cx + dx + j()} {cy - dy + j()}",
        f"{cx + dx + j()} {cy + dy + j()}",
        f"{cx - dx + j()} {cy + dy + j()}",
    ]
    coords.append(coords[0])  # close the ring
    wkt = f"POLYGON(({', '.join(coords)}))"
    ptype = random.choice(PARCEL_TYPES)
    poly_rows.append((f"pg_{i:05d}", f"{ptype}_{i}", ptype, wkt))

df_polys = sedona.createDataFrame(poly_rows, ["id", "name", "parcel_type", "wkt"])
df_polys.createOrReplaceTempView("raw_polys")

df_polys_geo = sedona.sql("""
    SELECT id, name, parcel_type,
           ROUND(ST_Area(ST_GeomFromWKT(wkt)) * 111320 * 111320, 0) AS area_sqm,
           ST_AsBinary(ST_GeomFromWKT(wkt)) AS geometry
    FROM raw_polys
""")

sedona.sql("DROP TABLE IF EXISTS lakehouse.colorado.polygons")
df_polys_geo.writeTo("lakehouse.colorado.polygons").create()

count = sedona.sql("SELECT count(*) AS n FROM lakehouse.colorado.polygons").collect()[0]["n"]
print(f"Wrote {count} polygons to lakehouse.colorado.polygons")

In [None]:
# Verification: counts + sample data + spatial query near Denver
print("=== Table Counts ===")
for table in ["points", "lines", "polygons"]:
    n = sedona.sql(f"SELECT count(*) AS n FROM lakehouse.colorado.{table}").collect()[0]["n"]
    print(f"  {table}: {n}")

print("\n=== Sample Points ===")
sedona.sql("""
    SELECT id, name, category, ST_AsText(ST_GeomFromWKB(geometry)) AS geom_wkt
    FROM lakehouse.colorado.points LIMIT 5
""").show(truncate=False)

print("=== Sample Lines ===")
sedona.sql("""
    SELECT id, name, length_km, ST_AsText(ST_GeomFromWKB(geometry)) AS geom_wkt
    FROM lakehouse.colorado.lines LIMIT 5
""").show(truncate=False)

print("=== Sample Polygons ===")
sedona.sql("""
    SELECT id, name, parcel_type, area_sqm, ST_AsText(ST_GeomFromWKB(geometry)) AS geom_wkt
    FROM lakehouse.colorado.polygons LIMIT 5
""").show(truncate=False)

# Spatial query: features within ~50km of Denver (39.74, -104.99)
print("=== Points within ~50km of Denver ===")
sedona.sql("""
    SELECT id, name, category,
           ROUND(ST_Distance(ST_GeomFromWKB(geometry), ST_Point(-104.99, 39.74)) * 111.32, 1) AS dist_km
    FROM lakehouse.colorado.points
    WHERE ST_Distance(ST_GeomFromWKB(geometry), ST_Point(-104.99, 39.74)) < 0.45
    ORDER BY dist_km
    LIMIT 10
""").show(truncate=False)