In [1]:
import boto3
import geojson
import pyarrow as pa
import pandas as pd
import geopandas as gpd

from typing import ClassVar, Type
from datetime import datetime
from pyarrow.fs import S3FileSystem
from dataclasses import dataclass

from pyiceberg.catalog import load_catalog
from pyiceberg.catalog.glue import GlueCatalog
from pyiceberg.table import StaticTable, DataScan
from pyiceberg.partitioning import PartitionSpec, PartitionField
from pyiceberg.table.sorting import SortOrder, SortField
from pyiceberg.io.pyarrow import pyarrow_to_schema
from pyiceberg.transforms import IdentityTransform, DayTransform, YearTransform
from pyiceberg.expressions import EqualTo, GreaterThanOrEqual
from pyiceberg.schema import Schema
from pyiceberg.table import Table
from pyiceberg import types as T

from geojson.utils import coords
from shapely.geometry import shape, Polygon, LinearRing, box, mapping

In [2]:
glue_catalog = load_catalog(
    'default',
    type='glue',
    profile='default',
    region_name='us-east-1',
    client={
        'region': 'us-east-1',
    }
)

In [None]:
# glue_catalog.create_namespace_if_not_exists("db_ice")

In [115]:
glue_catalog.list_namespaces()

[('base',), ('db_ice',), ('default',), ('sagemaker_featurestore',)]

In [116]:
DATABASE = "db_ice"

In [117]:
glue_catalog.list_tables(DATABASE)

[]

## Create table

In [128]:
tb_name        = "tb_poligono"
table_location = f"s3://data-us-east-1-891377318910/db_ice/{tb_name}" 

schema = Schema(
    T.NestedField(1, "id_poligono", T.StringType(), required=True),
    T.NestedField(2, "poligono", T.StringType(), required=True),
    T.NestedField(3, "data_plantio", T.StringType(), required=False),
    T.NestedField(4, "anomesdia", T.IntegerType(), required=True),
    identifier_field_ids=[1]
)

In [129]:
partition_field = schema.find_field("anomesdia")

partition_spec = PartitionSpec(
    fields=[
        # PartitionField(source_id=4, field_id=1, transform=DayTransform("anomesdia"), name="anomesdia")
        PartitionField(
            source_id=partition_field.field_id,
            field_id=partition_field.field_id,
            transform=IdentityTransform(),
            name="anomesdia"
        )
    ]
)

In [None]:
table = glue_catalog.create_table(
    identifier=(DATABASE, tb_name),
    schema=schema,
    location=table_location,
    partition_spec=partition_spec,
    sort_order=SortOrder(
        fields=[
            SortField(1, IdentityTransform(), ascending=True),
        ]
    ),
    properties={
        "format-version": "2",
        "write.format.default": "parquet",
        "write.parquet.compression-codec": "snappy",
        "write.parquet.page-size": "1048576",  # 1 MB
        "write.parquet.dictionary-enabled": "true",
        "write.parquet.dictionary-page-size": "1048576",  # 1 MB
        "write.parquet.block-size": "268435456",  # 256 MB
        "write.parquet.enable-encoding": "true",
        "write.parquet.enable-statistics": "true",
        "write.parquet.enable-bloom-filter": "true",
        "write.parquet.bloom-filter-encoding": "utf8",
    }
)

## Load data

In [64]:
def bounds_to_ring(geom):
    minx, miny, maxx, maxy = geom.bounds
    return LinearRing([
        (minx, miny),
        (maxx, miny),
        (maxx, maxy),
        (minx, maxy),
        (minx, miny)
    ])


def load_geosjon(path: str) -> dict:
    with open(path, "r") as f:
        obj = geojson.load(f)
    return obj


def get_polygon_from_geometry(geometry: dict) -> Polygon:
    return Polygon(coords(geometry['coordinates']))

In [60]:
filepath = "../../geospatial/tests_01/data/archive/BR_Municipios_2020.shp"
gdf_municipios = gpd.read_file(filepath)

In [65]:
crs = "EPSG:4674"
gdf_municipios = gdf_municipios.to_crs(crs)

In [66]:
gdf_municipios['boundary'] = gdf_municipios.boundary
gdf_municipios["centroid"] = gdf_municipios.centroid
gdf_municipios["bounds"] = gdf_municipios.geometry.apply(bounds_to_ring)


  gdf_municipios["centroid"] = gdf_municipios.centroid


In [67]:
gdf_municipios.head(5)

Unnamed: 0,CD_MUN,NM_MUN,SIGLA_UF,AREA_KM2,geometry,boundary,centroid,bounds
0,1100015,Alta Floresta D'Oeste,RO,7067.127,"POLYGON ((-62.19465 -11.82746, -62.19332 -11.8...","LINESTRING (-62.19465 -11.82746, -62.19332 -11...",POINT (-62.27466 -12.47013),"LINEARRING (-62.89408 -13.11894, -61.88651 -13..."
1,1100023,Ariquemes,RO,4426.571,"POLYGON ((-62.53648 -9.73222, -62.52765 -9.736...","LINESTRING (-62.53648 -9.73222, -62.52765 -9.7...",POINT (-62.95726 -9.95189),"LINEARRING (-63.61822 -10.28828, -62.42262 -10..."
2,1100031,Cabixi,RO,1314.352,"POLYGON ((-60.37119 -13.36655, -60.37134 -13.3...","LINESTRING (-60.37119 -13.36655, -60.37134 -13...",POINT (-60.63981 -13.47488),"LINEARRING (-60.92476 -13.6937, -60.36529 -13...."
3,1100049,Cacoal,RO,3793.0,"POLYGON ((-61.0008 -11.29737, -61.00103 -11.39...","LINESTRING (-61.0008 -11.29737, -61.00103 -11....",POINT (-61.32473 -11.30129),"LINEARRING (-61.7958 -11.61921, -61.00059 -11...."
4,1100056,Cerejeiras,RO,2783.3,"POLYGON ((-61.49976 -13.00525, -61.49809 -13.0...","LINESTRING (-61.49976 -13.00525, -61.49809 -13...",POINT (-61.26095 -13.20356),"LINEARRING (-61.90804 -13.36371, -60.71665 -13..."


In [70]:
gdf_municipios['AREA_KM2'].nsmallest(10)

2911     3.565
3273     3.612
3812    15.331
3711    17.264
3550    17.449
1525    18.609
3221    19.393
1774    19.793
3855    20.388
1637    25.704
Name: AREA_KM2, dtype: float64

In [86]:
gdf = gdf_municipios.loc[[3812, 3711, 3550, 1774, 3221], ['CD_MUN', 'geometry']].reset_index(drop=True)
gdf.head(100)

Unnamed: 0,CD_MUN,geometry
0,3548807,"POLYGON ((-46.55698 -23.61209, -46.55694 -23.6..."
1,3539806,"POLYGON ((-46.35065 -23.50767, -46.35048 -23.5..."
2,3525003,"POLYGON ((-46.91529 -23.51394, -46.91525 -23.5..."
3,2802502,"POLYGON ((-36.98399 -10.68278, -36.98237 -10.6..."
4,3303203,"POLYGON ((-43.40291 -22.8003, -43.40187 -22.80..."


In [97]:
size = gdf.shape[0]
polygons = {}

for item in range(size):
    polygons[item] = str(gdf['geometry'].values[item].wkt)

In [99]:
polygons.keys()

dict_keys([0, 1, 2, 3, 4])

## Insert data

In [107]:
table = glue_catalog.load_table((DATABASE, tb_name))

In [131]:
arrow_schema = pa.schema(
    [
        pa.field("id_poligono", pa.string(), nullable=False),
        pa.field("poligono", pa.string(), nullable=False),
        pa.field("data_plantio", pa.string(), nullable=True),
        pa.field("anomesdia", pa.uint32(), nullable=False),
    ]
)

In [132]:
table_values = [
    {"id_poligono": "id_001", "poligono": polygons[0], "data_plantio": "2023-06-20", "anomesdia": 20250810},
    {"id_poligono": "id_002", "poligono": polygons[1], "data_plantio": "2023-07-10", "anomesdia": 20250810},
    {"id_poligono": "id_003", "poligono": polygons[2], "data_plantio": "2023-08-01", "anomesdia": 20250810},
    {"id_poligono": "id_004", "poligono": polygons[3], "data_plantio": "2023-08-01", "anomesdia": 20250810},
    {"id_poligono": "id_005", "poligono": polygons[4], "data_plantio": "2023-08-01", "anomesdia": 20250810},
]

In [133]:
pa_table = pa.Table.from_pylist(
    table_values,
    schema=arrow_schema
)

In [134]:
pa_table.to_pandas().head()

Unnamed: 0,id_poligono,poligono,data_plantio,anomesdia
0,id_001,"POLYGON ((-46.556983701 -23.612093836, -46.556...",2023-06-20,20250810
1,id_002,"POLYGON ((-46.350654 -23.507674999, -46.350485...",2023-07-10,20250810
2,id_003,"POLYGON ((-46.915293 -23.513940999, -46.915253...",2023-08-01,20250810
3,id_004,"POLYGON ((-36.983988539 -10.682781162, -36.982...",2023-08-01,20250810
4,id_005,"POLYGON ((-43.402914028 -22.800304262, -43.401...",2023-08-01,20250810


In [135]:
table.append(pa_table)

