In [1]:
import boto3
import geojson
import pyarrow as pa
import pandas as pd
import geopandas as gpd

from typing import ClassVar, Type
from datetime import datetime
from pyarrow.fs import S3FileSystem
from dataclasses import dataclass

from pyiceberg.catalog import load_catalog
from pyiceberg.catalog.glue import GlueCatalog
from pyiceberg.table import StaticTable, DataScan
from pyiceberg.partitioning import PartitionSpec, PartitionField
from pyiceberg.table.sorting import SortOrder, SortField
from pyiceberg.io.pyarrow import pyarrow_to_schema
from pyiceberg.transforms import IdentityTransform, DayTransform, YearTransform
from pyiceberg.expressions import EqualTo, GreaterThanOrEqual
from pyiceberg.schema import Schema
from pyiceberg.table import Table
from pyiceberg import types as T

from geojson.utils import coords
from shapely.geometry import shape, Polygon, LinearRing, box, mapping

In [2]:
glue_catalog = load_catalog(
    'default',
    type='glue',
    profile='default',
    region_name='us-east-1',
    client={
        'region': 'us-east-1',
    }
)

In [3]:
# glue_catalog.create_namespace_if_not_exists("db_ice")

In [4]:
glue_catalog.list_namespaces()

[('base',), ('db_ice',), ('default',), ('sagemaker_featurestore',)]

In [5]:
DATABASE = "db_ice"

In [6]:
glue_catalog.list_tables(DATABASE)

[('db_ice', 'tb_controle_ingestao'), ('db_ice', 'tb_poligono')]

## Create table

In [8]:
tb_name        = "tb_controle_imagem"
table_location = f"s3://data-us-east-1-891377318910/db_ice/{tb_name}" 

schema = Schema(
    T.NestedField(1, "id_poligono", T.StringType(), required=True),
    T.NestedField(2, "poligono", T.StringType(), required=True),
    T.NestedField(3, "area_poligono", T.StringType(), required=False),
    T.NestedField(4, "caminho_s3", T.StringType(), required=False),
    identifier_field_ids=[1]
)

In [9]:
table = glue_catalog.create_table(
    identifier=(DATABASE, tb_name),
    schema=schema,
    location=table_location,
    sort_order=SortOrder(
        fields=[
            SortField(1, IdentityTransform(), ascending=True),
        ]
    ),
    properties={
        "format-version": "2",
        "write.format.default": "parquet",
        "write.parquet.compression-codec": "snappy",
        "write.parquet.page-size": "1048576",  # 1 MB
        "write.parquet.dictionary-enabled": "true",
        "write.parquet.dictionary-page-size": "1048576",  # 1 MB
        "write.parquet.block-size": "268435456",  # 256 MB
        "write.parquet.enable-encoding": "true",
        "write.parquet.enable-statistics": "true",
        "write.parquet.enable-bloom-filter": "true",
        "write.parquet.bloom-filter-encoding": "utf8",
    }
)

## Insert data

In [10]:
table = glue_catalog.load_table((DATABASE, tb_name))

In [11]:
arrow_schema = pa.schema(
    [
        pa.field("id_poligono", pa.string(), nullable=False),
        pa.field("poligono", pa.string(), nullable=False),
        pa.field("area_poligono", pa.string(), nullable=True),
        pa.field("caminho_s3", pa.string(), nullable=True),
    ]
)

In [15]:
table_values = [
    {
        "id_poligono": "id_101", 
        "poligono": "POLYGON((-47.9296875 -15.601025390625,-47.8125 -15.601025390625,-47.8125 -15.517578125,-47.929687))",
        "area_poligono": "100",
        "caminho_s3": "s3://data-us-east-1-891377318910/geospatial/mvp/images/id_101.zarr",
    },
]

In [16]:
pa_table = pa.Table.from_pylist(
    table_values,
    schema=arrow_schema
)

In [17]:
pa_table.to_pandas().head()

Unnamed: 0,id_poligono,poligono,area_poligono,caminho_s3
0,id_101,"POLYGON((-47.9296875 -15.601025390625,-47.8125...",100,s3://data-us-east-1-891377318910/geospatial/mv...


In [18]:
table.append(pa_table)