In [72]:
import boto3
import pyarrow as pa
import pandas as pd
import awswrangler as wr

from typing import ClassVar, Type
from datetime import datetime
from pyarrow.fs import S3FileSystem 
from dataclasses import dataclass, asdict
from abc import ABC, abstractmethod

from pyiceberg.catalog import load_catalog
from pyiceberg.catalog.glue import GlueCatalog
from pyiceberg.table import StaticTable, DataScan
from pyiceberg.partitioning import PartitionSpec, PartitionField
from pyiceberg.table.sorting import SortOrder, SortField
from pyiceberg.transforms import IdentityTransform
from pyiceberg.expressions import EqualTo, GreaterThanOrEqual
from pyiceberg.schema import Schema
from pyiceberg.table import Table
from pyiceberg import types as T

from pyiceberg.exceptions import (
    NamespaceAlreadyExistsError,
    NoSuchNamespaceError,
    NoSuchTableError,
    NotInstalledError,
    TableAlreadyExistsError,
)

In [73]:
glue_catalog = load_catalog(
    'default',
    type='glue',
    profile='default',
    region_name='us-east-1',
    client={
        'region': 'us-east-1',
        # 'access-key-id': 'YOUR_ACCESS_KEY', # Avoid hardcoding in production
        # 'secret-access-key': 'YOUR_SECRET_KEY' # Avoid hardcoding in production
    }
)

In [74]:
DATABASE = "db_ice"
TB_POLIGONO = "tb_poligono"
TB_INGESTAO = "tb_controle_ingestao"
TB_CONTROLE_IMAGEM = "tb_controle_imagem"
TB_IMAGEM = "tb_imagem"

In [75]:
glue_catalog.list_namespaces()

[('base',), ('db_ice',), ('default',), ('sagemaker_featurestore',)]

In [76]:
glue_catalog.list_tables(DATABASE)

[('db_ice', 'tb_controle_imagem'),
 ('db_ice', 'tb_controle_ingestao'),
 ('db_ice', 'tb_imagem'),
 ('db_ice', 'tb_poligono')]

In [77]:
# table = glue_catalog.load_table("base.loan_2018")
table = glue_catalog.load_table((DATABASE, TB_INGESTAO))

## Insert data

In [80]:
class RecordNotFoundError(Exception):
    """Exception raised when a record does not exist in the table."""
    pass

class PrimaryKeyDuplicatedError(Exception):
    """Exception raised when a record is duplicated in the table."""
    pass

In [81]:
class Record(ABC):

    def to_dict(self) -> dict:
        return asdict(self)

In [82]:
@dataclass
class ControleIngestao(Record):
    id_poligono: str
    data_penultima_ingestao: str
    data_ultima_ingestao: str


@dataclass
class ControleImagem(Record):
    id_poligono: str
    poligono: str
    area_poligono: str
    caminho_s3: str


@dataclass
class Imagem(Record):
    id_poligono: str
    data_imagem: str
    no_data: float
    saturated_or_defective: float
    dark_area_pixels: float
    cloud_shadows: float
    vegetation: float
    bare_soils: float
    water: float
    clouds_low_probability_or_unclassified: float
    clouds_medium_probability: float
    clouds_high_probability: float
    thin_cirrus: float
    snow_or_ice: float

In [83]:
class PATable(ABC):

    @property
    @abstractmethod
    def schema(self):
        pass

    def __init__(self) -> None:
        self.values: list[dict] = []

    def check_unique_id(self, record_id: str) -> None:
        ids = [record['id_poligono'] for record in self.values]
        if record_id in ids:
            raise PrimaryKeyDuplicatedError(f"Record with id '{record_id}' already exists.")

    def add_record(self, record: Record) -> None:
        record_dict = record.to_dict()
        # self.check_unique_id(record_id=record_dict['id_poligono'])
        self.values.append(record_dict)

    def add_dataframe(self, df: pd.DataFrame) -> None:
        for item in df.to_dict(orient="index").values():
            self.values.append(item)

    def clear(self) -> None:
        self.values = []

    def to_pyarrow_table(self) -> pa.Table:
        return pa.Table.from_pylist(
            self.values,
            schema=self.schema
        )


In [84]:
class PATableControleIngestao(PATable):

    schema: pa.schema = pa.schema([
        pa.field("id_poligono", pa.string(), nullable=False),
        pa.field("data_penultima_ingestao", pa.string(), nullable=True),
        pa.field("data_ultima_ingestao", pa.string(), nullable=False),
    ])


class PATableControleImagem(PATable):
    schema: pa.schema = pa.schema([
        pa.field("id_poligono", pa.string(), nullable=False),
        pa.field("poligono", pa.string(), nullable=False),
        pa.field("area_poligono", pa.string(), nullable=True),
        pa.field("caminho_s3", pa.string(), nullable=True),
    ])


class PATableImagem(PATable):
    schema: pa.schema = pa.schema([
        pa.field("id_poligono", pa.string(), nullable=False),
        pa.field("data_imagem", pa.string(), nullable=False),
        pa.field("no_data", pa.float32(), nullable=True),
        pa.field("saturated_or_defective", pa.float32(), nullable=True),
        pa.field("dark_area_pixels", pa.float32(), nullable=True),
        pa.field("cloud_shadows", pa.float32(), nullable=True),
        pa.field("vegetation", pa.float32(), nullable=True),
        pa.field("bare_soils", pa.float32(), nullable=True),
        pa.field("water", pa.float32(), nullable=True),
        pa.field("clouds_low_probability_or_unclassified", pa.float32(), nullable=True),
        pa.field("clouds_medium_probability", pa.float32(), nullable=True),
        pa.field("clouds_high_probability", pa.float32(), nullable=True),
        pa.field("thin_cirrus", pa.float32(), nullable=True),
        pa.field("snow_or_ice", pa.float32(), nullable=True),
    ])


In [85]:
class IcebergTable:

    def __init__(self, table: Table, record_class: Record) -> None:
        self.table = table
        self.record_class = record_class

    def load_record(self, record_id: int) -> DataScan:
        return self.table.scan(row_filter=f"id_poligono == '{record_id}'")

    def get_record(self, record_id: int) -> Record:
        tb_arrow = self.load_record(record_id).to_arrow()
        if tb_arrow.num_rows == 0:
            raise RecordNotFoundError(f"No record found with id '{record_id}'")
        if tb_arrow.num_rows > 1:
            raise PrimaryKeyDuplicatedError(f"Polygon Id duplicated with id '{record_id}'")
        return self.record_class(
            **{
                key: value[0] for key, value 
                in tb_arrow.to_pydict().items()
            }
        )

    def upsert(self, pyarrow_table: PATable) -> None:
        self.table.upsert(
            df=pyarrow_table.to_pyarrow_table(),
            join_cols=["id_poligono"]
        )

    def append(self, pyarrow_table: PATable) -> None:
        self.table.append(df=pyarrow_table.to_pyarrow_table())  

    def record_exist(self, record_id: int) -> bool:
        tb_arrow = self.load_record(record_id).to_arrow()
        if tb_arrow.num_rows > 0:
            return True
        return False

# table.delete(delete_filter="id == 3")
# table.refresh()

In [86]:
class IcebergFactory:

    glue_catalog = load_catalog(
        'default',
        type='glue',
        profile='default',
        region_name='us-east-1',
        client={
            'region': 'us-east-1',
        }
    )

    @classmethod
    def load_iceberg_table(cls, database: str, table: str, record_class: Record) -> IcebergTable:
        table = cls.glue_catalog.load_table((database, table))
        return IcebergTable(table, record_class)

In [93]:
table = glue_catalog.load_table((DATABASE, TB_IMAGEM))

In [88]:
# table.name_mapping

In [94]:
table.delete(delete_filter="id_poligono == 'id_001'")

In [None]:
# table.scan().to_polars().head()

In [15]:
# icerberg_polygon = IcebergTable(table)

### Tabela Controle de Ingestão

In [16]:
icerberg_ingestao = IcebergFactory().load_iceberg_table(DATABASE, TB_INGESTAO, ControleIngestao)

In [17]:
pa_table_ingestao = PATableControleIngestao()

In [22]:
record = icerberg_ingestao.get_record("id_002")

RecordNotExistsError: No record found with id 'id_002'

In [21]:
record.to_dict()

{'id_poligono': 'id_001',
 'data_penultima_ingestao': None,
 'data_ultima_ingestao': '2025-08-29T13:18:59.003000Z'}

In [None]:
record.data_ultima_ingestao = "2023-06-22"
record.to_dict()['data_ultima_ingestao']

'2023-06-22'

In [None]:
pa_table_ingestao.add_record(record)
pa_table_ingestao.to_pyarrow_table().to_pandas()

Unnamed: 0,id_poligono,poligono,data_plantio,anomesdia
0,id_001,"POLYGON ((-46.556983701 -23.612093836, -46.556...",2023-06-22,20250810


In [None]:
# pa_table_ingestao.add_record(record)
# pa_table_ingestao.to_pyarrow_table().to_pandas()

In [None]:
# pa_table_ingestao.clear()
# pa_table_ingestao.to_pyarrow_table().to_pandas()

In [20]:
record_1 = ControleIngestao("id_001", None, "2025-08-29")
pa_table_ingestao.add_record(record_1)
pa_table_ingestao.to_pyarrow_table().to_pandas()

Unnamed: 0,id_poligono,data_penultima_ingestao,data_ultima_ingestao
0,id_001,,2025-08-29


In [30]:
pa_table_ingestao.to_pyarrow_table().to_pandas()

Unnamed: 0,id_poligono,data_penultima_ingestao,data_ultima_ingestao


In [31]:
record_1_dict = {
    "id_poligono": "id_001",
    "data_penultima_ingestao": None,
    "data_ultima_ingestao": "2025-08-29T13:18:59.003000Z"
}
record_2 = ControleIngestao(**record_1_dict)
record_2

ControleIngestao(id_poligono='id_001', data_penultima_ingestao=None, data_ultima_ingestao='2025-08-29T13:18:59.003000Z')

In [32]:
pa_table_ingestao.add_record(record_2)
pa_table_ingestao.to_pyarrow_table().to_pandas()

Unnamed: 0,id_poligono,data_penultima_ingestao,data_ultima_ingestao
0,id_001,,2025-08-29T13:18:59.003000Z


In [33]:
icerberg_ingestao.upsert(pa_table_ingestao)

### Tabela Controle de Imagem

In [None]:
icerberg_controle_imagem = IcebergFactory().load_iceberg_table(DATABASE, TB_CONTROLE_IMAGEM, ControleImagem)

In [34]:
pa_table_imagem = PATableControleImagem()

In [35]:
pa_table_imagem.to_pyarrow_table().to_pandas()

Unnamed: 0,id_poligono,poligono,area_poligono,caminho_s3


In [36]:
pa_table_imagem.to_pyarrow_table()

pyarrow.Table
id_poligono: string not null
poligono: string not null
area_poligono: string
caminho_s3: string
----
id_poligono: [[]]
poligono: [[]]
area_poligono: [[]]
caminho_s3: [[]]

In [None]:
icerberg_controle_imagem.get_record("id_001")

RecordNotExistsError: No record found with id 'id_001'

In [None]:
type(icerberg_controle_imagem.get_record("id_101"))

__main__.ControleImagem

### Tabela de Imagens

In [52]:
icerberg_imagem = IcebergFactory().load_iceberg_table(DATABASE, TB_IMAGEM, Imagem)

In [53]:
pa_table_imagem = PATableImagem()

In [54]:
pa_table_imagem.to_pyarrow_table().to_pandas()

Unnamed: 0,id_poligono,data_imagem,no_data,saturated_or_defective,dark_area_pixels,cloud_shadows,vegetation,bare_soils,water,clouds_low_probability_or_unclassified,clouds_medium_probability,clouds_high_probability,thin_cirrus,snow_or_ice


In [59]:
record_1_dict = {
    "id_poligono": "id_001",
    "data_imagem": "2025-09-02T13:18:59.003000Z",
    "no_data": 0.01,
    "saturated_or_defective": 0.01,
    "dark_area_pixels": 0.01,
    "cloud_shadows": 0.01,
    "vegetation": 0.01,
    "bare_soils": 0.01,
    "water": 0.01,
    "clouds_low_probability_or_unclassified": 0.01,
    "clouds_medium_probability": 0.01,
    "clouds_high_probability": 0.01,
    "thin_cirrus": 0.01,
    "snow_or_ice": 0.01,
}
record_1 = Imagem(**record_1_dict)
record_1

Imagem(id_poligono='id_001', data_imagem='2025-09-02T13:18:59.003000Z', no_data=0.01, saturated_or_defective=0.01, dark_area_pixels=0.01, cloud_shadows=0.01, vegetation=0.01, bare_soils=0.01, water=0.01, clouds_low_probability_or_unclassified=0.01, clouds_medium_probability=0.01, clouds_high_probability=0.01, thin_cirrus=0.01, snow_or_ice=0.01)

In [60]:
pa_table_imagem.add_record(record_1)

In [61]:
pa_table_imagem.to_pyarrow_table().to_pandas()

Unnamed: 0,id_poligono,data_imagem,no_data,saturated_or_defective,dark_area_pixels,cloud_shadows,vegetation,bare_soils,water,clouds_low_probability_or_unclassified,clouds_medium_probability,clouds_high_probability,thin_cirrus,snow_or_ice
0,id_001,2025-09-01T13:18:59.003000Z,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01
1,id_001,2025-09-02T13:18:59.003000Z,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01


In [64]:
icerberg_imagem.append(pa_table_imagem)

In [63]:
# icerberg_imagem.upsert(pa_table_imagem)

ValueError: Duplicate rows found in source dataset based on the key columns. No upsert executed

In [None]:
# pa_table_imagem.clear()

In [51]:
icerberg_imagem.get_record("id_001")

RecordNotExistsError: No record found with id 'id_001'