In [17]:
import boto3
import pyarrow as pa
import pandas as pd

from typing import ClassVar, Type
from datetime import datetime
from pyarrow.fs import S3FileSystem 
from dataclasses import dataclass, asdict
from abc import ABC, abstractmethod

from pyiceberg.catalog import load_catalog
from pyiceberg.catalog.glue import GlueCatalog
from pyiceberg.table import StaticTable, DataScan
from pyiceberg.partitioning import PartitionSpec, PartitionField
from pyiceberg.table.sorting import SortOrder, SortField
from pyiceberg.transforms import IdentityTransform
from pyiceberg.expressions import EqualTo, GreaterThanOrEqual
from pyiceberg.schema import Schema
from pyiceberg.table import Table
from pyiceberg import types as T

from pyiceberg.exceptions import (
    NamespaceAlreadyExistsError,
    NoSuchNamespaceError,
    NoSuchTableError,
    NotInstalledError,
    TableAlreadyExistsError,
)

In [2]:
glue_catalog = load_catalog(
    'default',
    type='glue',
    profile='default',
    region_name='us-east-1',
    client={
        'region': 'us-east-1',
        # 'access-key-id': 'YOUR_ACCESS_KEY', # Avoid hardcoding in production
        # 'secret-access-key': 'YOUR_SECRET_KEY' # Avoid hardcoding in production
    }
)

In [7]:
DATABASE = "db_ice"
TB_POLIGONO = "tb_poligono"

In [5]:
glue_catalog.list_namespaces()

[('base',), ('db_ice',), ('default',), ('sagemaker_featurestore',)]

In [6]:
glue_catalog.list_tables(DATABASE)

[('db_ice', 'tb_controle_ingestao'), ('db_ice', 'tb_poligono')]

In [9]:
glue_catalog.table_exists(f"{DATABASE}.{TB_POLIGONO}")

True

In [8]:
# table = glue_catalog.load_table("base.loan_2018")
table = glue_catalog.load_table((DATABASE, TB_POLIGONO))

## Insert data

In [131]:
class RecordNotExistsError(Exception):
    """Exception raised when a record does not exist in the table."""
    pass

class PrimaryKeyDuplicatedError(Exception):
    """Exception raised when a record is duplicated in the table."""
    pass

In [100]:
class Record(ABC):

    def to_dict(self) -> dict:
        return asdict(self)

In [174]:
@dataclass
class Poligono(Record):
    id_poligono: str
    poligono: str
    data_plantio: str
    anomesdia: int


In [187]:
class PATable(ABC):

    @property
    @abstractmethod
    def schema(self):
        pass

    def __init__(self) -> None:
        self.values: list[dict] = []

    def check_unique_id(self, record_id: str) -> None:
        ids = [record['id_poligono'] for record in self.values]
        if record_id in ids:
            raise PrimaryKeyDuplicatedError(f"Record with id '{record_id}' already exists.")

    def add_record(self, record: Record) -> None:
        record_dict = record.to_dict()
        self.check_unique_id(record_id=record_dict['id_poligono'])
        self.values.append(record_dict)

    def clear(self) -> None:
        self.values = []

    def to_pyarrow_table(self) -> pa.Table:
        return pa.Table.from_pylist(
            self.values,
            schema=self.schema
        )


In [176]:
class PATablePolygon(PATable):

    schema: pa.schema = pa.schema([
        pa.field("id_poligono", pa.string(), nullable=False),
        pa.field("poligono", pa.string(), nullable=False),
        pa.field("data_plantio", pa.string(), nullable=True),
        pa.field("anomesdia", pa.uint32(), nullable=False),
    ])


In [177]:
record_polygon = Poligono(
    id_poligono="1",
    poligono="POLYGON ((-47.9296875 -15.60197258, -47.9296875 -15.79381919, -47.67578125 -15.79381919, -47.67578125 -15.60197258, -47.9296875 -15.60197258))",
    data_plantio="2023-06-01",
    anomesdia=20230601
)

In [None]:
class IcebergTable:

    def __init__(self, table: Table) -> None:
        self.table = table

    def load_record(self, record_id: int) -> DataScan:
        return self.table.scan(row_filter=f"id_poligono == '{record_id}'")

    def get_record(self, record_id: int) -> Record:
        tb_arrow = self.load_record(record_id).to_arrow()
        if tb_arrow.num_rows == 0:
            raise RecordNotExistsError(f"No record found with id '{record_id}'")
        if tb_arrow.num_rows > 1:
            raise PrimaryKeyDuplicatedError(f"Polygon Id duplicated with id '{record_id}'")
        return Poligono(
            **{
                key: value[0] for key, value 
                in tb_arrow.to_pydict().items()
            }
        )

    def upsert(self, pyarrow_table: PATable) -> None:
        table.upsert(
            df=pyarrow_table.to_pyarrow_table(),
            join_cols=["id_poligono"]
        )


# table.delete(delete_filter="id == 3")
# table.refresh()

In [179]:
class IcebergFactory:

    glue_catalog = load_catalog(
        'default',
        type='glue',
        profile='default',
        region_name='us-east-1',
        client={
            'region': 'us-east-1',
        }
    )

    @classmethod
    def load_iceberg_table(cls, database: str, table: str) -> IcebergTable:
        table = cls.glue_catalog.load_table((database, table))
        return IcebergTable(table)

In [None]:
# table = glue_catalog.load_table((DATABASE, "tb_poligono"))

In [None]:
# icerberg_polygon = IcebergTable(table)

In [180]:
icerberg_polygon = IcebergFactory().load_iceberg_table(DATABASE, TB_POLIGONO)

In [181]:
pa_table_polygon = PATablePolygon()

In [182]:
record = icerberg_polygon.get_record("id_001")

In [183]:
record.to_dict()['data_plantio']

'2023-06-21'

In [184]:
record.data_plantio = "2023-06-22"
record.to_dict()['data_plantio']

'2023-06-22'

In [185]:
pa_table_polygon.add_record(record)
pa_table_polygon.to_pyarrow_table().to_pandas()

Unnamed: 0,id_poligono,poligono,data_plantio,anomesdia
0,id_001,"POLYGON ((-46.556983701 -23.612093836, -46.556...",2023-06-22,20250810


In [188]:
# pa_table_polygon.add_record(record)
# pa_table_polygon.to_pyarrow_table().to_pandas()

In [167]:
# pa_table_polygon.clear()
# pa_table_polygon.to_pyarrow_table().to_pandas()

In [169]:
icerberg_polygon.upsert(pa_table_polygon)



In [None]:
# table.scan().to_polars().head()