In [1]:
from pyiceberg.catalog import load_catalog

catalog = load_catalog()

In [2]:
ns = catalog.list_namespaces()

print(ns)

[('default',), ('drivestats',), ('ds',)]


In [5]:
from pyiceberg.schema import Schema
from pyiceberg.types import (
    TimestampType,
    FloatType,
    DoubleType,
    StringType,
    NestedField,
    StructType,
)

schema = Schema(
    NestedField(field_id=1, name="datetime", field_type=TimestampType(), required=True),
    NestedField(field_id=2, name="symbol", field_type=StringType(), required=True),
    NestedField(field_id=3, name="bid", field_type=FloatType(), required=False),
    NestedField(field_id=4, name="ask", field_type=DoubleType(), required=False),
    NestedField(
        field_id=5,
        name="details",
        field_type=StructType(
            NestedField(
                field_id=4, name="created_by", field_type=StringType(), required=False
            ),
        ),
        required=False,
    ),
)

from pyiceberg.partitioning import PartitionSpec, PartitionField
from pyiceberg.transforms import DayTransform

partition_spec = PartitionSpec(
    PartitionField(
        source_id=1, field_id=1000, transform=DayTransform(), name="datetime_day"
    )
)

from pyiceberg.table.sorting import SortOrder, SortField
from pyiceberg.transforms import IdentityTransform

# Sort on the symbol
sort_order = SortOrder(SortField(source_id=2, transform=IdentityTransform()))

catalog.create_table(
    identifier="drivestats.bids",
    schema=schema,
    location="s3a://drivestats-iceberg/bids",
    partition_spec=partition_spec,
    sort_order=sort_order,
)

bids(
  1: datetime: required timestamp,
  2: symbol: required string,
  3: bid: optional float,
  4: ask: optional double,
  5: details: optional struct<6: created_by: optional string>
),
partition by: [datetime_day],
sort order: [2 ASC NULLS FIRST],
snapshot: null

In [20]:
import pyarrow as pa

df = pa.Table.from_pylist(
    [
        {"city": "Amsterdam", "lat": 52.371807, "long": 4.896029},
        {"city": "San Francisco", "lat": 37.773972, "long": -122.431297},
        {"city": "Drachten", "lat": 53.11254, "long": 6.0989},
        {"city": "Paris", "lat": 48.864716, "long": 2.349014},
    ],
)

In [21]:
from pyiceberg.catalog import load_catalog

catalog = load_catalog("default")

tbl = catalog.load_table("drivestats.cities")

In [None]:
from pyiceberg.catalog import load_catalog

catalog = load_catalog("default")

from pyiceberg.schema import Schema
from pyiceberg.types import NestedField, StringType, DoubleType

schema = Schema(
    NestedField(1, "city", StringType(), required=False),
    NestedField(2, "lat", DoubleType(), required=False),
    NestedField(3, "long", DoubleType(), required=False),
)

tbl = catalog.create_table(
    identifier="drivestats.cities",
    schema=schema,
    location="s3a://drivestats-iceberg/cities",
)

In [22]:
tbl.append(df)
# or
# tbl.overwrite(df)

In [23]:
tbl.scan().to_arrow()

pyarrow.Table
city: large_string
lat: double
long: double
----
city: [["Amsterdam","San Francisco","Drachten","Paris"],["Amsterdam","San Francisco","Drachten"],["Groningen"]]
lat: [[52.371807,37.773972,53.11254,48.864716],[52.371807,37.773972,53.11254],[53.21917]]
long: [[4.896029,-122.431297,6.0989,2.349014],[4.896029,-122.431297,6.0989],[6.56667]]

In [11]:
df = pa.Table.from_pylist(
    [{"city": "Groningen", "lat": 53.21917, "long": 6.56667}],
)

tbl.append(df)

In [13]:
tbl.delete(delete_filter="city == 'Paris'")

In [15]:
tbl.inspect.snapshots()

pyarrow.Table
committed_at: timestamp[ms] not null
snapshot_id: int64 not null
parent_id: int64
operation: string
manifest_list: string not null
summary: map<string, string>
  child 0, entries: struct<key: string not null, value: string> not null
      child 0, key: string not null
      child 1, value: string
----
committed_at: [[2024-12-18 02:14:09.175,2024-12-18 02:14:57.526,2024-12-18 02:15:40.163]]
snapshot_id: [[3715489976768252193,6382132783291269597,5555822186573786383]]
parent_id: [[null,3715489976768252193,6382132783291269597]]
operation: [["append","append","overwrite"]]
manifest_list: [["s3a://drivestats-iceberg/cities/metadata/snap-3715489976768252193-0-b29cc744-fbb4-45a6-8c25-e62bb0dff8d7.avro","s3a://drivestats-iceberg/cities/metadata/snap-6382132783291269597-0-6da3aeb0-9fda-4cc5-8a77-86f7c757fcef.avro","s3a://drivestats-iceberg/cities/metadata/snap-5555822186573786383-0-03b24080-0267-4bba-81e7-3ea8517ca850.avro"]]
summary: [[keys:["added-files-size","added-data-files","

In [16]:
tbl.inspect.partitions()

pyarrow.Table
record_count: int64 not null
file_count: int32 not null
total_data_file_size_in_bytes: int64 not null
position_delete_record_count: int64 not null
position_delete_file_count: int32 not null
equality_delete_record_count: int64 not null
equality_delete_file_count: int32 not null
last_updated_at: timestamp[ms]
last_updated_snapshot_id: int64
----
record_count: [[4]]
file_count: [[2]]
total_data_file_size_in_bytes: [[2623]]
position_delete_record_count: [[0]]
position_delete_file_count: [[0]]
equality_delete_record_count: [[0]]
equality_delete_file_count: [[0]]
last_updated_at: [[2024-12-18 02:15:40.163]]
last_updated_snapshot_id: [[5555822186573786383]]

In [17]:
tbl.inspect.entries()

pyarrow.Table
status: int8 not null
snapshot_id: int64 not null
sequence_number: int64 not null
file_sequence_number: int64 not null
data_file: struct<content: int8 not null, file_path: string not null, file_format: string not null, partition: struct<> not null, record_count: int64 not null, file_size_in_bytes: int64 not null, column_sizes: map<int32, int64>, value_counts: map<int32, int64>, null_value_counts: map<int32, int64>, nan_value_counts: map<int32, int64>, lower_bounds: map<int32, binary>, upper_bounds: map<int32, binary>, key_metadata: binary, split_offsets: list<item: int64>, equality_ids: list<item: int32>, sort_order_id: int32> not null
  child 0, content: int8 not null
  child 1, file_path: string not null
  child 2, file_format: string not null
  child 3, partition: struct<> not null
  child 4, record_count: int64 not null
  child 5, file_size_in_bytes: int64 not null
  child 6, column_sizes: map<int32, int64>
      child 0, entries: struct<key: int32 not null, value: in

In [18]:
tbl.inspect.refs()

pyarrow.Table
name: string not null
type: dictionary<values=string, indices=int32, ordered=0> not null
snapshot_id: int64 not null
max_reference_age_in_ms: int64
min_snapshots_to_keep: int32
max_snapshot_age_in_ms: int64
----
name: [["main"]]
type: [  -- dictionary:
["BRANCH"]  -- indices:
[0]]
snapshot_id: [[5555822186573786383]]
max_reference_age_in_ms: [[null]]
min_snapshots_to_keep: [[null]]
max_snapshot_age_in_ms: [[null]]

In [24]:
tbl.inspect.files()

pyarrow.Table
content: int8 not null
file_path: string not null
file_format: dictionary<values=string, indices=int32, ordered=0> not null
spec_id: int32 not null
record_count: int64 not null
file_size_in_bytes: int64 not null
column_sizes: map<int32, int64>
  child 0, entries: struct<key: int32 not null, value: int64> not null
      child 0, key: int32 not null
      child 1, value: int64
value_counts: map<int32, int64>
  child 0, entries: struct<key: int32 not null, value: int64> not null
      child 0, key: int32 not null
      child 1, value: int64
null_value_counts: map<int32, int64>
  child 0, entries: struct<key: int32 not null, value: int64> not null
      child 0, key: int32 not null
      child 1, value: int64
nan_value_counts: map<int32, int64>
  child 0, entries: struct<key: int32 not null, value: int64> not null
      child 0, key: int32 not null
      child 1, value: int64
lower_bounds: map<int32, binary>
  child 0, entries: struct<key: int32 not null, value: binary> not n