In [None]:
# Import required libraries
import pyarrow as pa

import pyiceberg
from pyiceberg.catalog import load_catalog

print(f"PyIceberg version: {pyiceberg.__version__}")

## Setup: Connecting to a Catalog

Iceberg uses a catalog to organize tables. For this example, we'll use a `SqlCatalog` with SQLite for local testing.

In [None]:
# Import required libraries
import os
import tempfile

import pyarrow.compute as pc

In [None]:
# Create a temporary warehouse location
warehouse_path = tempfile.mkdtemp(prefix="iceberg_warehouse_")
print(f"Warehouse location: {warehouse_path}")

In [None]:
# Configure and load the catalog
catalog = load_catalog(
    "default",
    type="sql",
    uri=f"sqlite:///{warehouse_path}/pyiceberg_catalog.db",
    warehouse=f"file://{warehouse_path}",
)

print("Catalog loaded successfully!")
print(f"Namespaces: {list(catalog.list_namespaces())}")

## Create a Namespace and Table

Let's create a namespace and a simple Iceberg table.

In [None]:
# Create a namespace
catalog.create_namespace("default")
print(f"Available namespaces: {list(catalog.list_namespaces())}")

## Write Data to an Iceberg Table

We'll create a sample dataset and write it to an Iceberg table.

In [None]:
# Create sample data using PyArrow

# Sample taxi-like data
data = {
    "vendor_id": [1, 2, 1, 2, 1],
    "trip_distance": [1.5, 2.3, 0.8, 5.2, 3.1],
    "fare_amount": [10.0, 15.5, 6.0, 22.0, 18.0],
    "tip_amount": [2.0, 3.0, 1.0, 4.5, 3.5],
    "passenger_count": [1, 2, 1, 3, 2],
}

df = pa.table(data)
print("Sample data:")
print(df)

In [None]:
# Create an Iceberg table with the schema from our dataframe
table = catalog.create_table(
    "default.sample_trips",
    schema=df.schema,
)

print(f"Created table: {table}")
print(f"Table schema: {table.schema()}")

In [None]:
# Append data to the table
table.append(df)
print(f"Rows written: {len(table.scan().to_arrow())}")

## Read Data from the Table

Let's read back the data we just wrote.

In [None]:
# Scan and read the entire table
result = table.scan().to_arrow()
print("Table contents:")
print(result)

## Schema Evolution

One of Iceberg's powerful features is schema evolution. Let's add a new computed column.

In [None]:
# Add a new computed column: tip per mile
df = df.append_column("tip_per_mile", pc.divide(df["tip_amount"], df["trip_distance"]))
print("Updated dataframe with new column:")
print(df)

In [None]:
# Evolve the table schema to include the new column
with table.update_schema() as update_schema:
    update_schema.union_by_name(df.schema)

print("Schema evolved!")
print(f"Updated table schema: {table.schema()}")

In [None]:
# Overwrite the table with the new data
table.overwrite(df)
print("Data overwritten with new schema")

# Verify the new column exists
result = table.scan().to_arrow()
print(result)

## Filtering Data

PyIceberg supports predicate pushdown for efficient data filtering.

In [None]:
# Filter rows where tip_per_mile > 1.0
filtered_df = table.scan(row_filter="tip_per_mile > 1.0").to_arrow()
print(f"Rows with tip_per_mile > 1.0: {len(filtered_df)}")
print(filtered_df)

## Inspect Table Metadata

Iceberg tables maintain rich metadata about their structure and history.

In [None]:
# View table properties
print(f"Table location: {table.location()}")
print(f"Table properties: {table.properties}")
print(f"Current snapshot ID: {table.current_snapshot()}")

In [None]:
# View table history (snapshots)
print("Table history:")
for snapshot in table.history():
    print(f"  Snapshot: {snapshot}")

## Explore Data Files

Let's see what files Iceberg created in the warehouse.

In [None]:
# List all files in the warehouse
for root, _dirs, files in os.walk(warehouse_path):
    level = root.replace(warehouse_path, "").count(os.sep)
    indent = " " * 2 * level
    print(f"{indent}{os.path.basename(root)}/")
    subindent = " " * 2 * (level + 1)
    for file in files:
        print(f"{subindent}{file}")

## Additional Operations

PyIceberg supports many more operations including:
- Time travel queries
- Partition evolution
- Table maintenance (expire snapshots, rewrite data files)
- Integration with pandas, DuckDB, Ray, and more

Check the [PyIceberg documentation](https://py.iceberg.apache.org/) for more details!