In [1]:
import polars as pl
from pyiceberg.catalog import load_catalog
from pyiceberg.schema import Schema
from pyiceberg.types import BooleanType, DoubleType, LongType, StringType, TimestampType, NestedField, IntegerType
from pyiceberg.transforms import MonthTransform

First we need to connect to our Iceberg catalogue - since currently the client is doing the reading and writing, we also set the access credentials. This can also be done in a `.pyiceberg.yaml` file.

Note that normally, the REST catalogue can handle all S3 auth, sending a signed S3 url to the client to upload - but that would require additional auth setup, so we pass the client credentials directly here

In [2]:
catalog = load_catalog("nessie", **{"uri": "http://nessie:19120/iceberg", 
                                    "s3.access-key-id": "minio", 
                                    "s3.secret-access-key": "minio1234",
                                    })

Iceberg metadata is organized in `namespaces` which would be the equivalent of a schema in a database.

Here we name our catalog `steam` to represent data that comes from Steam

In [6]:
catalog.create_namespace_if_not_exists("steam")
catalog.list_namespaces()

[('steam',)]

We can define a schema using pyiceberg. Note that each field needs a unique id within the schema, as one of the ways that Iceberg can handle schema migrations is by referencing each field by position rather than by name, but we'll see an example of that later.

We will create a table containing a small subset for demonstration purposes, as it's a bit tedious to write out the whole schema by hand 😅

In [7]:
schema = Schema(
    NestedField(id=1, name='recommendationid', type=LongType()),
    NestedField(id=2, name='language', type=StringType()),
    NestedField(id=3, name='timestamp_created', type=TimestampType()),
    NestedField(id=4, name='voted_up', type=BooleanType()),
)

Now we're ready to create the table, by passing the schema to the catalog. It will take care of writing a metadata file in the object storage.

In [8]:
table = catalog.create_table("steam.languages", schema=schema)

Let's insert some data into the table. Pyiceberg supports Arrow out of the box, so we use Polars to read data in and turn it into Arrow format

In [8]:
df = pl.read_csv("data/10.csv").select(pl.col('recommendationid'), 
                                       pl.col('language'), 
                                      pl.from_epoch(pl.col('timestamp_created')),
                                      pl.col('voted_up').cast(pl.Boolean)
                                      ).filter(pl.col('recommendationid').is_not_null())

Now we can `overwrite` or `append` this data

In [9]:
table.overwrite(df.to_arrow())

  Expected `TableIdentifier` but got `dict` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_json(


Now that there's data in the table, we can scan the Iceberg table, filtering out the records we don't need. Pyiceberg will use the metadata stored in Iceberg to quicklyl locate the correct files and reading the correct parts of those files

In [10]:
t = table.scan().to_arrow()



In [11]:
print(f"Total rows in table: {table.scan().to_arrow().shape[0]:,}")

Total rows in table: 238,827


In [12]:
new_df = pl.from_arrow(table.scan(selected_fields=['language', 'voted_up'], 
                                #Alternatively use expressions -> EqualTo('language', 'english')
                                row_filter="language == 'english'"
                                 ).to_arrow()
                      )
new_df

language,voted_up
str,bool
"""english""",true
"""english""",true
"""english""",true
"""english""",true
"""english""",true
…,…
"""english""",true
"""english""",true
"""english""",true
"""english""",true


One problem we have now, is that we don't actually have the game id in the table - that would be pretty useful. If we just try to insert data with the added column, that wouldn't work as the Iceberg schema doesn't contain the `game_id` column and will error to protect the data.

In [13]:
df_game = df.with_columns(game_id=pl.lit("10"))
table.overwrite(df_game.to_arrow())

ValueError: PyArrow table contains more columns: game_id. Update the schema first (hint, use union_by_name).

Luckily in Iceberg, we can update the schema without having to rewrite all the physical files, much like in a traditional RDBMS

In [14]:
with table.update_schema() as update:
    update.add_column('game_id', StringType())

  Expected `TableIdentifier` but got `dict` - serialized value may not be as expected
  lambda x, h: h(x), schema=core_schema.any_schema()


In [15]:
table.schema()

Schema(NestedField(field_id=1, name='recommendationid', field_type=LongType(), required=False), NestedField(field_id=2, name='language', field_type=StringType(), required=False), NestedField(field_id=3, name='timestamp_created', field_type=TimestampType(), required=False), NestedField(field_id=4, name='voted_up', field_type=BooleanType(), required=False), NestedField(field_id=5, name='game_id', field_type=StringType(), required=False), schema_id=1, identifier_field_ids=[])

PyIceberg makes sure to give it a valid ID and puts it at the end of the schema. We can move the fields around as we want by updating the metadata, Iceberg keeps track of the corresponding data positions without having to rewrite the files

In [16]:
with table.update_schema() as update:
    update.move_first('game_id')

In [17]:
table.schema()

Schema(NestedField(field_id=5, name='game_id', field_type=StringType(), required=False), NestedField(field_id=1, name='recommendationid', field_type=LongType(), required=False), NestedField(field_id=2, name='language', field_type=StringType(), required=False), NestedField(field_id=3, name='timestamp_created', field_type=TimestampType(), required=False), NestedField(field_id=4, name='voted_up', field_type=BooleanType(), required=False), schema_id=2, identifier_field_ids=[])

In [18]:
table.overwrite(df_game.to_arrow())

`game_id` should now be the first column of the table

In [19]:
pl.from_arrow(table.scan().to_arrow())

game_id,recommendationid,language,timestamp_created,voted_up
str,i64,str,datetime[μs],bool
"""10""",147937429,"""english""",2023-10-09 18:11:42,true
"""10""",166664841,"""russian""",2024-06-04 14:08:20,true
"""10""",166664763,"""russian""",2024-06-04 14:06:49,true
"""10""",166663001,"""turkish""",2024-06-04 13:36:22,false
"""10""",166658743,"""brazilian""",2024-06-04 12:16:25,true
…,…,…,…,…
"""10""",8227337,"""polish""",2013-12-23 16:17:23,true
"""10""",149330962,"""russian""",2023-11-01 19:58:03,true
"""10""",149284037,"""english""",2023-11-01 00:58:41,true
"""10""",127959835,"""schinese""",2022-12-05 04:31:44,true


Now we're ready to insert some more data using `.append`

In [20]:
civ6_df = pl.read_csv('data/289070.csv').select(
    pl.lit('289070').alias('game_id'),
    pl.col('recommendationid'),
    pl.col('language'),
    pl.from_epoch(pl.col('timestamp_created')),
    pl.col('voted_up').cast(pl.Boolean)
)
civ6_df.head()

game_id,recommendationid,language,timestamp_created,voted_up
str,i64,str,datetime[μs],bool
"""289070""",166714446,"""schinese""",2024-06-05 05:41:08,True
"""289070""",134583732,"""english""",2023-03-13 05:33:40,True
"""289070""",166713515,"""schinese""",2024-06-05 05:14:50,True
"""289070""",166712974,"""schinese""",2024-06-05 04:59:06,True
"""289070""",166712070,"""english""",2024-06-05 04:34:35,True


In [21]:
table.append(civ6_df.to_arrow())

In [22]:
t = table.scan().to_arrow()
print(f"Current rows: {t.shape[0]:,}")
pl.from_arrow(t).select(pl.col('game_id').unique())

Current rows: 549,712


game_id
str
"""10"""
"""289070"""


Let's add some more data.

In [23]:
cs2 = pl.read_csv('data/730.csv').select(
    pl.lit('730').alias('game_id'),
    pl.col('recommendationid'),
    pl.col('language'),
    pl.from_epoch(pl.col('timestamp_created')),
    pl.col('voted_up').cast(pl.Boolean)
)
cs2.head()

game_id,recommendationid,language,timestamp_created,voted_up
str,i64,str,datetime[μs],bool
"""730""",145242121,"""russian""",2023-08-29 06:38:44,True
"""730""",142695062,"""russian""",2023-07-24 10:19:31,True
"""730""",167219799,"""schinese""",2024-06-12 14:58:47,True
"""730""",167219772,"""english""",2024-06-12 14:58:21,True
"""730""",167219768,"""schinese""",2024-06-12 14:58:17,True


In [24]:
table.append(cs2.to_arrow())

In [25]:
print(f"The table has {table.scan().to_arrow().num_rows:,} rows")

The table has 8,197,038 rows


What if I find out that the end-users are mainly trying to analyze the upvotes over time? The Dashboard has been built, but it's not performing as they want. Partitioning might be the answer, but with Iceberg, we don't have to physically rewrite all the data:

In [26]:
with table.update_spec() as update:
    update.add_field("timestamp_created", MonthTransform(), 'month_created')

Nothing has happened yet, the Iceberg metada has been updated with the partition - no need to rewrite all the files. 

If we insert new data, it will be partitioned by our new partitioning schema

In [27]:
pubg = pl.read_csv('data/578080.csv').select(
    pl.lit('578080').alias('game_id'),
    pl.col('recommendationid'),
    pl.col('language'),
    pl.from_epoch(pl.col('timestamp_created')),
    pl.col('voted_up').cast(pl.Boolean)
)
pubg.head()

game_id,recommendationid,language,timestamp_created,voted_up
str,i64,str,datetime[μs],bool
"""578080""",167085096,"""schinese""",2024-06-10 12:55:59,True
"""578080""",167085090,"""schinese""",2024-06-10 12:55:53,True
"""578080""",167085008,"""schinese""",2024-06-10 12:54:24,False
"""578080""",167083530,"""russian""",2024-06-10 12:22:08,False
"""578080""",167084941,"""schinese""",2024-06-10 12:52:58,True


In [28]:
table.append(pubg.to_arrow())

Iceberg knows the partitioning scheme for each of the manifest lists and will generate a plan for each file independently

![Partition Spec Evolution](images/partition_spec_evolution.png)

> https://iceberg.apache.org/docs/latest/evolution/#partition-evolution

# Write-Audit-Publish with Git-for-data branches

Nessie provides the ability to perform a Write-Audit-Publish pattern, through git branching of data. Using the mechanisms of Iceberg, Nessie can keep track of the different branches of data, allowing us git-like semantics for working with data. Pyiceberg doesn't yet support this Nessie-specific syntax, so we're going to switch to another project, Dremio. We could also have chose a number of other query engines such as Apache Spark, or Trino/Presto as well.

A nice thing about Dremio is that since it's Arrow-backed internally, they also expose FlightSQL endpoints, letting us use ADBC as a generic DB client.

In [9]:
from adbc_driver_flightsql.dbapi import connect
from adbc_driver_flightsql import DatabaseOptions

In [10]:
conn = connect("grpc+tcp://dremio:32010", 
               db_kwargs={"username": "dremio", 
                          "password": "dremio123", 
                          DatabaseOptions.WITH_COOKIE_MIDDLEWARE.value: "true"}, 
              )



In [31]:
with conn.cursor() as c:
    c.execute("SELECT 1")
    print(c.fetchone())

(1,)


Dremio has been pre-configured to talk to Nessie, and since Iceberg is client-agnostic, Dremio can read all the Iceberg tables registered in the Nessie catalogue. We're taking advantage of the Arrow-based nature of Dremio, Polars and ADBC to be able to interact directly

In [35]:
pl.read_database("SELECT * FROM Nessie.steam.languages", conn)

game_id,recommendationid,language,timestamp_created,voted_up
str,i64,str,datetime[ms],bool
"""578080""",30930293,"""german""",2017-04-03 20:24:14,false
"""578080""",31421053,"""english""",2017-04-28 17:27:35,false
"""578080""",31322738,"""german""",2017-04-23 15:43:28,true
"""578080""",31115003,"""english""",2017-04-13 12:02:13,false
"""578080""",31150783,"""koreana""",2017-04-15 07:23:40,true
…,…,…,…,…
"""578080""",163949610,"""russian""",2024-05-01 00:05:27,true
"""578080""",163949553,"""turkish""",2024-05-01 00:04:15,true
"""578080""",163949534,"""english""",2024-05-01 00:03:58,true
"""578080""",163949515,"""koreana""",2024-05-01 00:03:45,true


Where before, `pyiceberg` let us filter the data, a query engine like Dremio can do SQL to do all the analytics we're used to from a database.

In [36]:
sql = """
    SELECT "language", 
    SUM(CAST(voted_up as int)) / CAST(COUNT(voted_up) as float) as ratio_positive_votes
    FROM Nessie.steam.languages 
    GROUP BY "language"
    ORDER BY 2 DESC
    """
pl.read_database(sql, conn)

language,ratio_positive_votes
str,f32
"""romanian""",0.942233
"""portuguese""",0.936969
"""brazilian""",0.923251
"""danish""",0.918101
"""swedish""",0.907861
…,…
"""japanese""",0.721831
"""koreana""",0.706234
"""schinese""",0.68408
"""tchinese""",0.659531


## Write
In this example, we have our raw extract data in the Extract source - a CSV file for each game. Let's prepare a staging table for the CSV files, as they have a slightly different format than our final table

In [11]:
with conn.cursor() as c:
    c.execute(""" 
    CREATE TABLE IF NOT EXISTS Nessie.steam.staging.languages (
        game_id varchar(50),
        recommendationid int,
        "language" varchar(20),
        timestamp_created int,
        voted_up boolean
    )
    """)
    print(c.fetchone()[1])

Table created


In the Write stage, we start by creating a new branch where we can stage all our data changes

In [16]:
with conn.cursor() as c:
    c.execute("CREATE BRANCH insert_demo AT BRANCH main IN Nessie")
    print(c.fetchone()[1])

Branch insert_demo2 has been created at branch main in source Nessie.


Now we can `COPY INTO` our staging table from our Extract file source

In [17]:
with conn.cursor() as c:
    c.execute("""
    COPY INTO Nessie.steam.staging.languages 
    AT BRANCH insert_demo2
    FROM '@Extract/extract/550.csv'
    ( EXTRACT_HEADER true, TRIM_SPACE true )
    """)
    print(f"Inserted {c.fetchone()[0]:,} rows")

Inserted 64 rows


We can verify that the main branch doesn't see any data

In [43]:
pl.read_database("SELECT * FROM Nessie.steam.staging.languages", conn)

game_id,recommendationid,language,timestamp_created,voted_up
str,i32,str,i32,bool


While the `insert_demo` branch does

In [42]:
pl.read_database("SELECT * FROM Nessie.steam.staging.languages AT BRANCH insert_demo", conn)

game_id,recommendationid,language,timestamp_created,voted_up
str,i32,str,i32,bool
,166655315,"""english""",1717498886,true
,166655283,"""russian""",1717498848,false
,166655274,"""english""",1717498839,true
,166654220,"""turkish""",1717497314,true
,166653942,"""ukrainian""",1717496923,true
…,…,…,…,…
,2141528,"""english""",1287211584,true
,2187504,"""russian""",1287198543,true
,2645097,"""english""",1287191253,true
,367214,"""english""",1287186381,true


Since the `game_id` is stored in the file name instead of the data itself, we need to update our staged data with the game id.

In [None]:
with conn.cursor() as c:
    c.execute("USE BRANCH insert_demo IN Nessie;")
    print(c.fetchone()[1])

In [18]:
with conn.cursor() as c:
    c.execute("UPDATE Nessie.steam.staging.languages AT BRANCH insert_demo SET game_id = '550' where game_id is null")
    print(f"Inserted {c.fetchone()[0]:,} rows")

ProgrammingError: INVALID_ARGUMENT: [FlightSQL] Cannot assign to target field 'D_R_E_M_I_O_D_A_T_A_F_I_L_E_F_I_L_E_P_A_T_H' of type VARCHAR(65536) from source field 'D_R_E_M_I_O_D_A_T_A_F_I_L_E_F_I_L_E_P_A_T_H' of type VARCHAR(65536) NOT NULL
startLine 1
startColumn 8
endLine 1
endColumn 37
SQL Query UPDATE Nessie.steam.staging.languages AT BRANCH insert_demo2 SET game_id = '209850' where game_id is null (InvalidArgument; Prepare)

Still no rows in the main branch

In [47]:
print("Branch main")
print(pl.read_database("SELECT * FROM Nessie.steam.staging.languages AT BRANCH main WHERE game_id = '550'", conn))
print("Branch insert_demo")
print(pl.read_database("SELECT * FROM Nessie.steam.staging.languages AT BRANCH insert_demo WHERE game_id = '550'", conn))

Branch main
shape: (0, 5)
┌─────────┬──────────────────┬──────────┬───────────────────┬──────────┐
│ game_id ┆ recommendationid ┆ language ┆ timestamp_created ┆ voted_up │
│ ---     ┆ ---              ┆ ---      ┆ ---               ┆ ---      │
│ str     ┆ i32              ┆ str      ┆ i32               ┆ bool     │
╞═════════╪══════════════════╪══════════╪═══════════════════╪══════════╡
└─────────┴──────────────────┴──────────┴───────────────────┴──────────┘
Branch insert_demo
shape: (858_570, 5)
┌─────────┬──────────────────┬───────────┬───────────────────┬──────────┐
│ game_id ┆ recommendationid ┆ language  ┆ timestamp_created ┆ voted_up │
│ ---     ┆ ---              ┆ ---       ┆ ---               ┆ ---      │
│ str     ┆ i32              ┆ str       ┆ i32               ┆ bool     │
╞═════════╪══════════════════╪═══════════╪═══════════════════╪══════════╡
│ 550     ┆ 166649911        ┆ russian   ┆ 1717490824        ┆ true     │
│ 550     ┆ 166643739        ┆ english   ┆ 1717480561

To finish our Write phase, we can move the staging data into the `languages` table with the correct conversions

In [48]:
with conn.cursor() as c:
    c.execute("""
    INSERT INTO Nessie.steam.languages AT BRANCH insert_demo 
    SELECT 
        game_id,
        recommendationid,
        "language",
        to_timestamp(timestamp_created) as timestamp_created, 
        voted_up
    FROM Nessie.steam.staging.languages AT BRANCH insert_demo
    """)
    print(f"Inserted {c.fetchone()[0]:,} rows")

Inserted 858,570 rows


In [50]:
pl.read_database("SELECT COUNT(*) as num_rows FROM Nessie.steam.languages AT BRANCH insert_demo", conn)

num_rows
i64
11342344


Now we can run our Audit step - verifying data to ensure the data quality before consumers get it

In [52]:
sql = """
     WITH num_reviews as (
        select game_id, count(*) as num_reviews
        FROM Nessie.steam.languages
        GROUP BY game_id
    ), num_language_reviews as (
        SELECT game_id, "language", COUNT(*) as num_language_reviews
        FROM Nessie.steam.languages l
        GROUP BY game_id, "language"
    )
    SELECT l.game_id, l."language",  num_language_reviews / cast(num_reviews as float) as language_ratio
    FROM num_reviews r join num_language_reviews as l on r.game_id = l.game_id
    """

ratio_df = pl.read_database(sql, conn)
ratio_df

game_id,language,language_ratio
str,str,f32
"""730""","""brazilian""",0.056923
"""730""","""thai""",0.002919
"""730""","""italian""",0.002533
"""730""","""schinese""",0.135074
"""730""","""german""",0.02688
…,…,…
"""578080""","""brazilian""",0.023774
"""289070""","""koreana""",0.033276
"""289070""","""norwegian""",0.001467
"""289070""","""romanian""",0.000187


We can perform sanity checks, such as checking that there aren't less than 15% of the reviews in english, or compare data across branches to make sure the difference is correct

In [53]:
assert ratio_df.filter((pl.col('language') == 'english') & (pl.col('language_ratio') < 0.15)).is_empty

In [56]:
assert pl.read_database("""
SELECT (SELECT COUNT(*) from Nessie.steam.languages AT BRANCH insert_demo) 
     - (SELECT COUNT(*) From Nessie.steam.languages AT BRANCH main)
""", conn).item() == 858570

Given that we're happy with the new data - it passes all our data quality checks - we're ready for the Publish step. 

In a git-like fashion, we can merge the two branches and all our changes will be visible to the "regular" users

In [57]:
with conn.cursor() as c:
    c.execute("MERGE BRANCH insert_demo into main in Nessie")
    print(c.fetchone()[1])

Branch insert_demo has been merged into main on source Nessie.


In [58]:
pl.read_database("SELECT COUNT(*) as num_reviews FROM Nessie.steam.languages at branch main", conn)

num_reviews
i64
11342344


The changes have been merged in, and we can clean up

In [59]:
with conn.cursor() as c:
    c.execute("USE BRANCH main in Nessie")
    print(c.fetchone()[1])
    c.execute("DROP BRANCH insert_demo in Nessie")
    print(c.fetchone()[1])

Current version context set to branch main in source Nessie.
Branch insert_demo has been dropped on source Nessie.


# Time Travel

Not only can we do Git branching, we can also do timetravel as part of the Iceberg spec. 

We could imagine that after the insert, we want to store a pointer to this version of the data, so we can go back to how data looked at an exact point in time. Iceberg has been keeping snapshots of each of our operations

In [60]:
pl.read_database("SELECT * FROM TABLE(table_snapshot('Nessie.steam.languages'))", conn)

committed_at,snapshot_id,parent_id,operation,manifest_list,summary
datetime[ms],i64,i64,str,str,list[struct[2]]
2024-08-03 19:40:34.303,2919915608453998030,,"""append""","""s3://warehouse/steam/languages…","[{""added-files-size"",""27356584""}, {""added-data-files"",""89""}, … {""total-equality-deletes"",""0""}]"
2024-08-03 19:59:51.048,9205576444219515382,2.919915608453998e+18,"""append""","""s3://warehouse/steam/languages…","[{""dremio-job-id"",""195173ca-7b41-4deb-a85b-ff90a6e6c000""}, {""added-data-files"",""166""}, … {""total-equality-deletes"",""0""}]"


I can choose between a snapshot_id:

In [61]:
pl.read_database("SELECT COUNT(*) FROM Nessie.steam.languages at snapshot '2919915608453998030'", conn)

EXPR$0
i64
10483774


Or a timestamp:

In [62]:
pl.read_database("SELECT COUNT(*) FROM Nessie.steam.languages at TIMESTAMP '2024-08-03 19:50:00'", conn)

EXPR$0
i64
10483774


After publishing, we may want to store a tag on the data in order to be able to audit how the data looked at a specific point in time

In [64]:
with conn.cursor() as c:
    c.execute('CREATE TAG my_report FROM BRANCH main IN Nessie')
    print(c.fetchone()[1])

Tag my_report has been created at branch main in source Nessie.


In [65]:
with conn.cursor() as c:
    c.execute("DELETE FROM Nessie.steam.languages where game_id = '550'")
    print(f"Deleted {c.fetchone()[0]:,} rows")

Deleted 858,570 rows


In [67]:
pl.read_database("SELECT COUNT(*) as num_rows FROM Nessie.steam.languages", conn)

num_rows
i64
10483774


Oops! Good thing I have a tag - my report is unaffected

In [68]:
pl.read_database("SELECT COUNT(*) as num_rows FROM Nessie.steam.languages AT TAG my_report", conn)

num_rows
i64
11342344


Let's rollback the accidental deletion by going for a snapshot before the accidental deletion

In [69]:
pl.read_database("SELECT * FROM TABLE(table_snapshot('Nessie.steam.languages'))", conn)

committed_at,snapshot_id,parent_id,operation,manifest_list,summary
datetime[ms],i64,i64,str,str,list[struct[2]]
2024-08-03 19:40:34.303,2919915608453998030,,"""append""","""s3://warehouse/steam/languages…","[{""added-files-size"",""27356584""}, {""added-data-files"",""89""}, … {""total-equality-deletes"",""0""}]"
2024-08-03 19:59:51.048,9205576444219515382,2.919915608453998e+18,"""append""","""s3://warehouse/steam/languages…","[{""dremio-job-id"",""195173ca-7b41-4deb-a85b-ff90a6e6c000""}, {""added-data-files"",""166""}, … {""total-equality-deletes"",""0""}]"
2024-08-03 20:07:24.780,7912176480591895159,9.205576444219517e+18,"""overwrite""","""s3://warehouse/steam/languages…","[{""dremio-job-id"",""19517204-d63f-c5cb-6520-17f35ed3c000""}, {""deleted-data-files"",""166""}, … {""total-equality-deletes"",""0""}]"


In [70]:
with conn.cursor() as c:
    c.execute("ROLLBACK TABLE Nessie.steam.languages TO SNAPSHOT '9205576444219515382'")
    print(c.fetchone()[1])

Table [Nessie.steam.languages] rollbacked


In [72]:
pl.read_database("SELECT COUNT(*) as num_rows FROM Nessie.steam.languages", conn)

num_rows
i64
11342344


Back to normal!