# Branching with Iceberg

In [7]:
import pyspark
from pyspark.sql import SparkSession
import os


conf = (
    pyspark.SparkConf()
        .setAppName('app_name')
    
    # first we will define the packages that we need. Iceberg Spark runtime
        .set('spark.jars.packages', 'org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:1.2.0,software.amazon.awssdk:bundle:2.17.178,software.amazon.awssdk:url-connection-client:2.17.178')
        
    # This property allows us to add any extensions that we want to use
        .set('spark.sql.extensions', 'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions')
    
    # configures a new catalog to a particular implementation of SparkCatalog
        .set('spark.sql.catalog.glue', 'org.apache.iceberg.spark.SparkCatalog')
    
    # particular type of catalog we are using
        .set('spark.sql.catalog.glue.catalog-impl', 'org.apache.iceberg.aws.glue.GlueCatalog')
    
    # engine writes to the warehouse
        .set('spark.sql.catalog.glue.warehouse', 's3://ids-sample-iceberg-datasets/sampledb/')
    
    # changes IO impl of catalog, mainly for changing writing data to object storage
        .set('spark.sql.catalog.glue.io-impl', 'org.apache.iceberg.aws.s3.S3FileIO')
)

## Start Spark Session
spark = SparkSession.builder.config(conf=conf).getOrCreate()
print("Spark Running")

Spark Running


In [8]:
# CREATE a new Iceberg table 'employees'
_ = spark.sql(
    """CREATE TABLE IF NOT EXISTS glue.sampledb.employees
            (id BIGINT, name STRING, role STRING, salary double, some_feature int) USING iceberg"""
)
print('table has been created')

table has been created


In [9]:
# INSERT some records
_ = spark.sql("INSERT INTO glue.sampledb.employees values (1, 'Steve', 'Clerk', 22000, 20), (2, 'Gary', 'Sales', 18000, 10)")
print('records have been inserted')

records have been inserted


In [10]:
spark.sql("SELECT * FROM glue.sampledb.employees").toPandas()

Unnamed: 0,id,name,role,salary,some_feature
0,1,Steve,Clerk,22000.0,20
1,2,Gary,Sales,18000.0,10


In [11]:
# CREATE a new branch
_ = spark.sql("ALTER TABLE glue.sampledb.employees CREATE BRANCH bobs_branch")
print('branch has been created')

branch has been created


In [12]:
spark.sql("SELECT * FROM glue.sampledb.employees.refs").toPandas()

Unnamed: 0,name,type,snapshot_id,max_reference_age_in_ms,min_snapshots_to_keep,max_snapshot_age_in_ms
0,main,BRANCH,1534453947479327056,,,
1,bobs_branch,BRANCH,1534453947479327056,,,


In [13]:
df = spark.sql("SELECT * FROM glue.sampledb.employees")
df.withColumn('some_feature', df.some_feature * 100).write.format("iceberg").option("branch", "bobs_branch").mode("overwrite").save("glue.sampledb.employees")

# Check the records in the BRANCH after records were updated
spark.sql("SELECT * FROM glue.sampledb.employees VERSION AS OF 'bobs_branch'").toPandas()

Unnamed: 0,id,name,role,salary,some_feature
0,1,Steve,Clerk,22000.0,2000
1,2,Gary,Sales,18000.0,1000


In [14]:
spark.sql("SELECT * FROM glue.sampledb.employees").toPandas()

Unnamed: 0,id,name,role,salary,some_feature
0,1,Steve,Clerk,22000.0,20
1,2,Gary,Sales,18000.0,10


In [15]:
# DROP the branch 
spark.sql("ALTER TABLE glue.sampledb.employees DROP BRANCH bobs_branch")
spark.sql("SELECT * FROM glue.sampledb.employees.refs").toPandas()

Unnamed: 0,name,type,snapshot_id,max_reference_age_in_ms,min_snapshots_to_keep,max_snapshot_age_in_ms
0,main,BRANCH,1534453947479327056,,,


### * Branching is also handy for the Write, Audit and Publish (WAP) pattern
### * Iceberg also supports the idea of Tagging
### * This is at the table level, but check out project [Nessie](https://github.com/projectnessie/nessie/) for catalog level branching