In [33]:
%load_ext sparksql_magic

In [2]:
import pyspark
from pyspark.sql import SparkSession, Row

from datetime import datetime,date

In [3]:
#https://iceberg.apache.org/spark-quickstart/#adding-a-catalog
#TODO for reference for aws config: https://www.dremio.com/blog/deep-dive-into-configuring-your-apache-iceberg-catalog-with-apache-spark/
conf = (
    pyspark.SparkConf()
        .setAppName('test')
        #Configure AWS (Minio)
        .set('spark.hadoop.fs.s3a.endpoint','http://localhost:9000')
        .set('spark.hadoop.fs.s3a.access.key','minioadmin')
        .set('spark.hadoop.fs.s3a.secret.key','minioadmin')
        .set('spark.hadoop.fs.s3a.path.style.access','true')
        .set('spark.hadoop.fs.s3a.path.style.access','org.apache.hadoop.fs.s3a.S3AFileSystem')
  		#packages
        .set('spark.jars.packages', 'org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.4.3,software.amazon.awssdk:bundle:2.17.178,software.amazon.awssdk:url-connection-client:2.17.178')
  		#SQL Extensions
        .set('spark.sql.extensions', 'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions')
  		#Configuring Catalog
        .set('spark.sql.catalog.hdfs_catalog', 'org.apache.iceberg.spark.SparkCatalog')
        .set('spark.sql.catalog.hdfs_catalog.type', 'hadoop')
        .set('spark.sql.catalog.hdfs_catalog.warehouse', '../datasets/sample-datasets/iceberg/')
        .set('spark.sql.defaultCatalog', 'hdfs_catalog')
        
)

spark = SparkSession.builder.config(conf=conf).getOrCreate()
spark

In [4]:
#!aws --endpoint-url http://localhost:9000 s3 ls s3://sample-datasets/

In [5]:
sql_create = """
CREATE TABLE prod.employee (
    id bigint COMMENT 'unique id for employee',
    birth_date date COMMENT 'birth date of employee',
    country string COMMENT 'Country location of employee',
    name string)
"""
spark.sql(sql_create)

DataFrame[]

### Above Step will create employe table under prod catalog
- You can navigate to `../datasets/iceberg/prod/employee/`
- You'll see metadata under the folder
- - This has version-hint.text and v1.metadata.json
  - `version-hint.text` has version number of the current metadata file. This file exists as we are using hadoop as our catalog.[1]
  - `v1.metadata.json` has metdata on the table including its schema information and location of the data

In [15]:
!cat ../datasets/sample-datasets/iceberg/prod/employee/metadata/version-hint.text

1

In [16]:
!cat ../datasets/sample-datasets/iceberg/prod/employee/metadata/v1.metadata.json

{
  "format-version" : 2,
  "table-uuid" : "9ebb3c61-c9c3-4c50-85ec-fda589212ae9",
  "location" : "../datasets/sample-datasets/iceberg/prod/employee",
  "last-sequence-number" : 0,
  "last-updated-ms" : 1706680277552,
  "last-column-id" : 4,
  "current-schema-id" : 0,
  "schemas" : [ {
    "type" : "struct",
    "schema-id" : 0,
    "fields" : [ {
      "id" : 1,
      "name" : "id",
      "required" : false,
      "type" : "long",
      "doc" : "unique id for employee"
    }, {
      "id" : 2,
      "name" : "birth_date",
      "required" : false,
      "type" : "date",
      "doc" : "birth date of employee"
    }, {
      "id" : 3,
      "name" : "country",
      "required" : false,
      "type" : "string",
      "doc" : "Country location of employee"
    }, {
      "id" : 4,
      "name" : "name",
      "required" : false,
      "type" : "string"
    } ]
  } ],
  "default-spec-id" : 0,
  "partition-specs" : [ {
    "spec-id" : 0,
    "fields" : [ ]
  } ],
  "last-partition-id" : 999

In [20]:
df_emp = spark.createDataFrame([
    Row(id=1, country='USA', birth_date=date(2000, 8, 1), name="A"),
   
    Row(id=2, country='IN', birth_date=date(2000, 6, 2), name="B"),
   
    Row(id=4, country='USA', birth_date=date(2000, 5, 3), name="C")
])
 

In [21]:
df_emp.show()

+---+-------+----------+----+
| id|country|birth_date|name|
+---+-------+----------+----+
|  1|    USA|2000-08-01|   A|
|  2|     IN|2000-06-02|   B|
|  4|    USA|2000-05-03|   C|
+---+-------+----------+----+



In [22]:
df_emp.writeTo("prod.employee")\
        .partitionedBy("country") \
        .append()

In [None]:
### The above command will add data and you can see v2.metadata.json add

In [24]:
!cat ../datasets/sample-datasets/iceberg/prod/employee/metadata/version-hint.text

2

In [25]:
!cat ../datasets/sample-datasets/iceberg/prod/employee/metadata/v1.metadata.json

{
  "format-version" : 2,
  "table-uuid" : "9ebb3c61-c9c3-4c50-85ec-fda589212ae9",
  "location" : "../datasets/sample-datasets/iceberg/prod/employee",
  "last-sequence-number" : 0,
  "last-updated-ms" : 1706680277552,
  "last-column-id" : 4,
  "current-schema-id" : 0,
  "schemas" : [ {
    "type" : "struct",
    "schema-id" : 0,
    "fields" : [ {
      "id" : 1,
      "name" : "id",
      "required" : false,
      "type" : "long",
      "doc" : "unique id for employee"
    }, {
      "id" : 2,
      "name" : "birth_date",
      "required" : false,
      "type" : "date",
      "doc" : "birth date of employee"
    }, {
      "id" : 3,
      "name" : "country",
      "required" : false,
      "type" : "string",
      "doc" : "Country location of employee"
    }, {
      "id" : 4,
      "name" : "name",
      "required" : false,
      "type" : "string"
    } ]
  } ],
  "default-spec-id" : 0,
  "partition-specs" : [ {
    "spec-id" : 0,
    "fields" : [ ]
  } ],
  "last-partition-id" : 999

In [30]:
spark.sql("select * from prod.employee").collect()

[Row(id=1, birth_date=datetime.date(2000, 8, 1), country='USA', name='A'),
 Row(id=2, birth_date=datetime.date(2000, 6, 2), country='IN', name='B'),
 Row(id=4, birth_date=datetime.date(2000, 5, 3), country='USA', name='C')]

In [38]:
%%sparksql
select * from prod.employee

0,1,2,3
id,birth_date,country,name
1,2000-08-01,USA,A
2,2000-06-02,IN,B
4,2000-05-03,USA,C


References:
1. https://www.dremio.com/resources/guides/apache-iceberg-an-architectural-look-under-the-covers/