In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

# adding iceberg configs
conf = (
    SparkConf()
    .set("spark.jars.ivy","/apps/.ivy2")
    .set("spark.jars.packages","org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.6.1,org.apache.hadoop:hadoop-aws:3.0.0")
    .set("spark.sql.extensions","org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") # Use Iceberg with Spark
    .set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog")
    .set("spark.sql.catalog.spark_catalog.type", "hive") # .set("spark.sql.catalogImplementation", "in-memory")
    .set("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog")
    .set("spark.sql.catalog.iceberg.type", "hadoop") # Iceberg catalog type
    .set("spark.sql.catalog.iceberg.warehouse", "s3a://warehouse/tablespace/external/spark") # Name of the Iceberg catalog
    .set("spark.sql.defaultCatalog", "iceberg") # Name of the Iceberg catalog
    .set("spark.executor.heartbeatInterval", "300000")
    .set("spark.network.timeout", "400000")
    .set("spark.hadoop.fs.s3a.endpoint", "http://minio.sandbox.net:9010")
    .set("spark.hadoop.fs.s3a.access.key", "pgm2H2bR7a5kMc5XCYdO")
    .set("spark.hadoop.fs.s3a.secret.key", "zjd8T0hXFGtfemVQ6AH3yBAPASJNXNbVSx5iddqG")
    .set("spark.hadoop.fs.s3a.path.style.access", "true")
    .set("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
    .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
)

spark = SparkSession.builder.appName("iceberg").config(conf=conf).getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
spark

In [2]:
df = spark.read.parquet("s3a://warehouse/taxi-data/yellow_tripdata_2021-04.parquet")
df.show()

In [3]:
# Default - displays 20 rows and 
# 20 charactes from column value 
df.show()

#Display full column contents
df.show(truncate=False)

# Display 2 rows and full column contents
df.show(2,truncate=False) 

# Display 2 rows & column values 25 characters
df.show(2,truncate=25) 

# Display DataFrame rows & columns vertically
df.show(n=3,truncate=25,vertical=True)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       1| 2021-04-01 00:00:18|  2021-04-01 00:21:54|            1.0|          8.4|       1.0|                 N|          79|         116|           1|       25.5|  3.0|    0.5|      5.8

In [6]:
spark.sql("show databases").show()

+---------+
|namespace|
+---------+
|      nyc|
+---------+



In [23]:
spark.sql("CREATE DATABASE IF NOT EXISTS nyc").show(truncate=False)

++
||
++
++



In [24]:
spark.sql("USE nyc").show(truncate=False)

++
||
++
++



In [25]:
spark.sql("show tables").show(truncate=False)

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|nyc      |taxis    |false      |
+---------+---------+-----------+



In [26]:
spark.sql("select * from taxis limit 10").show(truncate=False)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|1       |2021-04-01 00:00:18 |2021-04-01 00:21:54  |1.0            |8.4          |1.0       |N                 |79          |116         |1           |25.5       |3.0  |0.5    |5.85     

In [13]:
## load nyc.taxis from catalog iceberg
spark.sql("select * from iceberg.nyc.taxis limit 10").show()

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       1| 2021-04-01 00:00:18|  2021-04-01 00:21:54|            1.0|          8.4|       1.0|                 N|          79|         116|           1|       25.5|  3.0|    0.5|      5.8

In [14]:
spark.sql("use iceberg.nyc")

DataFrame[]

In [15]:
spark.sql("select * from taxis limit 10").show()

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       1| 2021-04-01 00:00:18|  2021-04-01 00:21:54|            1.0|          8.4|       1.0|                 N|          79|         116|           1|       25.5|  3.0|    0.5|      5.8

In [17]:
spark.sql("SHOW TBLPROPERTIES nyc.taxis ('current-snapshot-id');").show(truncate=False)

+-------------------+-------------------+
|                key|              value|
+-------------------+-------------------+
|current-snapshot-id|6144676154389073102|
+-------------------+-------------------+



In [18]:
spark.sql("DESCRIBE EXTENDED nyc.taxis").show(truncate=False)


+--------------------+-------------+-------+
|            col_name|    data_type|comment|
+--------------------+-------------+-------+
|            VendorID|       bigint|   NULL|
|tpep_pickup_datetime|timestamp_ntz|   NULL|
|tpep_dropoff_date...|timestamp_ntz|   NULL|
|     passenger_count|       double|   NULL|
|       trip_distance|       double|   NULL|
|          RatecodeID|       double|   NULL|
|  store_and_fwd_flag|       string|   NULL|
|        PULocationID|       bigint|   NULL|
|        DOLocationID|       bigint|   NULL|
|        payment_type|       bigint|   NULL|
|         fare_amount|       double|   NULL|
|               extra|       double|   NULL|
|             mta_tax|       double|   NULL|
|          tip_amount|       double|   NULL|
|        tolls_amount|       double|   NULL|
|improvement_surch...|       double|   NULL|
|        total_amount|       double|   NULL|
|congestion_surcharge|       double|   NULL|
|         airport_fee|       double|   NULL|
|         

In [21]:
spark.sql("DESCRIBE FORMATTED nyc.taxis").show(truncate=False)

+---------------------+-------------+-------+
|col_name             |data_type    |comment|
+---------------------+-------------+-------+
|VendorID             |bigint       |NULL   |
|tpep_pickup_datetime |timestamp_ntz|NULL   |
|tpep_dropoff_datetime|timestamp_ntz|NULL   |
|passenger_count      |double       |NULL   |
|trip_distance        |double       |NULL   |
|RatecodeID           |double       |NULL   |
|store_and_fwd_flag   |string       |NULL   |
|PULocationID         |bigint       |NULL   |
|DOLocationID         |bigint       |NULL   |
|payment_type         |bigint       |NULL   |
|fare_amount          |double       |NULL   |
|extra                |double       |NULL   |
|mta_tax              |double       |NULL   |
|tip_amount           |double       |NULL   |
|tolls_amount         |double       |NULL   |
|improvement_surcharge|double       |NULL   |
|total_amount         |double       |NULL   |
|congestion_surcharge |double       |NULL   |
|airport_fee          |double     

In [27]:
%%sql

select * from taxis limit 10


UsageError: Cell magic `%%sql` not found.
