In [None]:
import os
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Adding AWS S3 Minio configs
sparkConf = (
    SparkConf()
    .set("spark.jars.ivy","/home/brijeshdhaker/.ivy2")
    .set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .set("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .set("spark.jars.packages","org.apache.hadoop:hadoop-aws:3.0.0,io.delta:delta-spark_2.12:3.3.2")
    .set("spark.executor.heartbeatInterval", "300000")
    .set("spark.network.timeout", "400000")
    .set("spark.hadoop.fs.s3a.endpoint", "http://minio.sandbox.net:9010")
    .set("spark.hadoop.fs.s3a.access.key", "pgm2H2bR7a5kMc5XCYdO")
    .set("spark.hadoop.fs.s3a.secret.key", "zjd8T0hXFGtfemVQ6AH3yBAPASJNXNbVSx5iddqG")
    .set("spark.hadoop.fs.s3a.path.style.access", "true")
    .set("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
    .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    #.set("spark.eventLog.enabled", "true")
    #.set("spark.eventLog.dir", "file:///apps/var/logs/spark-events")
)

spark = (
    SparkSession.builder.master("local[*]").
        appName('spark-deltalake').
        config(conf=sparkConf).
        getOrCreate()
)

spark.sparkContext.setLogLevel('ERROR')
spark

#### Create a DataFrame

In [None]:
from delta import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit

# Create a DataFrame
df = spark.range(1, 10)
df = df.withColumn('value', lit('ABC'))
df.show()

# Save as delta table
df.write.format('delta').save('/deltalake/test_table')

#### Update rows with even id to append the id

In [None]:
from delta import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit

# Update Delta Lake table
test_table = DeltaTable.forPath(spark, "/deltalake/test_table")

# Update rows with even id to append the id
test_table.update(
    condition=expr("id % 2 == 0"),
    set={"value": concat("value", lit('|'), "id")})

df = test_table.toDF()
df.show()

#### Create a staging DataFrame for merge.

In [None]:
# Update Delta Lake table
test_table = DeltaTable.forPath(spark, "/deltalake/test_table")

# Create a staging DataFrame for merge.
df_stg = spark.range(9, 15)
df_stg = df_stg.withColumn('value', lit('EDF'))

# Merge into test_table
merge = test_table.alias('tgt').merge(df_stg.alias('src'),"src.id = tgt.id") \
    .whenMatchedUpdate(set={"value": col("src.value")}) \
    .whenNotMatchedInsert(values={"id": col("src.id"), "value": col("src.value")})
merge.execute()

df = test_table.toDF()
df.show()

#### delete rows where id = 6

In [None]:
# Update Delta Lake table
test_table = DeltaTable.forPath(spark, "/deltalake/test_table")

# delete rows where id = 6
test_table.delete(
    condition=expr("id == 6"))

df = test_table.toDF()
df.show()

#### overwrite

In [None]:
# Save as delta table
df.write.format('delta').mode('overwrite').save('/deltalake/test_table')

#### Time Travle

In [None]:
# Read version 1
df_v1 = spark.read.format('delta').option('versionAsOf', 1).load("/deltalake/test_table")
df_v1.show()