In [None]:
import os
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Adding AWS S3 Minio configs
sparkConf = (
    SparkConf()
    .set("spark.jars.ivy","/home/brijeshdhaker/.ivy2")
    .set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .set("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .set("spark.jars.packages","org.apache.hadoop:hadoop-aws:3.0.0,io.delta:delta-spark_2.12:3.3.2")
    .set("spark.executor.heartbeatInterval", "300000")
    .set("spark.network.timeout", "400000")
    .set("spark.hadoop.fs.defaultFS", "s3a://defaultfs/")
    .set("spark.hadoop.fs.s3a.endpoint", "http://minio.sandbox.net:9010")
    .set("spark.hadoop.fs.s3a.access.key", "pgm2H2bR7a5kMc5XCYdO")
    .set("spark.hadoop.fs.s3a.secret.key", "zjd8T0hXFGtfemVQ6AH3yBAPASJNXNbVSx5iddqG")
    .set("spark.hadoop.fs.s3a.path.style.access", "true")
    .set("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
    .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    #.set("spark.eventLog.enabled", "true")
    #.set("spark.eventLog.dir", "file:///apps/var/logs/spark-events")
)

spark = (
    SparkSession.builder.master("local[*]").
        appName('spark-deltalake').
        config(conf=sparkConf).
        getOrCreate()
)

spark.sparkContext.setLogLevel('ERROR')
spark

#
# 
#
def display(df):
    df.show(truncate=False)

In [None]:
%%bash

## Delete Existing Delta Table
aws --endpoint-url http://minio.sandbox.net:9010 s3 rm s3://defaultfs/deltalake/peoples --recursive


#### Create Deltatable

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType

schema = StructType([
  StructField("id", IntegerType(), True),
  StructField("firstName", StringType(), True),
  StructField("middleName", StringType(), True),
  StructField("lastName", StringType(), True),
  StructField("gender", StringType(), True),
  StructField("birthDate", TimestampType(), True),
  StructField("ssn", StringType(), True),
  StructField("salary", IntegerType(), True)
])

df = spark.read.format("csv").option("header", True).schema(schema).load("s3a://datasets/peoples.csv")
df.printSchema()
#
display(df)


# Save as delta table in S3
df.write.format('delta').save('/deltalake/peoples')

# Create the table if it does not exist. Otherwise, replace the existing table.
#df.writeTo("spark_catalog.default.peoples").createOrReplace()

# If you know the table does not already exist, you can call this instead:
#df.write.saveAsTable("spark_catalog.default.peoples")

#### Python Create Deltatable

In [None]:
from delta import *

DeltaTable.createIfNotExists(spark) \
    .tableName("peoples") \
    .addColumn("id", "INT") \
    .addColumn("firstName", "STRING") \
    .addColumn("middleName", "STRING") \
    .addColumn("lastName", "STRING", comment = "surname") \
    .addColumn("gender", "STRING") \
    .addColumn("birthDate", "TIMESTAMP") \
    .addColumn("ssn", "STRING") \
    .addColumn("salary", "INT") \
    .execute()

#### Upsert to a Deltatable

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType
from datetime import date

schema = StructType([
  StructField("id", IntegerType(), True),
  StructField("firstName", StringType(), True),
  StructField("middleName", StringType(), True),
  StructField("lastName", StringType(), True),
  StructField("gender", StringType(), True),
  StructField("birthDate", DateType(), True),
  StructField("ssn", StringType(), True),
  StructField("salary", IntegerType(), True)
])

data = [
  (9999998, 'Billy', 'Tommie', 'Luppitt', 'M', date.fromisoformat('1992-09-17'), '953-38-9452', 55250),
  (9999999, 'Elias', 'Cyril', 'Leadbetter', 'M', date.fromisoformat('1984-05-22'), '906-51-2137', 48500),
  (10000000, 'Joshua', 'Chas', 'Broggio', 'M', date.fromisoformat('1968-07-22'), '988-61-6247', 90000),
  (20000001, 'John', '', 'Doe', 'M', date.fromisoformat('1978-01-14'), '345-67-8901', 55500),
  (20000002, 'Mary', '', 'Smith', 'F', date.fromisoformat('1982-10-29'), '456-78-9012', 98250),
  (20000003, 'Jane', '', 'Doe', 'F', date.fromisoformat('1981-06-25'), '567-89-0123', 89900)
]

people_10m_updates = spark.createDataFrame(data, schema)
people_10m_updates.createOrReplaceTempView("people_10m_updates")

# ...

from delta.tables import DeltaTable

deltaTable = DeltaTable.forPath(spark, '/deltalake/peoples')

(deltaTable.alias("people_10m")
  .merge(
    people_10m_updates.alias("people_10m_updates"),
    "people_10m.id = people_10m_updates.id"
  )
  .whenMatchedUpdateAll()
  .whenNotMatchedInsertAll()
  .execute()
)

In [None]:
df = spark.read.format('delta').load("/deltalake/peoples")
df_filtered = df.filter(df["id"] >= 9999998)
display(df_filtered)

#### Read a Deltatable

In [None]:
people_df = spark.read.format('delta').load("/deltalake/peoples")
display(people_df)

#### Write to a Deltatable

In [None]:
# df.write.mode("append").saveAsTable("main.default.people_10m")

# Save as delta table
df.write.format('delta').mode('append').save('/deltalake/delta-table')

In [None]:
# df.write.mode("overwrite").saveAsTable("main.default.people_10m")

# Save as delta table
df.write.format('delta').mode('overwrite').save('/deltalake/delta-table')

#### Update Deltatable Rows

In [None]:
from delta import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

deltaTable = DeltaTable.forPath(spark, '/deltalake/peoples')

# Declare the predicate by using a SQL-formatted string.
deltaTable.update(
  condition = "gender = 'F'",
  set = { "gender": "'Female'" }
)

# Declare the predicate by using Spark SQL functions.
deltaTable.update(
  condition = col('gender') == 'M',
  set = { 'gender': lit('Male') }
)

#### Delete Rows 

In [None]:
from delta.tables import *
from pyspark.sql.functions import *

deltaTable = DeltaTable.forPath(spark, '/deltalake/peoples')

# Declare the predicate by using a SQL-formatted string.
deltaTable.delete("birthDate < '1955-01-01'")

# Declare the predicate by using Spark SQL functions.
deltaTable.delete(col('birthDate') < '1960-01-01')

#### Display table history

In [None]:
from delta.tables import *

deltaTable = DeltaTable.forPath(spark, '/deltalake/peoples')
display(deltaTable.history())

#### overwrite

In [None]:
# Save as delta table
df.write.format('delta').mode('overwrite').save('/deltalake/delta-table')

#### Time Travle

In [None]:
# Read version 1
from delta.tables import *

deltaTable = DeltaTable.forPath(spark, '/deltalake/peoples')
deltaHistory = deltaTable.history()

display(deltaHistory.where("version == 0"))
# Or:
display(deltaHistory.where("timestamp == '2024-05-15T22:43:15.000+00:00'"))

In [None]:
df = spark.read.format('delta').option('versionAsOf', 0).load("/deltalake/peoples")
# Or: 2025-10-14 18:39:41
#df = spark.read.format('delta').option('timestampAsOf', '2025-10-14T18:45:03.000+00:00').load("/deltalake/peoples")

display(df)

#### Display table history

In [None]:
deltaTable = DeltaTable.forPath(spark, "/deltalake/peoples")
print("######## Describe history for the table ######")
deltaTable.history().show()

#### Vacuum

In [None]:
deltaTable = DeltaTable.forPath(spark, "/deltalake/peoples")
print("######## Vacuum the table ########")
deltaTable.vacuum()

#### Describe details for the table

In [None]:
print("######## Describe details for the table ######")
deltaTable.detail().show()

#### Generating manifest 

In [None]:
# Generate manifest
print("######## Generating manifest ######")
deltaTable.generate("SYMLINK_FORMAT_MANIFEST")

In [36]:
# SQL Vacuum
print("####### SQL Vacuum #######")
spark.sql("VACUUM '%s' RETAIN 169 HOURS" % ("/deltalake/peoples")).collect()

####### SQL Vacuum #######


                                                                                

Deleted 0 files and directories in a total of 1 directories.


[Row(path='s3a://defaultfs/deltalake/peoples')]

In [35]:
# SQL describe history
print("####### SQL Describe History ########")
print(spark.sql("DESCRIBE HISTORY delta.`%s`" % ("/deltalake/peoples")).collect())

####### SQL Describe History ########
[Row(version=5, timestamp=datetime.datetime(2025, 10, 14, 18, 57, 41), userId=None, userName=None, operation='DELETE', operationParameters={'predicate': '["(birthDate#6062 < 1960-01-01 00:00:00)"]'}, job=None, notebook=None, clusterId=None, readVersion=4, isolationLevel='Serializable', isBlindAppend=False, operationMetrics={'numDeletionVectorsUpdated': '0', 'numAddedFiles': '1', 'executionTimeMs': '1316', 'numDeletionVectorsRemoved': '0', 'numRemovedFiles': '1', 'rewriteTimeMs': '197', 'numRemovedBytes': '44629', 'scanTimeMs': '1119', 'numCopiedRows': '833', 'numDeletionVectorsAdded': '0', 'numAddedChangeFiles': '0', 'numDeletedRows': '104', 'numAddedBytes': '40093'}, userMetadata=None, engineInfo='Apache-Spark/3.5.3 Delta-Lake/3.3.2'), Row(version=4, timestamp=datetime.datetime(2025, 10, 14, 18, 57, 39), userId=None, userName=None, operation='DELETE', operationParameters={'predicate': '["(birthDate#6062 < 1955-01-01 00:00:00)"]'}, job=None, notebo

In [None]:
import shutil

# cleanup
shutil.rmtree("/tmp/delta-table")

In [37]:
%%bash

aws --endpoint-url http://minio.sandbox.net:9010 s3 rm s3://defaultfs/deltalake/peoples --recursive

delete: s3://defaultfs/deltalake/peoples/_delta_log/00000000000000000000.crc
delete: s3://defaultfs/deltalake/peoples/_delta_log/00000000000000000000.json
delete: s3://defaultfs/deltalake/peoples/_delta_log/00000000000000000001.crc
delete: s3://defaultfs/deltalake/peoples/_delta_log/00000000000000000001.json
delete: s3://defaultfs/deltalake/peoples/_delta_log/00000000000000000002.crc
delete: s3://defaultfs/deltalake/peoples/_delta_log/00000000000000000002.json
delete: s3://defaultfs/deltalake/peoples/_delta_log/00000000000000000003.json
delete: s3://defaultfs/deltalake/peoples/_delta_log/00000000000000000004.crc
delete: s3://defaultfs/deltalake/peoples/_delta_log/00000000000000000003.crc
delete: s3://defaultfs/deltalake/peoples/_delta_log/00000000000000000005.crc
delete: s3://defaultfs/deltalake/peoples/_delta_log/00000000000000000005.json
delete: s3://defaultfs/deltalake/peoples/_delta_log/00000000000000000004.json
delete: s3://defaultfs/deltalake/peoples/_delta_log/_commits/
delete: 

#### Optimize a table
After you have performed multiple changes to a table, you might have a lot of small files. To improve the speed of read queries, you can use the optimize operation to collapse small files into larger ones:

In [None]:
from delta.tables import *

#deltaTable = DeltaTable.forName(spark, "main.default.people_10m")
deltaTable = DeltaTable.forPath(spark, "/deltalake/peoples")
deltaTable.optimize().executeCompaction()

#### Z-order by columns
To improve read performance further, you can collocate related information in the same set of files by z-ordering. Delta Lake data-skipping algorithms use this collocation to dramatically reduce the amount of data that needs to be read. To z-order data, you specify the columns to order on in the z-order by operation. For example, to collocate by gender, run:

In [None]:
from delta.tables import *

#deltaTable = DeltaTable.forName(spark, "main.default.people_10m")
deltaTable = DeltaTable.forPath(spark, "/deltalake/peoples")
deltaTable.optimize().executeZOrderBy("gender")

#### Clean up snapshots with VACUUM
Delta Lake provides snapshot isolation for reads, which means that it is safe to run an optimize operation even while other users or jobs are querying the table. Eventually however, you should clean up old snapshots. You can do this by running the vacuum operation:

In [None]:
from delta.tables import *

#deltaTable = DeltaTable.forName(spark, "main.default.people_10m")
deltaTable = DeltaTable.forPath(spark, "/deltalake/peoples")
deltaTable.vacuum()

#### How do I find the last commit's version in the Spark session?

In [None]:
spark.conf.get("spark.databricks.delta.lastCommitVersionInSession")