## Delta with PySpark

In [0]:
# Create Spark Session with Delta JARS and conf

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Delta with PySpark") \
    .config('spark.jars.packages', 'io.delta:delta-core_2.12:2.1.1') \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    ) \
    .config("spark.sql.warehouse.dir", "spark-warehouse") \
    .master("local[*]") \
    .enableHiveSupport() \
    .getOrCreate()

spark

In [0]:
# pip install sparksql-magic
# Run below command to enable sparksql
%load_ext sparksql_magic

In [0]:
%%sparksql
show tables in default;

In [0]:
# Lets read our Sales dataset

df_sales = spark.read.parquet("dataset/sales.parquet/*parquet")
df_sales.printSchema()
df_sales.show(10, False)

In [0]:
# Lets create a sales managed delta table
from pyspark.sql.functions import to_timestamp, expr

df_formatted = (
    df_sales
    .withColumn("transacted_at", to_timestamp("transacted_at"))
    .withColumn("amount", expr("CAST(amount as decimal(14,2))"))
               )
    
df_formatted.write \
    .format("delta") \
    .saveAsTable("sales_delta_managed")

In [0]:
%%sparksql

describe extended default.sales_delta_managed;

In [0]:
%%sparksql

select * from default.sales_delta_managed limit 10;

In [0]:
# Lets check the current version of the table

from delta import DeltaTable

dt = DeltaTable.forName(spark, "sales_delta_managed")
dt.history().select("version", "timestamp").show(truncate=False)

In [0]:
%%sparksql

update default.sales_delta_managed set amount = 450.56 where trx_id = '1995601912';

In [0]:
# Lets check the current version of the table

dt.history().select("version", "timestamp").show(truncate=False)

In [0]:
%%sparksql

select * from default.sales_delta_managed limit 10;

In [0]:
# Verify if a given table is Delta

print(DeltaTable.isDeltaTable(spark, "spark-warehouse/sales_managed/"))
print(DeltaTable.isDeltaTable(spark, "spark-warehouse/sales_delta_managed/"))

In [0]:
# Shortcut to create a Parquet location to delta table
# We will convert the sales_managed table to delta

DeltaTable.convertToDelta(spark, "parquet.`spark-warehouse/sales_managed`")

In [0]:
# Verify if a given table is Delta

print(DeltaTable.isDeltaTable(spark, "spark-warehouse/sales_managed/"))
print(DeltaTable.isDeltaTable(spark, "spark-warehouse/sales_delta_managed/"))

In [0]:
%%sparksql

describe extended default.sales_managed;

In [0]:
%%sparksql

CONVERT TO DELTA default.sales_managed;

In [0]:
%%sparksql

describe extended default.sales_managed;