In [None]:
import warnings
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.sql import Window
from delta.tables import DeltaTable

warnings.filterwarnings("ignore", category=FutureWarning)

# Create SparkSession
spark = SparkSession.builder.appName("DeltaSession") \
            .config("spark.jars.packages", "io.delta:delta-core_2.12:2.3.0") \
            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")\
            .getOrCreate()

## Parquet tables don't have schema enforcement

In [None]:
columns = ["first_name", "age"]
data = [("bob", 47), ("li", 23), ("leonard", 51)]

df = spark.createDataFrame(data, columns)

df.toPandas()

In [None]:
df.write.mode("overwrite").format("parquet").save("extract/03schema_parquet")

In [None]:
columns = ["first_name", "favorite_color"]
data = [("sal", "red"), ("cat", "pink")]

df = spark.createDataFrame(data, columns)

df.toPandas()

In [None]:
df.write.mode("append").format("parquet").save("extract/03schema_parquet")

In [None]:
spark.read.format("parquet").load("extract/03schema_parquet").toPandas()

## Parquet tables with metastore have schema enforcement


In [None]:
columns = ["first_name", "age"]
data = [("bob", 47), ("li", 23), ("leonard", 51)]

df = spark.createDataFrame(data, columns)

df.toPandas()

In [None]:
df.write.format("parquet").saveAsTable("03schema_parquet2")

In [None]:
columns = ["first_name", "favorite_color"]
data = [("sal", "red"), ("cat", "pink")]

df = spark.createDataFrame(data, columns)

df.toPandas()

In [None]:
df.write.mode("append").format("parquet").saveAsTable("03schema_parquet2")

## Delta Lake schema enforcement is built-in

In [None]:
columns = ["first_name", "age"]
data = [("bob", 47), ("li", 23), ("leonard", 51)]

df = spark.createDataFrame(data, columns)

df.toPandas()

In [None]:
df.write.format("delta").save("extract/03schema_enf1")

In [None]:
columns = ["first_name", "favorite_color"]
data = [("sal", "red"), ("cat", "pink")]

df = spark.createDataFrame(data, columns)

df.toPandas()

In [None]:
df.write.mode("append").format("delta").save("extract/03schema_enf1")

## option("mergeSchema", "true") 

In [None]:
columns = ["first_name", "age"]
data = [("bob", 47), ("li", 23), ("leonard", 51)]

df = spark.createDataFrame(data, columns)

df.toPandas()

In [None]:
df.write.format("delta").save("extract/03schema_enf2")

In [None]:
columns = ["first_name", "favorite_color"]
data = [("sal", "red"), ("cat", "pink")]

df = spark.createDataFrame(data, columns)

df.toPandas()

In [None]:
(df.write
    .option("mergeSchema", "true")
    .mode("append")
    .format("delta")
    .save("extract/03schema_enf2")
)

In [None]:
spark.read.format("delta").load("extract/03schema_enf2").toPandas()

## Set mergeSchema to true

In [None]:
# python
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")

In [None]:
columns = ["first_name", "age"]
data = [("bob", 47), ("li", 23), ("leonard", 51)]

df = spark.createDataFrame(data, columns)

df.toPandas()

In [None]:
df.write.format("delta").save("extract/03schema_enf3")

In [None]:
columns = ["first_name", "favorite_color"]
data = [("sal", "red"), ("cat", "pink")]

df = spark.createDataFrame(data, columns)

df.toPandas()

In [None]:
(df.write
    #.option("mergeSchema", "true")
    .mode("append")
    .format("delta")
    .save("extract/03schema_enf3")
)

In [None]:
spark.read.format("delta").load("extract/03schema_enf3").toPandas()