## Dynamic Partition Overwrite

In [0]:
# Create Spark Session

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Partition Overwrite") \
    .master("local[2]") \
    .getOrCreate()

spark

In [0]:
# Example dataset
from pyspark.sql.functions import cast, to_date
_data = [
    ["ORD1001", "P003", 70, "01-21-2022"],
    ["ORD1004", "P033", 12, "01-24-2022"],
    ["ORD1005", "P036", 10, "01-20-2022"],
    ["ORD1002", "P016", 2, "01-10-2022"],
    ["ORD1003", "P012", 6, "01-10-2022"],
]

_cols = ["order_id", "prod_id", "qty", "order_date"]

# Create the dataframe
df = spark.createDataFrame(data=_data, schema=_cols)

# Cast the Order date from String to Date
df = df.withColumn("order_date", to_date("order_date" ,"MM-dd-yyyy"))
df.printSchema()
df.show()

In [0]:
# Check the mode for Partition Overwrite
spark.conf.get("spark.sql.sources.partitionOverwriteMode")

In [0]:
# Lets repartition the data with order_date and write

df.repartition("order_date") \
    .write \
    .format("parquet") \
    .partitionBy("order_date") \
    .mode("overwrite") \
    .save("dataset/orders_partitioned")

In [0]:
%%sh

ls -ltr dataset/orders_partitioned/

In [0]:
# Validate data
from pyspark.sql.functions import count, lit

spark.read.parquet("dataset/orders_partitioned/").groupBy("order_date").agg(count(lit(1))).show()

In [0]:
# Lets create our delta dataset for Overwrite

_data = [
    ["ORD1010", "P053", 78, "01-24-2022"],
    ["ORD1011", "P076", 21, "01-20-2022"],
]

_cols = ["order_id", "prod_id", "qty", "order_date"]

# Create the delta dataframe
delta_df = spark.createDataFrame(data=_data, schema=_cols)

# Cast the Order date from String to Date
delta_df = delta_df.withColumn("order_date", to_date("order_date" ,"MM-dd-yyyy"))
delta_df.printSchema()
delta_df.show()

In [0]:
# Lets write to the same location for Orders partitioned

delta_df.repartition("order_date") \
    .write \
    .format("parquet") \
    .partitionBy("order_date") \
    .mode("overwrite") \
    .save("dataset/orders_partitioned")

In [0]:
%%sh

ls -ltr dataset/orders_partitioned/

In [0]:
# Validate data
from pyspark.sql.functions import count, lit

spark.read.parquet("dataset/orders_partitioned/").groupBy("order_date").agg(count(lit(1))).show()

### Lets follow the same example but this time with partitionOverwriteMode as "DYNAMIC"

In [0]:
# Setting the partitionOverwriteMode as DYNAMIC

spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")
spark.conf.get("spark.sql.sources.partitionOverwriteMode")

In [0]:
# Example dataset
from pyspark.sql.functions import cast, to_date
_data = [
    ["ORD1001", "P003", 70, "01-21-2022"],
    ["ORD1004", "P033", 12, "01-24-2022"],
    ["ORD1005", "P036", 10, "01-20-2022"],
    ["ORD1002", "P016", 2, "01-10-2022"],
    ["ORD1003", "P012", 6, "01-10-2022"],
]

_cols = ["order_id", "prod_id", "qty", "order_date"]

# Create the dataframe
df = spark.createDataFrame(data=_data, schema=_cols)

# Cast the Order date from String to Date
df = df.withColumn("order_date", to_date("order_date" ,"MM-dd-yyyy"))
df.printSchema()
df.show()

In [0]:
# Lets repartition the data with order_date and write

df.repartition("order_date") \
    .write \
    .format("parquet") \
    .partitionBy("order_date") \
    .mode("overwrite") \
    .save("dataset/orders_partitioned")

In [0]:
%%sh

ls -ltr dataset/orders_partitioned/

In [0]:
# Validate data
from pyspark.sql.functions import count, lit

spark.read.parquet("dataset/orders_partitioned/").groupBy("order_date").agg(count(lit(1))).show()

In [0]:
# Lets create our delta dataset for Overwrite

_data = [
    ["ORD1010", "P053", 78, "01-24-2022"],
    ["ORD1011", "P076", 21, "01-10-2022"],
]

_cols = ["order_id", "prod_id", "qty", "order_date"]

# Create the delta dataframe
delta_df = spark.createDataFrame(data=_data, schema=_cols)

# Cast the Order date from String to Date
delta_df = delta_df.withColumn("order_date", to_date("order_date" ,"MM-dd-yyyy"))
delta_df.printSchema()
delta_df.show()

In [0]:
# Lets write to the same location for Orders partitioned

delta_df.repartition("order_date") \
    .write \
    .format("parquet") \
    .partitionBy("order_date") \
    .mode("overwrite") \
    .save("dataset/orders_partitioned")

In [0]:
%%sh

ls -ltr dataset/orders_partitioned/

In [0]:
# Validate data
from pyspark.sql.functions import count, lit

spark.read.parquet("dataset/orders_partitioned/").groupBy("order_date").agg(count(lit(1))).show()