In [0]:
from pyspark.sql.functions import *

In [0]:
date_bounds = (
    spark.table("silver.orders")
    .select(
        min("order_purchase_timestamp").alias("min_date"),
        max("order_purchase_timestamp").alias("max_date")
    )
    .collect()[0]
)

start_date = date_bounds["min_date"].date().replace(year=date_bounds["min_date"].year - 1)
end_date = date_bounds["max_date"].date().replace(year=date_bounds["max_date"].year + 1)


In [0]:
dim_date = (
    spark.sql(f"""
        SELECT explode(
            sequence(
                to_date('{start_date}'),
                to_date('{end_date}'),
                interval 1 day
            )
        ) AS full_date
    """)
)

In [0]:
dim_date = (
    dim_date
    .withColumn("date_key", date_format("full_date", "yyyyMMdd").cast("int"))
    .withColumn("year", year("full_date"))
    .withColumn("month", month("full_date"))
    .withColumn("month_name", date_format("full_date", "MMMM"))
    .withColumn("day", dayofmonth("full_date"))
    .withColumn("day_of_week", dayofweek("full_date").cast("int"))
    .withColumn("day_name", date_format("full_date", "EEEE"))
    .withColumn("week_of_year", weekofyear("full_date"))
    .withColumn("is_weekend", when(col("day_of_week").isin(1, 7), 1).otherwise(0))
)


In [0]:
dim_date.display()

In [0]:
dim_date.count()

In [0]:
dim_date.select("date_key").distinct().count()

In [0]:
dim_date.filter(col("date_key").isNull()).count()

In [0]:
dim_date.write\
    .format("delta")\
    .mode("overwrite")\
    .option("overwriteSchema", "true")\
    .save("abfss://olist-data@retailds.dfs.core.windows.net/gold/dim_date")