In [0]:
# Use Bronze Store DB to read in data from payment table
spark.sql("USE DATABASE bronze_store")

In [0]:
# Read date column from payment table
from pyspark.sql.functions import *

payment_dt_df = (spark
.read
.table(tableName = "payment")
.select(col("date").cast('date'))
)

display(payment_dt_df)

date
2019-05-01
2019-06-01
2019-07-01
2019-08-01
2019-09-01
2019-10-01
2019-11-01
2019-12-01
2020-01-01
2020-02-01


In [0]:
payment_dt_df.printSchema()

In [0]:
# Calculate minimum payment date
min_dt = (
    payment_dt_df
        .select(min(col("date")))
        .first()
        [0]
)

print(type(min_dt))
print(min_dt)

In [0]:
# Calculate maximum payment date
max_dt = (
    payment_dt_df
        .select(max(col("date")))
        .first()
        [0]
)

print(type(max_dt))
print(max_dt)

In [0]:
# Generate continuous range (list) of dates (interval = 1 day between subsequent dates) between min payment date and 10 years in the future from max payment date

from datetime import timedelta

calendar_list = []
dt = min_dt

while dt <= max_dt + timedelta(days = 365 * 10):
    calendar_list.append(dt)
    dt+=timedelta(days = 1)

print(calendar_list)

In [0]:
# Convert the list of dates generated earlier to calendar dataframe by extending the date column to multiple required attributes: month, year, day of week, etc.
from pyspark.sql.types import DateType

calendar_df = (
    spark
        .createDataFrame(calendar_list, DateType())
        .withColumnRenamed("value", "date_value")
        .select(
            (
             year(col("date_value")) * 10000 + 
             month(col("date_value")) * 100 + 
             dayofmonth(col("date_value"))
            ).alias("date_id"),
            col("date_value"),
            month(col("date_value")).alias("month"),
            date_format(col("date_value"), 'MMMM').alias("month_name"),
            year(col("date_value")).alias("year"),
            quarter(col("date_value")).alias("quarter"),
            date_format(col("date_value"), 'E').alias("day_of_week")
        )
)

display(calendar_df)

date_id,date_value,month,month_name,year,quarter,day_of_week
20130201,2013-02-01,2,February,2013,1,Fri
20130202,2013-02-02,2,February,2013,1,Sat
20130203,2013-02-03,2,February,2013,1,Sun
20130204,2013-02-04,2,February,2013,1,Mon
20130205,2013-02-05,2,February,2013,1,Tue
20130206,2013-02-06,2,February,2013,1,Wed
20130207,2013-02-07,2,February,2013,1,Thu
20130208,2013-02-08,2,February,2013,1,Fri
20130209,2013-02-09,2,February,2013,1,Sat
20130210,2013-02-10,2,February,2013,1,Sun


In [0]:
# Create Database to load Gold Tables
spark.sql("CREATE DATABASE IF NOT EXISTS Gold_Store")

# Use Gold Store to load dimension tables and fact tables that are part of the designed star schema
spark.sql("USE DATABASE Gold_Store")

In [0]:
# Save calendar dataframe to delta table in Gold Store.
(calendar_df
     .write
     .format("delta")
     .mode("overwrite")
     .saveAsTable("dim_date")
)