A date dimension is not 1-to-1 with fact rows.
Instead, you build a master calendar (date dimension) once, and every fact row references the right date from it.

In [None]:
from datetime import datetime
import pandas as pd

from pyspark.sql import functions as F
from pyspark.sql import Window
from pyspark.sql.types import DateType

In [None]:
# 1. Generate full date range
start_date = datetime(2015, 1, 1) 
end_date = datetime(2030, 12, 31)
date_list = pd.date_range(start=start_date, end=end_date, freq='D')

In [None]:
# flights_df = spark.read.table("unicargo_dev.01_bronze.unicargo_flights_bronze")

In [None]:
# Create dim_date table from data taken from the flights bronze data

# select only date-related fields and drop duplicates
# dim_date = (
#     flights_df
#     .select("YEAR", "MONTH", "DAY", "DAY_OF_WEEK")
#     .distinct()
#     # create a proper date field
#     .withColumn(
#         "full_date", 
#         F.to_date(F.concat_ws("-", flights_df.YEAR, flights_df.MONTH, flights_df.DAY))
#     )
# )

# # add surrogate key (date_sk)
# dim_date = dim_date.withColumn("date_sk", F.monotonically_increasing_id())

In [None]:
# 2. Build list of dictionaries for each date
date_data = []
for i, date in enumerate(date_list, 1):
    date_data.append({
        'date_sk': i,
        
        'full_date': date.strftime('%Y-%m-%d'),  # convert to string
        'year': date.year,
        'month': date.month,
        'day': date.day,
        'day_of_week': date.weekday() + 1,  # 1=Monday, 7=Sunday
        'day_name': date.strftime('%A'),
        'month_name': date.strftime('%B'),
        'quarter': (date.month - 1) // 3 + 1,
        'is_weekend': 1 if date.weekday() >= 5 else 0
    })

In [None]:
# 3. Convert to Spark DataFrame
dim_date_extended = spark.createDataFrame(date_data)
print("Schema before casting:")
dim_date_extended.printSchema()

In [None]:
# Cast full_date (currently string) → DateType
dim_date_extended = dim_date_extended.withColumn("full_date", F.col("full_date").cast(DateType()))
print("\nSchema after casting:")
dim_date_extended.printSchema()

print("\nSample data:")
dim_date_extended.show(5)

# Verify the date range now covers your flight data
print("\nNew date range:")
dim_date_extended.select(F.min("full_date"), F.max("full_date")).show()

In [None]:
# Write dim_date (dimension table)
print("Saving date dimension...")
dim_date_extended.write \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("unikargo_dev.02_silver.unikargo_dim_date_silver")

In [None]:
# Check the saved table
saved_dim_date = spark.table("unikargo_dev.02_silver.unikargo_dim_date_silver")
print("Saved table count:", saved_dim_date.count())
saved_dim_date.select(F.min("full_date"), F.max("full_date")).show()