In [0]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.types import IntegerType


In [0]:
# Creating dim_date
def create_dim_date(start_date, end_date):
    # Creating sequence to create a list of date from start_date to end_date at interval of 1 day
    date_list = spark.sql(f"select sequence(to_date('{start_date}'), to_date('{end_date}'), interval 1 day ) as dates ").collect()[0].dates
    
    # Making dataframe from the list
    date_df = spark.createDataFrame([(d,) for d in date_list], ["date_alt_key"])
    
    # Adding necessary fields to the dataframe
    date_df = date_df.withColumn('date_id', date_format(col("date_alt_key"), "yyyyMMdd"))\
                     .withColumn('year', year("date_alt_key")) \
                     .withColumn('quarter', quarter("date_alt_key")) \
                     .withColumn('month', month("date_alt_key")) \
                     .withColumn('day', dayofmonth("date_alt_key")) \
                     .withColumn('day_of_week', date_format(col("date_alt_key"), 'EEEE'))
    
    # Managing the order
    date_df = date_df.select('date_id', 'date_alt_key', 'year', 'quarter', 'month', 'day', 'day_of_week')
    
    # Writing to dim_date
    date_df.write.format('delta').mode('overwrite').saveAsTable('dim_date')

In [0]:
# Creating dim_time
def create_dim_time(start_time, end_time, interval):
    # Creating sequence to create a list of time from start_time to end_time at interval of 1 hour
    time_list = spark.sql(f"select sequence(to_timestamp('{start_time}', 'HH:mm:ss'), to_timestamp('{end_time}','HH:mm:ss'), interval '{interval}')\
                    AS time_alt").collect()[0].time_alt
    
    # Creaing dataframe using the list
    time_df = spark.createDataFrame([(t,) for t in time_list], ['time_alt'])
    
    # Adding necessary fields to the dataframe
    time_df = time_df.withColumn('time_alt_key', date_format(col('time_alt'), 'HH:mm:ss'))
    
    windowSpec = Window.orderBy('time_alt_key')
    time_df = time_df.withColumn('time_id', row_number().over(windowSpec).cast(IntegerType()))
    time_df = time_df.select('time_id', 'time_alt_key')
    
    # Writing to dim_time
    time_df.write.format('delta').mode('overwrite').saveAsTable('dim_time')

In [0]:
# Creating the dim_city
def create_dim_city():
    city_df = spark.read.format('delta').load('dbfs:/FileStore/shared_uploads/export_deltatable')
    city_df = city_df.select('id','name','lat','lon').limit(5)
    
    city_df.write.format('delta').mode('overwrite').saveAsTable('dim_city')