In [0]:
from delta.tables import DeltaTable
from pyspark.sql.functions import col 

In [0]:
dbutils.widgets.text("catalog","flights")
dbutils.widgets.text("source_schema","silver")
dbutils.widgets.text("source_object","silver_bookings")
dbutils.widgets.text("target_schema","gold")
dbutils.widgets.text("target_object","fact_bookings")
dbutils.widgets.text("cdc_col","modified_date")
dbutils.widgets.text("fact_columns","['amount','booking_date','modified_date']")
dbutils.widgets\
    .text("fact_key_cols","['dims_passengers_key','dims_flights_key','dims_airports_key']")
dbutils.widgets.text("backdated_refresh","")
dbutils.widgets.text("fact_table_alias","fact")


In [0]:

#catalog
catalog=dbutils.widgets.get("catalog")

#source schema
source_schema=dbutils.widgets.get("source_schema")

#source object
source_object=dbutils.widgets.get("source_object")

#target schema
target_schema=dbutils.widgets.get("target_schema")

#target object
target_object=dbutils.widgets.get("target_object")

#cdc_column_names                                                        
cdc_col=dbutils.widgets.get("cdc_col")
fact_source_table=f"{catalog}.{source_schema}.{source_object}"
fact_target_table=f"{catalog}.{target_schema}.{target_object}"

#fact_columns_names                                                                       
fact_columns=dbutils.widgets.get("fact_columns")
fact_columns=eval(fact_columns)

#backdated refresh
backdated_refresh=dbutils.widgets.get("backdated_refresh")

#fact table alias
fact_alias=dbutils.widgets.get("fact_table_alias")

#fact_key_cols
fact_key_cols=dbutils.widgets.get("fact_key_cols")
fact_key_cols=eval(fact_key_cols)

(fact_columns,fact_key_cols)

In [0]:
dimensions='''[
    {
        "table":f"{catalog}.{target_schema}.dim_passengers",
        "alias":"dim_passengers",
        "fact_keys":["passenger_id","passenger_id"]
    },
    {
        "table":f"{catalog}.{target_schema}.dim_flights",
        "alias":"dim_flights",
        "fact_keys":["flight_id","flight_id"]
    },
    {
        "table":f"{catalog}.{target_schema}.dim_airports",
        "alias":"dim_airports",
        "fact_keys":["airport_id","airport_id"]
    }
]'''

dimensions=eval(dimensions)
(dimensions)

In [0]:
#Compute Last load date
last_load_date=backdated_refresh

if len(backdated_refresh)==0:
    if spark.catalog.tableExists(f"{fact_target_table}"):
        last_load_date=spark.sql(f"""
                                 SELECT max({cdc_col}) From {fact_target_table}
                                 """).collect()[0][0]
    else:
        last_load_date="1900-01-01 00:00:00"

#checking last_load_date
(last_load_date)

In [0]:
def generateQuery(dimensions,fact_columns,fact_source_table,fact_target_table):
    #creating select clause

    select_clause=", ".join([", ".join(f"{fact_alias}.{col}" for col in fact_columns),", ".join(col for col in fact_key_cols)])

    join_clause=" ".join(
        [f"LEFT JOIN {dim['table']} AS {dim['alias']} ON {dim['alias']}.{dim['fact_keys'][0]}={fact_alias}.{dim['fact_keys'][1]}" for dim in dimensions]
        )
    
    where_clause=f"""
        {fact_alias}.{cdc_col}>=DATE('{last_load_date}')
    """

    query=(f"SELECT {select_clause} FROM {fact_source_table} AS fact {join_clause} WHERE {where_clause}").strip()

    return query

In [0]:
query=generateQuery(dimensions,fact_columns,fact_source_table,fact_target_table)
df_fact=spark.sql(query)

df_fact.display()

In [0]:
#creating merge condition
merge_list=fact_key_cols
merge_list.append("booking_date")
on_merge_condition=" AND ".join([f"tgt.{col}=src.{col}" for col in merge_list])


#Upserting in the gold_fact table
if spark.catalog.tableExists(f"{fact_target_table}"):
    delta_table=DeltaTable.forName(spark,f"{fact_target_table}")

    delta_table.alias("tgt").merge(df_fact.alias("src"),f"{on_merge_condition}")\
                    .whenMatchedUpdateAll(condition=f"src.{cdc_col}>=tgt.{cdc_col}")\
                    .whenNotMatchedInsertAll()\
                    .execute()
else:
    df_fact.write.format("delta").mode("append")\
        .saveAsTable(f"{fact_target_table}")


In [0]:
%sql
SELECT * FROM flights.gold.fact_bookings