In [0]:
silver = spark.sql(''' describe external location `silver` ''').select('url').collect()[0][0]
gold = spark.sql(''' describe external location `gold` ''').select('url').collect()[0][0]

In [0]:
from pyspark.sql.functions import col, row_number, sha2, concat_ws
from pyspark.sql.window import Window
from delta.tables import DeltaTable

In [0]:
def create_sales_fact():
    silver_path = f"DELTA.`{silver}dh_transactions`"
    gold_fact_path = f"{gold}sales_fact"
    table_name = "`carbo_catalog`.`gold`.sales_fact"

    print("Starting pipeline for sales_fact")
    try:
        # Step 1: Read silver data
        df_silver = spark.sql(f"SELECT * FROM {silver_path}")

        # Step 2: Read dimension tables
        df_time = spark.sql("SELECT * FROM `carbo_catalog`.`gold`.dim_time")
        df_product = spark.sql("SELECT * FROM `carbo_catalog`.`gold`.dim_product")
        df_household = spark.sql("SELECT * FROM `carbo_catalog`.`gold`.dim_household")
        df_store = spark.sql("SELECT * FROM `carbo_catalog`.`gold`.dim_store")
        df_trade = spark.sql("SELECT * FROM `carbo_catalog`.`gold`.dim_trade")

        # Step 3: Join silver data with dimension tables
        df_fact = df_silver.join(
            df_time,
            (df_silver.day == df_time.day) & 
            (df_silver.week == df_time.week) & 
            (df_silver.time_of_transaction == df_time.time_of_transaction),
            "left"
        ).join(
            df_product,
            df_silver.upc == df_product.upc,
            "left"
        ).join(
            df_household,
            df_silver.household == df_household.household,
            "left"
        ).join(
            df_store,
            df_silver.store == df_store.store,
            "left"
        ).join(
            df_trade,
            (df_silver.upc == df_trade.upc) & 
            (df_silver.store == df_trade.store) & 
            (df_silver.week == df_trade.week),
            "left"
        ).select(
            df_silver.day,
            df_silver.time_of_transaction,
            df_silver.store,
            df_silver.household,
            df_silver.basket,
            df_silver.upc,
            df_silver.dollar_sales,
            df_silver.units,
            df_silver.coupon,
            df_time.time_key,
            df_product.product_key,
            df_household.household_key,
            df_store.store_key,
            df_trade.trade_key
        )

        # Step 4: Add transaction_key
        df_fact = df_fact.withColumn(
            "transaction_key",
            sha2(concat_ws("-", "day", "time_of_transaction", "store", "household", "basket", "upc"), 256)
        )

        # Step 5: Select final columns
        df_fact = df_fact.select(
            "transaction_key",
            "time_key",
            "product_key",
            "household_key",
            "store_key",
            "trade_key",
            "dollar_sales",
            "units",
            "coupon",
            "basket"
        )

        # Step 6: Write to Delta Lake with incremental load
        if df_fact.count() == 0:
            print("No data to process for sales_fact")
            return

        if spark.catalog.tableExists(table_name):
            print("Performing incremental load for sales_fact")
            delta_table = DeltaTable.forPath(spark, gold_fact_path)
            delta_table.alias("trg").merge(
                df_fact.alias("src"),
                "trg.transaction_key = src.transaction_key"
            ).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()
            print("Incremental update completed for sales_fact")
        else:
            print("Performing initial load for sales_fact")
            df_fact.write.format("delta")\
                .mode("overwrite")\
                .option("path", gold_fact_path)\
                .saveAsTable(table_name)
            print("Initial load completed for sales_fact")

        display(df_fact)

    except Exception as e:
        print(f"Pipeline failed for sales_fact: {e}")
        raise

    print("Pipeline completed for sales_fact")

In [0]:
# Execute the function
create_sales_fact()

In [0]:
df = spark.sql("SELECT count(*) FROM `carbo_catalog`.`gold`.sales_fact")
# df = spark.sql("SELECT count(*) FROM `carbo_catalog`.`silver`.dh_transactions")
display(df)

In [0]:
df_silver = spark.sql(f"SELECT * FROM DELTA.`{silver}dh_transactions`")
display(df_silver)