In [None]:
--- a/spark_scripts/03_load_to_bigquery.py
+++ b/spark_scripts/03_load_to_bigquery.py
@@ -1,52 +1,133 @@
 # Assuming you are running this in a Databricks notebook
-from pyspark.sql import SparkSession
-from pyspark.sql.functions import col
-
-# spark = SparkSession.builder.appName("EcomDataToBigQuery").getOrCreate()
-
-silver_data_path = "gs://your-project-id-processed-data/silver/"
-bigquery_project = "your-gcp-project-id" # Replace with your GCP project ID
-bigquery_dataset = "ecom_data_warehouse"
-
-# Configure BigQuery connector
-# You might need to set up service account key for your Databricks cluster
-# to access BigQuery if not already configured at workspace level.
-# For example, by passing options in the write command or cluster settings.
-# Ensure your Databricks cluster has the BigQuery connector installed (usually comes pre-installed).
-
-# Read silver sales data
-print("Reading silver sales data...")
-df_silver_sales = spark.read.parquet(f"{silver_data_path}sales_silver")
-print("Silver sales data read.")
-
-# Write to BigQuery
-# Ensure your BigQuery table exists or let Spark create it if you use 'createIfNotExist'
-table_name = "sales_fact"
-print(f"Writing sales data to BigQuery table: {bigquery_dataset}.{table_name}")
-df_silver_sales.write.format("bigquery") \
-    .option("table", f"{bigquery_project}:{bigquery_dataset}.{table_name}") \
-    .option("temporaryGcsBucket", "your-project-id-processed-data") # A temporary bucket for BigQuery connector
-    .mode("overwrite") \
-    .save()
-print(f"Sales data loaded into BigQuery table: {bigquery_dataset}.{table_name}")
-
-# Read silver products data
-print("Reading silver products data...")
-df_silver_products = spark.read.parquet(f"{silver_data_path}products_silver")
-print("Silver products data read.")
-
-# Write products to BigQuery
-table_name = "products_dim"
-print(f"Writing products data to BigQuery table: {bigquery_dataset}.{table_name}")
-df_silver_products.write.format("bigquery") \
-    .option("table", f"{bigquery_project}:{bigquery_dataset}.{table_name}") \
-    .option("temporaryGcsBucket", "your-project-id-processed-data")
-    .mode("overwrite") \
-    .save()
-print(f"Products data loaded into BigQuery table: {bigquery_dataset}.{table_name}")
-
-print("All silver data loaded to BigQuery.")
+# spark_scripts/03_load_to_bigquery.py
+# This script is intended to be run as a Databricks job (e.g., from Airflow using DatabricksSubmitRunOperator)
+# It reads from Delta tables (output of DLT) and upserts into BigQuery.
+
+import sys
+import random
+from datetime import datetime
+
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import col, lit, current_timestamp
+from pyspark.sql.types import IntegerType, DoubleType, StringType, DateType, BooleanType
+
+def upsert_to_bigquery(
+    spark: SparkSession,
+    source_delta_table_path: str, # Path to the Delta table, e.g., 'gs://<bucket>/dlt_storage/tables/silver_sales'
+    target_bigquery_project: str,
+    target_bigquery_dataset: str,
+    target_bigquery_table: str,
+    unique_key_columns: list, # List of column names that form the unique key for matching rows
+    temporary_gcs_bucket: str # A GCS bucket for temporary BigQuery staging
+):
+    """
+    Reads data from a Delta table and upserts it into a BigQuery table using MERGE.
+    Assumes the BigQuery connector is properly configured in the Spark environment.
+
+    Args:
+        spark (SparkSession): The active SparkSession.
+        source_delta_table_path (str): Path to the source Delta table.
+        target_bigquery_project (str): GCP project ID for BigQuery.
+        target_bigquery_dataset (str): BigQuery dataset name.
+        target_bigquery_table (str): BigQuery table name to upsert into.
+        unique_key_columns (list): List of column names that form the unique key for matching rows.
+        temporary_gcs_bucket (str): A GCS bucket for temporary BigQuery staging.
+    """
+    print(f"Starting upsert for {target_bigquery_table}...")
+    print(f"Reading data from Delta table: {source_delta_table_path}")
+
+    # Read the data from the Delta table
+    source_df = spark.read.format("delta").load(source_delta_table_path)
+
+    if source_df.isEmpty():
+        print(f"No new data found in Delta table: {source_delta_table_path}. Skipping upsert.")
+        return
+
+    print(f"Read {source_df.count()} rows from {source_delta_table_path}.")
+
+    # Add a timestamp to the source_df to track when it was loaded into the staging area
+    # This helps with debugging and lineage if needed in BigQuery.
+    source_df = source_df.withColumn("bq_load_timestamp", current_timestamp())
+
+    # Define the BigQuery target table in the format "project:dataset.table"
+    bq_full_table_name = f"{target_bigquery_project}.{target_bigquery_dataset}.{target_bigquery_table}"
+
+    # Create a unique temporary staging table name for BigQuery
+    temp_bq_table_name = f"{target_bigquery_table}_staging_{datetime.now().strftime('%Y%m%d%H%M%S')}_{random.randint(0, 9999)}"
+    temp_bq_full_table_name = f"{target_bigquery_project}.{target_bigquery_dataset}.{temp_bq_table_name}"
+
+    print(f"Writing data to temporary BigQuery staging table: {temp_bq_full_table_name}")
+    # Write to a temporary staging table in BigQuery.
+    # This requires the BigQuery connector and GCS access configured for your Spark cluster.
+    source_df.write.format("bigquery") \
+        .option("table", temp_bq_full_table_name) \
+        .option("temporaryGcsBucket", temporary_gcs_bucket) \
+        .mode("overwrite") \
+        .save()
+    print("Data written to staging table successfully.")
+
+    # Generate the MERGE statement for BigQuery
+    # This statement needs to be executed as a BigQuery query.
+    # We will pass this SQL to Airflow's BigQueryExecuteQueryOperator.
+    # For now, we print it and assume an Airflow task will execute it.
+
+    merge_join_conditions = " AND ".join([f"T.{col_name} = S.{col_name}" for col_name in unique_key_columns])
+
+    # Construct the SET clause for UPDATE
+    update_set_clauses = []
+    # Exclude unique_key_columns and bq_load_timestamp from direct update for simplicity
+    # If a column represents an SCD Type 2 attribute, you'd handle it differently (e.g., end_date, new row)
+    for col_name in source_df.columns:
+        if col_name not in unique_key_columns:
+            update_set_clauses.append(f"T.{col_name} = S.{col_name}")
+    update_set_statement = ", ".join(update_set_clauses)
+
+    insert_columns = ", ".join([f"`{c}`" for c in source_df.columns])
+    insert_values = ", ".join([f"S.`{c}`" for c in source_df.columns])
+
+    # BigQuery MERGE statement
+    merge_sql_query = f"""
+        MERGE INTO `{bq_full_table_name}` T
+        USING `{temp_bq_full_table_name}` S
+        ON {merge_join_conditions}
+        WHEN MATCHED THEN
+            UPDATE SET {update_set_statement}
+        WHEN NOT MATCHED THEN
+            INSERT ({insert_columns}) VALUES ({insert_values});
+    """
+
+    print("\nBigQuery MERGE statement to be executed:")
+    print(merge_sql_query)
+
+    # In a real Databricks notebook/job, you might execute this via spark.sql or a BigQuery client library.
+    # For this Airflow orchestration, we'll rely on a subsequent Airflow BigQueryExecuteQueryOperator.
+    # However, to ensure the script completes without error in Databricks, we'll simulate the execution.
+    # For full idempotency and atomicity, it's safer to let Airflow control the MERGE and DELETE.
+
+    # IMPORTANT: The following is for demonstration within this single script.
+    # In the Airflow DAG, the MERGE and DROP will be handled by separate Airflow tasks.
+
+    # For Databricks, you can use spark.sql for BigQuery operations if the connector is set up
+    # and you have BigQuery write access. This often requires additional spark_conf settings
+    # on the cluster, including the BigQuerySparkExtensions for MERGE.
+
+    # Instead, we will print the MERGE SQL and the temp table name for Airflow to use.
+    # A robust solution might return these values or write them to a task XCom.
+    print(f"\nTemporary BigQuery staging table: {temp_bq_full_table_name}")
+    print(f"MERGE SQL: {merge_sql_query.strip()}")
+
+
+if __name__ == "__main__":
+    spark = SparkSession.builder \
+        .appName("EcomDataUpsertToBigQuery") \
+        .getOrCreate()
+
+    # These parameters would typically be passed from Airflow
+    if len(sys.argv) != 5:
+        print("Usage: 03_load_to_bigquery.py <dlt_storage_path> <gcp_project_id> <bq_dataset> <temp_gcs_bucket>")
+        sys.exit(1)
+
+    dlt_storage_path = sys.argv[1] # e.g., 'gs://your-project-id-processed-data/dlt_storage'
+    gcp_project_id = sys.argv[2]
+    bq_dataset = sys.argv[3]
+    temp_gcs_bucket = sys.argv[4]
+
+    # --- Upsert Sales Fact ---
+    source_sales_delta_table = f"{dlt_storage_path}/tables/silver_sales" # Path where DLT writes sales
+    target_sales_bq_table = "sales_fact"
+    sales_unique_keys = ["order_id"] # Assuming order_id is the primary key for sales_fact
+
+    upsert_to_bigquery(
+        spark,
+        source_sales_delta_table,
+        gcp_project_id,
+        bq_dataset,
+        target_sales_bq_table,
+        sales_unique_keys,
+        temp_gcs_bucket
+    )
+
+    # --- Upsert Products Dimension ---
+    source_products_delta_table = f"{dlt_storage_path}/tables/silver_products" # Path where DLT writes products
+    target_products_bq_table = "products_dim"
+    products_unique_keys = ["product_id"] # Assuming product_id is the primary key for products_dim
+
+    upsert_to_bigquery(
+        spark,
+        source_products_delta_table,
+        gcp_project_id,
+        bq_dataset,
+        target_products_bq_table,
+        products_unique_keys,
+        temp_gcs_bucket
+    )
+
+    # --- Upsert Customers Dimension ---
+    source_customers_delta_table = f"{dlt_storage_path}/tables/silver_customers" # Path where DLT writes customers
+    target_customers_bq_table = "customers_dim"
+    customers_unique_keys = ["customer_id"] # Assuming customer_id is the primary key for customers_dim
+
+    upsert_to_bigquery(
+        spark,
+        source_customers_delta_table,
+        gcp_project_id,
+        bq_dataset,
+        target_customers_bq_table,
+        customers_unique_keys,
+        temp_gcs_bucket
+    )
+
+
+    spark.stop()
+    print("All BigQuery upsert processes initiated.")
