In [0]:
dbutils.widgets.removeAll()
dbutils.widgets.text("table", "powerPlay")
dbutils.widgets.text("dst_table_name", "powerPlay_rebuild")
dbutils.widgets.text("checkpoint_suffix", "_rebuild")

In [0]:
# This ugly crap is needed to import modules from the parent folder
import os
import sys
sys.path.append(f"{os.getcwd()}/..")

from pathlib import Path
import json
from delta.tables import DeltaTable
from pyspark.sql.functions import struct, to_json, sha2, col, current_timestamp
from pyspark.sql.functions import to_timestamp, concat, regexp_extract, lit, date_format
from functions import create_table_if_not_exists, rename_columns, cast_data_types
from layer_02_silver import silver_upsert

table = dbutils.widgets.get("table")
settings = json.loads(next(Path().glob(f"../layer_02_silver/{table}.json")).read_text())

# Variables (json file)
src_table_name          = settings.get("src_table_name")
dst_table_name          = settings.get("dst_table_name")
readStreamOptions       = settings.get("readStreamOptions")
writeStreamOptions      = settings.get("writeStreamOptions")
composite_key           = settings.get("composite_key")
business_key            = settings.get("business_key")
column_map              = settings.get("column_map")
data_type_map           = settings.get("data_type_map")

# Cheat Sheet for the widgets:
# table: The table you are rebuilding.
# dst_table_name: The new name (use same name if you want, it will overwrite it)
# checkpoint_suffix: The stream needs a fresh checkpoint folder (or delete current one)
dst_table_name = dbutils.widgets.get("dst_table_name")
checkpoint_suffix = dbutils.widgets.get("checkpoint_suffix")
checkpoint_location = writeStreamOptions["checkpointLocation"]
checkpoint_location_rebuild = checkpoint_location.rstrip("/") + checkpoint_suffix
writeStreamOptions["checkpointLocation"] = checkpoint_location_rebuild


spark.sql(f"TRUNCATE TABLE {dst_table_name}")
history = spark.sql(f"DESCRIBE HISTORY {src_table_name}").select("version", "operation").orderBy("version").collect()

# Find SET TBLPROPERTIES (CDF enabled)
cdf_enable_index = None
for i, row in enumerate(history):
    if row['operation'] == "SET TBLPROPERTIES":
        cdf_enable_index = i

if cdf_enable_index is None:
    raise Exception("CDF not enabled on this table")

# Only process ranges where there was no schema change operation between i and i+1
schema_change_ops = {"ADD COLUMNS", "DROP COLUMNS", "ALTER COLUMN"}
for i in range(cdf_enable_index, len(history) - 1):
    # If a schema change happened at i+1, skip the range
    if history[i+1]['operation'] in schema_change_ops:
        continue
    start_version = history[i]['version']
    end_version = history[i+1]['version']
    (
        spark.readStream
        .options(**readStreamOptions)
        .option("readChangeData", "true")
        .option("startingVersion", start_version)
        .option("endingVersion", end_version)
        .table(src_table_name)
        .writeStream
        .queryName(dst_table_name)
        .options(**writeStreamOptions)
        .trigger(availableNow=True)
        .foreachBatch(silver_upsert(spark, settings))
        .outputMode("update")
        .start()
    )

In [0]:
df1 = spark.table("edsm.silver.powerplay_test")
df2 = spark.table("edsm.silver.powerplay")

cols = [c for c in df1.columns if c != "ingest_time"]
diff = df1.select(cols).subtract(df2.select(cols)).union(df2.select(cols).subtract(df1.select(cols)))

if diff.count() == 0:
    print("Tables are the same")
else:
    print("Tables are different")

    only_in_df1 = df1.select(cols).subtract(df2.select(cols))
    only_in_df2 = df2.select(cols).subtract(df1.select(cols))

    print("Only in df1:")
    only_in_df1.show(1, vertical=True)

    print("Only in df2:")
    only_in_df2.show(1, vertical=True)
