In [0]:
from datetime import datetime, timedelta

# yesterdayâ€™s file (assuming you process next day)
process_date = (datetime.today() - timedelta(days=1)).strftime("%Y/%m/%d")
#process_date = datetime.today().strftime("%Y/%m/%d") #--today's file

path = f"abfss://input@adlssource0001.dfs.core.windows.net/csv/{process_date}/*.parquet"


raw_df = spark.read.format("parquet") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(path)

display(raw_df)

In [0]:
target_path = "abfss://output@adlstarget0001.dfs.core.windows.net/curated_csv/order_payments_delta"

(raw_df.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .save(target_path))


In [0]:
%sql
CREATE TABLE IF NOT EXISTS indiametastore.default.order_items_delta
USING DELTA
LOCATION "abfss://output@adlstarget0001.dfs.core.windows.net/curated_csv/order_payments_delta"

In [0]:
%sql
SHOW CATALOGS;
SHOW SCHEMAS IN indiametastore;


In [0]:
df = spark.table("indiametastore.default.order_items_delta")

SCD 1

In [0]:
from datetime import datetime
from pyspark.sql.functions import current_date, lit, row_number
from pyspark.sql.window import Window
from delta.tables import DeltaTable

# ==============================
# CONFIG
# ==============================
# Base folder where Snowflake parquet exports land
csv_base = "abfss://input@adlssource0001.dfs.core.windows.net/csv"

# Delta target folder on ADLS for SCD1 table
delta_path = "abfss://output@adlstarget0001.dfs.core.windows.net/curated_csv/order_payments_delta"

# Columns that define the business key (SCD1 merge keys)
key_cols = ["ORDER_ID"]    # change if your keys are different

# ==============================
# STEP 1: Find latest partition folder (YYYY/MM/DD)
# ==============================

# List years
years = dbutils.fs.ls(csv_base + "/")
year_names = [f.name.rstrip('/') for f in years if f.isDir()]
if not year_names:
    raise ValueError("No year folders found under snowflake_base.")

latest_year = max(year_names)

# List months for that year
months = dbutils.fs.ls(f"{csv_base}/{latest_year}/")
month_names = [f.name.rstrip('/') for f in months if f.isDir()]
if not month_names:
    raise ValueError(f"No month folders found under year {latest_year}.")

latest_month = max(month_names)

# List days for that year/month
days = dbutils.fs.ls(f"{csv_base}/{latest_year}/{latest_month}/")
day_names = [f.name.rstrip('/') for f in days if f.isDir()]
if not day_names:
    raise ValueError(f"No day folders found under {latest_year}/{latest_month}.")

latest_day = max(day_names)

latest_path = f"{csv_base}/{latest_year}/{latest_month}/{latest_day}/*.parquet"
print(f"Reading from: {latest_path}")

file_date = f"{latest_year}-{latest_month}-{latest_day}"

# ==============================
# STEP 2: Read raw data and add columns
# ==============================
raw_df = spark.read.parquet(latest_path)

#spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")

new_df = (raw_df
          .withColumn("ingest_date", current_date())   # load date
          .withColumn("file_date", lit(file_date)))    # folder date

# ==============================
# STEP 2.1: Deduplicate on key columns
# ==============================
window = Window.partitionBy(*key_cols).orderBy(new_df["ingest_date"].desc())

new_df = (new_df
          .withColumn("rn", row_number().over(window))
          .filter("rn = 1")
          .drop("rn"))

# ==============================
# STEP 3: Merge into Delta (SCD1)
# ==============================
if not DeltaTable.isDeltaTable(spark, delta_path):
    print("Delta table path does not exist yet. Performing initial load.")
    (new_df.write
        .format("delta")
        .mode("overwrite")
        .option("mergeSchema", "true")
        .save(delta_path))
else:
    print("Delta table exists. Performing SCD1 merge.")
    deltaTable = DeltaTable.forPath(spark, delta_path)

    # Build join condition string from key_cols
    join_cond = " AND ".join([f"t.{c} = s.{c}" for c in key_cols])

    (deltaTable.alias("t")
        .merge(new_df.alias("s"), join_cond)
        .whenMatchedUpdateAll()
        .whenNotMatchedInsertAll()
        .execute())

# ==============================
# STEP 4: Reload and quick check
# ==============================
updated_df = (spark.read
              .format("delta")
              .option("mergeSchema", "true")
              .load(delta_path))

print("Sample from source (raw_df):")
raw_df.show(10, truncate=False)

print("Sample from SCD1 Delta (updated_df):")
updated_df.show(10, truncate=False)
