In [1]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.utils import *
from delta import *
import hashlib
import pandas as pd
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', 1000)  

In [2]:
import pyspark
from delta import *

builder = pyspark.sql.SparkSession.builder.appName("NB_DIM_Category") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

# Spark will automatically use the master specified in spark-defaults.conf
spark = configure_spark_with_delta_pip(builder).getOrCreate()

spark

In [3]:
trgt_path_processed = "/mnt/Category/Category_Parquet"
trgt_path_csv = "/mnt/Category/Category_Processed"
mount_d ="/data/"
trgt_path = '/mnt/'
source_path = mount_d+"Template.xlsm"
sheet_name ='SPENDING_HISTORY'


In [4]:
# Read the Excel file (use Spark-Excel library)
df = pd.read_excel(source_path, sheet_name = sheet_name)
df = spark.createDataFrame(df)
df= df.withColumn("Date", to_date(df["Date"],"yyyy-MM-dd"))

In [5]:
df.createOrReplaceTempView("vw_src")
query = "select distinct Category from vw_src"
print(query)
df_src = spark.sql(query)
df_src.show()

select distinct Category from vw_src
+------------------+
|          Category|
+------------------+
|            Chitty|
|        Travelling|
|   Mobile Recharge|
|              Food|
|        Investment|
|              Rent|
|        House Hold|
|Internet Recharges|
+------------------+



In [6]:
df_output = \
df_src.withColumn("categorysk",xxhash64("category"))\
        .withColumn("UpdateTimeStamp", date_format(current_timestamp(), "yyyy-MM-dd HH:mm:ss").cast("timestamp"))\
            .withColumn("RowSK",xxhash64(concat_ws("|", *[col(c) for c in df_src.columns])))

In [7]:
df_output.createOrReplaceTempView("vw_source")

In [8]:
column_name = df_output.columns
set_clause = ", ".join([f"target.{i} = source.{i}" for i in column_name])
query = f"MERGE INTO delta.`{trgt_path_processed}` AS target USING vw_source AS source ON target.categorysk = source.categorysk AND target.RowSK <> source.RowSK WHEN MATCHED THEN UPDATE SET {set_clause}"
print(query)

MERGE INTO delta.`/mnt/Category/Category_Parquet` AS target USING vw_source AS source ON target.categorysk = source.categorysk AND target.RowSK <> source.RowSK WHEN MATCHED THEN UPDATE SET target.Category = source.Category, target.categorysk = source.categorysk, target.UpdateTimeStamp = source.UpdateTimeStamp, target.RowSK = source.RowSK


In [9]:
if DeltaTable.isDeltaTable(spark, trgt_path_processed):
    spark.sql(query)
else :
    spark.sql(f"CREATE TABLE delta.`{trgt_path_processed}` USING DELTA AS SELECT * FROM vw_source")

In [10]:
# Save the DataFrame to a CSV file
spark.read.format("delta").load(trgt_path_processed).coalesce(1).write.format("csv").option("header","true").mode("overwrite").save(trgt_path_csv)

In [11]:
#spark.read.format("delta").load(trgt_path_processed).show()