In [1]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.utils import *
from delta import *
import hashlib
import pandas as pd
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', 1000)  

In [2]:
import pyspark
from delta import *

builder = pyspark.sql.SparkSession.builder.appName("NB_DIM_Wallet") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

# Spark will automatically use the master specified in spark-defaults.conf
spark = configure_spark_with_delta_pip(builder).getOrCreate()

spark

In [3]:
trgt_path_processed = "/mnt/Wallet/Wallet_Parquet"
trgt_path_csv = "/mnt/Wallet/Wallet_Processed"
mount_d ="/data/"
trgt_path = '/mnt/'
source_path = mount_d+"Template.xlsm"
sheet_name ='SPENDING_HISTORY'


In [4]:
# Read the Excel file (use Spark-Excel library)
df = pd.read_excel(source_path, sheet_name = sheet_name)
df = spark.createDataFrame(df)
df= df.withColumn("Date", to_date(df["Date"],"yyyy-MM-dd"))

In [5]:
df.createOrReplaceTempView("vw_src")
query = "select distinct `Wallet used` from vw_src"
print(query)
df_src = spark.sql(query)
df_src.show()

select distinct `Wallet used` from vw_src
+--------------------+
|         Wallet used|
+--------------------+
|Credit Card - Amazon|
|               G-pay|
|       Amazon Wallet|
|           ICICI Pay|
+--------------------+



In [6]:
df_output = \
df_src.withColumn("Walletsk",xxhash64("Wallet used")).withColumnRenamed("Wallet used","WalletUsed")

In [7]:
df_output.show()

+--------------------+--------------------+
|          WalletUsed|            Walletsk|
+--------------------+--------------------+
|Credit Card - Amazon| 4962050552325147656|
|               G-pay| 1782127853235431588|
|       Amazon Wallet|-2996829708924805941|
|           ICICI Pay| 6156857810389859050|
+--------------------+--------------------+



In [8]:
df_output.createOrReplaceTempView("vw_source")

In [9]:
column_name = df_output.columns
set_clause = ", ".join([f"target.{i} = source.{i}" for i in column_name])
query = f"MERGE INTO delta.`{trgt_path_processed}` AS target USING vw_source AS source ON target.Walletsk = source.Walletsk WHEN MATCHED THEN UPDATE SET {set_clause}"
print(query)

MERGE INTO delta.`/mnt/Wallet/Wallet_Parquet` AS target USING vw_source AS source ON target.Walletsk = source.Walletsk WHEN MATCHED THEN UPDATE SET target.WalletUsed = source.WalletUsed, target.Walletsk = source.Walletsk


In [10]:
if DeltaTable.isDeltaTable(spark, trgt_path_processed):
    spark.sql(query)
else :
    spark.sql(f"CREATE TABLE delta.`{trgt_path_processed}` USING DELTA AS SELECT * FROM vw_source")

In [11]:
# Save the DataFrame to a CSV file
spark.read.format("delta").load(trgt_path_processed).coalesce(1).write.format("csv").option("header","true").mode("overwrite").save(trgt_path_csv)

In [12]:
spark.read.format("delta").load(trgt_path_processed).show()

+--------------------+--------------------+
|          WalletUsed|            Walletsk|
+--------------------+--------------------+
|           ICICI Pay| 6156857810389859050|
|       Amazon Wallet|-2996829708924805941|
|               G-pay| 1782127853235431588|
|Credit Card - Amazon| 4962050552325147656|
+--------------------+--------------------+

