In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize SparkSession with necessary options to handle S3
spark = SparkSession.builder \
    .appName("AWS S3 Integration and Data Processing") \
    .getOrCreate()

# Set AWS S3 access keys securely (use environment variables or Databricks secrets in production)
# spark._jsc.hadoopConfiguration().set("fs.s3a.access.key", "AKIAZG4APFAQRRDBUO65")
# spark._jsc.hadoopConfiguration().set("fs.s3a.secret.key", "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")

# Define S3 bucket and file paths
# bucket_name = "iiht-ntt-24"
# sale_order_file_path = f"s3a://{bucket_name}/data.csv/SaleOrder.csv"
sale_order_file_path = "dbfs:/FileStore/tables/SalesOrder.csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# Try block for processing and handling the Sales data
try:
    # Load the SaleOrder.csv file from S3 into a DataFrame
    dfSales = spark.read.format("csv") \
        .option("inferSchema", infer_schema) \
        .option("header", first_row_is_header) \
        .option("sep", delimiter) \
        .load(sale_order_file_path)

    # Display the DataFrame to verify contents
    dfSales.show()
    table_name = "SalesTable"
    warehouse_location = spark.conf.get("dbfs:/FileStore/tables", "dbfs:/FileStore/tables")
    table_path = f"{warehouse_location}/{table_name.lower()}"

    # Check if the directory exists and clear it
    if dbutils.fs.ls(table_path):
        dbutils.fs.rm(table_path, recurse=True)
    if spark.catalog.tableExists(table_name):
        spark.sql(f"DROP TABLE IF EXISTS {table_name}")
    dfSales.write.format("parquet").mode("overwrite").option("path", table_path).saveAsTable(table_name)
    print(f"Table {table_name} created successfully.")

except Exception as e:
    print(f"An error occurred while processing Sales data: {e}")

# Try block for handling Delta table operations
try:
    # Load the permanent table created previously
    permanent_table_df = spark.table("SilverCustomer")

    # Filter rows where CustomerKey is not null
    silver_customers_df = permanent_table_df.filter(col("CustomerKey").isNotNull())

    silver_customers_df.show()

except Exception as e:
    print(f"An error occurred during Delta table operations: {e}")



+-------+----------+----------+------+
|OrderID|CustomerID| OrderDate|Amount|
+-------+----------+----------+------+
|   1001|   CUST001|2024-01-15| 250.0|
|   1002|   CUST002|2024-01-17| 150.5|
|   1003|   CUST003|2024-01-19|300.75|
|   1004|   CUST001|2024-02-10|450.25|
|   1005|   CUST004|2024-02-12| 500.0|
|   1006|   CUST002|2024-03-05| 125.8|
|   1007|   CUST005|2024-03-07|  60.0|
|   1008|   CUST006|2024-03-09| 220.9|
|   1009|   CUST003|2024-04-01|  75.3|
|   1010|   CUST001|2024-04-10| 350.0|
+-------+----------+----------+------+

Table SalesTable created successfully.
+-----------+-------------+-----------+--------------+--------+-----------+--------------+----------+----------+----------+----------+
|CustomerKey|WWICustomerID|   Customer|BillToCustomer|Category|BuyingGroup|PrimaryContact|PostalCode| ValidFrom|   ValidTo|LineageKey|
+-----------+-------------+-----------+--------------+--------+-----------+--------------+----------+----------+----------+----------+
|        

In [0]:
# Create a temporary view or register DataFrame as a temporary table
silver_customers_df.createOrReplaceTempView("temp_silver_customers_table")

# Define the Delta table path
delta_table_location = "/FileStore/tables/temp_silver_customers_table_delta"
dbutils.fs.rm(delta_table_location, True)  # Ensure the directory is empty

# Save the DataFrame as a Delta table
silver_customers_df.write.format("delta").mode("overwrite").save(delta_table_location)
print(f"Delta table saved at {delta_table_location}")

# Access and display the saved Delta table
delta_table_df = spark.read.format("delta").load(delta_table_location)


Delta table saved at /FileStore/tables/temp_silver_customers_table_delta
