AUTHENTICATION AND DATA READING

In [0]:
#dbutils.secrets.listScopes()
dbutils.secrets.list('databricks-keyvault-scope')
#secret1=dbutils.secrets.get('databricks-keyvault-scope','ServiceCredential')

[SecretMetadata(key='ApplicationId'),
 SecretMetadata(key='directoryid'),
 SecretMetadata(key='ServiceCredential')]

In [0]:
#Mounting Azure Data Lake Storage Gen2

application_id=dbutils.secrets.get('databricks-keyvault-scope','ApplicationId')
service_credential=dbutils.secrets.get('databricks-keyvault-scope','ServiceCredential')
directory_id=dbutils.secrets.get('databricks-keyvault-scope','directoryid')

configs = {"fs.azure.account.auth.type": "OAuth",
          "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
          "fs.azure.account.oauth2.client.id": application_id,
          "fs.azure.account.oauth2.client.secret": service_credential,
          "fs.azure.account.oauth2.client.endpoint": f"https://login.microsoftonline.com/{directory_id}/oauth2/token"}


In [0]:
mount_point = "/mnt/project040datalake/project040processed"

if any(mount.mountPoint == mount_point for mount in dbutils.fs.mounts()):
    dbutils.fs.unmount(mount_point)
    print(f"Unmounted existing mount at {mount_point}")
    
#  Optionally, you can add <directory-name> to the source URI of your mount point.
dbutils.fs.mount(
  source = "abfss://project040processed@project040datalake.dfs.core.windows.net/",
  mount_point = mount_point,
  extra_configs = configs)
print(f"Mounted successfully at {mount_point}")

/mnt/project040datalake/project040processed has been unmounted.
Unmounted existing mount at /mnt/project040datalake/project040processed
Mounted successfully at /mnt/project040datalake/project040processed


In [0]:
mount_point = "/mnt/project040datalake/project040gold"

if any(mount.mountPoint == mount_point for mount in dbutils.fs.mounts()):
    dbutils.fs.unmount(mount_point)
    print(f"Unmounted existing mount at {mount_point}")
    
#  Optionally, you can add <directory-name> to the source URI of your mount point.
dbutils.fs.mount(
  source = "abfss://project040gold@project040datalake.dfs.core.windows.net/",
  mount_point = mount_point,
  extra_configs = configs)
print(f"Mounted successfully at {mount_point}")

Mounted successfully at /mnt/project040datalake/project040gold


In [0]:
dbutils.fs.mounts()

[MountInfo(mountPoint='/databricks-datasets', source='databricks-datasets', encryptionType=''),
 MountInfo(mountPoint='/mnt/project040datalake/project040raw', source='abfss://project040raw@project040datalake.dfs.core.windows.net/', encryptionType=''),
 MountInfo(mountPoint='/Volumes', source='UnityCatalogVolumes', encryptionType=''),
 MountInfo(mountPoint='/databricks/mlflow-tracking', source='databricks/mlflow-tracking', encryptionType=''),
 MountInfo(mountPoint='/databricks-results', source='databricks-results', encryptionType=''),
 MountInfo(mountPoint='/mnt/project040datalake/project040gold', source='abfss://project040gold@project040datalake.dfs.core.windows.net/', encryptionType=''),
 MountInfo(mountPoint='/databricks/mlflow-registry', source='databricks/mlflow-registry', encryptionType=''),
 MountInfo(mountPoint='/Volume', source='DbfsReserved', encryptionType=''),
 MountInfo(mountPoint='/volumes', source='DbfsReserved', encryptionType=''),
 MountInfo(mountPoint='/mnt/project040d

In [0]:
accounts_df = spark.read.format("parquet").load("/mnt/project040datalake/project040processed/silver/accounts_parquet_out/")
accounts_df.show(5)

+----------+-----------+------------+-------+
|account_id|customer_id|account_type|balance|
+----------+-----------+------------+-------+
|         1|         45|     Savings| 1000.5|
|         2|         12|    Checking|2500.75|
|         3|         78|     Savings| 1500.0|
|         4|         34|    Checking|3000.25|
|         5|         56|     Savings|  500.0|
+----------+-----------+------------+-------+
only showing top 5 rows



In [0]:
customers_df = spark.read.format("parquet").load("/mnt/project040datalake/project040processed/silver/customers_parquet_out/")
customers_df.show(5)

+-----------+----------+---------+--------------+---------+-----+------+
|customer_id|first_name|last_name|       address|     city|state|   zip|
+-----------+----------+---------+--------------+---------+-----+------+
|          1|      John|      Doe|    123 Elm St|  Toronto|   ON|M4B1B3|
|          2|      Jane|    Smith| 456 Maple Ave|   Ottawa|   ON|K1A0B1|
|          3|   Michael|  Johnson|    789 Oak Dr| Montreal|   QC|H1A1A1|
|          4|     Emily|    Davis|   101 Pine Rd|  Calgary|   AB|T2A0A1|
|          5|     David|   Wilson|202 Birch Blvd|Vancouver|   BC|V5K0A1|
+-----------+----------+---------+--------------+---------+-----+------+
only showing top 5 rows



In [0]:
from pyspark.sql.functions import col, sum

# Step 1: Join accounts and customers data on customer_id
joined_df = accounts_df.join(customers_df, on="customer_id", how="inner")

# Step 2: Calculate the total balance for each customer
total_balance_df = joined_df.groupBy("customer_id", "first_name", "last_name") \
    .agg(sum("balance").alias("total_balance"))

# Show the transformed data
total_balance_df.show(5)


+-----------+----------+---------+-------------+
|customer_id|first_name|last_name|total_balance|
+-----------+----------+---------+-------------+
|         44|    Amelia|   Howard|       9300.0|
|         64|  Isabella| Gonzalez|       3200.5|
|         12|  Isabella|      Lee|      9000.75|
|         63| Alexander|   Foster|       425.75|
|         53|     James|  Jenkins|       300.25|
+-----------+----------+---------+-------------+
only showing top 5 rows



In [0]:
# Save the transformed data in the Refined (Gold) container in Parquet format
total_balance_df.write.mode("overwrite").parquet("/mnt/project040datalake/project040gold/total_balance/total_balance_parquet_out/")


In [0]:
# Save the transformed data in the Refined (Gold) container in Delta format
total_balance_df.write.format("delta").mode("overwrite").save("/mnt/project040datalake/project040gold/total_balance/delta_out/")
