### Setup

Make sure you have the files available from previous demos.

In [0]:
# This cell sets all the configuration parameters to connect to Azure Data Lake
spark.conf.set("fs.azure.account.auth.type.<account_name>.dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type.<account_name>.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id.<account_name>.dfs.core.windows.net", "****************************")
spark.conf.set("fs.azure.account.oauth2.client.secret.<account_name>.dfs.core.windows.net", "*******************************")
spark.conf.set("fs.azure.account.oauth2.client.endpoint.<account_name>.dfs.core.windows.net", "https://login.microsoftonline.com/************************/oauth2/token")

Verify that cloud storage is accessible

In [0]:
display(dbutils.fs.ls("abfss://etl1@dbstoragebbpbs73u57xmm.dfs.core.windows.net/"))

Let's load our dataset

In [0]:
from pyspark.sql import functions as F

# Paths to datasets
transactions_path = "abfss://etl1@dbstoragebbpbs73u57xmm.dfs.core.windows.net/transactions_data.parquet"
customers_path = "abfss://etl1@dbstoragebbpbs73u57xmm.dfs.core.windows.net/customers_data.parquet"

# Load DataFrames
df_transactions = spark.read.parquet(transactions_path)
df_customers = spark.read.parquet(customers_path)

# Display sample data
df_transactions.limit(5).display()
df_customers.limit(5).display()



Let's join the tables


In [0]:
df_join = df_transactions.join(df_customers,df_transactions.customer_id == df_customers.customer_id,'inner')
df_join.count()



# df_join = df_transactions.join(df_customers, "customer_id", "inner")
# df_join.count()

In [0]:

# df_join.limit(500).display()

df_join_sum = (df_join
    .groupBy('country')
    .agg(F.sum('amount').alias('total_amout'))
)

df_join_sum.limit(500).display()
df_join_sum.explain(mode="formatted")

# df_join_sum = df_join \
#     .groupBy("country") \
#     .agg(F.sum("amount").alias("total_amount"))

# # Display the result
# df_join_sum.display()

# df_join_sum.explain(mode="formatted")

If we are going to keep working with the joined data, we can cache it and avoid the file operations

In [0]:
df_join.cache()
df_join.count()


And run the aggregation again

In [0]:
df_join_sum = df_join \
    .groupBy("country") \
    .agg(F.sum("amount").alias("total_amount"))

# Display the result
df_join_sum.display()

df_join_sum.explain(mode="formatted")