### Setup

Make sure you have the files available from previous demos.

In [0]:
# This cell sets all the configuration parameters to connect to Azure Data Lake
spark.conf.set("fs.azure.account.auth.type.<account_name>.dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type.<account_name>.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id.<account_name>.dfs.core.windows.net", "****************************")
spark.conf.set("fs.azure.account.oauth2.client.secret.<account_name>.dfs.core.windows.net", "*******************************")
spark.conf.set("fs.azure.account.oauth2.client.endpoint.<account_name>.dfs.core.windows.net", "https://login.microsoftonline.com/************************/oauth2/token")

Verify that cloud storage is accessible

In [0]:
display(dbutils.fs.ls("abfss://etl1@dbstoragebbpbs73u57xmm.dfs.core.windows.net/"))

Let's load our dataset

In [0]:
from pyspark.sql import functions as F

# Paths to datasets
transactions_path = "abfss://etl1@dbstoragebbpbs73u57xmm.dfs.core.windows.net/transactions_data.parquet"
customers_path = "abfss://etl1@dbstoragebbpbs73u57xmm.dfs.core.windows.net/customers_data.parquet"

# Load DataFrames
df_transactions = spark.read.parquet(transactions_path)
df_customers = spark.read.parquet(customers_path)

# Display sample data
df_transactions.limit(5).display()
df_customers.limit(5).display()



Let's join the dataframes and calculate some aggregations

In [0]:
# Join the two DataFrames on customer_id
df_joined = df_customers.join(df_transactions, on="customer_id", how="inner")

# Aggregate the joined data:
#   - Count the number of transactions per customer
#   - Calculate the average transaction amount
#   - Determine the first and last transaction dates
df_agg = df_joined.groupBy("customer_id", "email").agg(
    F.count("transaction_id").alias("total_transactions"),
    F.avg("amount").alias("avg_transaction_amount"),
    F.min("transaction_date").alias("first_transaction_date"),
    F.max("transaction_date").alias("last_transaction_date")
)

# Grab the top 10
df_top_customers = df_agg.orderBy(F.desc("total_transactions")).limit(10)
df_top_customers.limit(10).display()

To use popular Python visuals libraries, we convert to Pandas

In [0]:
# Convert the Spark DataFrame to a pandas DataFrame
df_pandas = df_top_customers.toPandas()


Now let's create a visual with matplotlib

In [0]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.bar(df_pandas['email'], df_pandas['total_transactions'])
plt.xlabel('Customer Name')
plt.ylabel('Total Transactions')
plt.title('Total Transactions per Customer')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


We can try with Plotly Express

In [0]:
import plotly.express as px

# Create an interactive scatter plot using Plotly Express
fig = px.scatter(
    df_pandas,
    x="total_transactions",
    y="avg_transaction_amount",
    text="email",
    title="Avg Transaction Amount vs Total Transactions"
)
fig.update_traces(textposition='top center')
fig.show()


Let's do another one with seaborn

In [0]:
import seaborn as sns
import matplotlib.pyplot as plt

# Convert decimal.Decimal columns to float
df_pandas["total_transactions"] = df_pandas["total_transactions"].astype(float)
df_pandas["avg_transaction_amount"] = df_pandas["avg_transaction_amount"].astype(float)

plt.figure(figsize=(10, 6))
sns.regplot(
    x="total_transactions",
    y="avg_transaction_amount",
    data=df_pandas,
    scatter_kws={"s": 100, "alpha": 0.7},
    line_kws={"color": "red"}
)
plt.xlabel("Total Transactions")
plt.ylabel("Average Transaction Amount")
plt.title("Relationship Between Transaction Volume and Average Amount")
plt.tight_layout()
plt.show()

And you can of course export any of the dataframes back out to cloud storage

In [0]:
df_top_customers.write.format("delta").mode("overwrite").save("abfss://pyspark@warnerdatalake.dfs.core.windows.net/exports/top_customers")