### Setup

Make sure you have the files available from previous demos.

In [0]:
# This cell sets all the configuration parameters to connect to Azure Data Lake
spark.conf.set("fs.azure.account.auth.type.<account_name>.dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type.<account_name>.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id.<account_name>.dfs.core.windows.net", "****************************")
spark.conf.set("fs.azure.account.oauth2.client.secret.<account_name>.dfs.core.windows.net", "*******************************")
spark.conf.set("fs.azure.account.oauth2.client.endpoint.<account_name>.dfs.core.windows.net", "https://login.microsoftonline.com/************************/oauth2/token")

Verify that cloud storage is accessible

In [0]:
dbutils.fs.ls("abfss://pyspark@warnerdatalake.dfs.core.windows.net/")

[FileInfo(path='abfss://pyspark@warnerdatalake.dfs.core.windows.net/exports/', name='exports/', size=0, modificationTime=1740581924000),
 FileInfo(path='abfss://pyspark@warnerdatalake.dfs.core.windows.net/imports/', name='imports/', size=0, modificationTime=1740581918000)]

Let's load our dataset

In [0]:
from pyspark.sql import functions as F

# Paths to datasets
transactions_path = "abfss://pyspark@warnerdatalake.dfs.core.windows.net//imports//transactions_data.parquet"
customers_path = "abfss://pyspark@warnerdatalake.dfs.core.windows.net//imports//customers_data.parquet"

# Load DataFrames
df_transactions = spark.read.parquet(transactions_path)
df_customers = spark.read.parquet(customers_path)

# Display sample data
df_transactions.limit(5).display()
df_customers.limit(5).display()



transaction_id,customer_id,transaction_date,amount,category
1,3065,2025-03-17,76.1,Clothes
2,3274,2025-02-18,91.91,Clothes
3,130,2025-01-10,11.81,Accessories
4,320,2025-03-06,20.37,Furniture
5,6480,2025-03-22,12.31,Beauty


customer_id,first_name,last_name,email,age,country
1,First_1,Last_1,First_1.Last_1@example.com,40,Canada
2,First_2,Last_2,First_2.Last_2@example.com,55,USA
3,First_3,Last_3,First_3.Last_3@example.com,59,USA
4,First_4,Last_4,First_4.Last_4@example.com,49,Canada
5,First_5,Last_5,First_5.Last_5@example.com,58,Canada


We can create temporary views on top of the dataframes to allow SQL commands


In [0]:
# Create temporary SQL views for SQL processing
df_customers.createOrReplaceTempView("customers")
df_transactions.createOrReplaceTempView("transactions")

we can then run SQL commands in different ways

In [0]:
denormalized_df = spark.sql("""
    SELECT 
        c.customer_id,
        c.email,
        t.transaction_id,
        t.amount,
        t.transaction_date
    FROM customers c
    JOIN transactions t 
      ON c.customer_id = t.customer_id
""")

denormalized_df.limit(5).display()

customer_id,email,transaction_id,amount,transaction_date
3065,First_3065.Last_3065@example.com,1,76.1,2025-03-17
3274,First_3274.Last_3274@example.com,2,91.91,2025-02-18
130,First_130.Last_130@example.com,3,11.81,2025-01-10
320,First_320.Last_320@example.com,4,20.37,2025-03-06
6480,First_6480.Last_6480@example.com,5,12.31,2025-03-22


You can also take advantage of notebook magic commands

In [0]:
%sql
SELECT 
  DATE(transaction_date) AS transaction_day, 
  COUNT(*) AS transaction_count
FROM 
  transactions
GROUP BY 
  DATE(transaction_date)
ORDER BY 
  transaction_day ASC
LIMIT 10

transaction_day,transaction_count
2025-01-01,11085
2025-01-02,10973
2025-01-03,11124
2025-01-04,11085
2025-01-05,11066
2025-01-06,11240
2025-01-07,11061
2025-01-08,11095
2025-01-09,11072
2025-01-10,11071


You can also use the Spark metastore (or the Unity Catalog) to create permanent databases, schemas and tables

In [0]:
%sql
CREATE DATABASE marketing

In [0]:
%sql
USE DATABASE marketing

And we can store the dataframe as a permanent table

In [0]:
denormalized_df.write.mode("overwrite").saveAsTable("denormalized_customer_transactions")

And even permanent views on top

In [0]:
# Create a view for aggregated customer transaction summary
spark.sql("""
    CREATE OR REPLACE VIEW customer_transaction_summary AS
    SELECT
        customer_id,
        email,
        COUNT(transaction_id) AS total_transactions,
        AVG(amount) AS avg_transaction_amount,
        MIN(transaction_date) AS first_transaction_date,
        MAX(transaction_date) AS last_transaction_date
    FROM denormalized_customer_transactions
    GROUP BY customer_id, email
""")


DataFrame[]

In [0]:
%sql
SELECT * FROM customer_transaction_summary
LIMIT 5;

customer_id,email,total_transactions,avg_transaction_amount,first_transaction_date,last_transaction_date
5855,First_5855.Last_5855@example.com,111,45.86027,2025-01-01,2025-03-30
6035,First_6035.Last_6035@example.com,112,50.152768,2025-01-01,2025-03-31
1968,First_1968.Last_1968@example.com,86,47.356047,2025-01-05,2025-03-31
8206,First_8206.Last_8206@example.com,106,53.628868,2025-01-01,2025-03-31
6963,First_6963.Last_6963@example.com,100,48.6231,2025-01-01,2025-03-30
