### Setup

Make sure you have the files available from previous demos.

In [0]:
# This cell sets all the configuration parameters to connect to Azure Data Lake
spark.conf.set("fs.azure.account.auth.type.<account_name>.dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type.<account_name>.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id.<account_name>.dfs.core.windows.net", "****************************")
spark.conf.set("fs.azure.account.oauth2.client.secret.<account_name>.dfs.core.windows.net", "*******************************")
spark.conf.set("fs.azure.account.oauth2.client.endpoint.<account_name>.dfs.core.windows.net", "https://login.microsoftonline.com/************************/oauth2/token")

Verify that cloud storage is accessible

In [0]:
display(dbutils.fs.ls("abfss://etl1@dbstoragebbpbs73u57xmm.dfs.core.windows.net/"))

Let's load our dataset

In [0]:
from pyspark.sql import functions as F

# Paths to datasets
transactions_path = "abfss://etl1@dbstoragebbpbs73u57xmm.dfs.core.windows.net/transactions_data.parquet"
customers_path = "abfss://etl1@dbstoragebbpbs73u57xmm.dfs.core.windows.net/customers_data.parquet"

# Load DataFrames
df_transactions = spark.read.parquet(transactions_path)
df_customers = spark.read.parquet(customers_path)

# Display sample data
df_transactions.limit(5).display()
df_customers.limit(5).display()



We can create temporary views on top of the dataframes to allow SQL commands


In [0]:

#Creating temp views on the top of Data Frames
df_transactions.createOrReplaceTempView("transactions")
df_customers.createOrReplaceTempView("customers")

# # Create temporary SQL views for SQL processing
# df_customers.createOrReplaceTempView("customers")
# df_transactions.createOrReplaceTempView("transactions")

we can then run SQL commands in different ways

In [0]:
denormalized_df = spark.sql("""
    SELECT 
        c.customer_id,
        c.email,
        t.transaction_id,
        t.amount,
        t.transaction_date
    FROM customers c
    JOIN transactions t 
      ON c.customer_id = t.customer_id
""")

denormalized_df.limit(5).display()

You can also take advantage of notebook magic commands

In [0]:
%sql
SELECT 
  DATE(transaction_date) AS transaction_day, 
  COUNT(*) AS transaction_count
FROM 
  transactions
GROUP BY 
  DATE(transaction_date)
ORDER BY 
  transaction_day ASC
LIMIT 10

You can also use the Spark metastore (or the Unity Catalog) to create permanent databases, schemas and tables

In [0]:
%sql
CREATE DATABASE sales.marketing

In [0]:
%sql
USE sales.marketing;

And we can store the dataframe as a permanent table

In [0]:
denormalized_df.write.mode("overwrite").saveAsTable("denormalized_customer_transactions")

And even permanent views on top

In [0]:
# Create a view for aggregated customer transaction summary
spark.sql("""
    CREATE OR REPLACE VIEW customer_transaction_summary AS
    SELECT
        customer_id,
        email,
        COUNT(transaction_id) AS total_transactions,
        AVG(amount) AS avg_transaction_amount,
        MIN(transaction_date) AS first_transaction_date,
        MAX(transaction_date) AS last_transaction_date
    FROM denormalized_customer_transactions
    GROUP BY customer_id, email
""")


In [0]:
%sql
USE sales.marketing;

-- SELECT * FROM customer_transaction_summary
-- LIMIT 5;

DESCRIBE table customer_transaction_summary