### Setup

Make sure you have the files available from previous demos.

In [0]:
# This cell sets all the configuration parameters to connect to Azure Data Lake
spark.conf.set("fs.azure.account.auth.type.<account_name>.dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type.<account_name>.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id.<account_name>.dfs.core.windows.net", "****************************")
spark.conf.set("fs.azure.account.oauth2.client.secret.<account_name>.dfs.core.windows.net", "*******************************")
spark.conf.set("fs.azure.account.oauth2.client.endpoint.<account_name>.dfs.core.windows.net", "https://login.microsoftonline.com/************************/oauth2/token")

Verify that cloud storage is accessible

In [0]:
dbutils.fs.ls("abfss://etl1@dbstoragebbpbs73u57xmm.dfs.core.windows.net/")

Let's load the transactions dataframe

In [0]:
# Path to the transactions data
parquet_path = "abfss://etl1@dbstoragebbpbs73u57xmm.dfs.core.windows.net/transactions_data.parquet"

# Load the transactions data
df_transactions = spark.read.parquet(parquet_path)
df_transactions.printSchema()

# Display the first 5 records
df_transactions.limit(5).display()
print(df_transactions.count())

Let's do some grouping and aggregation

In [0]:
from pyspark.sql import functions as F

df_category_total = (
    df_transactions.groupBy(F.col('category'))
    .agg(F.sum('amount').alias('total_amount'))
    .orderBy(F.desc('total_amount'))
)

display(df_category_total)


# from pyspark.sql import functions as F
# # Total amount per category
# df_category_total = (
#     df_transactions
#         .groupBy("category")
#         .agg(F.sum("amount").alias("total_amount"))
#         .orderBy(F.col("total_amount").desc())
# )

# df_category_total.limit(5).display()


In [0]:
df_category_stats = (
    df_transactions.groupBy(F.col('category'))
    .agg(F.avg('amount').alias('avg_amount'),
        F.max('amount').alias('max_amount'),
        F.min('amount').alias('min_amount')
    )
    .orderBy(F.desc('avg_amount'))
)

df_category_stats.display()


# # Average, max and min amount per category ordered by average
# df_category_stats = (
#     df_transactions
#         .groupBy("category")
#         .agg(
#             F.avg("amount").alias("avg_amount"),
#             F.max("amount").alias("max_amount"),
#             F.min("amount").alias("min_amount")
#         )
#         .orderBy(F.col("avg_amount").desc())
# )

# df_category_stats.limit(5).display()


Let's do some Exploratory Data Analysis (EDA)

In [0]:
df_transactions.select('amount').summary().display()

# # Get the statistics for a single column
# df_transactions.select("amount").summary().display()


In [0]:
df_category_distributiion = (
    df_transactions.groupBy(F.col('category'))
    .agg(F.count('category').alias('count_of_categories'))
    .orderBy(F.desc('count_of_categories'))
)

df_category_distributiion.display()





# # Calculate the distribution of categories
# df_category_distribution = (
#     df_transactions
#         .groupBy("category")
#         .count()
#         .orderBy(F.col("count").desc())
# )

# df_category_distribution.limit(5).display()


In [0]:
# Mix aggregations in a single result, top categories by total and average amount

df_top_categories = (
    df_transactions
        .groupBy("category")
        .agg(
            F.sum("amount").alias("total_amount"),
            F.avg("amount").alias("avg_amount"),
            F.count("*").alias("transaction_count")
        )
        .orderBy(F.col("total_amount").desc(), F.col("avg_amount").desc())
)

df_top_categories.limit(5).display()

df_customers = spark.read.parquet("abfss://etl1@dbstoragebbpbs73u57xmm.dfs.core.windows.net/customers_data.parquet")


In [0]:

df_customers.limit(50).display()
df_transactions.limit(50).display()


# # Calculate the average transaction amount by age group

# # Load the customers data
# customers_path = "abfss://pyspark@warnerdatalake.dfs.core.windows.net//imports//customers_data.parquet"
# df_customers = spark.read.parquet(customers_path)

# # Join transactions with customers
# df_joined = df_transactions.join(df_customers, on="customer_id", how="inner")

# # Add age group column and calculate average spending by age group
# df_age_group_spending = (
#     df_joined
#         .withColumn(
#             "age_group", 
#             F.when(F.col("age") < 30, "Under 30")
#              .when((F.col("age") >= 30) & (F.col("age") < 50), "30-49")
#              .otherwise("50 and Above")
#         )
#         .groupBy("age_group")
#         .agg(F.avg("amount").alias("avg_spending"))
#         .orderBy(F.col("avg_spending").desc())
# )

# df_age_group_spending.limit(5).display()


# Join Examples

In [0]:
from pyspark.sql import functions as F
from pyspark.sql import Row

df = spark.createDataFrame([Row(name='Alice',age=2),Row(name='Bob',age=5)])
df2 = spark.createDataFrame([Row(name='Tom',height=80),Row(name='Bob',height=85)])
df3 = spark.createDataFrame([
  Row(name='Alice',age=10,height=80),
  Row(name='Bob',age=5,height=None),
  Row(name='Tome',age=None,height=None),
  Row(name=None,age=None,height=None)
  ])

df.display()
df2.display()
df3.display()
                    

In [0]:
df.join(df2,df.name==df2.name).display()

In [0]:
df.join(df3,(df.name == df3.name) & (df.age == df3.age),'inner')\
    .select(df.name,df.age,df3.height).display()

In [0]:
df.alias('a').join(df2.alias('b'),F.col('a.name')==F.col('b.name'),'outer').orderBy(F.desc(df.name)).display()

In [0]:
df.alias('a').join(df2.alias('b'),F.col('a.name')==F.col('b.name'),'left_outer').orderBy(F.desc(df.name)).display()

In [0]:
df.alias('a').join(df2.alias('b'),F.col('a.name')==F.col('b.name'),'right_outer').orderBy(F.desc(df.name))\
    .display()