In [0]:
df = spark.read.table("samples.bakehouse.sales_transactions")
display(df.head(10))

transactionID,customerID,franchiseID,dateTime,product,quantity,unitPrice,totalPrice,paymentMethod,cardNumber
1002961,2000253,3000047,2024-05-14T12:17:01.495Z,Golden Gate Ginger,8,3,24,amex,378154478982993
1003007,2000226,3000047,2024-05-10T23:10:10.239Z,Austin Almond Biscotti,36,3,108,mastercard,2244626981238094
1003017,2000108,3000047,2024-05-16T16:34:10.613Z,Austin Almond Biscotti,40,3,120,mastercard,2490570234487424
1003068,2000173,3000047,2024-05-02T04:31:51.612Z,Pearly Pies,28,3,84,amex,343808569426192
1003103,2000075,3000047,2024-05-04T23:44:26.902Z,Pearly Pies,28,3,84,visa,4377080942201798
1003147,2000295,3000047,2024-05-15T16:17:06.259Z,Austin Almond Biscotti,32,3,96,amex,371093774812677
1003196,2000237,3000047,2024-05-07T11:13:22.469Z,Tokyo Tidbits,40,3,120,mastercard,5538807345848392
1003329,2000272,3000047,2024-05-06T03:32:16.017Z,Outback Oatmeal,28,3,84,visa,4872480716880043
1001264,2000209,3000047,2024-05-16T17:32:28.547Z,Pearly Pies,28,3,84,mastercard,5287105980593305
1001287,2000120,3000047,2024-05-15T08:41:28.406Z,Austin Almond Biscotti,40,3,120,amex,376211012259783


In [0]:
import pyspark.sql.functions as F

df_franchise = df.groupBy("franchiseID").agg(
  F.sum("quantity").alias("total_products"),
  F.mean('totalPrice').alias("avg_price_by_franchise"),
  F.collect_set('product').alias("products_per_franchise")
)

## Using explain()

In [0]:
df_franchise.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- == Initial Plan ==
   ColumnarToRow
   +- PhotonResultStage
      +- PhotonGroupingAgg(keys=[franchiseID#13567L], functions=[finalmerge_sum(merge sum#13581L) AS sum(quantity)#13575L, finalmerge_avg(merge sum#13584, count#13585L) AS avg(totalPrice)#13576, collect_list(distinct product#13569, 0, 0)])
         +- PhotonGroupingAgg(keys=[franchiseID#13567L, product#13569], functions=[merge_sum(merge sum#13581L) AS sum#13581L, merge_avg(merge sum#13584, count#13585L) AS (sum#13584, count#13585L)])
            +- PhotonGroupingAgg(keys=[franchiseID#13567L, product#13569], functions=[partial_sum(quantity#13570L) AS sum#13581L, partial_avg(totalPrice#13572L) AS (sum#13584, count#13585L)])
               +- PhotonScan parquet samples.bakehouse.sales_transactions[franchiseID#13567L,product#13569,quantity#13570L,totalPrice#13572L] DataFilters: [], DictionaryFilters: [], Format: parquet, Location: PreparedDeltaFileIndex(1 paths)[s3://syste

In [0]:
df_franchise.explain(mode = 'formatted')

== Physical Plan ==
AdaptiveSparkPlan (7)
+- == Initial Plan ==
   ColumnarToRow (6)
   +- PhotonResultStage (5)
      +- PhotonGroupingAgg (4)
         +- PhotonGroupingAgg (3)
            +- PhotonGroupingAgg (2)
               +- PhotonScan parquet samples.bakehouse.sales_transactions (1)


(1) PhotonScan parquet samples.bakehouse.sales_transactions
Output [4]: [franchiseID#13617L, product#13619, quantity#13620L, totalPrice#13622L]
Location: PreparedDeltaFileIndex [s3://system-tables-prod-us-east-2-uc-metastore-bucket/metastore/b622f329-0f39-444f-9dbb-5733ba3e2021/tables/e922f6b4-1ca7-495e-b661-a5e73dc5f5d5]
ReadSchema: struct<franchiseID:bigint,product:string,quantity:bigint,totalPrice:bigint>

(2) PhotonGroupingAgg
Input [4]: [franchiseID#13617L, product#13619, quantity#13620L, totalPrice#13622L]
Arguments: [franchiseID#13617L, product#13619], [partial_sum(quantity#13620L) AS sum#13631L, partial_avg(totalPrice#13622L) AS (sum#13634, count#13635L)], [sum(quantity)#13631L, avg(total

## Heavy Query

In [0]:
df_filtered = df.filter(F.col("franchiseID") == 3000047)

df_filtered.groupBy("franchiseID").agg(
    F.sum("quantity"),
    F.mean("totalPrice")
).explain(True)

== Parsed Logical Plan ==
'Aggregate ['franchiseID], ['franchiseID, unresolvedalias('sum('quantity)), unresolvedalias('avg('totalPrice))]
+- 'Filter '`==`('franchiseID, 3000047)
   +- 'UnresolvedRelation [samples, bakehouse, sales_transactions], [], false

== Analyzed Logical Plan ==
franchiseID: bigint, sum(quantity): bigint, avg(totalPrice): double
Aggregate [franchiseID#13667L], [franchiseID#13667L, sum(quantity#13670L) AS sum(quantity)#13678L, avg(totalPrice#13672L) AS avg(totalPrice)#13679]
+- Filter (franchiseID#13667L = cast(3000047 as bigint))
   +- SubqueryAlias samples.bakehouse.sales_transactions
      +- Relation samples.bakehouse.sales_transactions[transactionID#13665L,customerID#13666L,franchiseID#13667L,dateTime#13668,product#13669,quantity#13670L,unitPrice#13671L,totalPrice#13672L,paymentMethod#13673,cardNumber#13674L] parquet

== Optimized Logical Plan ==
Aggregate [franchiseID#13667L], [franchiseID#13667L, sum(quantity#13670L) AS sum(quantity)#13678L, avg(totalPrice#1