# Bucketing

In [22]:
import warnings
warnings.filterwarnings("ignore")

In [23]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [24]:
from pyspark.storagelevel import StorageLevel
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql import SparkSession

In [25]:
spark = (
    SparkSession
    .builder
    .config("spark.driver.memory", "10g")
    .master("local[*]")
    .getOrCreate()
)
sc = spark.sparkContext
sc.setLogLevel("ERROR")
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

In [26]:
orders_file = "../data/bucketing/orders.csv"
df_orders = spark.read.csv(orders_file, header=True, inferSchema=True)

In [27]:
df_orders.show(5, False)
df_orders.printSchema()

+--------+----------+-----------+--------+----------+------------+
|order_id|product_id|customer_id|quantity|order_date|total_amount|
+--------+----------+-----------+--------+----------+------------+
|1       |80        |10         |4       |2023-3-20 |1003        |
|2       |69        |30         |3       |2023-12-11|780         |
|3       |61        |20         |4       |2023-4-26 |1218        |
|4       |62        |44         |3       |2023-8-26 |2022        |
|5       |78        |46         |4       |2023-8-5  |1291        |
+--------+----------+-----------+--------+----------+------------+
only showing top 5 rows

root
 |-- order_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- total_amount: integer (nullable = true)



In [28]:
products_file = "../data/bucketing/products.csv"
df_products = spark.read.csv(products_file, header=True, inferSchema=True)

In [29]:
df_products.show(5, False)
df_products.printSchema()

+----------+------------+-----------+-------+-----+-----+
|product_id|product_name|category   |brand  |price|stock|
+----------+------------+-----------+-------+-----+-----+
|1         |Product_1   |Electronics|Brand_4|26   |505  |
|2         |Product_2   |Apparel    |Brand_4|489  |15   |
|3         |Product_3   |Apparel    |Brand_4|102  |370  |
|4         |Product_4   |Groceries  |Brand_1|47   |433  |
|5         |Product_5   |Groceries  |Brand_3|244  |902  |
+----------+------------+-----------+-------+-----+-----+
only showing top 5 rows

root
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- stock: integer (nullable = true)



In [30]:
df_products.select("product_id").distinct().count()

100

In [31]:
df_orders.select("order_id").distinct().count()

                                                                                

1000

## Bucketing In Joins

In [32]:
df_orders_product_details = (
    df_orders.join(
        df_products,
        on="product_id",
        how="inner"
    )
)

In [33]:
df_orders_product_details.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [product_id#245, order_id#244, customer_id#246, quantity#247, order_date#248, total_amount#249, product_name#304, category#305, brand#306, price#307, stock#308]
   +- SortMergeJoin [product_id#245], [product_id#303], Inner
      :- Sort [product_id#245 ASC NULLS FIRST], false, 0
      :  +- Exchange hashpartitioning(product_id#245, 200), ENSURE_REQUIREMENTS, [id=#760]
      :     +- Filter isnotnull(product_id#245)
      :        +- FileScan csv [order_id#244,product_id#245,customer_id#246,quantity#247,order_date#248,total_amount#249] Batched: false, DataFilters: [isnotnull(product_id#245)], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/Users/afaqueahmad/Documents/YouTube/spark-experiments/data/bucke..., PartitionFilters: [], PushedFilters: [IsNotNull(product_id)], ReadSchema: struct<order_id:int,product_id:int,customer_id:int,quantity:int,order_date:string,total_amount:int>
      +- Sort [product_id#303 ASC N

In [34]:
df_orders_product_details.count()

1000

In [35]:
(
    df_products
    .write.bucketBy(4, col="product_id")
    .mode("overwrite")
    .saveAsTable("products_bucketed")
)

                                                                                

In [36]:
(
    df_orders
    .write.bucketBy(4, col="product_id")
    .mode("overwrite")
    .saveAsTable("orders_bucketed")
)

                                                                                

In [37]:
df_orders_bucketed = spark.table("orders_bucketed")
df_products_bucketed = spark.table("products_bucketed")

In [38]:
df_orders_product_details_bucketed = (
    df_orders_bucketed.join(
        df_products_bucketed,
        on="product_id",
        how="inner"
    )
)

In [39]:
df_orders_product_details_bucketed.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [product_id#416, order_id#415, customer_id#417, quantity#418, order_date#419, total_amount#420, product_name#428, category#429, brand#430, price#431, stock#432]
   +- SortMergeJoin [product_id#416], [product_id#427], Inner
      :- Sort [product_id#416 ASC NULLS FIRST], false, 0
      :  +- Filter isnotnull(product_id#416)
      :     +- FileScan parquet default.orders_bucketed[order_id#415,product_id#416,customer_id#417,quantity#418,order_date#419,total_amount#420] Batched: true, DataFilters: [isnotnull(product_id#416)], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/Users/afaqueahmad/Documents/YouTube/spark-experiments/spark/spar..., PartitionFilters: [], PushedFilters: [IsNotNull(product_id)], ReadSchema: struct<order_id:int,product_id:int,customer_id:int,quantity:int,order_date:string,total_amount:int>, SelectedBucketsCount: 4 out of 4
      +- Sort [product_id#427 ASC NULLS FIRST], false, 0
         +-

In [40]:
df_orders_product_details_bucketed.count()

                                                                                

1000

## Bucketing In Aggregations

In [41]:
df_orders.show(5, False)

+--------+----------+-----------+--------+----------+------------+
|order_id|product_id|customer_id|quantity|order_date|total_amount|
+--------+----------+-----------+--------+----------+------------+
|1       |80        |10         |4       |2023-3-20 |1003        |
|2       |69        |30         |3       |2023-12-11|780         |
|3       |61        |20         |4       |2023-4-26 |1218        |
|4       |62        |44         |3       |2023-8-26 |2022        |
|5       |78        |46         |4       |2023-8-5  |1291        |
+--------+----------+-----------+--------+----------+------------+
only showing top 5 rows



In [42]:
df_product_sales = (
    df_orders
    .filter(F.col("order_date") == '2023-12-11')
    .groupBy("product_id")
    .agg(F.sum("total_amount").alias("sales"))
)

In [43]:
df_product_sales.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[product_id#245], functions=[sum(total_amount#249)])
   +- Exchange hashpartitioning(product_id#245, 200), ENSURE_REQUIREMENTS, [id=#1155]
      +- HashAggregate(keys=[product_id#245], functions=[partial_sum(total_amount#249)])
         +- Project [product_id#245, total_amount#249]
            +- Filter (isnotnull(order_date#248) AND (order_date#248 = 2023-12-11))
               +- FileScan csv [product_id#245,order_date#248,total_amount#249] Batched: false, DataFilters: [isnotnull(order_date#248), (order_date#248 = 2023-12-11)], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/Users/afaqueahmad/Documents/YouTube/spark-experiments/data/bucke..., PartitionFilters: [], PushedFilters: [IsNotNull(order_date), EqualTo(order_date,2023-12-11)], ReadSchema: struct<product_id:int,order_date:string,total_amount:int>




In [44]:
(
    df_orders
    .write.partitionBy("order_date")
    .bucketBy(4, "product_id")
    .saveAsTable("orders_2_bucketed")
)

                                                                                

In [45]:
df_orders_bucketed_2 = spark.read.table("orders_2_bucketed")

In [46]:
df_product_sales_bucketed = (
    df_orders_bucketed_2
    .filter(F.col("order_date") == '2023-12-11')
    .groupBy("product_id")
    .agg(F.sum("total_amount").alias("sales"))
)

In [47]:
df_product_sales_bucketed.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[product_id#535], functions=[sum(total_amount#538)])
   +- HashAggregate(keys=[product_id#535], functions=[partial_sum(total_amount#538)])
      +- Project [product_id#535, total_amount#538]
         +- FileScan parquet default.orders_2_bucketed[product_id#535,total_amount#538,order_date#539] Batched: true, DataFilters: [], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/Users/afaqueahmad/Documents/YouTube/spark-experiments/spark/spar..., PartitionFilters: [isnotnull(order_date#539), (order_date#539 = 2023-12-11)], PushedFilters: [], ReadSchema: struct<product_id:int,total_amount:int>, SelectedBucketsCount: 4 out of 4


