# Bucketing

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
from pyspark.storagelevel import StorageLevel
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql import SparkSession

In [4]:
spark = (
    SparkSession
    .builder
    .config("spark.driver.memory", "10g")
    .master("local[*]")
    .getOrCreate()
)
sc = spark.sparkContext
sc.setLogLevel("ERROR")
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/15 13:36:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
orders_file = "../data/bucketing/orders.csv"
df_orders = spark.read.csv(orders_file, header=True, inferSchema=True)

                                                                                

In [6]:
df_orders.show(5, False)
df_orders.printSchema()

+--------+----------+-----------+--------+----------+------------+
|order_id|product_id|customer_id|quantity|order_date|total_amount|
+--------+----------+-----------+--------+----------+------------+
|1       |80        |10         |4       |2023-3-20 |1003        |
|2       |69        |30         |3       |2023-12-11|780         |
|3       |61        |20         |4       |2023-4-26 |1218        |
|4       |62        |44         |3       |2023-8-26 |2022        |
|5       |78        |46         |4       |2023-8-5  |1291        |
+--------+----------+-----------+--------+----------+------------+
only showing top 5 rows

root
 |-- order_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- total_amount: integer (nullable = true)



In [7]:
products_file = "../data/bucketing/products.csv"
df_products = spark.read.csv(products_file, header=True, inferSchema=True)

In [8]:
df_products.show(5, False)
df_products.printSchema()

+----------+------------+-----------+-------+-----+-----+
|product_id|product_name|category   |brand  |price|stock|
+----------+------------+-----------+-------+-----+-----+
|1         |Product_1   |Electronics|Brand_4|26   |505  |
|2         |Product_2   |Apparel    |Brand_4|489  |15   |
|3         |Product_3   |Apparel    |Brand_4|102  |370  |
|4         |Product_4   |Groceries  |Brand_1|47   |433  |
|5         |Product_5   |Groceries  |Brand_3|244  |902  |
+----------+------------+-----------+-------+-----+-----+
only showing top 5 rows

root
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- stock: integer (nullable = true)



In [9]:
df_products.select("product_id").distinct().count()

100

In [10]:
df_orders.select("order_id").distinct().count()

1000

## Bucketing In Joins

In [11]:
df_orders_product_details = (
    df_orders.join(
        df_products,
        on="product_id",
        how="inner"
    )
)

In [12]:
df_orders_product_details.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [product_id#17, order_id#16, customer_id#18, quantity#19, order_date#20, total_amount#21, product_name#76, category#77, brand#78, price#79, stock#80]
   +- SortMergeJoin [product_id#17], [product_id#75], Inner
      :- Sort [product_id#17 ASC NULLS FIRST], false, 0
      :  +- Exchange hashpartitioning(product_id#17, 200), ENSURE_REQUIREMENTS, [id=#259]
      :     +- Filter isnotnull(product_id#17)
      :        +- FileScan csv [order_id#16,product_id#17,customer_id#18,quantity#19,order_date#20,total_amount#21] Batched: false, DataFilters: [isnotnull(product_id#17)], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/Users/afaqueahmad/Documents/YouTube/spark-experiments/data/bucke..., PartitionFilters: [], PushedFilters: [IsNotNull(product_id)], ReadSchema: struct<order_id:int,product_id:int,customer_id:int,quantity:int,order_date:string,total_amount:int>
      +- Sort [product_id#75 ASC NULLS FIRST], false, 0
  

In [13]:
df_orders_product_details.count()

1000

In [15]:
(
    df_products
    .write.bucketBy(4, col="product_id")
    .mode("overwrite")
    .saveAsTable("products_bucketed")
)

                                                                                

In [16]:
(
    df_orders
    .write.bucketBy(4, col="product_id")
    .mode("overwrite")
    .saveAsTable("orders_bucketed")
)

In [17]:
df_orders_bucketed = spark.table("orders_bucketed")
df_products_bucketed = spark.table("products_bucketed")

In [18]:
df_orders_product_details_bucketed = (
    df_orders_bucketed.join(
        df_products_bucketed,
        on="product_id",
        how="inner"
    )
)

In [19]:
df_orders_product_details_bucketed.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [product_id#194, order_id#193, customer_id#195, quantity#196, order_date#197, total_amount#198, product_name#206, category#207, brand#208, price#209, stock#210]
   +- SortMergeJoin [product_id#194], [product_id#205], Inner
      :- Sort [product_id#194 ASC NULLS FIRST], false, 0
      :  +- Filter isnotnull(product_id#194)
      :     +- FileScan parquet default.orders_bucketed[order_id#193,product_id#194,customer_id#195,quantity#196,order_date#197,total_amount#198] Batched: true, DataFilters: [isnotnull(product_id#194)], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/Users/afaqueahmad/Documents/YouTube/spark-experiments/spark/spar..., PartitionFilters: [], PushedFilters: [IsNotNull(product_id)], ReadSchema: struct<order_id:int,product_id:int,customer_id:int,quantity:int,order_date:string,total_amount:int>, SelectedBucketsCount: 4 out of 4
      +- Sort [product_id#205 ASC NULLS FIRST], false, 0
         +-

In [20]:
df_orders_product_details_bucketed.count()

1000

## Bucketing In Aggregations

In [21]:
df_orders.show(5, False)

+--------+----------+-----------+--------+----------+------------+
|order_id|product_id|customer_id|quantity|order_date|total_amount|
+--------+----------+-----------+--------+----------+------------+
|1       |80        |10         |4       |2023-3-20 |1003        |
|2       |69        |30         |3       |2023-12-11|780         |
|3       |61        |20         |4       |2023-4-26 |1218        |
|4       |62        |44         |3       |2023-8-26 |2022        |
|5       |78        |46         |4       |2023-8-5  |1291        |
+--------+----------+-----------+--------+----------+------------+
only showing top 5 rows



In [36]:
# WITHOUT BUCKETING

df_product_sales = (
    df_orders
    .groupBy("product_id")
    .agg(F.sum("total_amount").alias("sales"))
)

df_product_sales.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[product_id#17], functions=[sum(total_amount#21)])
   +- Exchange hashpartitioning(product_id#17, 200), ENSURE_REQUIREMENTS, [id=#763]
      +- HashAggregate(keys=[product_id#17], functions=[partial_sum(total_amount#21)])
         +- FileScan csv [product_id#17,total_amount#21] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/Users/afaqueahmad/Documents/YouTube/spark-experiments/data/bucke..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<product_id:int,total_amount:int>




In [37]:
# WITH BUCKETING

df_product_sales = (
    df_orders_bucketed
    .groupBy("product_id")
    .agg(F.sum("total_amount").alias("sales"))
)

df_product_sales.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[product_id#194], functions=[sum(total_amount#198)])
   +- HashAggregate(keys=[product_id#194], functions=[partial_sum(total_amount#198)])
      +- FileScan parquet default.orders_bucketed[product_id#194,total_amount#198] Batched: true, DataFilters: [], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/Users/afaqueahmad/Documents/YouTube/spark-experiments/spark/spar..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<product_id:int,total_amount:int>, SelectedBucketsCount: 4 out of 4




## Bucket Pruning

In [30]:
df_product_sales_bucket_pruning = (
    df_orders_bucketed
    .filter(F.col("product_id") == 1)
    .groupBy("product_id")
    .agg(F.sum("total_amount").alias("sales"))
)

In [31]:
df_product_sales_bucket_pruning.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[product_id#194], functions=[sum(total_amount#198)])
   +- HashAggregate(keys=[product_id#194], functions=[partial_sum(total_amount#198)])
      +- Filter (isnotnull(product_id#194) AND (product_id#194 = 1))
         +- FileScan parquet default.orders_bucketed[product_id#194,total_amount#198] Batched: true, DataFilters: [isnotnull(product_id#194), (product_id#194 = 1)], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/Users/afaqueahmad/Documents/YouTube/spark-experiments/spark/spar..., PartitionFilters: [], PushedFilters: [IsNotNull(product_id), EqualTo(product_id,1)], ReadSchema: struct<product_id:int,total_amount:int>, SelectedBucketsCount: 1 out of 4


