In [20]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [14]:
spark = SparkSession.builder \
.appName('Olist Ecommerce Performance Optmization') \
.config('spark.executor.memory','6g') \
.config('spark.executor.cores','4') \
.config('spark.executor.instances','2') \
.config('spark.driver.memory','4g') \
.config('spark.driver.maxResultSize','2g') \
.config('spark.sql.shuffle.partitions','64') \
.config('spark.default.parallelism','64') \
.config('spark.sql.adaptive.enabled','true') \
.config('spark.sql.adaptive.coalescePartition.enabled','true') \
.config('spark.sql.autoBroadcastJoinThreshold',20*1024*1024) \
.config('spark.sql.files.maxPartitionBytes','64MB') \
.config('spark.sql.files.openCostInBytes','2MB') \
.config('spark.memory.fraction',0.8) \
.config('spark.memory.storageFraction',0.2) \
.getOrCreate()

In [15]:
hdfs_path ="hdfs://localhost:9000/user/winardi/data/olist_proc/"

In [16]:
customers_df = spark.read.parquet(hdfs_path + "customers_cleaned.parquet")
orders_df = spark.read.parquet(hdfs_path + "orders_cleaned.parquet")
order_items_df = spark.read.parquet(hdfs_path + "order_items_cleaned.parquet")
payments_df = spark.read.parquet(hdfs_path + "payments_cleaned.parquet")
reviews_df = spark.read.parquet(hdfs_path + "reviews_cleaned.parquet")
products_df = spark.read.parquet(hdfs_path + "product_cleaned.parquet")
sellers_df = spark.read.parquet(hdfs_path + "sellers_cleaned.parquet")
geolocation_df = spark.read.parquet(hdfs_path + "geolocation_cleaned.parquet")
category_translation_df = spark.read.parquet(hdfs_path + "category_translation_cleaned.parquet")

In [10]:
processed_path = "hdfs://localhost:9000/user/winardi/data/olist/processed/"

In [17]:
full_orders_df = spark.read.parquet(processed_path)

In [18]:
full_orders_df.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: date (nullable = true)
 |-- order_purchase_date: date (nullable = true)
 |-- order_delivered_date: date (nullable = true)
 |-- order_item_id: integer (nullable = true)
 |-- shipping_limit_date: timestamp (nullable = true)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = true)
 |-- payment_sequential: integer (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- payment_installments: integer (nullable = true)
 |-- payment_value: double (nullable = true)
 |-- customer_

In [22]:
# Broadcast
customers_broadcast_df = broadcast(customers_df)
optimized_broadcast_join = full_orders_df.join(customers_broadcast_df,'customer_id')

In [23]:
# Sort and Merge join
sorted_customers_df = customers_df.sortWithinPartitions('customer_id')
sorted_orders_df = full_orders_df.sortWithinPartitions('customer_id')
optimized_merge_full_orders_df = sorted_orders_df.join(sorted_customers_df,'customer_id')

In [24]:
# Bucket join
bucketed_customers_df = customers_df.repartition(10,'customer_id')
bucketed_orders_df = full_orders_df.repartition(10,'customer_id')
bucket_join_df = bucketed_orders_df.join(bucketed_customers_df,'customer_id')

In [25]:
# Skew Join handling
skew_handled_join = full_orders_df.join(customers_df,'customer_id')

In [27]:
# Caching
# Caching the main integrated DataFrame
full_orders_df.cache()

# Trigger caching
print("Total records:", full_orders_df.count())

Total records: 18060583
