In [None]:
import sys
from pyspark.sql import SparkSession

print("PySpark Script: ", sys.argv[0])

# Create a spark context and print some information about the context object
spark: SparkSession = SparkSession.builder.appName("Demo").getOrCreate()

print(spark.sparkContext.version)
print(spark.sparkContext.pythonVer)
print(spark.sparkContext.master)
# Stop Pyspark
spark.stop()
print("Spark Successfully Stopped!")

In [1]:
import time
import random
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
import pyspark.sql.functions as F


DEFAULT_DATA_SIZE = 10000000

data_size_labels = {
    50000000: 'large',
    10000000: 'small'
}

def generate_data(size):
    return [(random.randint(1, 10000), random.random()) for _ in range(size)]


def run_large_spark_job(spark: SparkSession, data_size: int, app_name: str, generate_rdd: bool = False):
    '''
    Runs the spark job with the passed in SparkSession that contains an optimized
    SparkConf
    '''
    print("Running spark job for SparkConf:")
    conf = spark.sparkContext.getConf()
    for key, value in conf.getAll():
        print(f"{key} = {value}")
    print()
    try:
        start_time = time.time()
        sc = spark.sparkContext
        sc.setLogLevel("ERROR")
        # Create a large DataFrame with random data
        data = [(i, i % 100, i % 1000) for i in range(data_size)]
        columns = ["id", "group", "subgroup"]
        large_df = spark.createDataFrame(data, columns)

        # Perform transformation, aggregation, and sorting: shuffling and sorting
        result_df = (large_df
                    .withColumn("group_id", F.col("group") * 10)
                    .groupBy("group_id")
                    .agg(F.avg("subgroup").alias("avg_subgroup"),
                        F.count("id").alias("count"))
                    .orderBy("avg_subgroup"))
        result_df.show()
        # # Create small dataframe & join operation which will trigger a shuffle
        # small_df = spark.createDataFrame([(i, i * 2) for i in range(100)], ["group_id", "value"])
        # # Perform the join operation
        # joined_df = result_df.join(small_df, "group_id")
        # joined_df.show()
        end_time = time.time()
        print("Execution Time (secs):", (end_time-start_time))
    except Exception as e:
        print('Stopping context with error', e)
        spark.stop()
    spark.stop()

## Baseline: Default Spark Config

- `spark.serializer` = org.apache.spark.serializer.JavaSerializer

- `spark.reducer.maxSizeInFlight` = 48m

- `spark.shuffle.compress` = true

- `spark.shuffle.spill.compress` = true

- `spark.rdd.compress` = true

- `spark.shuffle.file.buffer` = 32k

- `spark.shuffle.io.preferDirectBufs` = true

- `spark.io.compression.codec` = lz4



Source: https://spark.apache.org/docs/latest/tuning.html


In [2]:
# Run the job with default config
data_size_label = data_size_labels.get(DEFAULT_DATA_SIZE, DEFAULT_DATA_SIZE)
app_name = f'Baseline: Default Config (data_size={data_size_label})'
spark: SparkSession = SparkSession.builder \
    .master("local[*]") \
    .appName(app_name) \
    .config("spark.reducer.maxSizeInFlight", "48m") \
    .config("spark.shuffle.compress", "true") \
    .config("spark.io.compression.codec", "lz4") \
    .config("spark.shuffle.io.preferDirectBufs", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.JavaSerializer") \
    .config("spark.shuffle.file.buffer", "32k") \
    .config("spark.shuffle.spill.compress", "true")\
    .config("spark.eventLog.compress", "true")\
    .config("spark.eventLog.enabled", "true")\
    .config("spark.eventLog.dir", "/tmp/spark-events")\
    .getOrCreate()
run_large_spark_job(spark=spark, data_size=DEFAULT_DATA_SIZE, app_name=app_name)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/20 17:39:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Running spark job for SparkConf:
spark.eventLog.enabled = true
spark.reducer.maxSizeInFlight = 48m
spark.serializer = org.apache.spark.serializer.JavaSerializer
spark.driver.extraJavaOptions = -Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-U

                                                                                

+--------+------------+------+
|group_id|avg_subgroup| count|
+--------+------------+------+
|       0|       450.0|100000|
|      10|       451.0|100000|
|      20|       452.0|100000|
|      30|       453.0|100000|
|      40|       454.0|100000|
|      50|       455.0|100000|
|      60|       456.0|100000|
|      70|       457.0|100000|
|      80|       458.0|100000|
|      90|       459.0|100000|
|     100|       460.0|100000|
|     110|       461.0|100000|
|     120|       462.0|100000|
|     130|       463.0|100000|
|     140|       464.0|100000|
|     150|       465.0|100000|
|     160|       466.0|100000|
|     170|       467.0|100000|
|     180|       468.0|100000|
|     190|       469.0|100000|
+--------+------------+------+
only showing top 20 rows

Execution Time (secs): 100.58739495277405


## Experiment 1: Changing the serializer

spark.serializer = JavaSerializer -> KryoSerializer

In [3]:
# Run the job with default config
data_size_label = data_size_labels.get(DEFAULT_DATA_SIZE, DEFAULT_DATA_SIZE)
app_name = f'Experiment 1: KryoSerializer (data_size={data_size_label})'
spark: SparkSession = SparkSession.builder \
    .master("local[*]") \
    .appName(app_name) \
    .config("spark.reducer.maxSizeInFlight", "48m") \
    .config("spark.shuffle.compress", "true") \
    .config("spark.io.compression.codec", "lz4") \
    .config("spark.shuffle.io.preferDirectBufs", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.shuffle.file.buffer", "32k") \
    .config("spark.shuffle.spill.compress", "true")\
    .config("spark.eventLog.compress", "true")\
    .config("spark.eventLog.enabled", "true")\
    .config("spark.eventLog.dir", "/tmp/spark-events")\
    .getOrCreate()
run_large_spark_job(spark=spark, data_size=DEFAULT_DATA_SIZE, app_name=app_name)

Running spark job for SparkConf:
spark.eventLog.enabled = true
spark.reducer.maxSizeInFlight = 48m
spark.driver.extraJavaOptions = -Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false
spark.app.id 

                                                                                

+--------+------------+------+
|group_id|avg_subgroup| count|
+--------+------------+------+
|       0|       450.0|100000|
|      10|       451.0|100000|
|      20|       452.0|100000|
|      30|       453.0|100000|
|      40|       454.0|100000|
|      50|       455.0|100000|
|      60|       456.0|100000|
|      70|       457.0|100000|
|      80|       458.0|100000|
|      90|       459.0|100000|
|     100|       460.0|100000|
|     110|       461.0|100000|
|     120|       462.0|100000|
|     130|       463.0|100000|
|     140|       464.0|100000|
|     150|       465.0|100000|
|     160|       466.0|100000|
|     170|       467.0|100000|
|     180|       468.0|100000|
|     190|       469.0|100000|
+--------+------------+------+
only showing top 20 rows

Execution Time (secs): 99.09791493415833


## Experiment 2.1: Decreasing the Shuffle File Buffer
- spark.shuffle.file.buffer = 32k -> 16k

In [4]:
# Run the job with default config
data_size_label = data_size_labels.get(DEFAULT_DATA_SIZE, DEFAULT_DATA_SIZE)
app_name = f'Experiment 2.1: Decrease file_buffer to 16k (data_size={data_size_label})'
spark: SparkSession = SparkSession.builder \
    .master("local[*]") \
    .appName(app_name) \
    .config("spark.reducer.maxSizeInFlight", "48m") \
    .config("spark.shuffle.compress", "true") \
    .config("spark.io.compression.codec", "lz4") \
    .config("spark.shuffle.io.preferDirectBufs", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.JavaSerializer") \
    .config("spark.shuffle.file.buffer", "16k") \
    .config("spark.shuffle.spill.compress", "true")\
    .config("spark.eventLog.compress", "true")\
    .config("spark.eventLog.enabled", "true")\
    .config("spark.eventLog.dir", "/tmp/spark-events")\
    .getOrCreate()
run_large_spark_job(spark=spark, data_size=DEFAULT_DATA_SIZE, app_name=app_name)

Running spark job for SparkConf:
spark.eventLog.enabled = true
spark.reducer.maxSizeInFlight = 48m
spark.serializer = org.apache.spark.serializer.JavaSerializer
spark.app.startTime = 1710971067455
spark.driver.extraJavaOptions = -Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.s

                                                                                

+--------+------------+------+
|group_id|avg_subgroup| count|
+--------+------------+------+
|       0|       450.0|100000|
|      10|       451.0|100000|
|      20|       452.0|100000|
|      30|       453.0|100000|
|      40|       454.0|100000|
|      50|       455.0|100000|
|      60|       456.0|100000|
|      70|       457.0|100000|
|      80|       458.0|100000|
|      90|       459.0|100000|
|     100|       460.0|100000|
|     110|       461.0|100000|
|     120|       462.0|100000|
|     130|       463.0|100000|
|     140|       464.0|100000|
|     150|       465.0|100000|
|     160|       466.0|100000|
|     170|       467.0|100000|
|     180|       468.0|100000|
|     190|       469.0|100000|
+--------+------------+------+
only showing top 20 rows

Execution Time (secs): 97.77148795127869


## Experiment 2.2: Increasing the Shuffle File Buffer
- spark.shuffle.file.buffer = 32k -> 64k

In [5]:
# Run the job with default config
data_size_label = data_size_labels.get(DEFAULT_DATA_SIZE, DEFAULT_DATA_SIZE)
app_name = f'Experiment 2.2: Increase file buffer to 64k (data_size={data_size_label})'
spark: SparkSession = SparkSession.builder \
    .master("local[*]") \
    .appName(app_name) \
    .config("spark.reducer.maxSizeInFlight", "48m") \
    .config("spark.shuffle.compress", "true") \
    .config("spark.io.compression.codec", "lz4") \
    .config("spark.shuffle.io.preferDirectBufs", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.JavaSerializer") \
    .config("spark.shuffle.file.buffer", "64k") \
    .config("spark.shuffle.spill.compress", "true")\
    .config("spark.eventLog.compress", "true")\
    .config("spark.eventLog.enabled", "true")\
    .config("spark.eventLog.dir", "/tmp/spark-events")\
    .getOrCreate()
run_large_spark_job(spark=spark, data_size=DEFAULT_DATA_SIZE, app_name=app_name)

Running spark job for SparkConf:
spark.eventLog.enabled = true
spark.reducer.maxSizeInFlight = 48m
spark.serializer = org.apache.spark.serializer.JavaSerializer
spark.driver.extraJavaOptions = -Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-U

                                                                                

+--------+------------+------+
|group_id|avg_subgroup| count|
+--------+------------+------+
|       0|       450.0|100000|
|      10|       451.0|100000|
|      20|       452.0|100000|
|      30|       453.0|100000|
|      40|       454.0|100000|
|      50|       455.0|100000|
|      60|       456.0|100000|
|      70|       457.0|100000|
|      80|       458.0|100000|
|      90|       459.0|100000|
|     100|       460.0|100000|
|     110|       461.0|100000|
|     120|       462.0|100000|
|     130|       463.0|100000|
|     140|       464.0|100000|
|     150|       465.0|100000|
|     160|       466.0|100000|
|     170|       467.0|100000|
|     180|       468.0|100000|
|     190|       469.0|100000|
+--------+------------+------+
only showing top 20 rows

Execution Time (secs): 98.58964681625366


## Experiment 3.1: Turn off shuffle compress
- spark.shuffle.compress = true -> false


In [6]:
# Run the job with default config
data_size_label = data_size_labels.get(DEFAULT_DATA_SIZE, DEFAULT_DATA_SIZE)
app_name = f'Experiment 3.1: No Shuffle Compression (data_size={data_size_label})'
spark: SparkSession = SparkSession.builder \
    .master("local[*]") \
    .appName(app_name) \
    .config("spark.reducer.maxSizeInFlight", "48m") \
    .config("spark.shuffle.compress", "false") \
    .config("spark.io.compression.codec", "lz4") \
    .config("spark.shuffle.io.preferDirectBufs", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.JavaSerializer") \
    .config("spark.shuffle.file.buffer", "32k") \
    .config("spark.shuffle.spill.compress", "true")\
    .config("spark.eventLog.compress", "true")\
    .config("spark.eventLog.enabled", "true")\
    .config("spark.eventLog.dir", "/tmp/spark-events")\
    .getOrCreate()
run_large_spark_job(spark=spark, data_size=DEFAULT_DATA_SIZE, app_name=app_name)

Running spark job for SparkConf:
spark.eventLog.enabled = true
spark.reducer.maxSizeInFlight = 48m
spark.serializer = org.apache.spark.serializer.JavaSerializer
spark.driver.extraJavaOptions = -Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-U

                                                                                

+--------+------------+------+
|group_id|avg_subgroup| count|
+--------+------------+------+
|       0|       450.0|100000|
|      10|       451.0|100000|
|      20|       452.0|100000|
|      30|       453.0|100000|
|      40|       454.0|100000|
|      50|       455.0|100000|
|      60|       456.0|100000|
|      70|       457.0|100000|
|      80|       458.0|100000|
|      90|       459.0|100000|
|     100|       460.0|100000|
|     110|       461.0|100000|
|     120|       462.0|100000|
|     130|       463.0|100000|
|     140|       464.0|100000|
|     150|       465.0|100000|
|     160|       466.0|100000|
|     170|       467.0|100000|
|     180|       468.0|100000|
|     190|       469.0|100000|
+--------+------------+------+
only showing top 20 rows

Execution Time (secs): 102.70968198776245


## Experiment 3.2: Turn off shuffle.spill compress
- spark.shuffle.spill.compress = true -> false


In [7]:
# Run the job with default config
data_size_label = data_size_labels.get(DEFAULT_DATA_SIZE, DEFAULT_DATA_SIZE)
app_name = f'Experiment 3.2: No Shuffle Spill Compression (data_size={data_size_label})'
spark: SparkSession = SparkSession.builder \
    .master("local[*]") \
    .appName(app_name) \
    .config("spark.reducer.maxSizeInFlight", "48m") \
    .config("spark.shuffle.compress", "true") \
    .config("spark.io.compression.codec", "lz4") \
    .config("spark.shuffle.io.preferDirectBufs", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.JavaSerializer") \
    .config("spark.shuffle.file.buffer", "32k") \
    .config("spark.shuffle.spill.compress", "false")\
    .config("spark.eventLog.compress", "true")\
    .config("spark.eventLog.enabled", "true")\
    .config("spark.eventLog.dir", "/tmp/spark-events")\
    .getOrCreate()
run_large_spark_job(spark=spark, data_size=DEFAULT_DATA_SIZE, app_name=app_name)

Running spark job for SparkConf:
spark.driver.port = 63867
spark.eventLog.enabled = true
spark.reducer.maxSizeInFlight = 48m
spark.serializer = org.apache.spark.serializer.JavaSerializer
spark.driver.extraJavaOptions = -Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jg

                                                                                

+--------+------------+------+
|group_id|avg_subgroup| count|
+--------+------------+------+
|       0|       450.0|100000|
|      10|       451.0|100000|
|      20|       452.0|100000|
|      30|       453.0|100000|
|      40|       454.0|100000|
|      50|       455.0|100000|
|      60|       456.0|100000|
|      70|       457.0|100000|
|      80|       458.0|100000|
|      90|       459.0|100000|
|     100|       460.0|100000|
|     110|       461.0|100000|
|     120|       462.0|100000|
|     130|       463.0|100000|
|     140|       464.0|100000|
|     150|       465.0|100000|
|     160|       466.0|100000|
|     170|       467.0|100000|
|     180|       468.0|100000|
|     190|       469.0|100000|
+--------+------------+------+
only showing top 20 rows

Execution Time (secs): 98.37648487091064


## Experiment 4.1: Change compression codec to: zstd
- spark.io.compression.codec = lz4 -> zstd

In [8]:
# Run the job with default config
data_size_label = data_size_labels.get(DEFAULT_DATA_SIZE, DEFAULT_DATA_SIZE)
app_name = f'Experiment 4.1: compression=zstd (data_size={data_size_label})'
spark: SparkSession = SparkSession.builder \
    .master("local[*]") \
    .appName(app_name) \
    .config("spark.reducer.maxSizeInFlight", "48m") \
    .config("spark.shuffle.compress", "true") \
    .config("spark.io.compression.codec", "zstd") \
    .config("spark.shuffle.io.preferDirectBufs", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.JavaSerializer") \
    .config("spark.shuffle.file.buffer", "32k") \
    .config("spark.shuffle.spill.compress", "true")\
    .config("spark.eventLog.compress", "true")\
    .config("spark.eventLog.enabled", "true")\
    .config("spark.eventLog.dir", "/tmp/spark-events")\
    .getOrCreate()
run_large_spark_job(spark=spark, data_size=DEFAULT_DATA_SIZE, app_name=app_name)

Running spark job for SparkConf:
spark.app.name = Experiment 4.1: compression=zstd (data_size=small)
spark.eventLog.enabled = true
spark.reducer.maxSizeInFlight = 48m
spark.serializer = org.apache.spark.serializer.JavaSerializer
spark.driver.extraJavaOptions = -Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calenda

                                                                                

+--------+------------+------+
|group_id|avg_subgroup| count|
+--------+------------+------+
|       0|       450.0|100000|
|      10|       451.0|100000|
|      20|       452.0|100000|
|      30|       453.0|100000|
|      40|       454.0|100000|
|      50|       455.0|100000|
|      60|       456.0|100000|
|      70|       457.0|100000|
|      80|       458.0|100000|
|      90|       459.0|100000|
|     100|       460.0|100000|
|     110|       461.0|100000|
|     120|       462.0|100000|
|     130|       463.0|100000|
|     140|       464.0|100000|
|     150|       465.0|100000|
|     160|       466.0|100000|
|     170|       467.0|100000|
|     180|       468.0|100000|
|     190|       469.0|100000|
+--------+------------+------+
only showing top 20 rows

Execution Time (secs): 96.88545989990234


## Experiment 4.2: Change compression codec to: snappy
- spark.io.compression.codec = lz4 -> snappy

In [9]:
# Run the job with default config
data_size_label = data_size_labels.get(DEFAULT_DATA_SIZE, DEFAULT_DATA_SIZE)
app_name = f'Experiment 4.2: compression=snappy (data_size={data_size_label})'
spark: SparkSession = SparkSession.builder \
    .master("local[*]") \
    .appName(app_name) \
    .config("spark.reducer.maxSizeInFlight", "48m") \
    .config("spark.shuffle.compress", "true") \
    .config("spark.io.compression.codec", "snappy") \
    .config("spark.shuffle.io.preferDirectBufs", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.JavaSerializer") \
    .config("spark.shuffle.file.buffer", "32k") \
    .config("spark.shuffle.spill.compress", "true")\
    .config("spark.eventLog.compress", "true")\
    .config("spark.eventLog.enabled", "true")\
    .config("spark.eventLog.dir", "/tmp/spark-events")\
    .getOrCreate()
run_large_spark_job(spark=spark, data_size=DEFAULT_DATA_SIZE, app_name=app_name)

Running spark job for SparkConf:
spark.eventLog.enabled = true
spark.reducer.maxSizeInFlight = 48m
spark.serializer = org.apache.spark.serializer.JavaSerializer
spark.app.startTime = 1710971877679
spark.driver.extraJavaOptions = -Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.s

                                                                                

+--------+------------+------+
|group_id|avg_subgroup| count|
+--------+------------+------+
|       0|       450.0|100000|
|      10|       451.0|100000|
|      20|       452.0|100000|
|      30|       453.0|100000|
|      40|       454.0|100000|
|      50|       455.0|100000|
|      60|       456.0|100000|
|      70|       457.0|100000|
|      80|       458.0|100000|
|      90|       459.0|100000|
|     100|       460.0|100000|
|     110|       461.0|100000|
|     120|       462.0|100000|
|     130|       463.0|100000|
|     140|       464.0|100000|
|     150|       465.0|100000|
|     160|       466.0|100000|
|     170|       467.0|100000|
|     180|       468.0|100000|
|     190|       469.0|100000|
+--------+------------+------+
only showing top 20 rows

Execution Time (secs): 97.54408407211304


## Experiment 4.3: Change compression codec to: lzf
- spark.io.compression.codec = lz4 -> lzf

In [10]:
# Run the job with default config
data_size_label = data_size_labels.get(DEFAULT_DATA_SIZE, DEFAULT_DATA_SIZE)
app_name = f'Experiment 4.3: compression=lzf (data_size={data_size_label})'
spark: SparkSession = SparkSession.builder \
    .master("local[*]") \
    .appName(app_name) \
    .config("spark.reducer.maxSizeInFlight", "48m") \
    .config("spark.shuffle.compress", "true") \
    .config("spark.io.compression.codec", "lzf") \
    .config("spark.shuffle.io.preferDirectBufs", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.JavaSerializer") \
    .config("spark.shuffle.file.buffer", "32k") \
    .config("spark.shuffle.spill.compress", "true")\
    .config("spark.eventLog.compress", "true")\
    .config("spark.eventLog.enabled", "true")\
    .config("spark.eventLog.dir", "/tmp/spark-events")\
    .getOrCreate()
run_large_spark_job(spark=spark, data_size=DEFAULT_DATA_SIZE, app_name=app_name)

Running spark job for SparkConf:
spark.eventLog.enabled = true
spark.reducer.maxSizeInFlight = 48m
spark.serializer = org.apache.spark.serializer.JavaSerializer
spark.driver.extraJavaOptions = -Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-U

                                                                                

+--------+------------+------+
|group_id|avg_subgroup| count|
+--------+------------+------+
|       0|       450.0|100000|
|      10|       451.0|100000|
|      20|       452.0|100000|
|      30|       453.0|100000|
|      40|       454.0|100000|
|      50|       455.0|100000|
|      60|       456.0|100000|
|      70|       457.0|100000|
|      80|       458.0|100000|
|      90|       459.0|100000|
|     100|       460.0|100000|
|     110|       461.0|100000|
|     120|       462.0|100000|
|     130|       463.0|100000|
|     140|       464.0|100000|
|     150|       465.0|100000|
|     160|       466.0|100000|
|     170|       467.0|100000|
|     180|       468.0|100000|
|     190|       469.0|100000|
+--------+------------+------+
only showing top 20 rows

Execution Time (secs): 96.0404200553894


## Final Optimization

TODO: Figure out the final optimization

- spark.serializer = JavaSerializer -> KryoSerializer

- spark.shuffle.compress = true -> false

- spark.shuffle.spill.compress = true -> false

- spark.shuffle.file.buffer = 32k -> 64k

- spark.io.compression.codec = lz4 -> zstd


In [None]:
# Run the job with default config
data_size_label = data_size_labels.get(DEFAULT_DATA_SIZE, DEFAULT_DATA_SIZE)
app_name = f'Final Optimization (data_size={data_size_label})'
spark: SparkSession = SparkSession.builder \
    .master("local[*]") \
    .appName(app_name) \
    .config("spark.reducer.maxSizeInFlight", "48m") \
    .config("spark.shuffle.compress", "false") \
    .config("spark.io.compression.codec", "zstd") \
    .config("spark.shuffle.io.preferDirectBufs", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.KyroSerializer") \
    .config("spark.shuffle.file.buffer", "64k") \
    .config("spark.shuffle.spill.compress", "true")\
    .config("spark.eventLog.compress", "true")\
    .config("spark.eventLog.enabled", "true")\
    .config("spark.eventLog.dir", "/tmp/spark-events")\
    .getOrCreate()
run_large_spark_job(spark=spark, data_size=DEFAULT_DATA_SIZE, app_name=app_name)