In [None]:
import sys
from pyspark.sql import SparkSession

print("PySpark Script: ", sys.argv[0])

# Create a spark context and print some information about the context object
spark: SparkSession = SparkSession.builder.appName("Demo").getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("WARN")

print(sc.version)
print(sc.pythonVer)
print(sc.master)
# Stop Pyspark
spark.stop()
print("Spark Successfully Stopped!")

In [1]:
from pyspark import SparkConf, SparkContext
import random
import time

# Function to generate a large dataset of key-value pairs
def generate_data(size):
    return [(random.randint(1, 10000), random.random()) for _ in range(size)]

def run_spark_job(serializer, data_size):
    '''
    Runs the spark job
    '''
    start_time = time.time()
    # Configure Spark to use the specified serializer
    conf = SparkConf() \
        .setAppName(f"SerializationTest-{serializer}") \
        .setMaster("local[*]") \
        .set("spark.serializer", serializer)

    sc = SparkContext(conf=conf)
    sc.setLogLevel("WARN")

    # Generate a large random dataset and parallelize it
    data = generate_data(data_size)
    rdd = sc.parallelize(data)

    # Perform reduceByKey (requires shuffling)
    result = rdd.reduceByKey(lambda a, b: a + b).count()

    print(f"Result count with {serializer}: {result}")
    end_time = time.time()
    print("Execution Time (secs):", (end_time-start_time))

    # Calculate and print the estimated data size (MB)
    bytes_per_element = 24  # Size per tuple
    data_size_bytes = data_size * bytes_per_element
    data_size_megabytes = data_size_bytes / (1024 * 1024)
    print(f"Estimated data size: {data_size_megabytes:.2f} MB")

    sc.stop()

## JavaSerializer (Default)

Source: https://spark.apache.org/docs/latest/tuning.html

In [2]:
# Run the job with JavaSerializer
run_spark_job("org.apache.spark.serializer.JavaSerializer", 10500000)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/16 13:33:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/03/16 13:33:50 WARN TaskSetManager: Stage 0 contains a task of very large size (14359 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

Result count with org.apache.spark.serializer.JavaSerializer: 10000
Execution Time (secs): 10.769093990325928
Estimated data size: 240.33 MB


## KryoSerializer

Source: https://spark.apache.org/docs/latest/tuning.html

In [3]:
# Run the job with KryoSerializer
run_spark_job("org.apache.spark.serializer.KryoSerializer", 10500000)

24/03/16 13:34:06 WARN TaskSetManager: Stage 0 contains a task of very large size (14422 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

Result count with org.apache.spark.serializer.KryoSerializer: 10000
Execution Time (secs): 6.932398796081543
Estimated data size: 240.33 MB
