In [1]:
import csv
import time
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import col, count, max, avg, explode, split, length, lower, struct, row_number, desc, countDistinct

# Initialization
spark_master = "192.168.2.157"
data_path = "hdfs://192.168.2.157:9000/user/ubuntu/input/corpus-webis-tldr-17.json"
results_path = "performance_test_3_workers.csv"

# Functions
def create_spark_session(worker_count):
    """Initialize Spark session."""
    return SparkSession.builder\
        .master(f"spark://{spark_master}:7077")\
        .appName(f"PerformanceTest{worker_count}Workers")\
        .config("spark.executor.instances", str(worker_count))\
        .config("spark.executor.cores", "8")\
        .config("spark.executor.memory", "8g")\
        .config("spark.driver.port", 9999)\
        .config("spark.blockManager.port", 10005)\
        .getOrCreate()

def log_performance(test_id, task, duration):
    """Log performance results to CSV."""
    with open(results_path, 'a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([test_id, task, f"{duration:.2f}"])

def measure_time(func):
    """Measure execution time of a function."""
    start_time = time.time()
    result = func()
    duration = time.time() - start_time
    return duration, result

def run_tasks(spark, worker_count):
    """Run tasks and measure performance."""
    df = spark.read.json(data_path).cache()
    df.count()  # Trigger cache

    # Define tasks within this function to ensure access to the DataFrame
    tasks = {
        'aggregate_subreddit_stats': lambda: df.groupBy('subreddit').agg(count("*").alias("count"), max("content_len"), avg("summary_len")).count(),
        'agg': lambda: df.groupBy('subreddit').agg(count("*").alias("count"), max("content_len"), avg("summary_len")).count(),
        'join': lambda: df.alias("df1").join(df.alias("df2"), col("df1.id") == col("df2.id")).count(),
        'complex_transformation': lambda: df.withColumn("new_content_len", col("content_len") * 2).groupBy('subreddit').agg(avg("new_content_len")).collect(),
        'window_function': lambda: df.withColumn("rank", row_number().over(Window.partitionBy("subreddit").orderBy(desc("content_len")))).filter(col("rank") <= 10).count(),
        'explode_split': lambda: df.withColumn("words", explode(split(col("body"), " "))).groupBy("words").count().orderBy(desc("count")).limit(10).collect(),
        'avg_summary_by_author': lambda: df.groupBy('author').agg(avg('summary_len').alias('avg_summary_length')).count(),
        'distinct_subreddit_count': lambda: df.select('subreddit').distinct().count(),
        'max_summary_length_per_subreddit': lambda: df.groupBy('subreddit').agg(max('summary_len').alias('max_summary_length')).count(),
        'top_title_per_subreddit': lambda: df.withColumn("title_length", length(col("title"))).groupBy('subreddit').agg(max(struct(col('title_length'), col('title'))).alias('top_title')).count(),
        'word_count_in_titles': lambda: df.withColumn("word", explode(split(lower(col("title")), "\\s+"))).groupBy('word').count().orderBy(desc('count')).limit(10).collect(),
        'self_join_shuffle': lambda: df.alias("df1").join(df.alias("df2"), col("df1.subreddit") == col("df2.subreddit")).agg(count("*")).collect(),
        'cross_join': lambda: df.crossJoin(df.limit(10)).agg(count("*")).collect(),  
        'complex_aggregation': lambda: df.groupBy('subreddit').agg(count("*"), avg("summary_len"), max("content_len"), countDistinct("author")).collect(),
        'sort_by_large_dataset': lambda: df.orderBy(desc("summary_len")).limit(1000).collect(),
    }

    for task_name, task_func in tasks.items():
        duration, _ = measure_time(task_func)
        print(f"Task {task_name} with {worker_count} workers completed in {duration:.2f} seconds.")
        log_performance(f"{worker_count}_workers", task_name, duration)


In [None]:
# Main
if __name__ == "__main__":
    # Setup CSV for results
    with open(results_path, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['TestID', 'Task', 'Duration'])
    
# Run tasks for 3 workers
worker_counts = 3

spark_session = create_spark_session(worker_counts)
run_tasks(spark_session, worker_counts)


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/14 22:07:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

Task aggregate_subreddit_stats with 3 workers completed in 28.66 seconds.


                                                                                

Task agg with 3 workers completed in 7.26 seconds.


                                                                                

Task join with 3 workers completed in 7.12 seconds.


                                                                                

Task complex_transformation with 3 workers completed in 7.63 seconds.


                                                                                

Task window_function with 3 workers completed in 8.22 seconds.


                                                                                

Task explode_split with 3 workers completed in 24.93 seconds.


                                                                                

Task avg_summary_by_author with 3 workers completed in 9.53 seconds.


                                                                                

Task distinct_subreddit_count with 3 workers completed in 5.41 seconds.


                                                                                

Task max_summary_length_per_subreddit with 3 workers completed in 4.02 seconds.


24/03/14 22:11:50 WARN TaskSetManager: Lost task 11.0 in stage 56.0 (TID 2068) (192.168.2.22 executor 0): java.lang.OutOfMemoryError: Java heap space
	at java.base/java.lang.reflect.Array.newInstance(Array.java:78)
	at java.base/java.io.ObjectInputStream.readArray(ObjectInputStream.java:2121)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1721)
	at java.base/java.io.ObjectInputStream.readArray(ObjectInputStream.java:2157)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1721)
	at java.base/java.io.ObjectInputStream$FieldValues.<init>(ObjectInputStream.java:2606)
	at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2457)
	at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2257)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1733)
	at java.base/java.io.ObjectInputStream.readObject(ObjectInputStream.java:509)
	at java.base/java.io.ObjectInputStream.readO

Task top_title_per_subreddit with 3 workers completed in 28.34 seconds.


                                                                                

Task word_count_in_titles with 3 workers completed in 21.11 seconds.


                                                                                

Task self_join_shuffle with 3 workers completed in 2427.85 seconds.


24/03/14 22:53:09 WARN TaskSetManager: Lost task 139.0 in stage 74.0 (TID 2833) (192.168.2.221 executor 2): java.lang.OutOfMemoryError: Java heap space
	at java.base/java.lang.reflect.Array.newInstance(Array.java:78)
	at java.base/java.io.ObjectInputStream.readArray(ObjectInputStream.java:2121)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1721)
	at java.base/java.io.ObjectInputStream.readArray(ObjectInputStream.java:2157)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1721)
	at java.base/java.io.ObjectInputStream$FieldValues.<init>(ObjectInputStream.java:2606)
	at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2457)
	at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2257)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1733)
	at java.base/java.io.ObjectInputStream.readObject(ObjectInputStream.java:509)
	at java.base/java.io.ObjectInputStream.rea

Task cross_join with 3 workers completed in 224.25 seconds.




In [None]:
spark_session.stop()