In [1]:
import csv
import time
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import col, count, max, avg, explode, split, length, lower, struct, row_number, desc, countDistinct

# Initialization
spark_master = "sparkmaster"
data_path = "hdfs://192.168.2.157:9000/user/ubuntu/input/corpus-webis-tldr-17.json"
results_path = "performance_test_1_worker_swarm.csv"

# Functions
def create_spark_session(worker_count):
    """Initialize Spark session."""
    return SparkSession.builder\
        .master(f"spark://{spark_master}:7077")\
        .appName(f"PerformanceTest{worker_count}Workers")\
        .config("spark.executor.instances", str(worker_count))\
        .config("spark.executor.cores", "2")\
        .config("spark.executor.memory", "2g")\
        .config("spark.driver.port", 9999)\
        .config("spark.blockManager.port", 10005)\
        .getOrCreate()

def log_performance(test_id, task, duration):
    """Log performance results to CSV."""
    with open(results_path, 'a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([test_id, task, f"{duration:.2f}"])

def measure_time(func):
    """Measure execution time of a function."""
    start_time = time.time()
    result = func()
    duration = time.time() - start_time
    return duration, result

def run_tasks(spark, worker_count):
    """Run tasks and measure performance."""
    df = spark.read.json(data_path).cache()
    df.count()  # Trigger cache

    # Define tasks within this function to ensure access to the DataFrame
    tasks = {
        'aggregate_subreddit_stats': lambda: df.groupBy('subreddit').agg(count("*").alias("count"), max("content_len"), avg("summary_len")).count(),
        'agg': lambda: df.groupBy('subreddit').agg(count("*").alias("count"), max("content_len"), avg("summary_len")).count(),
        'join': lambda: df.alias("df1").join(df.alias("df2"), col("df1.id") == col("df2.id")).count(),
        'complex_transformation': lambda: df.withColumn("new_content_len", col("content_len") * 2).groupBy('subreddit').agg(avg("new_content_len")).collect(),
        'window_function': lambda: df.withColumn("rank", row_number().over(Window.partitionBy("subreddit").orderBy(desc("content_len")))).filter(col("rank") <= 10).count(),
        'explode_split': lambda: df.withColumn("words", explode(split(col("body"), " "))).groupBy("words").count().orderBy(desc("count")).limit(10).collect(),
        'avg_summary_by_author': lambda: df.groupBy('author').agg(avg('summary_len').alias('avg_summary_length')).count(),
        'distinct_subreddit_count': lambda: df.select('subreddit').distinct().count(),
        'max_summary_length_per_subreddit': lambda: df.groupBy('subreddit').agg(max('summary_len').alias('max_summary_length')).count(),
        'top_title_per_subreddit': lambda: df.withColumn("title_length", length(col("title"))).groupBy('subreddit').agg(max(struct(col('title_length'), col('title'))).alias('top_title')).count(),
        'word_count_in_titles': lambda: df.withColumn("word", explode(split(lower(col("title")), "\\s+"))).groupBy('word').count().orderBy(desc('count')).limit(10).collect(),
        # 'self_join_shuffle': lambda: df.alias("df1").join(df.alias("df2"), col("df1.subreddit") == col("df2.subreddit")).agg(count("*")).collect(),
        'cross_join': lambda: df.crossJoin(df.limit(10)).agg(count("*")).collect(),  
        'complex_aggregation': lambda: df.groupBy('subreddit').agg(count("*"), avg("summary_len"), max("content_len"), countDistinct("author")).collect(),
    }

    for task_name, task_func in tasks.items():
        duration, _ = measure_time(task_func)
        print(f"Task {task_name} with {worker_count} workers completed in {duration:.2f} seconds.")
        log_performance(f"{worker_count}_workers", task_name, duration)

In [2]:
# Main
if __name__ == "__main__":
    # Setup CSV for results
    with open(results_path, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['TestID', 'Task', 'Duration'])
    
# Run tasks for 1 worker
worker_counts = 1

spark_session = create_spark_session(worker_counts)
run_tasks(spark_session, worker_counts)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/15 13:41:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/03/15 13:42:18 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
                                                                                

Task aggregate_subreddit_stats with 1 workers completed in 658.27 seconds.


                                                                                

Task agg with 1 workers completed in 964.41 seconds.


                                                                                

Task join with 1 workers completed in 2091.66 seconds.


                                                                                

Task complex_transformation with 1 workers completed in 763.23 seconds.


                                                                                

Task window_function with 1 workers completed in 1112.51 seconds.


                                                                                

Task explode_split with 1 workers completed in 671.76 seconds.


                                                                                

Task avg_summary_by_author with 1 workers completed in 609.29 seconds.


                                                                                

Task distinct_subreddit_count with 1 workers completed in 555.86 seconds.


                                                                                

Task max_summary_length_per_subreddit with 1 workers completed in 625.32 seconds.


                                                                                

Task top_title_per_subreddit with 1 workers completed in 630.10 seconds.


                                                                                

Task word_count_in_titles with 1 workers completed in 331.24 seconds.


                                                                                

Task cross_join with 1 workers completed in 431.44 seconds.


                                                                                

Task complex_aggregation with 1 workers completed in 315.27 seconds.


In [3]:
spark_session.stop()