In [2]:
#### Driver program

import sys
from pyspark import SparkContext, SparkConf
import time
from pyspark.sql import functions as F
from pyspark.sql import SQLContext

# start spark with 1 worker thread
sc = SparkContext("local[1]")
sc.setLogLevel("ERROR")
sqlContext = SQLContext(sc)





In [16]:
### Group by scheduling class in buckets (20 buckets) and print eviction rate per class 

input_paths = [
    "google-dataset/task_events/part-00265-of-00500.csv.gz",
    "google-dataset/task_events/part-00266-of-00500.csv.gz",
    "google-dataset/task_events/part-00267-of-00500.csv.gz",
    "google-dataset/task_events/part-00268-of-00500.csv.gz",
    "google-dataset/task_events/part-00269-of-00500.csv.gz",
]

# col2 = job ID, col3 = task index, col8 = scheduling class, col6 = event type
df = sqlContext.read.csv(input_paths, header=False, inferSchema=True) \
    .withColumnRenamed("_c2", "job_id") \
    .withColumnRenamed("_c3", "task_id") \
    .withColumnRenamed("_c5", "event_type") \
    .withColumnRenamed("_c7", "scheduling_class")

# maxi scheduling_class 
max_sched = df.agg(F.max("scheduling_class").alias("max_sc")).collect()[0]["max_sc"]

# Buckets 
num_buckets = 50
bucket_size = max_sched / float(num_buckets)

# Bucket-ID for each row (0..19(num_buckets-1))
def bucket_index_expr(sc_col):
    return F.when(sc_col.isNull(), None) \
            .otherwise(
                F.when(sc_col >= max_sched, num_buckets - 1)
                 .otherwise((sc_col / bucket_size).cast("int"))
            )

df = df.withColumn("sched_bucket", bucket_index_expr(F.col("scheduling_class")))

# EVICT-Code = 2
EVICT_CODE = 2

# filter tasks if evicted
task_eviction = (
    df
    .select("job_id", "task_id", "sched_bucket",
            (F.col("event_type") == EVICT_CODE).cast("int").alias("is_eviction"))
    .groupBy("job_id", "task_id", "sched_bucket")
    .agg(F.max("is_eviction").alias("evicted"))
)

# per Bucket: total_tasks, evicted_tasks, eviction_rate, Bucket-Grenzen
stats = (
    task_eviction
    .groupBy("sched_bucket")
    .agg(
        F.count("*").alias("total_tasks"),
        F.sum("evicted").alias("evicted_tasks")
    )
    .withColumn("evicted_tasks", F.coalesce(F.col("evicted_tasks"), F.lit(0)))
    .withColumn(
        "eviction_rate",
        F.when(F.col("total_tasks") > 0,
               F.col("evicted_tasks") / F.col("total_tasks") * 100.0
        ).otherwise(F.lit(0.0))
    )
    .withColumn("bucket_min", F.col("sched_bucket") * bucket_size)
    .withColumn("bucket_max", (F.col("sched_bucket") + 1) * bucket_size)
    .orderBy("sched_bucket")
)


stats.show(num_buckets, truncate=False)




+------------+-----------+-------------+------------------+----------+----------+
|sched_bucket|total_tasks|evicted_tasks|eviction_rate     |bucket_min|bucket_max|
+------------+-----------+-------------+------------------+----------+----------+
|0           |181916     |31428        |17.276105455265068|0.0       |0.06      |
|16          |30270      |4672         |15.434423521638585|0.96      |1.02      |
|33          |9599       |832          |8.667569538493593 |1.98      |2.04      |
|49          |2136       |209          |9.784644194756554 |2.94      |3.0       |
+------------+-----------+-------------+------------------+----------+----------+



In [8]:
# 10 highest Eviction-Rate
print("Top 10 scheduling classes mit höchster eviction_rate:")
stats.orderBy(F.col("eviction_rate").desc()).show(10, truncate=False)

# 10 lowest Eviction-Rate
print("Top 10 scheduling classes mit niedrigster eviction_rate:")
stats.orderBy(F.col("eviction_rate").asc()).show(10, truncate=False)



Top 10 scheduling classes mit höchster eviction_rate:
+----------------+-----------+-------------+-----------------+
|scheduling_class|total_tasks|evicted_tasks|eviction_rate    |
+----------------+-----------+-------------+-----------------+
|353619378       |2          |2            |100.0            |
|4820854840      |1          |1            |100.0            |
|501421958       |1          |1            |100.0            |
|1436287887      |1          |1            |100.0            |
|38744730        |1          |1            |100.0            |
|4246147567      |26         |21           |80.76923076923077|
|1335141         |3          |2            |66.66666666666666|
|4802086443      |37         |24           |64.86486486486487|
|351605900       |8          |5            |62.5             |
|450119653       |5          |3            |60.0             |
+----------------+-----------+-------------+-----------------+
only showing top 10 rows

Top 10 scheduling classes mit niedrigs