In [2]:
#### Driver program

import sys
from pyspark import SparkContext, SparkConf
import time
from pyspark.sql import functions as F
from pyspark.sql import SQLContext

sc = SparkContext("local[8]")
sc.setLogLevel("ERROR")
sqlContext = SQLContext(sc)







In [3]:
### Group by scheduling class and print eviction rate per class 

input_paths = [
        "google-dataset/task_events/part-00265-of-00500.csv.gz",
        "google-dataset/task_events/part-00266-of-00500.csv.gz",
        "google-dataset/task_events/part-00267-of-00500.csv.gz",
        "google-dataset/task_events/part-00268-of-00500.csv.gz",
        "google-dataset/task_events/part-00269-of-00500.csv.gz",
]

# col2 = job_id, col3 = task_id, col4 = scheduling_class, col5 = event_type
df = sqlContext.read.csv(input_paths, header=False, inferSchema=True) \
    .withColumnRenamed("_c2", "job_id") \
    .withColumnRenamed("_c3", "task_id") \
    .withColumnRenamed("_c7", "scheduling_class") \
    .withColumnRenamed("_c5", "event_type")

EVICT_CODE = 2

# check if (job_id, task_id) was evicted
task_eviction = (
    df
    .select("job_id", "task_id", "scheduling_class",
            (F.col("event_type") == EVICT_CODE).cast("int").alias("is_eviction"))
    .groupBy("job_id", "task_id", "scheduling_class")
    .agg(F.max("is_eviction").alias("evicted"))  # 1, wenn mind. ein evict-Event
)

# per Scheduling-Class: number of tasks, number evicted tasks, Eviction rate
stats = (
    task_eviction
    .groupBy("scheduling_class")
    .agg(
        F.count("*").alias("total_tasks"),
        F.sum("evicted").alias("evicted_tasks")
    )
    .withColumn(
        "eviction_rate",
        (F.col("evicted_tasks") / F.col("total_tasks") * 100.0)
    )
    .orderBy("scheduling_class")
)

stats.show(truncate=False)



+----------------+-----------+-------------+------------------+
|scheduling_class|total_tasks|evicted_tasks|eviction_rate     |
+----------------+-----------+-------------+------------------+
|0               |181916     |31428        |17.276105455265068|
|1               |30270      |4672         |15.434423521638585|
|2               |9599       |832          |8.667569538493593 |
|3               |2136       |209          |9.784644194756554 |
+----------------+-----------+-------------+------------------+

