In [2]:
#### Driver program

import sys
from pyspark import SparkContext, SparkConf
import time
from pyspark.sql import functions as F
from pyspark.sql import SQLContext

# start spark with 1 worker thread
sc = SparkContext("local[1]")
sc.setLogLevel("ERROR")
sqlContext = SQLContext(sc)







In [3]:
### Group by scheduling class and print eviction rate per class 

input_paths = [
        "google-dataset/task_events/part-00265-of-00500.csv.gz",
        "google-dataset/task_events/part-00266-of-00500.csv.gz",
        "google-dataset/task_events/part-00267-of-00500.csv.gz",
        "google-dataset/task_events/part-00268-of-00500.csv.gz",
        "google-dataset/task_events/part-00269-of-00500.csv.gz",
]

# col2 = job_id, col3 = task_id, col4 = scheduling_class, col5 = event_type
df = sqlContext.read.csv(input_paths, header=False, inferSchema=True) \
    .withColumnRenamed("_c2", "job_id") \
    .withColumnRenamed("_c3", "task_id") \
    .withColumnRenamed("_c7", "scheduling_class") \
    .withColumnRenamed("_c5", "event_type")

EVICT_CODE = 2

# check if (job_id, task_id) was evicted
task_eviction = (
    df
    .select("job_id", "task_id", "scheduling_class",
            (F.col("event_type") == EVICT_CODE).cast("int").alias("is_eviction"))
    .groupBy("job_id", "task_id", "scheduling_class")
    .agg(F.max("is_eviction").alias("evicted"))  # 1, wenn mind. ein evict-Event
)

# Pro Scheduling-Class: Anzahl Tasks, Anzahl evicted Tasks, Eviction-Rate
stats = (
    task_eviction
    .groupBy("scheduling_class")
    .agg(
        F.count("*").alias("total_tasks"),
        F.sum("evicted").alias("evicted_tasks")
    )
    .withColumn(
        "eviction_rate",
        (F.col("evicted_tasks") / F.col("total_tasks") * 100.0)
    )
    .orderBy("scheduling_class")
)

stats.show(truncate=False)



+----------------+-----------+-------------+------------------+
|scheduling_class|total_tasks|evicted_tasks|eviction_rate     |
+----------------+-----------+-------------+------------------+
|0               |181916     |31428        |17.276105455265068|
|1               |30270      |4672         |15.434423521638585|
|2               |9599       |832          |8.667569538493593 |
|3               |2136       |209          |9.784644194756554 |
+----------------+-----------+-------------+------------------+



In [8]:
# 10 highest Eviction-Rate
print("Top 10 scheduling classes mit höchster eviction_rate:")
stats.orderBy(F.col("eviction_rate").desc()).show(10, truncate=False)

# 10 lowest Eviction-Rate
print("Top 10 scheduling classes mit niedrigster eviction_rate:")
stats.orderBy(F.col("eviction_rate").asc()).show(10, truncate=False)



Top 10 scheduling classes mit höchster eviction_rate:
+----------------+-----------+-------------+-----------------+
|scheduling_class|total_tasks|evicted_tasks|eviction_rate    |
+----------------+-----------+-------------+-----------------+
|353619378       |2          |2            |100.0            |
|4820854840      |1          |1            |100.0            |
|501421958       |1          |1            |100.0            |
|1436287887      |1          |1            |100.0            |
|38744730        |1          |1            |100.0            |
|4246147567      |26         |21           |80.76923076923077|
|1335141         |3          |2            |66.66666666666666|
|4802086443      |37         |24           |64.86486486486487|
|351605900       |8          |5            |62.5             |
|450119653       |5          |3            |60.0             |
+----------------+-----------+-------------+-----------------+
only showing top 10 rows

Top 10 scheduling classes mit niedrigs