In [None]:
#### Driver program

import sys
from pyspark import SparkContext, SparkConf
import time
from pyspark.sql import functions as F
from pyspark.sql import SQLContext

sc = SparkContext("local[8]")
sc.setLogLevel("ERROR")
sqlContext = SQLContext(sc)

print("OK")



In [6]:
### combine task_event and task_usage and compare

task_events_paths = [
    "google-dataset/task_events/part-00265-of-00500.csv.gz",
    "google-dataset/task_events/part-00266-of-00500.csv.gz",
    "google-dataset/task_events/part-00267-of-00500.csv.gz",
    "google-dataset/task_events/part-00268-of-00500.csv.gz",
    "google-dataset/task_events/part-00269-of-00500.csv.gz",
]

task_usage_paths = [
    "google-dataset/task_usage/part-00265-of-00500.csv.gz",
    "google-dataset/task_usage/part-00266-of-00500.csv.gz",
    "google-dataset/task_usage/part-00267-of-00500.csv.gz",
    "google-dataset/task_usage/part-00268-of-00500.csv.gz",
    "google-dataset/task_usage/part-00269-of-00500.csv.gz",
]


te = sqlContext.read.csv(task_events_paths, header=False, inferSchema=True) \
    .withColumnRenamed("_c2", "job_id") \
    .withColumnRenamed("_c3", "task_index") \
    .withColumnRenamed("_c9", "cpu_request") \
    .withColumnRenamed("_c10", "mem_request") \
    .withColumnRenamed("_c11", "disk_request")

tu = sqlContext.read.csv(task_usage_paths, header=False, inferSchema=True) \
    .withColumnRenamed("_c2", "job_id") \
    .withColumnRenamed("_c3", "task_index") \
    .withColumnRenamed("_c5", "cpu_rate") \
    .withColumnRenamed("_c7", "assigned_memory_usage") \
    .withColumnRenamed("_c10", "max_mem_usage") \
    .withColumnRenamed("_c12", "local_disk_usage") \
    .withColumnRenamed("_c13", "max_cpu_rate")




te.show()
tu.show()

# aggregate usage over time window
tu_agg = (
    tu.groupBy("job_id", "task_index")
      .agg(
          F.max("cpu_rate").alias("max_cpu_rate"),
          F.max("max_mem_usage").alias("max_mem_usage_overall"),
          F.max("local_disk_usage").alias("max_local_disk_usage")
      )
)


# Join: requests + Usage per task
task_joined = (
    te.select("job_id", "task_index", "cpu_request", "mem_request", "disk_request")
      .join(tu_agg, on=["job_id", "task_index"], how="inner")
)

task_joined.show()

# handle null values
tj = task_joined.withColumn("cpu_request", F.coalesce(F.col("cpu_request").cast("double"), F.lit(0.0))) \
    .withColumn("max_cpu_rate", F.coalesce(F.col("max_cpu_rate").cast("double"), F.lit(0.0))) \
    .withColumn("mem_request", F.coalesce(F.col("mem_request").cast("double"), F.lit(0.0))) \
    .withColumn("max_mem_usage_overall", F.coalesce(F.col("max_mem_usage_overall").cast("double"), F.lit(0.0))) \
    .withColumn("disk_request", F.coalesce(F.col("disk_request").cast("double"), F.lit(0.0))) \
    .withColumn("max_local_disk_usage", F.coalesce(F.col("max_local_disk_usage").cast("double"), F.lit(0.0)))

tj = tj.withColumn("cpu_over", F.when(F.col("max_cpu_rate") > F.col("cpu_request"), 1).otherwise(0)) \
    .withColumn("mem_over", F.when(F.col("max_mem_usage_overall") > F.col("mem_request"), 1).otherwise(0)) \
    .withColumn("disk_over", F.when(F.col("max_local_disk_usage") > F.col("disk_request"), 1).otherwise(0))



# aggregate statistics in one table: total number, counts, averages
stats = tj.agg(
    F.count("*").alias("total_tasks"),
    F.sum("cpu_over").alias("cpu_over_count"),
    (F.sum("cpu_over") / F.count("*") * 100).alias("cpu_over_pct"),
    F.avg("cpu_request").alias("avg_cpu_request"),
    F.avg("max_cpu_rate").alias("avg_max_cpu_rate"),
    F.sum("mem_over").alias("mem_over_count"),
    (F.sum("mem_over") / F.count("*") * 100).alias("mem_over_pct"),
    F.avg("mem_request").alias("avg_mem_request"),
    F.avg("max_mem_usage_overall").alias("avg_max_mem_usage"),
    F.sum("disk_over").alias("disk_over_count"),
    (F.sum("disk_over") / F.count("*") * 100).alias("disk_over_pct"),
    F.avg("disk_request").alias("avg_disk_request"),
    F.avg("max_local_disk_usage").alias("avg_max_local_disk_usage")
)


print("Aggregated statistics: Percentage of tasks using more than requested (CPU/MEM/DISK):")
stats.show(truncate=False)

print("Examples: Tasks with high observed CPU (Top 50):")
tj.select(
    "job_id", "task_index", "cpu_request", "max_cpu_rate", "mem_request", "max_mem_usage_overall", "disk_request", "max_local_disk_usage", "cpu_over", "mem_over", "disk_over"
).orderBy(F.desc("max_cpu_rate")).show(50, truncate=False)


+-------------+----+----------+----------+----------+---+--------------------+---+---+-----------+-----------+------------+----+
|          _c0| _c1|    job_id|task_index|       _c4|_c5|                 _c6|_c7|_c8|cpu_request|mem_request|disk_request|_c12|
+-------------+----+----------+----------+----------+---+--------------------+---+---+-----------+-----------+------------+----+
|1343602104020|NULL|6375646109|        70|  56896039|  1|XXdY557FQk791swgp...|  2|  4|     0.0625|     0.0636|    5.817E-5|   0|
|1343602104030|NULL|6375646109|       256|2107285354|  1|XXdY557FQk791swgp...|  2|  4|     0.0625|     0.0636|    5.817E-5|   0|
|1343602104043|NULL| 515042969|         5| 351621284|  1|/fk1fVcVxZ6iM6gHZ...|  2|  0|    0.01562|    0.01553|    2.155E-4|   0|
|1343602104051|NULL|6336594489|       562| 974375799|  1|EWR2JX3J0jkvMGNMI...|  0|  0|     0.0125|     0.0159|    4.044E-4|   0|
|1343602104060|NULL|6336594489|      6073| 257413113|  1|EWR2JX3J0jkvMGNMI...|  0|  0|     0.0125

In [None]:
# Optional: calculate simple correlations
# (only meaningful if enough data in sample)
corr_cpu = task_joined.stat.corr("cpu_request", "max_cpu_rate")
corr_mem = task_joined.stat.corr("mem_request", "max_mem_usage_overall")
corr_disk = task_joined.stat.corr("disk_request", "max_local_disk_usage")

print("Correlation CPU request vs. max CPU rate: ", corr_cpu)
print("Correlation MEM request vs. max MEM usage: ", corr_mem)
print("Correlation DISK request vs. max local disk usage: ", corr_disk)
