In [2]:
#### Driver program

import sys
from pyspark import SparkContext, SparkConf
import time
from pyspark.sql import functions as F
from pyspark.sql import SQLContext

sc = SparkContext("local[8]")
sc.setLogLevel("ERROR")
sqlContext = SQLContext(sc)





In [3]:
### Group by job id and count machine IDs to get different machines

input_paths = [
    "google-dataset/task_events/part-00265-of-00500.csv.gz",
    "google-dataset/task_events/part-00266-of-00500.csv.gz",
    "google-dataset/task_events/part-00267-of-00500.csv.gz",
    "google-dataset/task_events/part-00268-of-00500.csv.gz",
    "google-dataset/task_events/part-00269-of-00500.csv.gz",
]

# task_events schema.csv:
# _c2 = job ID (field 3)
# _c3 = task index (field 4)
# _c4 = machine ID (field 5)
df = sqlContext.read.csv(input_paths, header=False, inferSchema=True) \
    .withColumnRenamed("_c2", "job_id") \
    .withColumnRenamed("_c3", "task_id") \
    .withColumnRenamed("_c4", "machine_id")

# for each (job_id, task_id) collect machines (in case of migration during on task)
# in general collecting for job_id should be sufficiant
job_task_machines = (
    df.select("job_id", "task_id", "machine_id")
      .dropna(subset=["machine_id"])
      .groupBy("job_id", "task_id")
      .agg(F.collect_set("machine_id").alias("machines_for_task"))
)

# count different machines
jobs_machine_stats = (
    job_task_machines
    .select(
        "job_id",
        F.explode("machines_for_task").alias("machine_id")
    )
    .groupBy("job_id")
    .agg(
        F.countDistinct("machine_id").alias("num_machines_for_job")
    )
    .orderBy(F.col("num_machines_for_job").desc())
)

# print for debug purposes
#print("Jobs with most different machines:")
#jobs_machine_stats.show(20, truncate=False)
#print("jobs on one machine:")
#jobs_machine_stats.filter(F.col("num_machines_for_job") == 1).show(20, truncate=False)


total_jobs = jobs_machine_stats.count()

distribution = (
    jobs_machine_stats
    .groupBy("num_machines_for_job")
    .agg(F.count("*").alias("num_jobs"))
    .withColumn(
        "fraction_of_jobs",
        F.col("num_jobs") / F.lit(total_jobs)
    )
    .orderBy("num_machines_for_job")
)

print("Distribution:")
distribution.show(20)



Distribution:
+--------------------+--------+--------------------+
|num_machines_for_job|num_jobs|    fraction_of_jobs|
+--------------------+--------+--------------------+
|                   1|    4267|  0.6703849175176748|
|                   2|     532| 0.08358208955223881|
|                   3|     121|  0.0190102120974077|
|                   4|      93|0.014611154752553025|
|                   5|      35|0.005498821681068...|
|                   6|      44|0.006912804399057345|
|                   7|      23|0.003613511390416...|
|                   8|      15|0.002356637863315004|
|                   9|      47|0.007384131971720346|
|                  10|      19|0.002985074626865...|
|                  11|      50|0.007855459544383346|
|                  12|      34|0.005341712490180676|
|                  13|       8|0.001256873527101...|
|                  14|      35|0.005498821681068...|
|                  15|     240|0.037706205813040065|
|                  16|      31|0