In [5]:
import sys
from pyspark import SparkContext, SparkConf
import time
from pyspark.sql import functions as F
from pyspark.sql import SQLContext



# Finds out the index of "name" in the array firstLine 
# returns -1 if it cannot find it
def findCol(firstLine, name):
	if name in firstLine:
		return firstLine.index(name)
	else:
		return -1





In [6]:

#### Driver program

# start spark with 1 worker thread
sc = SparkContext("local[1]")
sc.setLogLevel("ERROR")
sqlContext = SQLContext(sc)




ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=pyspark-shell, master=local[1]) created by __init__ at /tmp/ipykernel_9753/2735925061.py:4 

In [None]:
### Task Events: evicted und killed getrennt

input_paths = [
    "google-dataset/task_events/part-00265-of-00500.csv.gz",
    "google-dataset/task_events/part-00266-of-00500.csv.gz",
    "google-dataset/task_events/part-00267-of-00500.csv.gz",
    "google-dataset/task_events/part-00268-of-00500.csv.gz",
    "google-dataset/task_events/part-00269-of-00500.csv.gz",
]

sqlContext = SQLContext(sc)

# no header so header=False
df = sqlContext.read.csv(input_paths, header=False, inferSchema=True)

df = df.withColumnRenamed("_c2", "job_id") \
       .withColumnRenamed("_c3", "task_id") \
       .withColumnRenamed("_c5", "event_type")

total_tasks = df.select("job_id", "task_id").distinct().count()

# evicted = 2, killed = 5
evicted_tasks = (
    df.filter(df.event_type == 2)
      .select("job_id", "task_id")
      .distinct()
      .count()
)

killed_tasks = (
    df.filter(df.event_type == 5)
      .select("job_id", "task_id")
      .distinct()
      .count()
)

combined_tasks = (
    df.filter(df.event_type.isin([2,5]))
      .select("job_id", "task_id")
      .distinct()
      .count()
)

def pct(part, whole):
    return (part / whole * 100.0) if whole > 0 else 0.0

print("=== Task Events Summary ===")
print("Total number of tasks: ", total_tasks)
print("Evicted tasks (code=2): {} ({:.2f}%)".format(evicted_tasks, pct(evicted_tasks, total_tasks)))
print("Killed tasks   (code=5): {} ({:.2f}%)".format(killed_tasks, pct(killed_tasks, total_tasks)))
print("Evicted or Killed (combined): {} ({:.2f}%)".format(combined_tasks, pct(combined_tasks, total_tasks)))



In [None]:
### Job events: evicted und killed getrennt

input_paths = [
    "google-dataset/job_events/part-00265-of-00500.csv.gz",
    "google-dataset/job_events/part-00266-of-00500.csv.gz",
    "google-dataset/job_events/part-00267-of-00500.csv.gz",
    "google-dataset/job_events/part-00268-of-00500.csv.gz",
    "google-dataset/job_events/part-00269-of-00500.csv.gz",
]

# no header
df = sqlContext.read.csv(input_paths, header=False, inferSchema=True)

df = df.withColumnRenamed("_c2", "job_id") \
       .withColumnRenamed("_c3", "event_type")

total_jobs = df.select("job_id").distinct().count()

evicted_jobs = (
    df.filter(df.event_type == 2)
      .select("job_id")
      .distinct()
      .count()
)

killed_jobs = (
    df.filter(df.event_type == 5)
      .select("job_id")
      .distinct()
      .count()
)

combined_jobs = (
    df.filter(df.event_type.isin([2,5]))
      .select("job_id")
      .distinct()
      .count()
)

print("=== Job Events Summary ===")
print("Total number of jobs: ", total_jobs)
print("Evicted jobs (code=2): {} ({:.2f}%)".format(evicted_jobs, pct(evicted_jobs, total_jobs)))
print("Killed jobs   (code=5): {} ({:.2f}%)".format(killed_jobs, pct(killed_jobs, total_jobs)))
print("Evicted or Killed (combined): {} ({:.2f}%)".format(combined_jobs, pct(combined_jobs, total_jobs)))
