In [3]:
import sys
from pyspark import SparkContext, SparkConf
import time
from pyspark.sql import functions as F
from pyspark.sql import SQLContext
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, StringType, BooleanType


In [4]:
#### Driver program

sc = SparkContext("local[8]")
sc.setLogLevel("ERROR")
sqlContext = SQLContext(sc)

print("OK")



OK


In [9]:
### combine task_event and task_usage and compare


# Define Task Events Schema
task_events_schema = StructType([
    StructField("time", IntegerType(), True),
    StructField("missing_info", IntegerType(), True),
    StructField("job_id", IntegerType(), False),
    StructField("task_index", IntegerType(), False),
    StructField("machine_id", IntegerType(), True),
    StructField("event_type", IntegerType(), False),
    StructField("user", StringType(), True),
    StructField("scheduling_class", IntegerType(), True),
    StructField("priority", IntegerType(), False),
    StructField("cpu_request", FloatType(), True),
    StructField("memory_request", FloatType(), True),
    StructField("disk_space_request", FloatType(), True),
    StructField("different_machines_restriction", BooleanType(), True)
])

# Define Task Usage Schema
task_usage_schema = StructType([
    StructField("start_time", IntegerType(), False),
    StructField("end_time", IntegerType(), False),
    StructField("job_id", IntegerType(), False),
    StructField("task_index", IntegerType(), False),
    StructField("machine_id", IntegerType(), False),
    StructField("cpu_rate", FloatType(), True),
    StructField("canonical_memory_usage", FloatType(), True),
    StructField("assigned_memory_usage", FloatType(), True),
    StructField("unmapped_page_cache", FloatType(), True),
    StructField("total_page_cache", FloatType(), True),
    StructField("maximum_memory_usage", FloatType(), True),
    StructField("disk_io_time", FloatType(), True),
    StructField("local_disk_space_usage", FloatType(), True),
])




task_events_paths = [
    "google-dataset/task_events/part-00265-of-00500.csv.gz",
    "google-dataset/task_events/part-00266-of-00500.csv.gz",
    "google-dataset/task_events/part-00267-of-00500.csv.gz",
    "google-dataset/task_events/part-00268-of-00500.csv.gz",
    "google-dataset/task_events/part-00269-of-00500.csv.gz",
]

task_usage_paths = [
    "google-dataset/task_usage/part-00265-of-00500.csv.gz",
    "google-dataset/task_usage/part-00266-of-00500.csv.gz",
    "google-dataset/task_usage/part-00267-of-00500.csv.gz",
    "google-dataset/task_usage/part-00268-of-00500.csv.gz",
    "google-dataset/task_usage/part-00269-of-00500.csv.gz",
]



# load data
df_events = sqlContext.read.csv("google-dataset/task_events/part-00265-of-00500.csv.gz", 
                           schema=task_events_schema, 
                           header=False)

df_usage = sqlContext.read.csv("google-dataset/task_usage/part-00265-of-00500.csv.gz",
                          schema=task_usage_schema,
                          header=False)


# only keep valid data, remove NULL
task_events = df_events.filter(
    (df_events.cpu_request.isNotNull()) |
    (df_events.memory_request.isNotNull()) |
    (df_events.disk_space_request.isNotNull())
)

task_usage = df_usage.filter(
    (df_usage.cpu_rate.isNotNull()) |
    (df_usage.canonical_memory_usage.isNotNull()) |
    (df_usage.local_disk_space_usage.isNotNull())
)


print("task_events")
task_events.show()

print("task_usage")
task_usage.show()



# Cache because we access it multiple times
task_events.cache()
task_usage.cache()


task_events
+----+------------+---------+----------+----------+----------+--------------------+----------------+--------+-----------+--------------+------------------+------------------------------+
|time|missing_info|   job_id|task_index|machine_id|event_type|                user|scheduling_class|priority|cpu_request|memory_request|disk_space_request|different_machines_restriction|
+----+------------+---------+----------+----------+----------+--------------------+----------------+--------+-----------+--------------+------------------+------------------------------+
|NULL|        NULL|     NULL|         3|   6640654|         1|Jb7cY5N5TFHGGBnnI...|               1|       4|     0.0625|        0.0318|          4.635E-4|                          NULL|
|NULL|        NULL|515042969|        13| 294934177|         1|/fk1fVcVxZ6iM6gHZ...|               2|       0|    0.01562|       0.01553|          2.155E-4|                          NULL|
|NULL|        NULL|     NULL|      5479|      NULL|  

DataFrame[start_time: int, end_time: int, job_id: int, task_index: int, machine_id: int, cpu_rate: float, canonical_memory_usage: float, assigned_memory_usage: float, unmapped_page_cache: float, total_page_cache: float, maximum_memory_usage: float, disk_io_time: float, local_disk_space_usage: float]

In [7]:

# Descriptive analysis: Evicted vs. non-evicted
# Compare average usage for evicted vs. non-evicted tasks

desc_stats = (
    task_level
    .groupBy("was_evicted")
    .agg(
        F.avg("avg_cpu_rate").alias("mean_avg_cpu_rate"),
        F.avg("max_cpu_rate").alias("mean_max_cpu_rate"),
        F.avg("avg_canon_mem").alias("mean_avg_canon_mem"),
        F.avg("max_canon_mem").alias("mean_max_canon_mem"),
        F.avg("avg_local_disk").alias("mean_avg_local_disk"),
        F.avg("max_local_disk").alias("mean_max_local_disk"),
        F.count("*").alias("num_tasks")
    )
)

desc_stats.show()



# Correlation between usage and eviction flag (treated as numeric 0/1)
corr_avg_cpu = task_level.stat.corr("avg_cpu_rate", "was_evicted")
corr_max_cpu = task_level.stat.corr("max_cpu_rate", "was_evicted")
corr_avg_mem = task_level.stat.corr("avg_canon_mem", "was_evicted")
corr_max_mem = task_level.stat.corr("max_canon_mem", "was_evicted")

print("Correlation avg_cpu_rate vs was_evicted:", corr_avg_cpu)
print("Correlation max_cpu_rate vs was_evicted:", corr_max_cpu)
print("Correlation avg_canon_mem vs was_evicted:", corr_avg_mem)
print("Correlation max_canon_mem vs was_evicted:", corr_max_mem)




+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------+
|was_evicted|   mean_avg_cpu_rate|   mean_max_cpu_rate|  mean_avg_canon_mem|  mean_max_canon_mem| mean_avg_local_disk| mean_max_local_disk|num_tasks|
+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------+
|          1|0.001352120105942...|0.008651086071040481|0.002140481629648868|  0.0029882720077876|1.495592038767064...|2.250575173093238...|       25|
|          0|0.006526975098758615|0.052900681443475844|0.006804221501569972|0.025395527801841365|4.635580097586943E-5|1.778847737615823...|    22163|
+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------+

Correlation avg_cpu_rate vs was_evicted: -0.01903622392340978
Correlation max_cpu_rate vs was_evict

In [12]:
### Analysis: Correlation between machine-level resource peaks and evictions
print("\n=== Machine-Level Resource Peaks vs. Eviction Events ===")

# 1. Aggregate resource usage per machine and time window
machine_usage = (
    task_usage
    .groupBy("machine_id", "start_time")
    .agg(
        F.sum("cpu_rate").alias("total_cpu"),
        F.sum("canonical_memory_usage").alias("total_memory"),
        F.sum("local_disk_space_usage").alias("total_disk"),
        F.count("*").alias("num_tasks_on_machine")
    )
)

# 2. Get eviction events with machine_id and timestamp
eviction_events = (
    task_events
    .filter(F.col("event_type") == EVICTED_CODE)
    .select("time", "machine_id", "job_id", "task_index")
    .withColumnRenamed("time", "eviction_time")
)

# 3. Join evictions with machine usage in same or nearby time window
# Assuming time window tolerance of +/- 300 seconds (5 minutes)
TIME_WINDOW = 300
evictions_with_usage = (
    eviction_events
    .join(
        machine_usage,
        (eviction_events.machine_id == machine_usage.machine_id) &
        (F.abs(eviction_events.eviction_time - machine_usage.start_time) <= TIME_WINDOW),
        "inner"    )
)

print("\nEvictions with concurrent machine resource usage:")
evictions_with_usage = (
    eviction_events
    .join(
        machine_usage,
        (eviction_events.machine_id == machine_usage.machine_id) &
        (F.abs(eviction_events.eviction_time - machine_usage.start_time) <= TIME_WINDOW),
        "inner"    )
    .select(
        eviction_events.eviction_time,
        eviction_events.machine_id,
        eviction_events.job_id,
        eviction_events.task_index,
        machine_usage.total_cpu,
        machine_usage.total_memory,
        machine_usage.total_disk,
        machine_usage.num_tasks_on_machine
    )
)

# 4. Compare resource levels: machines with evictions vs. without
machines_with_evictions = evictions_with_usage.select("machine_id").distinct()

machine_stats = (
    machine_usage
    .withColumn(
        "has_evictions",
        F.when(F.col("machine_id").isin([row.machine_id for row in machines_with_evictions.collect()]), 1).otherwise(0)
    )
    .groupBy("has_evictions")
    .agg(
        F.avg("total_cpu").alias("avg_cpu"),
        F.max("total_cpu").alias("max_cpu"),
        F.avg("total_memory").alias("avg_memory"),
        F.max("total_memory").alias("max_memory"),
        F.avg("num_tasks_on_machine").alias("avg_tasks"),
        F.count("*").alias("num_samples")
    )
)

print("\nResource comparison: Machines with vs. without evictions:")
machine_stats.show()

# 5. Detect resource peaks (e.g., top 10% usage)
cpu_threshold = machine_usage.approxQuantile("total_cpu", [0.9], 0.01)[0]
mem_threshold = machine_usage.approxQuantile("total_memory", [0.9], 0.01)[0]

print(f"\nResource peak thresholds (90th percentile):")
print(f"CPU: {cpu_threshold}")
print(f"Memory: {mem_threshold}")

high_resource_periods = (
    machine_usage
    .filter(
        (F.col("total_cpu") >= cpu_threshold) |
        (F.col("total_memory") >= mem_threshold)
    )
)

# 6. Count evictions during high resource periods
evictions_during_peaks = (
    eviction_events
    .join(
        high_resource_periods,
        (eviction_events.machine_id == high_resource_periods.machine_id) &
        (F.abs(eviction_events.eviction_time - high_resource_periods.start_time) <= TIME_WINDOW),
        "inner"    )
    .select(eviction_events["*"])
)

total_evictions = eviction_events.count()
evictions_at_peaks = evictions_during_peaks.count()
percentage = (evictions_at_peaks / total_evictions * 100) if total_evictions > 0 else 0
print(f"\n=== Peak Correlation Results ===")
print(f"Total evictions: {total_evictions:,}")
print(f"Evictions during resource peaks: {evictions_at_peaks:,} ({percentage:.2f}%)")

# 7. Visualization
import matplotlib.pyplot as plt

# Plot resource distribution for machines with/without evictions
stats_data = machine_stats.collect()
if len(stats_data) >= 2:
    categories = ['Avg CPU', 'Max CPU', 'Avg Memory', 'Max Memory']
    with_evictions = [stats_data[1]['avg_cpu'], stats_data[1]['max_cpu'], 
                      stats_data[1]['avg_memory'], stats_data[1]['max_memory']]
    without_evictions = [stats_data[0]['avg_cpu'], stats_data[0]['max_cpu'],
                        stats_data[0]['avg_memory'], stats_data[0]['max_memory']]
    
    x = range(len(categories))
    width = 0.35        
    plt.figure(figsize=(12, 6))
    plt.bar([i - width/2 for i in x], without_evictions, width, label='Without Evictions', color='green', alpha=0.7)
    plt.bar([i + width/2 for i in x], with_evictions, width, label='With Evictions', color='red', alpha=0.7)
    plt.xlabel('Resource Metric', fontsize=12)
    plt.ylabel('Resource Units', fontsize=12)
    plt.title('Resource Usage: Machines with vs. without Evictions', fontsize=14, fontweight='bold')
    plt.xticks(x, categories)
    plt.legend()
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.savefig('machine_resources_evictions.png', dpi=300, bbox_inches='tight')
    plt.show()



=== Machine-Level Resource Peaks vs. Eviction Events ===

Evictions with concurrent machine resource usage:

Resource comparison: Machines with vs. without evictions:
+-------------+----------------+------------------+-----------------+------------------+------------------+-----------+
|has_evictions|         avg_cpu|           max_cpu|       avg_memory|        max_memory|         avg_tasks|num_samples|
+-------------+----------------+------------------+-----------------+------------------+------------------+-----------+
|            0|4.36484280905721|11861.348092917331|5.209083381925109|15042.822055560684|265.38396978598405|       9532|
+-------------+----------------+------------------+-----------------+------------------+------------------+-----------+


Resource peak thresholds (90th percentile):
CPU: 5.373521113866445
Memory: 6.072802803348168

=== Peak Correlation Results ===
Total evictions: 473
Evictions during resource peaks: 0 (0.00%)
