In [11]:
from pyspark import SparkContext, SparkConf

MACHINE_EVENT_ADD = 0
MACHINE_EVENT_REMOVE = 1
MACHINE_EVENT_UPDATE = 2

TASK_EVENT_SCHEDULE = 1
TASK_EVENT_EVICT = 2
TASK_EVENT_FAIL = 3
TASK_EVENT_FINISH = 4
TASK_EVENT_KILL = 5
TASK_EVENT_LOST = 6

TASK_TERMINATION_EVENTS = {2, 3, 4, 5, 6}

print("OK")


OK


In [None]:
#spark context
conf = SparkConf().setAppName("OvercommitmentAnalysis")
sc = SparkContext(conf=conf)


In [25]:
# DEFINITIONS

def parse_machine_event(line):
    """Parse machine_events: timestamp, machine_id, event_type, platform_id, cpu_capacity, memory_capacity"""
    try:
        fields = line.strip().split(',')
        if len(fields) < 6:
            return None
        
        timestamp = int(fields[0]) if fields[0] else 0
        machine_id = int(fields[1]) if fields[1] else None
        event_type = int(fields[2]) if fields[2] else None
        cpu_capacity = float(fields[4]) if fields[4] else 0.0
        memory_capacity = float(fields[5]) if fields[5] else 0.0
        
        if machine_id is None or event_type is None:
            return None
        
        if event_type not in [0, 1, 2]:
            return None
            
        return (machine_id, (timestamp, event_type, cpu_capacity, memory_capacity))
    except (ValueError, IndexError):
        return None


def parse_task_event(line):
    """Parse task_events: timestamp, missing_info, job_id, task_index, machine_id, event_type, ..., cpu_request, memory_request, ..."""
    try:
        fields = line.strip().split(',')
        if len(fields) < 12:
            return None
        
        timestamp = int(fields[0]) if fields[0] else 0
        job_id = int(fields[2]) if fields[2] else None
        task_index = int(fields[3]) if fields[3] else None
        machine_id = int(fields[4]) if fields[4] else None
        event_type = int(fields[5]) if fields[5] else None
        cpu_request = float(fields[9]) if fields[9] else 0.0
        memory_request = float(fields[10]) if fields[10] else 0.0
        
        if job_id is None or task_index is None or event_type is None:
            return None
        
        if event_type > 8:
            return None
        
        task_id = (job_id, task_index)
        return (task_id, (timestamp, machine_id, event_type, cpu_request, memory_request))
    except (ValueError, IndexError):
        return None


def get_latest_machine_capacity(events):
    sorted_events = sorted(events, key=lambda x: x[0])
    
    cpu_capacity = 0.0
    memory_capacity = 0.0
    is_active = False

    for timestamp, event_type, cpu, memory in sorted_events:
        if event_type == MACHINE_EVENT_ADD:
            is_active = True
            if cpu > 0:
                cpu_capacity = cpu
            if memory > 0:
                memory_capacity = memory
        elif event_type == MACHINE_EVENT_REMOVE:
            is_active = False
        elif event_type == MACHINE_EVENT_UPDATE:
            if cpu > 0:
                cpu_capacity = cpu
            if memory > 0:
                memory_capacity = memory

    return cpu_capacity, memory_capacity, is_active


def compute_task_running_periods(task_id, events):
    sorted_events = sorted(events, key=lambda x: x[0])

    periods = []
    current_start = None
    current_machine = None
    current_cpu = 0.0
    current_memory = 0.0

    for timestamp, machine_id, event_type, cpu_request, memory_request in sorted_events:
        if event_type == TASK_EVENT_SCHEDULE:
            if current_start is None:
                current_start = timestamp
                current_machine = machine_id
                current_cpu = cpu_request
                current_memory = memory_request
        elif event_type in TASK_TERMINATION_EVENTS:
            if current_start is not None:
                periods.append((current_machine, current_start, timestamp, current_cpu, current_memory))
                current_start = None
                current_machine = None
                current_cpu = 0.0
                current_memory = 0.0

    return periods


def analyze_machine_overcommitment(machine_id, task_periods, machine_capacities):
    if machine_id not in machine_capacities:
        return (machine_id, 0, 0, {})

    machine_cpu_capacity, machine_memory_capacity = machine_capacities[machine_id]

    events = []
    for start_time, end_time, cpu_req, memory_req in task_periods:
        events.append((start_time, 'start', cpu_req, memory_req))
        events.append((end_time, 'end', cpu_req, memory_req))

    events.sort()

    current_cpu = 0.0
    current_memory = 0.0
    overcommit_count = 0
    total_snapshots = 0

    for timestamp, event_type, cpu_req, memory_req in events:
        if event_type == 'start':
            current_cpu += cpu_req
            current_memory += memory_req
        else:
            current_cpu -= cpu_req
            current_memory -= memory_req

        total_snapshots += 1

        #check overcommitment
        cpu_overcommit = current_cpu > machine_cpu_capacity if machine_cpu_capacity > 0 else False
        memory_overcommit = current_memory > machine_memory_capacity if machine_memory_capacity > 0 else False

        if cpu_overcommit or memory_overcommit:
            overcommit_count += 1

    details = {
        'cpu_capacity': machine_cpu_capacity,
        'memory_capacity': machine_memory_capacity,
        'max_cpu_used': max([0] + [current_cpu for _, _, current_cpu, _ in [(0, 0, 0, 0)]]),
        'max_memory_used': max([0] + [current_memory for _, _, _, current_memory in [(0, 0, 0, 0)]])
    }

    return (machine_id, overcommit_count, total_snapshots, details)

print("OK")



OK


In [26]:

machine_events_path = "google-dataset/machine_events/part-*-of-*.csv.gz"
task_events_path = "google-dataset/task_events/part-*-of-*.csv.gz"


# ========== STEP 1: Machine Capacities ==========
print("\n[1] Loading machine events...")
machine_events_raw = sc.textFile(machine_events_path)

# Parse and filter
parsed_events = machine_events_raw \
    .map(parse_machine_event) \
    .filter(lambda x: x is not None)

# group by machine_id
grouped_events = parsed_events.groupByKey()

#Calculate capacities and filter
def process_machine(item):
    machine_id, events = item
    cpu, mem, is_active = get_latest_machine_capacity(list(events))
    if is_active and (cpu > 0 or mem > 0):
        return (machine_id, (cpu, mem))
    return None

machine_events = grouped_events \
    .map(process_machine) \
    .filter(lambda x: x is not None)

num_machines = machine_events.count()
print(f"Active machines: {num_machines}")

machine_capacities = machine_events.collectAsMap()
print(f"Machine capacities: {len(machine_capacities)} machines")

bc_machine_capacities = sc.broadcast(machine_capacities)
print(f"Machine capacities extracted: {len(machine_capacities)} machines")




[1] Loading machine events...
Active machines: 12486
Machine capacities: 12486 machines
Machine capacties extracted: 12486 machines


In [28]:


# ===== STEP 2: Task Events =====
print("\n[2] Loading task events...")
task_events_raw = sc.textFile(task_events_path)

task_events = task_events_raw \
    .map(parse_task_event) \
    .filter(lambda x: x is not None)

num_task_events = task_events.count()
print(f"Task events loaded: {num_task_events} events")

# ======= STEP 3: Task Running Periods =========
print("\n[3] Computing task running periods...")

task_lifecycle = task_events.groupByKey()
task_running_periods = task_lifecycle \
    .flatMap(lambda x: compute_task_running_periods(x[0], list(x[1])))

num_running_periods = task_running_periods.count()
print(f"Task running periods: {num_running_periods} periods")

# ========= STEP 4: Overcommitment Analysis ==========
print("\n[4] Analyzing overcommitment...")

machine_task_periods = task_running_periods \
    .map(lambda x: (x[0], (x[1], x[2], x[3], x[4]))) \
    .filter(lambda x: x[0] is not None)

machine_tasks = machine_task_periods.groupByKey().mapValues(list)

overcommitment_analysis = machine_tasks \
    .map(lambda x: analyze_machine_overcommitment(x[0], x[1], bc_machine_capacities.value))

print("Overcommitment analysis complete")

# ======= STEP 5: Aggregate Results ========
print("\n[5] Aggregating results...")

results = overcommitment_analysis.collect()

total_machines_analyzed = len(results)
machines_with_overcommit = sum(1 for _, oc_count, _, _ in results if oc_count > 0)
total_snapshots = sum(snapshots for _, _, snapshots, _ in results)
total_overcommit_snapshots = sum(oc_count for _, oc_count, _, _ in results)

print("Results aggregated")

# ======== OUTPUT ========
print("\n" + "="*80)
print("RESULTS")
print("="*80)

print(f"\n[Machines]")
print(f"  Total analyzed: {total_machines_analyzed}")
print(f"  With overcommit: {machines_with_overcommit}")

if total_machines_analyzed > 0:
    pct_machines = 100.0 * machines_with_overcommit / total_machines_analyzed
    print(f"  Percentage: {pct_machines:.2f}%")

print(f"\n[Time]")
print(f"  Total snapshots: {total_snapshots}")
print(f"  With overcommit: {total_overcommit_snapshots}")

if total_snapshots > 0:
    pct_time = 100.0 * total_overcommit_snapshots / total_snapshots
    print(f"  Percentage: {pct_time:.2f}%")

# top 10 machines
if machines_with_overcommit > 0:
    print("\n[Top 10 Machines]")
    top_machines = sorted(results, key=lambda x: x[1], reverse=True)[:10]
    
    for rank, (machine_id, oc_count, total_snaps, details) in enumerate(top_machines, 1):
        if oc_count > 0:
            pct = 100.0 * oc_count / total_snaps if total_snaps > 0 else 0
            print(f"  {rank}. Machine {machine_id}: {oc_count}/{total_snaps} ({pct:.2f}%)")

print("\n" + "="*80)



[2] Loading task events...
Task events loaded: 1603148 events

[3] Computing task running periods...
Task running periods: 500618 periods

[4] Analyzing overcommitment...
✓ Overcommitment analysis complete

[5] Aggregating results...
✓ Results aggregated

RESULTS

[Machines]
  Total analyzed: 12206
  With overcommit: 148
  Percentage: 1.21%

[Time]
  Total snapshots: 996044
  With overcommit: 2045
  Percentage: 0.21%

[Top 10 Machines]
  1. Machine 1436488167: 68/152 (44.74%)
  2. Machine 6201459631: 67/196 (34.18%)
  3. Machine 288787745: 58/118 (49.15%)
  4. Machine 3422416755: 56/142 (39.44%)
  5. Machine 904166: 54/126 (42.86%)
  6. Machine 564457288: 51/104 (49.04%)
  7. Machine 2787457444: 50/132 (37.88%)
  8. Machine 647596483: 46/124 (37.10%)
  9. Machine 3676415236: 45/160 (28.12%)
  10. Machine 63691529: 43/118 (36.44%)

