In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.kaleido.scope.mathjax = None

In [None]:
FIG_PATH = "./figures/"

In [None]:
DATA_PATH = "./"

In [None]:
summary_jobs = np.load(DATA_PATH + "summary_jobs.npy")

In [None]:
all_unique_states, all_state_counts = np.unique(summary_jobs["state"], return_counts=True)

In [None]:
_, idx = np.unique(summary_jobs["job_id"], return_index=True)
unique_jobs = summary_jobs[np.sort(idx)]

## 1.) Job Breakdown By Status

In [None]:
unique_states, unique_counts = np.unique(unique_jobs["state"], return_counts=True)

In [None]:
fig = go.Figure(data=[go.Pie(labels=unique_states, values=unique_counts, text=unique_states, textposition="outside", sort=False)])
fig.update_layout(title = "Distribution of All Jobs by Status (N = 8571)")
fig.write_image(FIG_PATH + "jobs_by_status.pdf")

## 2.) Job Breakdown By Status + Duration

In [None]:
def create_duration_chart(unique_jobs, job_state):
    jobs_by_state = unique_jobs[unique_jobs["state"] == job_state]
    elapsed_time_sec = (jobs_by_state["elapsed_time"] / 1e9).astype("int")
    
    mins = 60
    hour = 60 * mins
    day = 24 * hour
    week = 7 * day
    elapsed_hist, edges = np.histogram(elapsed_time_sec, bins=[0, mins, 10 * mins, 30 * mins, hour, 2 * hour, 6 * hour, day, week])

    bin_names = [" < 1 min", "1 - 10 min", "10 - 30 min", "30 min - 1 hour", "1 - 2 hours", "2 - 6 hours", "6 hours to 1 day", "1 day to 1 week"]

    fig = go.Figure(data=[go.Pie(labels=bin_names, values=elapsed_hist, text=bin_names, direction="clockwise", textposition="outside", sort=False)])
    fig.update_layout(title = 
                          {
                            "text": f"{job_state} Jobs by Duration<br><br>N = {len(jobs_by_state)}<br>", 
                            "yanchor": "top"
                          },
                      margin =
                         {
                             "t": 200
                         })
    return fig

In [None]:
duration_figs = {}
for s in unique_states:
    duration_figs[s] = create_duration_chart(unique_jobs, s)

In [None]:
DURATION_FIG_PATH = FIG_PATH + "jobs_by_duration/"

In [None]:
for state, fig in duration_figs.items():
    fig.write_image(DURATION_FIG_PATH + state + ".pdf")

## 3.) Job Breakdown By Requested Resources

In [None]:
REQUESTED_RESOURCE_FIG_PATH = FIG_PATH + "jobs_by_req_resources/"

In [None]:
n_nodes, n_jobs_by_req_nodes = np.unique(unique_jobs["n_nodes"], return_counts=True)

In [None]:
by_req_nodes_fig = go.Figure(go.Bar(x=[str(n) for n in n_nodes], y=n_jobs_by_req_nodes, text=n_jobs_by_req_nodes, marker_color="black"))
by_req_nodes_fig.update_layout(title="Job Breakdown by Requested # Nodes", xaxis_title="# Nodes", yaxis_title="# Jobs")
by_req_nodes_fig.write_image(REQUESTED_RESOURCE_FIG_PATH + "number_nodes.pdf")

In [None]:
n_total_gpus = unique_jobs["n_nodes"] * unique_jobs["n_gpus"]
total_gpus, n_jobs_by_req_total_gpus = np.unique(n_total_gpus, return_counts=True)

In [None]:
by_req_total_gpus_fig = go.Figure(go.Bar(x=[str(n) for n in total_gpus], y=n_jobs_by_req_total_gpus, text=n_jobs_by_req_total_gpus, marker_color="goldenrod"))
by_req_total_gpus_fig.update_layout(title="Job Breakdown by Requested Total # GPUs", xaxis_title="# GPUs", yaxis_title="# Jobs")
by_req_total_gpus_fig.write_image(REQUESTED_RESOURCE_FIG_PATH + "total_gpus.pdf")

## 4.) Cluster-Wide Metric Analysis

In [None]:
summary_gpu = np.load(DATA_PATH + "summary_gpu.npy")

In [None]:
summary_cpu = np.load(DATA_PATH + "summary_cpu.npy")

In [None]:
def generate_histogram(values, title, bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100], color="blue", density=False, xaxis_title=None, yaxis_title=None):
    if bins:
        hist_values, edges = np.histogram(values, bins)
    else:
        hist_values, edges = np.histogram(values)

    if density:
        hist_values = [round(hist_values[i] / sum(hist_values), 2) for i in range(len(hist_values))]
        
    bin_names = [str(round(edges[i])) + "-" + str(round(edges[i+1])) for i in range(len(edges) - 1)]
    fig = go.Figure(go.Bar(x=bin_names, y=hist_values, text=hist_values, marker_color=color))
    fig.update_layout(title = 
                          {
                            "text": f"{title}<br>(N = {len(values)})", 
                            "yanchor": "top"
                          })
    if xaxis_title:
        fig.update_layout(xaxis_title=xaxis_title)
    if yaxis_title:
        fig.update_layout(yaxis_title=yaxis_title)
    
    return fig

In [None]:
fields = ["gpu_mem_used", "smi_util", "sm_active", "sm_occupancy", "tensor_active", "dram_active", "diff_util_metrics"]
field_text = {"gpu_mem_used": "GPU Memory Usage", "smi_util": "SMI Utilization", "sm_active": "SM Active Utilization", 
              "sm_occupancy": "SM Occupancy", "tensor_active": "Tensor Core Utilization", "dram_active": "Memory BW Utilization", "diff_util_metrics": "Difference between SMI vs. SM Active Util."} 
field_colors = {"gpu_mem_used": "purple", "smi_util": "red", "sm_active": "blue", "sm_occupancy": "orange", "dram_active": "green", "tensor_active": "brown", "diff_util_metrics": "pink"}

#### Just Taking the Whole Cluster's GPU Data with no Filtering

In [None]:
FIELDS_NO_FILTER = FIG_PATH + "fields_whole_cluster/"

In [None]:
whole_cluster_figures = {}
for f in fields:
    xaxis_title = field_text[f]
    yaxis_title = "Density"
    fig_title = field_text[f] + " Cluster-Wide -- No Filtering"
    fig = generate_histogram(summary_gpu[f], title = fig_title, color=field_colors[f], density=True, xaxis_title=xaxis_title, yaxis_title=yaxis_title)
    fig.write_image(FIELDS_NO_FILTER + f + ".pdf")
    whole_cluster_figures[f] = fig

#### Whole Cluster GPU Data Non-Idle Times

In [None]:
FIELDS_NON_IDLE = FIG_PATH + "fields_whole_cluster/non_idle_samples/"

In [None]:
non_idle_summary_gpu = summary_gpu[summary_gpu["smi_util"] > 0]

In [None]:
for f in fields:
    xaxis_title = field_text[f]
    yaxis_title = "Density"
    fig_title = field_text[f] + " Cluster-Wide -- Filtering for Non-Idle 100ms Intervals"
    fig = generate_histogram(non_idle_summary_gpu[f], title = fig_title, color=field_colors[f], density=True, xaxis_title=xaxis_title, yaxis_title=yaxis_title)
    fig.write_image(FIELDS_NON_IDLE + f + ".pdf")
    whole_cluster_figures[f] = fig

## 5.) Metric-Analysis on a Per-Job Basis
- Only looking at non-overlapping, completed jobs

In [None]:
completed_job_ids = unique_jobs[unique_jobs["state"] == "COMPLETED"]["job_id"]

In [None]:
len(completed_job_ids)

#### Note:
- Not very efficient implemention. Querying corresponding cpu/gpu data per job could be done better
- Should take 20-30 sec per job

In [None]:
job_data = {}
i = 0
for job_id in completed_job_ids:
    print(f"On Job #{i}")
    job_data[job_id] = {}
    ## get general info
    job_info = {}
    job_info_row = unique_jobs[unique_jobs["job_id"] == job_id]
    job_info["user"] = job_info_row["user"][0]
    job_info["group"] = job_info_row["group"][0]
    job_info["submit_time"] = job_info_row["submit_time"][0]
    job_info["queue_time"] = job_info_row["start_time"][0] - job_info_row["submit_time"][0]
    job_info["req_nodes"] = job_info_row["n_nodes"][0]
    job_info["req_cpus"] = job_info_row["n_cpus"][0]
    job_info["req_gpus"] = job_info_row["n_gpus"][0]
    job_info["req_mem_mb"] = job_info_row["mem_mb"][0]
    job_info["start_time"] = job_info_row["start_time"][0]
    job_info["end_time"] = job_info_row["end_time"][0]
    job_info["elapsed_time"] = job_info_row["elapsed_time"][0]
    all_nodes = summary_jobs[summary_jobs["job_id"] == job_id]["node_id"]
    job_info["alloc_nodes_list"] = sorted(list(all_nodes))
    job_data[job_id]["job_info"] = job_info

    ## cpu data that could possibly match
    cpu_data = summary_cpu[(summary_cpu["timestamp"] >= job_info["start_time"]) & 
                            (summary_cpu["timestamp"] <= job_info["end_time"]) & 
                            (np.isin(summary_cpu["node_id"], job_info["alloc_nodes_list"]))]
    job_data[job_id]["cpu_data"] = cpu_data

    ## gpu data that could possibly match
    gpu_data = summary_gpu[(summary_gpu["timestamp"] >= job_info["start_time"]) & 
                            (summary_gpu["timestamp"] <= job_info["end_time"]) & 
                            (np.isin(summary_gpu["node_id"], job_info["alloc_nodes_list"]))]
    job_data[job_id]["gpu_data"] = gpu_data
    i += 1

#### Only Analyzing Completed Jobs

In [None]:
all_completed = summary_jobs[summary_jobs["state"] == "COMPLETED"]
sort_ind = np.argsort(all_completed,order=['end_time', 'start_time', 'node_id'])
all_completed_sorted = all_completed[sort_ind]

In [None]:
gpu_job_ids_collected = []
for k, v in job_data.items():
    gpu_job_ids_collected.append(k)

In [None]:
gpu_job_collected = all_completed_sorted[np.isin(all_completed_sorted["job_id"], gpu_job_ids_collected)]

#### Only Keeping Jobs that did not Overlap on the node(s) they ran on

In [None]:
def determine_gpus(job_data, job_id):
    job_entry = job_data[job_id]
    num_gpus = job_entry["job_info"]["req_gpus"]
    return_gpu_ids = {}
    if num_gpus == 8:
        for n in job_entry["job_info"]["alloc_nodes_list"]:
            return_gpu_ids[n] = [i for i in range(8)]
        return return_gpu_ids
        
    job_gpu_data = job_entry["gpu_data"]
    for n in job_entry["job_info"]["alloc_nodes_list"]:
        non_idle_times = job_gpu_data[(job_gpu_data["node_id"] == n) & (job_gpu_data["smi_util"] > 0)]
        non_idle_dev_ids = np.unique(non_idle_times["device_id"])
        if (len(non_idle_dev_ids) < num_gpus):
            return None
        return_gpu_ids[n] = sorted(list(non_idle_dev_ids))
    return return_gpu_ids

#### Augmenting the "Job Data" Dictionary Summarizing Each Job

In [None]:
iso_job_ids = set()

In [None]:
for i in range(37):
    node_jobs = gpu_job_collected[gpu_job_collected["node_id"] == i]
    end_times = np.pad(node_jobs["end_time"], (1, 0), 'constant')
    start_times = np.pad(node_jobs["start_time"], (0, 1), 'constant')
    isolated_job_inds = (start_times > end_times)[1:]
    iso_jobs = node_jobs[isolated_job_inds]["job_id"]
    for j in iso_jobs:
        iso_job_ids.add((j, i))

In [None]:
true_isolated_jobs = []
for job_id, info in job_data.items():
    all_nodes = info["job_info"]["alloc_nodes_list"]
    is_isolated = True
    for n in all_nodes:
        if (job_id, n) not in iso_job_ids:
            is_isolated = False
            break
    info["is_isolated"] = is_isolated
    if is_isolated:
        true_isolated_jobs.append(job_id)

In [None]:
jobs_with_detected_dev_ids = {}
for job_id in true_isolated_jobs:
    dev_ids = determine_gpus(job_data, job_id)
    if dev_ids is None:
        continue
    jobs_with_detected_dev_ids[job_id] = dev_ids

In [None]:
for job_id, entry in job_data.items():
    node_to_gpu_mapping = {}
    if job_id in jobs_with_detected_dev_ids:
        dev_ids = jobs_with_detected_dev_ids[job_id]
        node_to_gpu_mapping = dev_ids
    entry["job_info"]["node_to_gpu_mapping"] = node_to_gpu_mapping

In [None]:
for job_id, entry in job_data.items():
    node_to_gpu_mapping = entry["job_info"]["node_to_gpu_mapping"]
    if len(node_to_gpu_mapping) > 0:
        orig_gpu_data = entry["gpu_data"]
        cur_gpu_data = orig_gpu_data
        for node_id, dev_ids in node_to_gpu_mapping.items():
            cur_gpu_data = cur_gpu_data[np.logical_or(cur_gpu_data["node_id"] != node_id, np.logical_and(cur_gpu_data["node_id"] == node_id, np.isin(cur_gpu_data["device_id"], dev_ids)))]
        entry["gpu_data"] = cur_gpu_data

In [None]:
for job_id, entry in job_data.items():
    node_to_gpu_mapping = entry["job_info"]["node_to_gpu_mapping"]
    gpu_summary = {}
    if len(node_to_gpu_mapping) > 0:
        gpu_data = entry["gpu_data"]
        if len(gpu_data) > 0:
            fields = ["gpu_mem_used", "smi_util", "sm_active", "sm_occupancy", "tensor_active", "dram_active", "diff_util_metrics"]
            for f in fields:
                gpu_summary[f] = {}
                gpu_summary[f]["mean"] = round(np.mean(gpu_data[f]), 2)
                gpu_summary[f]["var"] = round(np.var(gpu_data[f]), 2)
                gpu_summary[f]["std"] = round(np.std(gpu_data[f]), 2)
                gpu_summary[f]["median"] = round(np.median(gpu_data[f]), 2)
    entry["summary_gpu_data"] = gpu_summary

#### Building Compacted Structure with Relevant Job + Summary of Metrics

In [None]:
gpu_summary_dtype = [("job_id", np.uint32), ("job_duration", "timedelta64[ns]"), ("nodes", np.uint8), ("gpus", np.uint8), ("cpus", np.uint8), ("sys_mem_mb", np.uint32)]

In [None]:
for f in fields:
    for v in ["mean", "var", "std", "median"]:
        gpu_summary_dtype.append((f + "_" + v, np.float32))

In [None]:
all_rows = []
for job_id, entry in job_data.items():
    summary_gpu_data = entry["summary_gpu_data"]
    if (len(summary_gpu_data) > 0):
        job_info = entry["job_info"]
        row = [job_id, job_info["elapsed_time"], job_info["req_nodes"], job_info["req_gpus"], job_info["req_cpus"], job_info["req_mem_mb"]]
        for f in fields:
            for v in ["mean", "var", "std", "median"]:
                row.append(summary_gpu_data[f][v])
        all_rows.append(row)

In [None]:
all_row_tuples = [(tuple(r)) for r in all_rows]

In [None]:
gpu_job_summary = np.array(all_row_tuples, dtype=gpu_summary_dtype)

#### Generating Figures for Fields Broken Down by Job

In [None]:
FIELDS_BY_JOB_FIG_PATH = FIG_PATH + "fields_by_job/"

In [None]:
stat_text = {"mean": "Average", "var": "Variance", "std": "Standard Deviation", "median": "Median"}
field_by_job_figures = {}
for f in fields:
    field_by_job_figures[f] = {}
    for v in ["mean", "var", "std", "median"]:
        xaxis_title = stat_text[v] + " " + field_text[f]
        yaxis_title = "# Jobs"
        fig_title = xaxis_title + " by Job"
        ## not in 0-100 scale, so let plot auto decide
        if (v == "var" or v == "std"):
            fig = generate_histogram(gpu_job_summary[f + "_" + v], fig_title, bins=None, color=field_colors[f], xaxis_title=xaxis_title, yaxis_title=yaxis_title)
        else:
            fig = generate_histogram(gpu_job_summary[f + "_" + v], fig_title, color=field_colors[f], xaxis_title=xaxis_title, yaxis_title=yaxis_title)
        fig.write_image(FIELDS_BY_JOB_FIG_PATH + f + "_" + v + ".pdf")
        field_by_job_figures[f][v] = fig