# Data visualization

In this notebook, we create sample visualizations of our data to show examples of the usage of data science tools to performance analysis in HPC.

## Import packages

Most of our data wrangling is done with `polars`  
We use `matplotlib` to create our plots.

In [None]:
import polars as pl
import matplotlib as mlp
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib.collections import PatchCollection
import seaborn as sns

We define a custom color map to be used in some figures

In [None]:
from matplotlib.colors import LinearSegmentedColormap
custom_cmap = LinearSegmentedColormap.from_list("my_cmap", ["#ffffff", "#ff0000"])

## Data reading

We read our data and clean up the data set

In [None]:
# df = pl.read_parquet("output_80_greedylb_196761.pq", columns=["Parent", "Start", "End", "Duration", "Value"])
df = pl.read_parquet("output_196946.pq", columns=["Parent", "Start", "End", "Duration", "Value"])
df = df.drop_nulls()
for col in df.iter_columns():
    col = col.str.strip_chars()
    if col.name == "Parent":
        col = col.str.replace(r"^pe", "").cast(pl.Int32)
    elif col.name != "Value":
        col = col.cast(pl.Float32)
    df = df.with_columns(col)
df = df.rename({"Value": "Chare", "Parent": "Processing Element"})
df = df.filter(pl.col.Duration > 0.0)

## Chare frequency chart
In this section we show a chart of the frequency of chares in this execution of the simulation.

In [None]:
def create_frequency_plot(dataframe: pl.DataFrame, filename="freq_plot.png"):
    frequency_df = dataframe.group_by("Chare").agg(pl.len()).sort(by="len")
    fig, ax = plt.subplots()
    ax.barh(y=frequency_df["Chare"], width=frequency_df["len"])
    ax.set_ylabel("Chare")
    ax.set_xlabel("Frequency")
    fig.savefig(filename)

In [None]:
create_frequency_plot(df)

## Duration of each chare across all Processing Elements

In [None]:
def create_total_duration_plot(dataframe: pl.DataFrame, filename="duration_plot.png"):
    durations_df = dataframe.group_by("Chare").agg(pl.sum("Duration")).sort(by="Duration")
    fig, ax = plt.subplots()
    ax.barh(y=durations_df["Chare"], width=durations_df["Duration"])
    ax.set_ylabel("Chare")
    ax.set_xlabel("Duration (ms)")
    fig.savefig(filename)

In [None]:
create_total_duration_plot(df)

## Chare activity per Processing Element heatmap

In [None]:
def create_chare_activity_per_pe_heatmap(dataframe: pl.DataFrame, num_nodes=4, pes_per_node=20, filename="activity_per_pe_hm.png"):
    agg_pe_chare_df = df.group_by(["Processing Element", "Chare"]).\
    agg(pl.sum("Duration") * 0.001).\
        sort(["Processing Element", "Duration"])
    heatmap_pe_chare_df = agg_pe_chare_df.pivot(index="Processing Element", on="Chare", values="Duration")
    heatmap_pe_chare_df = heatmap_pe_chare_df.drop("Processing Element")
    fig = plt.figure(figsize=(12, 8))
    ax = sns.heatmap(
        heatmap_pe_chare_df,
        cmap=custom_cmap,
        # cmap=custom_cmap,  # or "magma", "coolwarm", etc.
        cbar_kws={'label': 'Duration'},
        linecolor='none',
        xticklabels=heatmap_pe_chare_df.columns
    )
    total_pes = num_nodes * pes_per_node
    for i in range(pes_per_node, total_pes, pes_per_node):
        ax.axhline(i, color='blue', linestyle='--', linewidth=2)

    ax.set_title("Processing Element Activity Heatmap")
    ax.set_xlabel("Chare")
    ax.set_ylabel("Processing Element")
    fig.savefig(filename)

In [None]:
create_chare_activity_per_pe_heatmap(df)

## Processing Element Activity across time

In [None]:
def bin_dataframe(dataframe: pl.DataFrame, bin_size = 1_000_000.0):
    condition = pl.when(pl.col("Start") // bin_size != pl.col("End") // bin_size)
    binned_df = dataframe.with_columns([
        (pl.col("Start") // bin_size).cast(pl.Int32).alias("Time Bin Start"),
        (pl.col("End") // bin_size).cast(pl.Int32).alias("Time Bin End"),
        condition.then(pl.col.Duration - (pl.col.End % bin_size)).otherwise(pl.col("Duration")).alias("Start Bin Duration"), # If the bins are different, then it means that there is spill over and we must divide the duration across the bins
        condition.then(pl.col.End % bin_size).otherwise(-1).cast(pl.Int32).alias("End Bin Duration"), # We set the "End Bin Duration" as -1 if there's no spill over so we can easily drop it later
    ]).drop(pl.col("Start"), pl.col("End"), pl.col("Duration"))
    binned_df = binned_df.\
        unpivot(on=["Start Bin Duration", "End Bin Duration"], # We melt the dataframe and create a row for each Bin Duration Type
                variable_name="Bin Duration Type", 
                value_name="Bin Duration",
                index=["Processing Element", "Chare", "Time Bin Start", "Time Bin End"])
    binned_df = binned_df.remove( # We remove the entries where there was no spillover
        (pl.col("Bin Duration") == -1) &
        (pl.col("Bin Duration Type") == "End Bin Duration")
    )
    binned_df = binned_df.select([
        pl.col("Processing Element"),
        pl.col("Chare"),
        pl.when(
            (pl.col("Time Bin Start") != pl.col("Time Bin End")) &
            (pl.col("Bin Duration Type") == "Start Bin Duration")
        ).then(-1).otherwise(pl.col("Time Bin Start")).alias("Time Bin Start"), # If the time bins are different, but this is a start bin then this is a duplicate and we mark it for deletion
        pl.when(
            (pl.col("Time Bin Start") == pl.col("Time Bin End")) &
            (pl.col("Bin Duration Type") == "End Bin Duration")
        ).then(-1).otherwise(pl.col("Time Bin End")).alias("Time Bin End"), # If the time bins are the same, but this is an end bin then this is a duplicate and we mark it for deletion
        pl.col("Bin Duration Type"),
        pl.col("Bin Duration")
    ])
    binned_df = binned_df.remove( # We delete the marked rows
        (pl.col("Time Bin Start") == -1) |
        (pl.col("Time Bin End") == -1)
    )
    binned_df = binned_df.select([ # Finall we take either the Start or End time bin depending on which one the entry actually belongs to
        (pl.col("Processing Element")),
        (pl.col("Chare")),
        pl.when(
            (pl.col("Bin Duration Type") == "Start Bin Duration")
        ).then(pl.col("Time Bin Start")).otherwise(pl.col("Time Bin End")).alias("Time Bin"),
        (pl.col("Bin Duration").alias("Duration"))
        ])
    return binned_df

In [None]:
def create_timeline_plot(dataframe: pl.DataFrame, num_nodes=4, pes_per_node=20, filename="timepline.png"):
    binned_df = bin_dataframe(dataframe, bin_size = 50_000.0)
    agg_df = binned_df.group_by(["Processing Element", "Time Bin"]).\
        agg(pl.sum("Duration") * 0.001).\
        sort(["Processing Element", "Time Bin"])
    heatmap_df = agg_df.pivot(index="Processing Element", on="Time Bin", values="Duration")
    fig = plt.figure(figsize=(16, 8))
    ax = sns.heatmap(
        heatmap_df,
        cmap=custom_cmap,
        # cmap="berlin",
        cbar_kws={'label': 'Activity in bin'},
        linecolor='none'
    )
    total_pes = num_nodes * pes_per_node
    for i in range(pes_per_node, total_pes, pes_per_node):
        ax.axhline(i, color='blue', linestyle='--', linewidth=2)

    ax.set_title("Processing Element Activity Heatmap")
    ax.set_xlabel("Time Bin")
    ax.set_ylabel("Processing Element")
    fig.savefig(filename)

In [None]:
create_timeline_plot(df)

In [None]:
fig, ax = plt.subplots()

In [None]:
df.head()

In [None]:
pe = df["Processing Element"].to_numpy()
start = df["Start"].to_numpy()
end = df["End"].to_numpy()
durations = df["Duration"].to_numpy()

In [None]:
rects = [
    patches.Rectangle((start[i], pe[i]), durations[i], 0.9)
    for i in range(len(start))
]

In [None]:
collection = PatchCollection(rects, facecolor=colors, edgecolor='black')

## Imbalance Heatmap