# SageMaker Debugger profiling report
This report summarizes the execution of the profiler built-in rules. Profiler runs a set of rules as the training is going on where each of them identifies certain performance issues. This notebook gives a description for each rule and details about the rule execution.

In [None]:
import json
import pandas as pd
import glob
import matplotlib.pyplot as plt
import numpy as np
from smdebug.profiler.utils import us_since_epoch_to_human_readable_time, ns_since_epoch_to_human_readable_time


In [None]:
import bokeh
from bokeh.io import output_notebook, show
from bokeh.layouts import column, row
from bokeh.plotting import figure
from bokeh.models.widgets import DataTable, DateFormatter, TableColumn
from bokeh.models import ColumnDataSource, PreText
from math import pi
from bokeh.transform import cumsum
import warnings
from bokeh.models.widgets import Paragraph
from bokeh.models import Legend
from bokeh.util.warnings import BokehDeprecationWarning, BokehUserWarning
warnings.simplefilter('ignore', BokehDeprecationWarning)
warnings.simplefilter('ignore', BokehUserWarning)

output_notebook(hide_banner=True)

In [None]:
def create_piechart(data_dict, title=None, height=400, width=400, x1=0, x2=0.1, radius=0.4, toolbar_location='right'):
   
    plot = figure(plot_height=height, 
                  plot_width=width,
                  toolbar_location=toolbar_location,
                  tools="hover,wheel_zoom,reset,pan", 
                  tooltips="@phase:@value", 
                  title=title,
                  x_range=(-radius-x1, radius+x2))

    data = pd.Series(data_dict).reset_index(name='value').rename(columns={'index':'phase'})
    data['angle'] = data['value']/data['value'].sum() * 2*pi
    data['color'] = bokeh.palettes.viridis(len(data_dict))

    plot.wedge(x=0, y=0., radius=radius,
        start_angle=cumsum('angle', include_zero=True), 
        end_angle=cumsum('angle'),
        line_color="white", 
        source=data, 
        fill_color='color', 
        legend='phase'
              )
    plot.legend.label_text_font_size = "8pt"
    plot.legend.location = 'center_right'
    plot.axis.axis_label=None
    plot.axis.visible=False
    plot.grid.grid_line_color = None
    plot.outline_line_color = "white"
    
    return plot

In [None]:
from IPython.display import display, HTML, Markdown, Image
def pretty_print(df):
    raw_html = df.to_html().replace("\\n","<br>").replace('<tr>','<tr style="text-align: left;">')
    return display(HTML(raw_html))

## Training job summary

In [None]:
def load_report(rule_name):
    try:
        report = json.load(open('/opt/ml/processing/outputs/profiler-reports/'+rule_name+'.json'))
        if rule_name != 'MaxInitializationTime' and rule_name != 'OverallSystemUsage' and rule_name != 'OverallFrameworkMetrics':
            triggered = report['RuleTriggered']
            datapoints = report['Datapoints']
            display(Markdown(f"""The number of times the {rule_name} rule triggerd: {triggered}"""))
            display(Markdown(f"""The number of events processed by {rule_name} rule: {datapoints}"""))
        return report
    except FileNotFoundError:
        print (rule_name + ' not triggered')

In [None]:

job_statistics = {}
report = load_report('MaxInitializationTime')
if report:
    if "first" in report['Details']["step_num"]:
        first_step = report['Details']["step_num"]["first"]
        last_step = report['Details']["step_num"]["last"]
    job_statistics["start_time"] = us_since_epoch_to_human_readable_time(report['Details']['job_start'] * 1000000)
    job_statistics["end_time"] = us_since_epoch_to_human_readable_time(report['Details']['job_end'] * 1000000)
    job_statistics["job_duration_in_seconds"] = (report['Details']['job_end'] - report['Details']['job_start']) 
    if "first" in report['Details']["step_num"]:
        job_statistics["training_loop_start"] = us_since_epoch_to_human_readable_time(first_step)
        job_statistics["training_loop_end"] = us_since_epoch_to_human_readable_time(last_step)
        job_statistics["training_loop_duration_in_seconds"] = (last_step - first_step) / 1000000
        job_statistics["initialization_in_seconds"] = first_step/1000000 - report['Details']['job_start'] 
        job_statistics["finalization_in_seconds"] = np.abs(report['Details']['job_end'] - last_step/1000000)
        job_statistics["initialization_%"] = job_statistics["initialization_in_seconds"] / job_statistics["job_duration_in_seconds"] * 100
        job_statistics["training_loop_%"] = job_statistics["training_loop_duration_in_seconds"] / job_statistics["job_duration_in_seconds"] * 100
        job_statistics["finalization_%"] = job_statistics["finalization_in_seconds"] / job_statistics["job_duration_in_seconds"] * 100


In [None]:
import datetime

text =  """The following table gives a summary about the training job. The tables includes information about when the training job started and ended, how much time initialization, training loop and finalization took."""
if job_statistics:
    df = pd.DataFrame.from_dict(job_statistics, orient='index')
    start_time = df[0]['start_time']
    date = datetime.datetime.strptime(df[0]['start_time'], '%Y-%m-%dT%H:%M:%S:%f')
    day = date.date().strftime("%m/%d/%Y")
    hour = date.time().strftime("%H:%M:%S")
    duration = int(df[0]['job_duration_in_seconds'])
    text = Paragraph(text=f"""{text} \n Your training job started on {day} at {hour} and ran for {duration} seconds.""")
     
    #pretty_print(df)
    if "first" in report['Details']["step_num"]:
        if job_statistics["finalization_%"]  < 0:
            job_statistics["finalization_%"]  = 0
        if job_statistics["training_loop_%"] < 0:
            job_statistics["training_loop_%"] = 0
        if job_statistics["initialization_%"] < 0:
            job_statistics["initialization_%"] = 0
    else:
        text = Paragraph(text=f"""{text} \n Your training job started on {day} at {hour} and ran for {duration} seconds.""")


In [None]:
df2 = df.reset_index()
df2.columns = ["0", "1"]
source = ColumnDataSource(data=df2)
columns = [TableColumn(field='0', title=""),
           TableColumn(field='1', title="Job Statistics"),]
table = DataTable(source=source, columns=columns, width=450, height=380)
plot = None
if "initialization_%" in job_statistics:
    piechart_data = {}
    piechart_data["initialization_%"] = job_statistics["initialization_%"]  
    piechart_data["training_loop_%"]  = job_statistics["training_loop_%"] 
    piechart_data["finalization_%"]  = job_statistics["finalization_%"] 

    plot = create_piechart(piechart_data, 
                           height=350,
                           width=500,
                           x1=0.15,
                           x2=0.15,
                           radius=0.15, 
                           toolbar_location=None)

text=f"""Your training job started on {day} at {hour} and ran for {duration} seconds."""
if plot != None:
    text = Paragraph(text=f"""{text}""", width = 800)
    show(column(text, row(table, plot)))
else:
    text = Paragraph(text=f"""{text}. No step information was recorded in your training job, so initialization and finalization time cannot be computed.""" , width = 800)
    show(column(text, row(table)))

## System usage statistics

In [None]:
report = load_report('OverallSystemUsage')

In [None]:
if "GPU" in report["Details"]:
    for node_id in report["Details"]["GPU"]:
        gpu_p95 = report["Details"]["GPU"][node_id]["p95"]
        cpu_p95 = report["Details"]["CPU"][node_id]["p95"]
        text1 = ''
        if gpu_p95 < 70 and cpu_p95 < 70:
            text1 = f"""{text1}The 95th quantile of the total GPU utilization on node {node_id} is only {int(gpu_p95)}%. 
            The 95th quantile of the total CPU utilization is only {int(cpu_p95)}%. Node {node_id} is under-utilized. 
            You may want to consider switching to a smaller instance type."""
        elif gpu_p95 < 70 and cpu_p95 > 70:
            text1 = f"""{text1}The 95th quantile of the total GPU utilization on node {node_id} is only {int(gpu_p95)}%. 
            However, the 95th quantile of the total CPU utilization is {int(cpu_p95)}%. GPUs on node {node_id} are under-utilized 
            likely because of CPU bottlenecks"""
        elif gpu_p95 > 70:
            text1 = f"""{text1}The 95th quantile of the total GPU utilization on node {node_id} is {int(gpu_p95)}%. 
            GPUs on node {node_id} are well utilized"""
        else:
            text1 = f"""{text1}The 95th quantile of the total GPU utilization on node {node_id} is {int(gpu_p95)}%. 
            The 95th quantile of the total CPU utilization is {int(cpu_p95)}%."""
else:
    for node_id in report["Details"]["CPU"]:
        cpu_p95 = report["Details"]["CPU"][node_id]["p95"]
        if cpu_p95 > 70:
            text1 = f"""{text1}The 95th quantile of the total CPU utilization on node {node_id} is {int**(cpu_p95)}%. GPUs on node {node_id} are well utilized"""
text1 = Paragraph(text=f"""{text1}""", width=1100)
text2 = Paragraph(text=f"""The following table shows usage statistics per worker node such as total CPU and GPU 
utilization, total CPU and memory footprint. The table also include total IO wait time and total sent/received bytes.
The table shows min and max values as well as p99, p90 and p50 percentiles.""", width=900)


In [None]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)
rows = [] 
units = {"CPU": "percentage", "CPU memory": "percentage", "GPU": "percentage", "Network": "bytes", "GPU memory": "percentage", "I/O": "percentage"}
if report:
    for metric in report['Details']:
        for node_id in report['Details'][metric]:
            values = report['Details'][metric][node_id]
            rows.append([node_id, metric, units[metric], values['max'], values['p99'], values['p95'], values['p50'], values['min']])

    df = pd.DataFrame(rows) 
    df.columns = ['Node', 'metric', 'unit', 'max', 'p99', 'p95', 'p50', 'min']

In [None]:
df2 = df.reset_index()
source = ColumnDataSource(data=df2)
columns = [TableColumn(field='Node', title="node"),
           TableColumn(field='metric', title="metric"),
           TableColumn(field='unit', title="unit"),
           TableColumn(field='max', title="max"),
           TableColumn(field='p99', title="p99"),
           TableColumn(field='p95', title="p95"),
           TableColumn(field='p50', title="p50"),
           TableColumn(field='min', title="min"),]
table = DataTable(source=source, columns=columns, width=800, height=df2.shape[0]*30)
    
show(column( text1, text2, row(table)))

In [None]:
def display_image(image_name):
    files = glob.glob('/opt/ml/processing/outputs/profiler-reports/' + image_name)
    for filename in files:
        display(Image(filename=filename))

In [None]:
report = load_report('OverallFrameworkMetrics')
if 'Details' in report:
    
    display(Markdown(f"""## Framework metrics summary"""))
    plots = []
    text = ''
    if 'phase' in report['Details']:
        text = f"""The following piecharts show how much time your training job spent in "training", "validation" phase or "others".
        Latter one is the accumulated time between steps, so when one step has finished but the new step has not started yet.
        Ideally most time should be spent in training steps."""

        if 'others' in report['Details']['phase']:
            others = float(report['Details']['phase']['others'])

            if others > 25:
                text = f"""{text} Your training job spent quite a significant amount of time ({round(others,2)}%) in phase "others".
                You should check what is happening in between the steps."""

            plot = create_piechart(report['Details']['phase'], 
                                height=350,
                                width=600,
                                x1=0.2,
                                x2=0.6,
                                radius=0.3, 
                                title="Ratio between TRAIN/EVAL phase and others")
            plots.append(plot)

    if 'forward_backward' in report['Details']:

        event = max(report['Details']['forward_backward'], key=report['Details']['forward_backward'].get)
        perc = report['Details']['forward_backward'][event]

        text = f"""{text} The piechart on the right shows a more detailed breakdown. 
        It shows that {int(perc)}% of the time was spent in event {event}"""

        if perc > 70:
            text = f"""{text} The following piecharts shows that {int(perc)}% of your training 
            was spent in "{event}". There is quite a significant difference between the time spent in forward and backward
            pass."""
        else:
            text = f"""{text} The following piecharts shows that {int(perc)}% of your training 
            was spent in "{event}"."""

        plot = create_piechart(report['Details']['forward_backward'], 
                            height=350,
                            width=600,
                            x1=0.2,
                            x2=0.6,
                            radius=0.3, 
                            title="Ratio between forward and backward pass") 
        plots.append(plot)

    if len(plots) > 0:
        paragraph = Paragraph(text=text, width=1100)
        show(column(paragraph, row(plots)))

    plots = []
    text=''
    if 'ratio' in report['Details']:

        key = list(report['Details']['ratio'].keys())[0]
        ratio = report['Details']['ratio'][key]

        text = f"""The following piechart shows a breakdown of the CPU/GPU operators. 
            It shows that {int(ratio)}% of the time was spent in executing operators on "{key}"."""

        plot = create_piechart(report['Details']['ratio'], 
                                height=350,
                                width=600,
                                x1=0.2,
                                x2=0.6,
                                radius=0.3, 
                                title="Ratio between CPU/GPU operators")
        plots.append(plot)


    if 'general' in report['Details']:
        event = max(report['Details']['general'], key=report['Details']['general'].get)
        perc = report['Details']['general'][event]
        others = int(report['Details']['phase']['others'])

        plot = create_piechart(report['Details']['general'], 
                            height=350,
                            width=600,
                            x1=0.2,
                            x2=0.6,
                            radius=0.3, 
                            title="General metrics recorded in framework ")
        plots.append(plot)

    if len(plots) > 0:
        paragraph = Paragraph(text=text, width=1100)
        show(column(paragraph, row(plots)))

    plots = []
    text = ''
    if 'horovod' in report['Details']:
        display(Markdown(f"""#### Overview: Horovod metrics"""))
        event = max(report['Details']['horovod'], key=report['Details']['horovod'].get)
        perc = report['Details']['horovod'][event]
        text = f"""{text} The following piechart shows a detailed breakdown of the Horovod metrics that have been recorded
        in your training job. The most expensive function was "{event}" with {int(perc)}%"""

        plot = create_piechart(report['Details']['horovod'], 
                            height=350,
                            width=600,
                            x1=0.2,
                            x2=0.6,
                            radius=0.3, 
                            title="Horovod metrics ")

        paragraph = Paragraph(text=text, width=1100)
        show(column(paragraph, row(plot)))


In [None]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)
rows = [] 
values = []
if 'CPU_total' in report['Details']:
    display(Markdown(f"""#### Overview: CPU operators"""))
    event = max(report['Details']['CPU'], key=report['Details']['CPU'].get)
    perc = report['Details']['CPU'][event]
        
    for function in report['Details']['CPU']:
        percentage = round(report['Details']['CPU'][function],2)
        time = report['Details']['CPU_total'][function]               
        rows.append([percentage, time, function])

    df = pd.DataFrame(rows) 
    df.columns = ['percentage', 'time', 'operator']
    
    df = df.sort_values(by=['percentage'], ascending=False)
    source = ColumnDataSource(data=df)
    columns = [TableColumn(field='percentage', title="Percentage"),
               TableColumn(field='time', title="Cumulative time"),
              TableColumn(field='operator', title="CPU operator"),]

    table = DataTable(source=source, columns=columns, width=550, height=350)

    text = Paragraph(text=f"""The following table shows a list of operators that your training job run on CPU.
    The most expensive operator on CPU was "{event}" with {int(perc)} %""")

    plot = create_piechart(report['Details']['CPU'],
                            height=350,
                            width=600,
                            x1=0.2,
                            x2=0.6,
                            radius=0.3, 
                           )

    show(column(text, row(table, plot)))


In [None]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)
rows = [] 
values = []
if 'GPU_total' in report['Details']:
    display(Markdown(f"""#### Overview: GPU operators"""))
    event = max(report['Details']['GPU'], key=report['Details']['GPU'].get)
    perc = report['Details']['GPU'][event]
    
    for function in report['Details']['GPU']:
        percentage = round(report['Details']['GPU'][function],2)
        time = report['Details']['GPU_total'][function]               
        rows.append([percentage, time, function])

    df = pd.DataFrame(rows) 
    df.columns = ['percentage', 'time', 'operator']
    
    df = df.sort_values(by=['percentage'], ascending=False)
    source = ColumnDataSource(data=df)
    columns = [TableColumn(field='percentage', title="Percentage"),
               TableColumn(field='time', title="Cumulative time"),
              TableColumn(field='operator', title="GPU operator"),]
    table = DataTable(source=source, columns=columns, width=450, height=350)

    text = Paragraph(text=f"""The following table shows a list of operators that your training job run on GPU.
    The most expensive operator on GPU was "{event}" with {int(perc)} %""")

    plot = create_piechart(report['Details']['GPU'],
                            height=350,
                            width=600,
                            x1=0.2,
                            x2=0.6,
                            radius=0.3, 
                           )

    show(column(text, row(table, plot)))

## Rules summary

In [None]:
description = {}
description['CPUBottleneck'] = 'Checks if CPU usage is high but GPU usage is low at the same time, it may indicate a CPU bottleneck where GPU is waiting for data to arrive from CPU. The rule triggers if number of CPU bottlenecks exceeds a predefined threshold.'
description['IOBottleneck'] = 'If IO wait time is high but at the same time GPU usage is low, it may indicate an IO bottleneck where GPU is waiting for data to arrive from disk. The rule triggers if number of IO bottlenecks exceeds a predefined threshold.'
#description['Dataloaders'] = 'Checks how many data-loader processes are running in parallel and whether the total number is equal the number of available CPU cores. The rule triggers if number is much smaller or larger than the number of available cores. If too small, it may lead to low GPU utilization. If too large it may lead to too many context switches on CPU.'
description['GPUMemoryIncrease'] = 'If model and/or batch size is too large then training will run out of memory and crash.'
description['BatchSize'] = 'Checks if GPU is under-utilized because of the batch size being too small. To detect this the rule analyzes the average GPU memory footprint, CPU and GPU utilization. '
description['LowGPUUtilization'] = 'Checks if GPU utilization is low or suffers from fluctuations. This can happen if there are bottlenecks, many blocking calls due to synchronizations or batch size too small.'
description['MaxInitializationTime'] = 'Checks if the training initialization is taking too much time. The rule waits until first step is available. This can happen if you are running in File mode and a lot of data needs to be downloaded from Amazon S3.'
description['LoadBalancing'] = 'Detect issues in workload balancing between multiple GPUs. Workload imbalance can for instance occur in data parallel training when gradients are accumulated on primary GPU so this GPU will be overused with regards to other GPUs limiting the effect of parallelization.  '
description['StepOutlier'] = 'Detect outliers in step duration. Time for forward and backward pass should be roughly the same throughout the training. If there are significant outliers it would indicate an issue due to a system stall or a bottleneck.'

In [None]:
recommendation = {}
recommendation['CPUBottleneck'] = 'CPU bottlenecks can happen when data preprocessing is very compute intensive. You should consider increasing the number of data-loader processes or apply pre-fetching.'
recommendation['IOBottleneck'] = 'Pre-fetch data or choose different file formats such as binary formats which improves read performance.'
#recommendation['Dataloaders'] = 'Increase or decrease the number of data-loader processes'
recommendation['GPUMemoryIncrease'] = 'Choose a larger instance type with more memory (if it is not a memory leak) or apply model parallelism (Rubik)'
recommendation['BatchSize'] = 'Run on a smaller instance type or increase batch size'
recommendation['LowGPUUtilization'] = 'Check for bottlenecks, minimize blocking calls, change distributed training strategy, increase batch-size.'
recommendation['MaxInitializationTime'] = 'Switch from File to Pipe mode'
recommendation['LoadBalancing'] = 'Choose different distributed training strategy or different distributed training framework'
recommendation['StepOutlier'] = 'Check for bottlenecks'

In [None]:
files = glob.glob('/opt/ml/processing/outputs/profiler-reports/*json')
summary = {}
for i in files:
    rule_name = i.split('/')[-1].replace('.json','')
    if rule_name == "OverallSystemUsage" or rule_name == "OverallFrameworkMetrics":
        continue
    rule_report = json.load(open(i))
    summary[rule_name] = {}
    summary[rule_name]['Description'] = description[rule_name]
    summary[rule_name]['Recommendation'] = recommendation[rule_name]
    summary[rule_name]['Number of times rule triggered'] = rule_report['RuleTriggered'] 
    #summary[rule_name]['Number of violations'] = rule_report['Violations'] 
    summary[rule_name]['Number of datapoints'] = rule_report['Datapoints']
    summary[rule_name]['Rule parameters'] = rule_report['RuleParameters']

df = pd.DataFrame.from_dict(summary, orient='index')
df = df.sort_values(by=['Number of times rule triggered'], ascending=False)

display(Markdown(f"""The following table shows a summary of the executed profiler rules. 
The table is sorted by the rules that triggered most frequently. In your training job this was the case
for rule {df.index[0]}. It has processed {df.values[0,3]} datapoints and triggered {df.values[0,2]} times."""))
with pd.option_context('display.colheader_justify','left'):    
    pretty_print(df)

In [None]:
analyse_phase = "training"
if job_statistics and "initialization_in_seconds" in job_statistics:
    if job_statistics["initialization_in_seconds"] > job_statistics["training_loop_duration_in_seconds"]:
        analyse_phase = "initialization"
        time = job_statistics["initialization_in_seconds"]
        perc = job_statistics["initialization_%"]
        display(Markdown(f"""The initialization phase lasted for {int(time)} seconds which is {int(perc)}%*
        of the training job time. Since the training loop has taken the most time, 
        we dive deep into the events occurring during this phase"""))
        display(Markdown("""### Analyzing initialization\n\n"""))
    time = job_statistics["training_loop_duration_in_seconds"]
    perc = job_statistics["training_loop_%"]
    display(Markdown(f"""The training loop lasted for {int(time)} seconds which is {int(perc)}% of the training job time.
                    Since the training loop has taken the most time, we dive deep into the events occurring during this phase"""))
if analyse_phase == 'training':
    display(Markdown("""### Analyzing the training loop\n\n"""))

In [None]:
if analyse_phase == "initialization":
    display(Markdown("""### MaxInitializationTime\n\nThis rule helps to detect if the training initialization is taking too much time. \nThe rule waits until first step is available. The rule takes the parameter `threshold` that defines how many minutes to wait for the first step to become available. Default is 20 minutes.\nYou can run the rule locally in the following way:
    """))
    
    _ = load_report("MaxInitializationTime")

In [None]:
if analyse_phase == "training":
    display(Markdown("""#### Step duration analysis"""))
    report = load_report('StepOutlier')
    parameters = report['RuleParameters']
    params = report['RuleParameters'].split('\n')
    stddev = params[3].split(':')[1]
    mode = params[1].split(':')[1]
    n_outlier = params[2].split(':')[1]
    display(Markdown(f"""The StepOutlier rule measures step durations and checks for outliers.\nThe rule \
    returns `True` if duration is larger than {stddev} times the standard deviation. The rule \
    also takes the parameter `mode`, that specifies whether steps from training or validation phase \
    should be checked. In your processing job `mode` was specified as {mode}. \
    Typically the first step is taking significantly more time and to avoid the \
    rule triggering immediately, one can use `n_outliers` to specify the number of outliers to ignore. \
    `n_outliers` was set to {n_outlier}.\n 
    """))

    if report and len(report['Details']['step_details']) > 0:
        for node_id in report['Details']['step_details']:
            tmp = report['RuleParameters'].split('threshold:')
            threshold = tmp[1].split('\n')[0]
            n_outliers = report['Details']['step_details'][node_id]['number_of_outliers']
            mean = report['Details']['step_details'][node_id]['step_stats']['mean']
            stddev = report['Details']['step_details'][node_id]['stddev']
            phase = report['Details']['step_details'][node_id]['phase']
            display(Markdown(f"""**Step durations on node {node_id}:**"""))
            display(Markdown(f"""The following table is a summary of the statistics of step durations measured on node {node_id}.
            The rule has analyzed the step duration from {phase} phase.
            The average step duration on node {node_id} was {round(mean, 2)}s. 
            The rule detected {n_outliers} outliers, where step duration was larger than {threshold} times the standard deviation of {stddev}s
                             \n"""))
            step_stats_df = pd.DataFrame.from_dict(report['Details']['step_details'][node_id]['step_stats'], orient='index').T
            step_stats_df.index = ['Step Durations in [s]']
            pretty_print(step_stats_df)
        
        display(Markdown(f"""The following histogram shows the step durations measured on the different nodes. 
            You can enable/disable the visualization of a histogram by clicking on the label in the legend."""))

        plot = figure(plot_height=450, 
                          plot_width=850, 
                          x_range=(-1,100),
                          title=f"""Step durations""")  
        
        colors = bokeh.palettes.viridis(len(report['Details']['step_details']))
        
        for index, node_id in enumerate(report['Details']['step_details']):
            probs = report['Details']['step_details'][node_id]['probs']
            binedges = report['Details']['step_details'][node_id]['binedges']
           
            plot.quad( top=probs,
                    bottom=0,
                    left=np.arange(0,98,2),
                    right=np.arange(2,100,2),
                    line_color="white",
                    fill_color=colors[index],
                    fill_alpha=0.7,
                    legend=node_id)
            
        plot.add_layout(Legend(), 'right')    
        plot.y_range.start = 0
        plot.xaxis.axis_label = f"""Utilization"""
        plot.yaxis.axis_label = "Occurrences"
        plot.grid.grid_line_color = "white"
        plot.legend.click_policy="hide"
        plot.legend.location = 'center_right'
        show(plot)
        
    if report['RuleTriggered'] > 0:
                        
        text=f"""To get a better understanding of what may have caused those outliers,
        we correlate the timestamps of step outliers with other framework metrics that happened at the same time.
        The left chart shows how much time was spent in the different framework
        metrics aggregated by event phase. The chart on the right shows the histogram of normal step durations (without
        outliers). The following chart shows how much time was spent in the different 
        framework metrics when step outliers occurred. In this chart framework metrics are not aggregated by event phase."""
        
        if 'phase' in report['Details']:
            text = f"""{text} The chart (in the middle) shows whether step outliers mainly happened during CPU bottlenecks
            """

            plot = create_piechart(report['Details']['phase'], 
                                height=350,
                                width=600,
                                x1=0.2,
                                x2=0.6,
                                radius=0.3, 
                                title="Ratio between TRAIN/EVAL phase and others")
            plots.append(plot)

            if 'forward_backward' in report['Details'] and  len(report['Details']['forward_backward']) > 0:

                event = max(report['Details']['forward_backward'], key=report['Details']['forward_backward'].get)
                perc = report['Details']['forward_backward'][event]

                text = f"""{text} The piecharts on the right shows a more detailed breakdown. 
                It shows that {int(perc)}% of the time was spent in event {event}"""

                plot = create_piechart(report['Details']['forward_backward'], 
                                    height=350,
                                    width=600,
                                    x1=0.2,
                                    x2=0.6,
                                    radius=0.3, 
                                    title="Ratio between forward and backward pass") 
                plots.append(plot)

            if len(plots) > 0:
                paragraph = Paragraph(text=text, width=900)
                show(column(paragraph, row(plots)))

            plots = []
            text = ""
            if 'ratio' in report['Details'] and len(report['Details']['ratio']) > 0:

                key = list(report['Details']['ratio'].keys())[0]
                ratio = report['Details']['ratio'][key]

                text = f"""The following piechart shows a breakdown of the CPU/GPU operators that happened during step outliers. 
                    It shows that {int(ratio)}% of the time was spent in executing operators in {key}."""

                plot = create_piechart(report['Details']['ratio'], 
                                        height=350,
                                        width=600,
                                        x1=0.2,
                                        x2=0.6,
                                        radius=0.3, 
                                        title="Ratio between CPU/GPU operators")
                plots.append(plot)


            if 'general' in report['Details'] and len(report['Details']['general']) > 0:

                event = max(report['Details']['general'], key=report['Details']['general'].get)
                perc = report['Details']['general'][event]

                plot = create_piechart(report['Details']['general'], 
                                    height=350,
                                    width=600,
                                    x1=0.2,
                                    x2=0.6,
                                    radius=0.3, 
                                    title="General metrics recorded in framework ")
                plots.append(plot)

            if len(plots) > 0:
                paragraph = Paragraph(text=text, width=900)
                show(column(paragraph, row(plots)))

            plots = []
            text = ""
            if 'horovod' in report['Details'] and len(report['Details']['horovod']) > 0:

                event = max(report['Details']['horovod'], key=report['Details']['horovod'].get)
                perc = report['Details']['horovod'][event]
                text = f"""The following piechart shows a detailed breakdown of the Horovod metrics that have been
                recorded when step outliers happened. The most expensive function was {event} with {int(perc)}%"""

                plot = create_piechart(report['Details']['horovod'], 
                                    height=350,
                                    width=600,
                                    x1=0.2,
                                    x2=0.6,
                                    radius=0.3, 
                                    title="General metrics recorded in framework ")

                paragraph = Paragraph(text=text, width=900)
                show(column(paragraph, row(plot)))      

In [None]:
if analyse_phase == "training":
    display(Markdown("""#### GPU utilization analysis\n\n"""))
    report = load_report('LowGPUUtilization')
    if report:
        params = report['RuleParameters'].split('\n')
        threshold_p95 = params[0].split(':')[1]
        threshold_p5 = params[1].split(':')[1]
        window = params[2].split(':')[1]
        patience = params[3].split(':')[1]
        violations = report['Violations']
        text=Paragraph(text=f"""The LowGPUUtilization rule checks for low and fluctuating GPU usage.If usage is 
        consistently low, it might be caused by bottlenecks or if batch size/model is too small.If usage is heavily 
        fluctuating it can be caused by bottlenecks or blocking calls. The rule computed the 95th and 5th 
        quantile of GPU utilization on {window} continuous datapoints and found {violations} cases where 
        p95 was above {threshold_p95}% and p5 was below {threshold_p5}%. If p95 is high and p5 is low it would indicate that the 
        usage is highly fluctuating. If both values are very low it would mean that the machine 
        is under-utilized. During initialization utilization is likely 0, so the rule skipped the first {patience} datapoints.
        """, width=800)
        show(text)

        
        if len(report['Details']) > 0:
            
            timestamp = us_since_epoch_to_human_readable_time(report['Details']['last_timestamp'])
            date = datetime.datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S:%f')
            day = date.date().strftime("%m/%d/%Y")
            hour = date.time().strftime("%H:%M:%S")
            text = Paragraph(text=f"""Your training job is under-utilizing the instance. You may want to consider
            to either switch to a smaller instance type. 
            The last time the LowGPUUtilization rule triggered in your training job was on {day} at {hour}.
            The following boxplots are a snapshot from this timestamp that show for each node the total 
            CPU utilization and the utilization and memory usage per GPU (without outliers).""", width=800)
            show(text)
            
            del report['Details']['last_timestamp']
            
            for node_id in report['Details']:
                
                plot = figure(plot_height=350, 
                          plot_width=1000,
                          toolbar_location='right',
                          tools="hover,wheel_zoom,reset,pan", 
                          title=f"Node {node_id}",
                          x_range=(0,17),
                          )
               
                for index, key in enumerate(report['Details'][node_id]):
                    display(Markdown(f"""**GPU utilization of {key} on node {node_id}:**"""))
                    text = ""
                    gpu_max = report['Details'][node_id][key]['gpu_max']
                    p_95 = report['Details'][node_id][key]['gpu_95']
                    p_5 = report['Details'][node_id][key]['gpu_5']
                    text = f"""{text} The max utilization of {key} on node {node_id} was {gpu_max}%"""
                    if p_95 < int(threshold_p95): 
                        text = f"""{text} and the 95th percentile was only {p_95}%. 
                        {key} on node {node_id} is under-utilized"""
                    if p_5 < int(threshold_p5): 
                        text = f"""{text} and the 5th percentile was only {p_5}%"""
                    if p_95 - p_5 > 50:
                        text = f"""{text} The difference between 5th quantile {p_5}% and 95th quantile {p_95}% is quite 
                        significant, which means that utilization on {key} is fluctuating quite a lot.\n"""
     
                    upper = report['Details'][node_id][key]['upper']
                    lower = report['Details'][node_id][key]['lower']
                    p75 = report['Details'][node_id][key]['p75']
                    p25 = report['Details'][node_id][key]['p25']
                    p50 = report['Details'][node_id][key]['p50']

                    plot.segment(index+1, upper, index+1, p75, line_color="black")
                    plot.segment(index+1, lower, index+1, p25, line_color="black")

                    plot.vbar(index+1, 0.7, p50, p75, fill_color="#FDE725", line_color="black")
                    plot.vbar(index+1, 0.7, p25, p50, fill_color="#440154", line_color="black")

                    plot.rect(index+1, lower, 0.2, 0.01, line_color="black")
                    plot.rect(index+1, upper, 0.2, 0.01, line_color="black")

                    plot.xaxis.major_label_overrides[index+1] = key
                    plot.xgrid.grid_line_color = None
                    plot.ygrid.grid_line_color = "white"
                    plot.grid.grid_line_width = 0

                    plot.xaxis.major_label_text_font_size="10px"
                    text=Paragraph(text=f"""{text}""", width=900)
                    show(text)
                plot.xaxis.ticker = np.arange(index+2)
                
                show(plot)

In [None]:
if analyse_phase == "training":
    display(Markdown(""" #### Batch size"""))
    report = load_report('BatchSize')
    if report:
        params = report['RuleParameters'].split('\n')
        cpu_threshold_p95 = int(params[0].split(':')[1])
        gpu_threshold_p95 = int(params[1].split(':')[1])
        gpu_memory_threshold_p95 = int(params[2].split(':')[1])
        patience = int(params[3].split(':')[1])
        window = int(params[4].split(':')[1])
        violations = report['Violations']
        
        text = Paragraph(text=f"""The BatchSize rule helps to detect if GPU is under-utilized because of the batch size being 
        too small. To detect this the rule analyzes the GPU memory footprint, CPU and GPU utilization. The rule checked if the 95th quantile of CPU utilization is below cpu_threshold_p95 of 
        {cpu_threshold_p95}%, the 95th quantile of GPU utilization is below gpu_threshold_p95 of {gpu_threshold_p95}% and the 95th quantile of memory footprint \
        below gpu_memory_threshold_p95 of {gpu_memory_threshold_p95}%. In your training job this happened {violations} times. \
        The rule skipped the first {patience} datapoints. The rule computed the quantiles over window size of {window} continuous datapoints.\n
        """, width=800)
        show(text)
        if len(report['Details']) >0: 
            timestamp = us_since_epoch_to_human_readable_time(report['Details']['last_timestamp'])
            date = datetime.datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S:%f')
            day = date.date().strftime("%m/%d/%Y")
            hour = date.time().strftime("%H:%M:%S")
            del report['Details']['last_timestamp']
            text = Paragraph(text=f"""Your training job is under-utilizing the instance. You may want to consider
            to either switch to a smaller instance type or to increase batch size of your model training. 
            The last time the BatchSize rule triggered in your training job was on {day} at {hour}.
            The following boxplots are a snapshot from this timestamp that show for each node the total 
            CPU utilization and the utilization and memory usage per GPU.""", 
            width=800)
            show(text)

            for node_id in report['Details']:
                xmax = max(20, len(report['Details'][node_id]))
                
                plot = figure(plot_height=350, 
                          plot_width=1000,
                          toolbar_location='right',
                          tools="hover,wheel_zoom,reset,pan", 
                          title=f"Node {node_id}",
                          x_range=(0,xmax)
                          )
                
                for index, key in enumerate(report['Details'][node_id]):
                        upper = report['Details'][node_id][key]['upper']
                        lower = report['Details'][node_id][key]['lower']
                        p75 = report['Details'][node_id][key]['p75']
                        p25 = report['Details'][node_id][key]['p25']
                        p50 = report['Details'][node_id][key]['p50']

                        plot.segment(index+1, upper, index+1, p75, line_color="black")
                        plot.segment(index+1, lower, index+1, p25, line_color="black")

                        plot.vbar(index+1, 0.7, p50, p75, fill_color="#FDE725", line_color="black")
                        plot.vbar(index+1, 0.7, p25, p50, fill_color="#440154", line_color="black")

                        plot.rect(index+1, lower, 0.2, 0.01, line_color="black")
                        plot.rect(index+1, upper, 0.2, 0.01, line_color="black")

                        plot.xaxis.major_label_overrides[index+1] = key
                        plot.xgrid.grid_line_color = None
                        plot.ygrid.grid_line_color = "white"
                        plot.grid.grid_line_width = 0

                        plot.xaxis.major_label_text_font_size="10px"
                plot.xaxis.ticker = np.arange(index+2)

                show(plot)

In [None]:
if analyse_phase == "training": 
    display(Markdown("""#### CPU bottlenecks\n\n"""))

    report = load_report('CPUBottleneck')
    if report:
        params = report['RuleParameters'].split('\n')
        threshold = int(params[0].split(':')[1])
        cpu_threshold = int(params[1].split(':')[1])
        gpu_threshold = int(params[2].split(':')[1])
        patience = int(params[3].split(':')[1])
        datapoints = report['Datapoints']
        violations = report['Violations']
        if report['Violations'] > 0:
            perc = int(report['Violations']/report['Datapoints']*100)
        else:
            perc = 0
        if perc < threshold:
            string = 'below'
        else:
            string = 'above'
        text = f"""The CPUBottleneck rule checked when CPU utilization was above cpu_threshold of {cpu_threshold}% 
        and GPU utilization was below gpu_threshold of {gpu_threshold}%. 
        During initialization utilization is likely 0, so the rule skipped the first {patience} datapoints.
        With this configuration the rule found {violations} CPU bottlenecks which is {perc}% of the total time. This is {string} the threshold of {threshold}%"""


        if report:

            plots = []
            text = ""
            if report['RuleTriggered'] > 0:

                low_gpu = report['Details']['low_gpu_utilization']
                cpu_bottleneck = {}
                cpu_bottleneck["GPU usage above threshold"] = report["Datapoints"] - report["Details"]["low_gpu_utilization"]
                cpu_bottleneck["GPU usage below threshold"] = report["Details"]["low_gpu_utilization"] - len(report["Details"])
                cpu_bottleneck["Low GPU usage due to CPU bottlenecks"] = len(report["Details"])

                n_bottlenecks = round(len(report['Details']['bottlenecks'])/datapoints * 100, 2)
                text = f"""The following chart (left) shows how many datapoints were below the gpu_threshold of {gpu_threshold}%
                and how many of those datapoints were likely caused by a CPU bottleneck. The rule found {low_gpu} out of {datapoints} datapoints which had a GPU utilization 
                below {gpu_threshold}%. Out of those datapoints {n_bottlenecks}% were likely caused by CPU bottlenecks. 
                """

                plot = create_piechart(cpu_bottleneck, 
                                    height=350,
                                    width=600,
                                    x1=0.2,
                                    x2=0.6,
                                    radius=0.3, 
                                    title="Low GPU usage caused by CPU bottlenecks")

                plots.append(plot)

                if 'phase' in report['Details']:
                    text = f"""{text} The chart (in the middle) shows whether CPU bottlenecks mainly happened during CPU bottlenecks
                    """

                    plot = create_piechart(report['Details']['phase'], 
                                        height=350,
                                        width=600,
                                        x1=0.2,
                                        x2=0.6,
                                        radius=0.3, 
                                        title="Ratio between TRAIN/EVAL phase and others")
                    plots.append(plot)

                if 'forward_backward' in report['Details'] and  len(report['Details']['forward_backward']) > 0:

                    event = max(report['Details']['forward_backward'], key=report['Details']['forward_backward'].get)
                    perc = report['Details']['forward_backward'][event]

                    text = f"""{text} The piecharts on the right shows a more detailed breakdown. 
                    It shows that {int(perc)}% of the time was spent in event {event}"""

                    plot = create_piechart(report['Details']['forward_backward'], 
                                        height=350,
                                        width=600,
                                        x1=0.2,
                                        x2=0.6,
                                        radius=0.3, 
                                        title="Ratio between forward and backward pass") 
                    plots.append(plot)

                if len(plots) > 0:
                    paragraph = Paragraph(text=text, width=900)
                    show(column(paragraph, row(plots)))

                plots = []
                text = ""
                if 'ratio' in report['Details'] and len(report['Details']['ratio']) > 0:

                    key = list(report['Details']['ratio'].keys())[0]
                    ratio = report['Details']['ratio'][key]

                    text = f"""The following piechart shows a breakdown of the CPU/GPU operators that happened during CPU bottlenecks. 
                        It shows that {int(ratio)}% of the time was spent in executing operators in {key}."""

                    plot = create_piechart(report['Details']['ratio'], 
                                            height=350,
                                            width=600,
                                            x1=0.2,
                                            x2=0.6,
                                            radius=0.3, 
                                            title="Ratio between CPU/GPU operators")
                    plots.append(plot)


                if 'general' in report['Details'] and len(report['Details']['general']) > 0:

                    event = max(report['Details']['general'], key=report['Details']['general'].get)
                    perc = report['Details']['general'][event]

                    plot = create_piechart(report['Details']['general'], 
                                        height=350,
                                        width=600,
                                        x1=0.2,
                                        x2=0.6,
                                        radius=0.3, 
                                        title="General metrics recorded in framework ")
                    plots.append(plot)

                if len(plots) > 0:
                    paragraph = Paragraph(text=text, width=900)
                    show(column(paragraph, row(plots)))

                plots = []
                text = ""
                if 'horovod' in report['Details'] and len(report['Details']['horovod']) > 0:

                    event = max(report['Details']['horovod'], key=report['Details']['horovod'].get)
                    perc = report['Details']['horovod'][event]
                    text = f"""The following piechart shows a detailed breakdown of the Horovod metrics that have been
                    recorded when CPU bottleneck happened. The most expensive function was {event} with {int(perc)}%"""

                    plot = create_piechart(report['Details']['horovod'], 
                                        height=350,
                                        width=600,
                                        x1=0.2,
                                        x2=0.6,
                                        radius=0.3, 
                                        title="General metrics recorded in framework ")

                    paragraph = Paragraph(text=text, width=900)
                    show(column(paragraph, row(plot)))

In [None]:
if analyse_phase == "training": 
    display(Markdown("""#### I/O bottlenecks\n\n"""))

    report = load_report('IOBottleneck')
    if report:
        params = report['RuleParameters'].split('\n')
        threshold = int(params[0].split(':')[1])
        io_threshold = int(params[1].split(':')[1])
        gpu_threshold = int(params[2].split(':')[1])
        patience = int(params[3].split(':')[1])
        violations = report['Violations']
        if report['Violations'] > 0:
            perc = int(report['Violations']/report['Datapoints']*100)
        else:
            perc = 0
        if perc < threshold:
            string = 'below'
        else:
            string = 'above'
        text = f"""The IOBottleneck rule checked when I/O wait time was above io_threshold of {io_threshold}% 
        and GPU utilization was below gpu_threshold of {gpu_threshold}. During initialization utilization is likely 0, so the rule skipped the first {patience} datapoints. 
        With this configuration the rule found {violations} I/O bottlenecks which is {perc}% of the total time. This is {string} the threshold of {threshold}%"""

        if report:

            plots = []
            text = ""
            if report['RuleTriggered'] > 0:

                low_gpu = report['Details']['low_gpu_utilization']
                cpu_bottleneck = {}
                cpu_bottleneck["GPU usage above threshold"] = report["Datapoints"] - report["Details"]["low_gpu_utilization"]
                cpu_bottleneck["GPU usage below threshold"] = report["Details"]["low_gpu_utilization"] - len(report["Details"])
                cpu_bottleneck["Low GPU usage due to I/O bottlenecks"] = len(report["Details"])

                n_bottlenecks = round(len(report['Details']['bottlenecks'])/datapoints * 100, 2)
                text = f"""The following chart (left) shows how many datapoints were below the gpu_threshold of {gpu_threshold}%
                and how many of those datapoints were likely caused by a I/O bottleneck. The rule found {low_gpu} out of {datapoints} datapoints which had a GPU utilization 
                below {gpu_threshold}%. Out of those datapoints {n_bottlenecks}% were likely caused by I/O bottlenecks. 
                """

                plot = create_piechart(cpu_bottleneck, 
                                    height=350,
                                    width=600,
                                    x1=0.2,
                                    x2=0.6,
                                    radius=0.3, 
                                    title="Low GPU usage caused by I/O bottlenecks")

                plots.append(plot)

                if 'phase' in report['Details']:
                    text = f"""{text} The chart (in the middle) shows whether I/O bottlenecks mainly happened during  trianing or validation phase.
                    """

                    plot = create_piechart(report['Details']['phase'], 
                                        height=350,
                                        width=600,
                                        x1=0.2,
                                        x2=0.6,
                                        radius=0.3, 
                                        title="Ratio between TRAIN/EVAL phase and others")
                    plots.append(plot)

                if 'forward_backward' in report['Details'] and  len(report['Details']['forward_backward']) > 0:

                    event = max(report['Details']['forward_backward'], key=report['Details']['forward_backward'].get)
                    perc = report['Details']['forward_backward'][event]

                    text = f"""{text} The piecharts on the right shows a more detailed breakdown. 
                    It shows that {int(perc)}% of the time was spent in event {event}"""

                    plot = create_piechart(report['Details']['forward_backward'], 
                                        height=350,
                                        width=600,
                                        x1=0.2,
                                        x2=0.6,
                                        radius=0.3, 
                                        title="Ratio between forward and backward pass") 
                    plots.append(plot)

                if len(plots) > 0:
                    paragraph = Paragraph(text=text, width=900)
                    show(column(paragraph, row(plots)))

                plots = []
                text = ""
                if 'ratio' in report['Details'] and len(report['Details']['ratio']) > 0:

                    key = list(report['Details']['ratio'].keys())[0]
                    ratio = report['Details']['ratio'][key]

                    text = f"""The following piechart shows a breakdown of the CPU/GPU operators that happened 
                    during I/O bottlenecks. It shows that {int(ratio)}% of the time was spent in executing operators in {key}."""

                    plot = create_piechart(report['Details']['ratio'], 
                                            height=350,
                                            width=600,
                                            x1=0.2,
                                            x2=0.6,
                                            radius=0.3, 
                                            title="Ratio between CPU/GPU operators")
                    plots.append(plot)


                if 'general' in report['Details'] and len(report['Details']['general']) > 0:

                    event = max(report['Details']['general'], key=report['Details']['general'].get)
                    perc = report['Details']['general'][event]

                    plot = create_piechart(report['Details']['general'], 
                                        height=350,
                                        width=600,
                                        x1=0.2,
                                        x2=0.6,
                                        radius=0.3, 
                                        title="General metrics recorded in framework ")
                    plots.append(plot)

                if len(plots) > 0:
                    paragraph = Paragraph(text=text, width=900)
                    show(column(paragraph, row(plots)))

                plots = []
                text = ""
                if 'horovod' in report['Details'] and len(report['Details']['horovod']) > 0:

                    event = max(report['Details']['horovod'], key=report['Details']['horovod'].get)
                    perc = report['Details']['horovod'][event]
                    text = f"""The following piechart shows a detailed breakdown of the Horovod metrics that have been
                    recorded when I/O bottleneck happened. The most expensive function was {event} with {int(perc)}%"""

                    plot = create_piechart(report['Details']['horovod'], 
                                        height=350,
                                        width=600,
                                        x1=0.2,
                                        x2=0.6,
                                        radius=0.3, 
                                        title="General metrics recorded in framework ")

                    paragraph = Paragraph(text=text, width=900)
                    show(column(paragraph, row(plot)))    


In [None]:
 
if analyse_phase == "training": 
    display(Markdown("""#### LoadBalancing in multi-GPU training\n\n""")) 
    report = load_report('LoadBalancing')
    if report:
        params = report['RuleParameters'].split('\n')
        threshold = params[0].split(':')[1]
        patience = params[1].split(':')[1]
        paragraph = Paragraph(text=f"""The LoadBalancing rule helps to detect issues in workload balancing 
        between multiple GPUs. 
        It computes a histogram of GPU utilization values for each GPU and compares then the 
        similarity between histograms. The rule checked if the distance of histograms is larger than the 
        threshold of {threshold}%.
        During initialization utilization is likely 0, so the rule skipped the first {patience} datapoints.
        """, width=900)
        show(paragraph)
        
        if len(report['Details']) > 0:
            for node_id in report['Details']: 
                
                
                text = f"""The following histogram shows the workloads on node {node_id} 
                which differ by more than threshold {threshold}. 
                You can enable/disable the visualization of a workload by clicking on the label in the legend."""
                
                plot = figure(plot_height=350, 
                              plot_width=350, 
                              x_range=(-1,100),
                              title=f"""Workloads on node {node_id}""")
                colors = bokeh.palettes.viridis(len(report['Details'][node_id]['workloads']))
                
                for index, gpu_id2 in enumerate(report['Details'][node_id]['workloads']):

                    if gpu_id2 in report['Details'][node_id]['distances']:
                        for gpu_id1 in report['Details'][node_id]['distances'][gpu_id2]:

                            distance = round(report['Details'][node_id]['distances'][gpu_id2][gpu_id1], 2)
                            text = f"""{text} The difference of workload between {gpu_id2} and {gpu_id1} is: {distance}."""

                    probs = report['Details'][node_id]['workloads'][gpu_id2]
                    plot.quad( top=probs,
                                bottom=0,
                                left=np.arange(0,98,2),
                                right=np.arange(2,100,2),
                                line_color="white",
                                fill_color=colors[index],
                                fill_alpha=0.8,
                                legend=gpu_id2 )

                    plot.y_range.start = 0
                    plot.xaxis.axis_label = f"""Utilization"""
                    plot.yaxis.axis_label = "Occurrences"
                    plot.grid.grid_line_color = "white"
                    plot.legend.click_policy="hide"

                paragraph = Paragraph(text=f"""{text}""", width=900)
                show(column(paragraph, plot))

In [None]:
if analyse_phase == "training":
    display(Markdown("""#### GPU memory analysis\n\n"""))
    
    report = load_report('GPUMemoryIncrease')
    if report:
        params = report['RuleParameters'].split('\n')
        increase = float(params[0].split(':')[1])
        patience = params[1].split(':')[1]
        window = params[2].split(':')[1]
        violations = report['Violations']
        
        text=Paragraph(text=f"""The GPUMemoryIncrease rule helps to detect large increase in memory usage on GPUs. 
        The rule checked if the moving average of memory increased by more than {increase}%. 
        So if the moving average increased for instance from 10% to {11+increase}%, 
        the rule would have triggered. During initialization utilization  is likely 0, so the rule skipped the first {patience} datapoints.
        The moving average was computed on a window size of {window} continuous datapoints. The rule detected {violations} violations
        where the moving average between previous and current time window increased by more than {increase}%""",
                       width=900)
        show(text)

        if len(report['Details']) > 0:
            
            timestamp = us_since_epoch_to_human_readable_time(report['Details']['last_timestamp'])
            date = datetime.datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S:%f')
            day = date.date().strftime("%m/%d/%Y")
            hour = date.time().strftime("%H:%M:%S")
            text = Paragraph(text=f"""Your training job triggered memory spikes. 
            The last time the GPUMemoryIncrease rule triggered in your training job was on {day} at {hour}.
            The following boxplots are a snapshot from this timestamp that show for each node and GPU the corresponding
            memory utilization (without outliers).""", width=900)
            show(text)
            
            del report['Details']['last_timestamp']
            
            for node_id in report['Details']:
    
                plot = figure(plot_height=350, 
                          plot_width=1000,
                          toolbar_location='right',
                          tools="hover,wheel_zoom,reset,pan", 
                          title=f"Node {node_id}",
                          x_range=(0,17),
                          )

                for index, key in enumerate(report['Details'][node_id]):
                    display(Markdown(f"""**Memory utilization of {key} on node {node_id}:**"""))
                    text = ""
                    gpu_max = report['Details'][node_id][key]['gpu_max']
                    text = f"""{text} The max memory utilization of {key} on node {node_id} was {gpu_max}%."""
                    
                    p_95 = int(report['Details'][node_id][key]['p95'])
                    p_5 = report['Details'][node_id][key]['p05']
                    if p_95 < int(50): 
                        text = f"""{text} The 95th percentile was only {p_95}%."""
                    if p_5 < int(5): 
                        text = f"""{text} The 5th percentile was only {p_5}%."""
                    if p_95 - p_5 > 50:
                        text = f"""{text} The difference between 5th quantile {p_5}% and 95th quantile {p_95}% is quite 
                        significant, which means that memory utilization on {key} is fluctuating quite a lot."""
                        
                    text = Paragraph(text=f"""{text}""", width=900)
                    show(text)
                    
                    upper = report['Details'][node_id][key]['upper']
                    lower = report['Details'][node_id][key]['lower']
                    p75 = report['Details'][node_id][key]['p75']
                    p25 = report['Details'][node_id][key]['p25']
                    p50 = report['Details'][node_id][key]['p50']

                    plot.segment(index+1, upper, index+1, p75, line_color="black")
                    plot.segment(index+1, lower, index+1, p25, line_color="black")

                    plot.vbar(index+1, 0.7, p50, p75, fill_color="#FDE725", line_color="black")
                    plot.vbar(index+1, 0.7, p25, p50, fill_color="#440154", line_color="black")

                    plot.rect(index+1, lower, 0.2, 0.01, line_color="black")
                    plot.rect(index+1, upper, 0.2, 0.01, line_color="black")

                    plot.xaxis.major_label_overrides[index+1] = key
                    plot.xgrid.grid_line_color = None
                    plot.ygrid.grid_line_color = "white"
                    plot.grid.grid_line_width = 0

                    plot.xaxis.major_label_text_font_size="10px"
                plot.xaxis.ticker = np.arange(index+2)
                
                show(plot)