# Profiler Report
This report summarizes the execution of the profiler built-in rules. Profiler runs a set of rules as the training is going on where each of them identifies certain performance issues. This notebook gives a description for each rule and details about the rule execution e.g. how often was the rule condition met 


In [None]:
import json
import pandas as pd
import glob
import matplotlib.pyplot as plt
import numpy as np

In [None]:
from IPython.display import display, HTML, Markdown, Code, Image
def pretty_print(df):
    raw_html = df.to_html().replace("\\n","<br>").replace('<tr>','<tr style="text-align: left;">')
    return display(HTML(raw_html))

## Training job summary

The following table gives a summary about the training job. The tables includes information about when the training job started and ended, how much time intialization, training loop and finalization took.

In [None]:
def load_report(rule_name):
    try:
        report = json.load(open('/opt/ml/processing/outputs/profiler-reports/'+rule_name+'.json'))
        if rule_name != 'MaxInitializationTime' and rule_name != 'OverallSystemUsage':
            triggered = report['RuleTriggered']
            datapoints = report['Datapoints']
            display(Markdown(f"""The number of times the {rule_name} rule triggerd: {triggered}"""))
            display(Markdown(f"""The number of events processed by {rule_name} rule: {datapoints}"""))
        return report
    except FileNotFoundError:
        print (rule_name + ' not triggered')

In [None]:
from smdebug.profiler.utils import us_since_epoch_to_human_readable_time, ns_since_epoch_to_human_readable_time

job_statistics = {}
report = load_report('MaxInitializationTime')
if report:
    if "first" in report['Details']["step_num"]:
        first_step = report['Details']["step_num"]["first"]
        last_step = report['Details']["step_num"]["last"]
    job_statistics["start_time"] = us_since_epoch_to_human_readable_time(report['Details']['job_start'] * 1000000)
    job_statistics["end_time"] = us_since_epoch_to_human_readable_time(report['Details']['job_end'] * 1000000)
    job_statistics["job_duration_in_seconds"] = (report['Details']['job_end'] - report['Details']['job_start']) 
    if "first" in report['Details']["step_num"]:
        job_statistics["training_loop_start"] = us_since_epoch_to_human_readable_time(first_step)
        job_statistics["training_loop_end"] = us_since_epoch_to_human_readable_time(last_step)
        job_statistics["training_loop_duration_in_seconds"] = (last_step - first_step) / 1000000
        job_statistics["initialization_in_seconds"] = first_step/1000000 - report['Details']['job_start'] 
        job_statistics["finalization_in_seconds"] = np.abs(report['Details']['job_end'] - last_step/1000000)
        job_statistics["initialization_%"] = job_statistics["initialization_in_seconds"] / job_statistics["job_duration_in_seconds"] * 100
        job_statistics["training_loop_%"] = job_statistics["training_loop_duration_in_seconds"] / job_statistics["job_duration_in_seconds"] * 100
        job_statistics["finalization_%"] = job_statistics["finalization_in_seconds"] / job_statistics["job_duration_in_seconds"] * 100


In [None]:
import datetime
if job_statistics:
    df = pd.DataFrame.from_dict(job_statistics, orient='index')
    start_time = df[0]['start_time']
    date = datetime.datetime.strptime(df[0]['start_time'], '%Y-%m-%dT%H:%M:%S:%f')
    day = date.date().strftime("%m/%d/%Y")
    hour = date.time().strftime("%H:%M:%S")
    duration = int(df[0]['job_duration_in_seconds'])
    display(Markdown(f"""Your training job started on **{day}** at **{hour}** and ran for **{duration}** seconds."""))
     
    pretty_print(df)
    if "first" in report['Details']["step_num"]:
        if job_statistics["finalization_%"]  < 0:
            job_statistics["finalization_%"]  = 0
        if job_statistics["training_loop_%"] < 0:
            job_statistics["training_loop_%"] = 0
        if job_statistics["initialization_%"] < 0:
            job_statistics["initialization_%"] = 0

        fig, ax = plt.subplots()
        ax.pie([job_statistics["initialization_%"] , 
                job_statistics["training_loop_%"] , 
                job_statistics["finalization_%"] ], 
                autopct='%1.1f%%')
        ax.legend(['initialization', 'training_loop', 'finalization'], bbox_to_anchor=(0.8, 0))
        plt.show()
    else:
        display(Markdown("""No step information available. Cannot calculate initialization and finalization time."""))
        

## System usage statistics

In [None]:
report = load_report('OverallSystemUsage')

In [None]:
if "GPU" in report["Details"]:
    for node_id in report["Details"]["GPU"]:
        gpu_p95 = report["Details"]["GPU"][node_id]["p95"]
        cpu_p95 = report["Details"]["CPU"][node_id]["p95"]

        if gpu_p95 < 70 and cpu_p95 < 70:
            display(Markdown(f"""The 95th quantile of the total GPU utilization on node {node_id} is only **{int(gpu_p95)}%**. 
            The 95th quantile of the total CPU utilization is only **{int(cpu_p95)}%**. Node {node_id} is under-utilized. 
            You may want to consider switching to a smaller instance type."""))
        elif gpu_p95 < 70 and cpu_p95 > 70:
            display(Markdown(f"""The 95th quantile of the total GPU utilization on node {node_id} is only **{int(gpu_p95)}%**. 
            However, the 95th quantile of the total CPU utilization is **{int(cpu_p95)}%**. GPUs on node {node_id} are under-utilized 
            likely because of CPU bottlenecks"""))
        elif gpu_p95 > 70:
            display(Markdown(f"""The 95th quantile of the total GPU utilization on node {node_id} is **{int(gpu_p95)}%**. 
            GPUs on node {node_id} are well utilized"""))
        else:
            display(Markdown(f"""The 95th quantile of the total GPU utilization on node {node_id} is **{int(gpu_p95)}%**. 
            The 95th quantile of the total CPU utilization is {int(cpu_p95)}%."""))
else:
    for node_id in report["Details"]["CPU"]:
        cpu_p95 = report["Details"]["CPU"][node_id]["p95"]
        if cpu_p95 > 70:
            display(Markdown(f"""The 95th quantile of the total CPU utilization on node {node_id} is {int**(cpu_p95)}%**. GPUs on node {node_id} are well utilized"""))

display(Markdown(f"""The following table shows usage statistics per worker node such as total CPU and GPU 
utilization, total CPU and memory footprint. The table also include total IO wait time and total sent/received bytes.
The table shows min and max values as well as p99, p90 and p50 percentiles.
"""))


In [None]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)
rows = [] 
units = {"CPU": "percentage", "CPU memory": "percentage", "GPU": "percentage", "Network": "bytes", "GPU memory": "percentage", "I/O": "percentage"}
if report:
    for metric in report['Details']:
        for node_id in report['Details'][metric]:
            values = report['Details'][metric][node_id]
            rows.append([node_id, metric, units[metric], values['max'], values['p99'], values['p95'], values['p50'], values['min']])

    df = pd.DataFrame(rows) 
    df.columns = ['Node', 'metric', 'unit', 'max', 'p99', 'p95', 'p50', 'min']
    with pd.option_context('display.colheader_justify','left'):
        pretty_print(df)

## Rules summary

In [None]:
description = {}
description['CPUBottleneck'] = 'Checks if CPU usage is high but GPU usage is low at the same time, it may indicate a CPU bottleneck where GPU is waiting for data to arrive from CPU. The rule triggers if number of CPU bottlenecks exceeds a predefined threshold.'
description['IOBottleneck'] = 'If IO wait time is high but at the same time GPU usage is low, it may indicate an IO bottleneck where GPU is waiting for data to arrive from disk. The rule triggers if number of IO bottlenecks exceeds a predefined threshold.'
#description['Dataloaders'] = 'Checks how many dataloader processes are running in parallel and whether the total number is equal the number of available CPU cores. The rule triggers if number is much smaller or larger than the number of available cores. If too small, it may lead to low GPU utilization. If too large it may lead to too many context switches on CPU.'
description['GPUMemoryIncrease'] = 'If model and/or batch size is too large then training will run out of memory and crash.'
description['BatchSize'] = 'Checks if GPU is underulitized because of the batch size being too small. To detect this the rule analyzes the average GPU memory footprint, CPU and GPU utilization. '
description['LowGPUUtilization'] = 'Checks if GPU utilization is low or suffers from fluctuations. This can happen if there are bottlenecks, many blocking calls due to synchroniziations or batch size too small.'
description['MaxInitializationTime'] = 'Checks if the training intialization is taking too much time. The rule waits until first step is available. This can happen if you are running in File mode and a lot of data needs to be downloaded from Amazon S3.'
description['LoadBalancing'] = 'Detect issues in workload balancing between multiple GPUs. Workload imbalance can for instance occur in data parallel training when gradients are accumulated on primary GPU so this GPU will be overused with regards to other GPUs limiting the effect of parallelization.  '
description['StepOutlier'] = 'Detect outliers in step duration. Time for forward and backward pass should be roughly the same throughout the training. If there are significant outliers it would indicate an issue due to a system stall or a bottleneck.'

In [None]:
recommendation = {}
recommendation['CPUBottleneck'] = 'CPU bottlenecks can happen when data preprocessing is very compute intensive. You should consider increasing the number of dataloaders or apply prefetching.'
recommendation['IOBottleneck'] = 'Prefetch data or choose different file formats such as binary formats which improves read performance.'
#recommendation['Dataloaders'] = 'Increase or decrease the number of dataloader subprocesses'
recommendation['GPUMemoryIncrease'] = 'Choose a larger instance type with more memory (if it is not a memory leak) or apply model parallelism (Rubik)'
recommendation['BatchSize'] = 'Run on a smaller instance type or increase batch size'
recommendation['LowGPUUtilization'] = 'Check for bottlenecks, minimize blocking calls, change distributed training strategy, increase batchsize.'
recommendation['MaxInitializationTime'] = 'Switch from File to Pipe mode'
recommendation['LoadBalancing'] = 'Choose different distributed training strategy or different distributed training framework'
recommendation['StepOutlier'] = 'Check for bottlenecks'

In [None]:
files = glob.glob('/opt/ml/processing/outputs/profiler-reports/*json')
summary = {}
for i in files:
    rule_name = i.split('/')[-1].replace('.json','')
    if rule_name == "OverallSystemUsage":
        continue
    rule_report = json.load(open(i))
    summary[rule_name] = {}
    summary[rule_name]['Description'] = description[rule_name]
    summary[rule_name]['Recommendation'] = recommendation[rule_name]
    summary[rule_name]['Number of times rule triggered'] = rule_report['RuleTriggered'] 
    summary[rule_name]['Number of violations'] = rule_report['Violations'] 
    summary[rule_name]['Number of datapoints'] = rule_report['Datapoints']
    summary[rule_name]['Rule parameters'] = rule_report['RuleParameters']

df = pd.DataFrame.from_dict(summary, orient='index')
df = df.sort_values(by=['Number of times rule triggered'], ascending=False)

display(Markdown(f"""The following table shows a summary of the executed profiler rules. 
The table is sorted by the rules the triggered most frequently. In your training job this was the case
for rule **{df.index[0]}**.
Per default rules will run on a time segment of 60 seconds. Within this time segment
a rule may produce multiple violations e.g. a CPUBottleneck triggers once per time segment, but 
within that time segment there could be hundreds of bottlenecks. 

In your training job rule **{df.index[0]}** triggered **{df.values[0,2]}** times and recorded **{df.values[0,3]}** violations """))
with pd.option_context('display.colheader_justify','left'):    
    pretty_print(df)

In [None]:
analyse_phase = "training"
if job_statistics and "initialization_in_seconds" in job_statistics:
    if job_statistics["initialization_in_seconds"] > job_statistics["training_loop_duration_in_seconds"]:
        analyse_phase = "initialization"
        display(Markdown("Since initialization has taken the most time, we dive deep into the events occurring during this phase"))
        display(Markdown("""### Analysing initialization\n\n"""))
if analyse_phase == "training":
    display(Markdown("Since training loop has taken the most time, we dive deep into the events occurring during this phase"))
    display(Markdown("""### Analysing the training loop\n\n"""))

In [None]:
def display_image(image_name):
    files = glob.glob('/opt/ml/processing/outputs/profiler-reports/' + image_name)
    for filename in files:
        display(Image(filename=filename))

In [None]:
if analyse_phase == "initialization":
    display(Markdown("""### MaxInitializationTime\n\nThis rule helps to detect if the training intialization is taking too much time. \nThe rule waits until first step is available. The rule takes the parameter `threshold` that defines how many minutes to wait for the first step to become available. Default is 20 minutes.\nYou can run the rule locally in the following way:
    """))
    display(Code('''
    from smdebug.profiler.analysis.rules.max_intialization_time import MaxInitializationTime

    profiler_path = estimator.latest_job_profiler_artifacts_path()
    trial = create_trial(profiler_path, profiler=True)
    rule = MaxInitializationTime(trial, threshold=20)
    
    def run_rule(rule):
        try:
            invoke_rule(rule, raise_eval_cond=True)
        except NoMoreData:
            print(
                "The training has ended and there is no more data to be analyzed. This is expected behavior."
            )
        except RuleEvaluationConditionMet as e:
            print(e)
    
    run_rule(rule)

    ''', language="python"))
    
    _ = load_report("MaxInitializationTime")

In [None]:
if analyse_phase == "training":
    display(Markdown("""#### Step duration analysis"""))
    
    display(Markdown("""The StepOutlier rule measures step durations and checks for outliers.\nThe rule \
    returns `True` if duration is larger than `stddev` multiplied the standard deviation. The rule \
    also takes the parameter `mode`, that specifies whether steps from training or validation phase \
    should be checked. Typically the first step is taking signifciantly more time and to avoid the \
    rule triggering immedietaly, one can use `n_outliers` to specify the number of outliers to ignore.\nYou \
    can run the rule locally in the following way:
    """))
    
    display(Code('''
    from smdebug.profiler.analysis.rules.step_outlier import StepOutlier
    
    profiler_path = estimator.latest_job_profiler_artifacts_path()
    trial = create_trial(profiler_path, profiler=True)
    rule = StepOutlier(trial, stddev=5, mode=None, n_outliers=10, )

    run_rule(rule)
    ''', language="python"))

    report = load_report('StepOutlier')

    if report and len(report['Details']) > 0:
        for node_id in report['Details']:
            tmp = report['RuleParameters'].split('threshold:')
            threshold = tmp[1].split('\n')[0]
            n_outliers = report['Details'][node_id]['number_of_outliers']
            mean = report['Details'][node_id]['step_stats']['mean']
            stddev = report['Details'][node_id]['stddev']
            phase = report['Details'][node_id]['phase']
            display(Markdown(f"""The following table is a summary of the statistics of step durations measured on node **{node_id}**.
            The rule has analyzed the step duration from **{phase}** phase.
            The average step duration on node **{node_id}** was **{round(mean, 3)}s**. 
            The rule detected **{n_outliers} outliers**, where step duration was larger than **{threshold} times** the standard deviation of **{stddev}s**
                             \n"""))
            step_stats_df = pd.DataFrame.from_dict(report['Details'][node_id]['step_stats'], orient='index').T
            step_stats_df.index = ['Step Durations in [s]']
            pretty_print(step_stats_df)
            if report['RuleTriggered'] > 0:
                display(Markdown(f'\nThe rule found the following outliers on node **{node_id}**'))
                if 'outliers' in report['Details'][node_id]:
                    for duration, step in zip(report['Details'][node_id]['outliers'][0], report['Details'][node_id]['step_numbers']):
                        display(Markdown(f"""Step {step}: {duration} s"""))
                        
                display(Markdown(f"""To get a better uderstanding of what may have caused those outliers,
                we correlate the timestamps of step outliers with other framework metrics that happened at the same time.
                The left chart shows how much time was spent in the different framework
                metrics aggregated by event phase. The chart on the right shows the histogram of normal step durations (without
                outliers). """))
                display_image("*step_duration_histogram.png")


                display(Markdown("""The following chart shows how much time was spent in the different 
                framework metrics when step outliers occured. In this chart framework metrics are not aggregated by event phase."""))
                display_image("histogram_step_outlier_framework.png")

In [None]:
if analyse_phase == "training":
    display(Markdown("""#### GPU utilization analysis\n\n"""))

    display(Markdown("""The LowGPUUtilization rule checks for low and fluctuating GPU usage.\nIf usage is consistently low \
    it can be caused by bottlenecks or if batch size is too small.\nIf usage is heavily \
    fluctuating it can be caused by bottlenecks or blocking calls. User can specify a \
    threshold `threshold_p95` for 95th quantile and `threshold_p5` for 5th quantile.\
    Default values are 70% and 10%. If p95 is high and p5 is low it would indicate that the \
    usage is highly fluctuating. If both values are very low it would mean that the machine \
    is underutilized. Furthermore user can specify the window size on which the quantiles \
    will be computed. Default value is 500. During intilization GPU usage will be likely 0, \
    so the rule also takes a parameter `patience` that defines how many number of datapoints \
    to skip.\nYou can run the rule locally in the following way:
    """))
    display(Code('''
    from smdebug.profiler.analysis.rules.low_gpu_utilization import LowGPUUtilization

    profiler_path = estimator.latest_job_profiler_artifacts_path()
    trial = create_trial(profiler_path, profiler=True)
    rule = LowGPUUtilization(trial, threshold_p95=70, threshold_p95=10, patience=100, window=500)

    run_rule(rule)  
    ''', language="python"))
    report = load_report('LowGPUUtilization')
    tmp = report['RuleParameters'].split(':')
    threshold_p95 = tmp[1].split('\n')[0]
    threshold_p5 = tmp[2].split('\n')[0]
    window = tmp[3].split('\n')[0]
    display(Markdown(f""" The rule ran with parameters  **threshold_p95={threshold_p95}** and 
    **threshold_p5={threshold_p5}**. Those quantiles were computed on a window size of **{window}** 
    continous datapoints.
     """))
    for node_id in report['Details']:
        for gpu_id in report['Details'][node_id]:
            p_95 = report['Details'][node_id][gpu_id]['gpu_95']
            p_5 = report['Details'][node_id][gpu_id]['gpu_5']
            if p_95 < int(threshold_p95): 
                display(Markdown(f"""The rule discovered that on node **{node_id}** the 95th percentile on **{gpu_id}** is only **{p_95}%**. 
                **{gpu_id}** on node **{node_id}** is under-utilized"""))
            if p_5 < int(threshold_p5): 
                display(Markdown(f"""The rule discovered that the 5th percentile on **{gpu_id}** is only **{p_5}%**"""))
            if p_95 - p_5 > 50:
                display(Markdown(f"""The difference between 5th quantile **{p_5}%** and 95th quantile **{p_95}%** is quite 
                signficant, which means that utilization on **{gpu_id}** is fluctuating quite a lot."""))
    if len(report['Details']) > 0:
        display(Markdown(f"""The following chart shows the boxplots of utilizations for the different GPUs."""))
    display_image("*box_plot_gpu_utilization.png")
    
    if report:
        display(Markdown("""Since the LowGPUUtilization rule pointed out potential GPU under-utilization, we will make use of \
        two other rules BatchSize and CPUBottleneck to identify the cause for the low
        utilization.
        """))
        
        display(Markdown("""The BatchSize rule helps to detect if GPU is underulitized because of the batch size being 
        too small. To detect this the rule analyzes the GPU memory footprint, CPU and GPU 
        utilization. If 95th quantile of CPU utilization is below `cpu_threshold_p95`, 
        95th quantile of GPU utilization is below `gpu_threshold_p95` and memory footprint 
        below `gpu_memory_threshold_p95` , it may indicate that user can either run on a 
        smaller instance type or that batch size could be increased. This analysis does 
        not work for frameworks that heavily over-allocate memory. Increasing batch size 
        could potentially lead to a processing/dataloading bottleneck, because more data 
        needs to be pre-processed in each iteration.\nThe rule takes in addition a parameter 
        `patience` to skip the first few datapoints and a parameter `window` that defines 
        the number of datapoints over which the quantiles are computed.\nYou can run the rule locally in the following way:
        """))
        
        display(Code('''
        from smdebug.profiler.analysis.rules.batch_size import BatchSize

        profiler_path = estimator.latest_job_profiler_artifacts_path()
        trial = create_trial(profiler_path, profiler=True)
        rule = BatchSize(trial, cpu_threshold_p95=70, gpu_threshold_p95=70, gpu_memory_threshold_p95=70, patience=100,  
                 window=500)

        run_rule(rule)
        ''', language="python"))
        report = load_report('BatchSize')
        tmp = report['RuleParameters'].split(':')
        cpu_threshold_p95 = int(tmp[1].split('\n')[0])
        gpu_threshold_p95 = int(tmp[2].split(' ')[0])
        gpu_memory_threshold_p95 = int(tmp[3].split('\n')[0])
        display(Markdown(f""" The rule ran with parameters  **cpu_threshold_p95={cpu_threshold_p95}**, **gpu_threshold_p95={gpu_threshold_p95}** and **gpu_memory_threshold_p95={gpu_memory_threshold_p95}**
         """))
        for node_id in report['Details']:
            display(Markdown(f"""Total CPU utilization p95 on node **{node_id}** is only **{cpu_p95}%**"""))
            for gpu_id in report['Details'][node_id]:
                cpu_p95 = round(report['Details'][node_id][gpu_id]['cpu_p95'], 2)
                gpu_p95 = round(report['Details'][node_id][gpu_id]['gpu_p95'], 2)
                gpu_memory_p95 = round(report['Details'][node_id][gpu_id]['gpu_memory_p95'], 2)
                display(Markdown(f"""The 95th quantile of GPU utilization and memory utilization on **{gpu_id}** 
                is only  **{gpu_p95}%** and **{gpu_memory_p95}%**. 
                """))
        if len(report['Details']) >0:   
            display(Markdown(f"""Your training job is under-utilizing the instance. You may want to consider
            to either switch to a smaller instance type or to increase batch size of your model training. The following boxplots show the 
            total CPU usage and the usage and memory per GPU."""))
            
        display_image('*box_plot_batch_size.png')
        
        
        display(Markdown("""The CPUBottleneck rule identifies when CPU utilization is high (above `cpu_threshold` default 
        is 90%) and GPU utilization is low (below `gpu_threshold` default is 10%).\
        GPU utilization is likely 0 during intiialization before the training loop has started,
        so the rule takes an additional argument `patience` that defines how many datapoints 
        to capture before to run the first evaluation. The parameter `threshold` defines when 
        the rule should return True and default is 50. If we see CPU bottlencks 50% of the 
        time, then the rule will trigger.\nYou can run the rule locally in the following way:
        """))

        display(Code('''
        from smdebug.profiler.analysis.rules.cpu_bottleneck import CPUBottleneck

        profiler_path = estimator.latest_job_profiler_artifacts_path()
        trial = create_trial(profiler_path, profiler=True)
        rule = CPUBottleneck(trial, gpu_threshold=10, cpu_threshold=90, threshold=50, patience=100)

        run_rule(rule)
        ''', language="python"))

        report = load_report('CPUBottleneck')
        if report:
            violations = report['Violations']
            perc = int(report['Violations']/report['Datapoints']*100)
            tmp = report['RuleParameters'].split(':')
            cpu_threshold = tmp[2].split('\n')[0]
            gpu_threshold = tmp[3].split('\n')[0]
            low_gpu = report['Details']['low_gpu_utilization']
            datapoints = report['Datapoints']
            n_bottlenecks = round(len(report['Details'])/datapoints * 100, 2)
            display(Markdown(f"""The rule ran with the **cpu_threshold={cpu_threshold}%** and 
            **gpu_threshold={gpu_threshold}%**. With this configuration the rule found **{violations}** CPU bottlenecks
            which is **{perc}%** of the total time."""))
            if report['RuleTriggered'] > 0:

                display(Markdown(f"""The following chart (left) shows how many datapoints were below the gpu_threshold of **{gpu_threshold}%** 
                and how many of those datapoints were likely caused by a CPU bottleneck. 
                The rule found **{low_gpu}** out of **{datapoints}** datapoints which had a GPU utilization 
                below **{gpu_threshold}%**. 
                Out of those datapoints **{n_bottlenecks}%** were likely caused by CPU bottlenecks.
                The chart in the middle shows how much time was spent in the framework metrics (aggregated by event phase) when CPU bottleneck occured. The chart on the right shows whether CPU bottlenecks mainly occured during training or validation phase.
                """))
                display_image('pie_charts_cpu_bottleneck.png')

                display(Markdown("""The following chart shows how much time was spent in the different framework metrics while the CPU bottlenecks occured.
                """))
                display_image('histogram_cpu_bottleneck_framework.png')
        
        display(Markdown("""The IOBottleneck rule identifies when I/O wait time is above `io_threshold` (default 
        is 90%) and GPU utilization is low (below `gpu_threshold` default is 10%).
        GPU utilization is likely 0 during intiialization before the training loop has started,
        so the rule takes an additional argument `patience` that defines how many datapoints 
        to capture before to run the first evaluation. The parameter `threshold` defines when 
        the rule should return True and default is 50. If we see IO bottlencks 50% of the 
        time, then the rule will trigger.\nYou can run the rule locally in the following way:
        """))

        display(Code('''
        from smdebug.profiler.analysis.rules.cpu_bottleneck import IOBottleneck

        profiler_path = estimator.latest_job_profiler_artifacts_path()
        trial = create_trial(profiler_path, profiler=True)
        rule = IOBottleneck(trial, gpu_threshold=10, io_threshold=90, threshold=50, patience=100)

        run_rule(rule)
        ''', language="python"))

        report = load_report('IOBottleneck')
        violations = report['Violations']
        perc = int(report['Violations']/report['Datapoints']*100)
        tmp = report['RuleParameters'].split(':')
        io_threshold = tmp[2].split('\n')[0]
        gpu_threshold = tmp[3].split('\n')[0]
        low_gpu = report['Details']['low_gpu_utilization']
        datapoints = report['Datapoints']
        n_bottlenecks = round(len(report['Details'])/datapoints *100, 2)
        display(Markdown(f"""The rule ran with the **io_threshold={io_threshold}%** and 
        **gpu_threshold={gpu_threshold}%**. With this configuration the rule found **{violations}** I/O bottlenecks
        which is **{perc}%** of the total time."""))
        if report['RuleTriggered'] > 0:

            display(Markdown(f"""The following chart (left) shows how many datapoints were below the gpu_threshold of **{gpu_threshold}%** 
            and how many of those datapoints were likely caused by an I/O bottleneck. 
            The rule found **{low_gpu}** out of **{datapoints}** datapoints which had a GPU utilization below **{gpu_threshold}%**. 
            Out of those datapoints **{n_bottlenecks}%** were likely caused by I/O bottlenecks.
            The chart in the middle shows how much time was spent in the framework metrics (aggregated by event phase) when I/O bottleneck occured. 
            The chart on the right shows whether I/O bottlenecks mainly occured during training or validation phase.
            """))
            display_image('pie_charts_io_bottleneck.png')

            display(Markdown("""The following chart shows how much time was spent in the different framework metrics when I/O bottlenecks occured.
            """))
            display_image('histogram_io_bottleneck_framework.png')
            
    display(Markdown("""The LoadBalancing rule helps to detect issues in workload balancing between multiple GPUs. 
    It computes a histogram of GPU utilization values for each GPU and compares then the 
    similarity between histograms. The rule takes a parameter `threshold` that defines 
    the maximum distance between histograms. In the beginning utilization is likely 0 when 
    intilization is happening. The parameter `patience` defines how many datapoints to skip 
    in the beginning.\nYou can run the rule locally in the following way:
    """))
    
    display(Code('''
    from smdebug.profiler.analysis.rules.load_balancing import LoadBalancing

    profiler_path = estimator.latest_job_profiler_artifacts_path()
    trial = create_trial(profiler_path, profiler=True)
    rule = LoadBalancing(trial, threshold=0.5, patience=10)

    run_rule(rule)
    ''', language="python"))
    
    report = load_report('LoadBalancing')
    display_image('*load_balancing_workload.png')

In [None]:
if analyse_phase == "training":
    display(Markdown("""#### GPU memory analysis\n\n"""))
    
    display(Markdown("""The GPUMemoryIncrease rule helps to detect large increase in memory usage on GPUs. 
    The rule takes the parameter `increase` which defines the threshold for absolute 
    memory increase and the default is 10%. So if the moving average increases from 10% to 21%, 
    the rule will trigger. The parameter `patience` specifies how many datapoints to 
    capture before Rule runs the first evluation. Default 100. The parameter `window` 
    defines the window size for moving average.\nYou can run the rule locally in the following way:
    """))
    
    display(Code('''
    from smdebug.profiler.analysis.rules.gpu_usage import GPUMemoryIncrease

    profiler_path = estimator.latest_job_profiler_artifacts_path()
    trial = create_trial(profiler_path, profiler=True)
    rule = GPUMemoryIncrease(trial, increase=10, patience=100, window=10)

    run_rule(rule)
    ''', language="python"))
    _ = load_report('GPUMemoryIncrease')
    display_image('*box_plot_gpu_memory.png')