In [None]:
import smdebug
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
from smdebug.profiler.analysis.utils.profiler_data_to_pandas import PandasFrame
from smdebug.profiler.analysis.notebook_utils.timeline_charts import TimelineCharts
from smdebug.profiler.analysis.utils.pytorch_dataloader_analysis import PT_dataloader_analysis

### Obtain the path to profiler output

If the training job name and region are known set the appropriate variables in following cell and run the cell to obtain path.  Optionally, if path to profiler output is already known, the following cell can be skipped.

In [None]:
training_job_name = 'pt-multiworker-resnext101-2020-09-08-21-43-05-168'
region = 'us-east-1'

In [None]:
from smdebug.profiler.analysis.notebook_utils.training_job import TrainingJob


tj = TrainingJob(training_job_name, region)
tj.wait_for_sys_profiling_data_to_be_available()
tj.wait_for_framework_profiling_data_to_be_available()
framework_metrics_reader = tj.get_framework_metrics_reader()
system_metrics_reader = tj.get_systems_metrics_reader()

profiler_output_path = tj.profiler_s3_output_path


In [None]:
# Optional if the profiler output path is already known.
profiler_output_path="/Users/amollele/tornasole-awslabs/sagemaker-profiler-pytorch-analysis/traceevents/pt-multiworker-resnext101-2020-09-08-21-43-05-168/profiler-output"

In [None]:
pf  = PandasFrame(path=profiler_output_path, use_in_memory_cache=True)

In [None]:
pt_analysis = PT_dataloader_analysis(pf)

### Analysis of DataloaderIter initializations.

The following cell analyzes
1. Which type of Dataloader iterators were initialized.
2. The number of workers per iterator.
3. Log the status of pin_memory.
4. Number of times the iteratos were initilized during training.  In PYTorch, iterators are initialized every time iterations over the dataset is to be started. (i.e. typically at the beginning of every epoch.) During initialization, PyTorch spins of worker processes depending upon the configured number of workers, establishes data queue to fetch data and pin_memory thread, if pin_memory is set.

The analysis outputs the median and maximum duration for these initializations. If there are outliers, (i.e duration is greater than 2 * median), the function prints the start and end times for those durations. These can be used to inspect system metrics during those time intervals.

In [None]:
pt_analysis.analyze_dataloaderIter_initialization()

### Analysis of Dataloader worker processes

In PyTorch, every time DataLoaderIterator is initalized, it spins of the worker processes that feed the data to iterator through attached data queue. These worker processes have lifetime similar to that of DataLoaderIterator.

The following analysis shows 
1. The number of worker processes that were spun off during the entire training.
2. Median and maximum duration for the worker processes.
3. Start and end time for the worker processes that are outliers.



In [None]:
md = pt_analysis.analyze_dataloaderWorkers()

### Analysis of DataLoaderIter::GetNext 

In PyTorch, the GetNext method is responsible for fetching the data from worker processes through the data queue.
These calls are run in the main training thread.
The analysis of these events show
1. Number of GetNext calls made during the training.
2. Median and maximum duration in micoseconds for GetNext calls.
3. Start time, End time, duration and worker id for the outlier GetNext call duration.

In [None]:
md = pt_analysis.analyze_dataloader_getnext()

### Analyze a specific outlier in DataLoaderIter::GetNext

To analyze specific outlier, select the row index corresponding to the outlier that we want to analyze and run the following cells.

In [None]:
# Plotting TimeLine charts for the first outlier reported in the above dataframe.
if md is not None and md.size > 0:
    index = 0
    start_timestamp = pf.convert_datetime_to_timestamp(md.loc[index]['start_time'])
    end_timestamp = pf.convert_datetime_to_timestamp(md.loc[index]['end_time'])
    pt_analysis.plot_the_window(start_timestamp, end_timestamp)

### Analysis of training activity for each batch of data.

Since, we have the start and end times of all the GetNext calls, we can find the amount of time spent by the training script on one batch of data.

1. We will get the time spent on each data batch by finding the difference between start time of current GetNext call and subsequent GetNext call. Let's call it 'BatchTime_in_seconds'
2. We will find the outliers in 'BatchTime_in_seconds' and start and end time for those outliers.
3. Obtain the framework and system metrics during those timestamps. This will indicate where the time was spent.



### Analyze the BatchTime_in_seconds

Following cell 
1. plots the BatchTime_in_seconds
2. Prints the median 'BatchTime_in_seconds'
3. Creates a dataframe that contains outliers.

In [None]:
md_batch = pt_analysis.analyze_batchtime()

### Analyze a specific outlier in BatchTime_in_seconds

To analyze specific outlier, select the row index corresponding to the outlier that we want to analyze and run the following cells.

In [None]:
# Plotting the TimeLine charts for the first outlier reported in the above dataframe.
if md_batch is not None and md_batch.size > 0:
    index = 1
    start_timestamp = pf.convert_datetime_to_timestamp(md_batch.loc[index]['previous_batch_start'])
    end_timestamp = pf.convert_datetime_to_timestamp(md_batch.loc[index]['start_time'])
    view_timeline_charts=pt_analysis.plot_the_window(start_timestamp, end_timestamp)

Following cells fetch the python profiler stats that we had cpatured during the start and end timestamp

In [None]:
# Get the python profiler stats for the given time window.
!mkdir -p /tmp/python_stats
from smdebug.profiler.analysis.python_profile_analysis import PythonProfileAnalysis
pfa = PythonProfileAnalysis(s3_path=profiler_output_path)

In [None]:
starttime_sec=start_timestamp / 1000_000
endtime_sec=end_timestamp/ 1000_000

In [None]:
result = pfa.fetch_profile_stats_by_time(start_time_since_epoch_in_secs=starttime_sec, end_time_since_epoch_in_secs=endtime_sec)

In [None]:
html_file = result[0].stats_path
from IPython.display import display, HTML
display(HTML(html_file))

### Plot the timeline for the batch .

The following plot indicates the window of time during which there is no specific training activity getting invoked.
Eliminating these idle time windows can improve the overall training time.

In [None]:
# The returned framework metrics contain events that are started but not completed within the given range.
# Sort the dataframe based on end time and filter the eventa that are completed within the given window.

sys_metrics, framework_metrics = pf.get_profiler_data_by_time(start_time_us=start_timestamp, end_time_us=end_timestamp)
framework_metrics['start_time'] = pd.to_datetime(framework_metrics['start_time'], format='%Y-%m-%dT%H:%M:%S:%f')
framework_metrics['end_time'] = pd.to_datetime(framework_metrics['end_time'], format='%Y-%m-%dT%H:%M:%S:%f')
framework_metrics = framework_metrics.sort_values(by=['end_time'])
framework_metrics_filtered = framework_metrics.loc[framework_metrics['end_time'] <= pd.to_datetime(end_timestamp, unit='us')]
framework_metrics_filtered.sort_values(by='start_time')


In [None]:
from bokeh.plotting import figure, show, output_notebook, output_file
from bokeh.models import ColumnDataSource, Range1d, CustomJS
from bokeh.models.tools import HoverTool
from datetime import datetime
output_notebook()


In [None]:

G=figure(title='TIme spent on a batch',x_axis_type='datetime',width=800,height=500,
        x_range=Range1d(framework_metrics_filtered.start_time.min(),framework_metrics_filtered.end_time.max(), name='TimeStamp'), tools="crosshair,xbox_select,pan,reset,save,xwheel_zoom")
hover=HoverTool(tooltips="Task: @framework_metric<br>\
Start: @start_time<br>\
End: @end_time")
G.add_tools(hover)
CDS=ColumnDataSource(framework_metrics_filtered)
G.hbar(y = 'index', height=5, left='start_time', right='end_time', fill_color="#CAB2D6", source=CDS, color="#CAB2D6", )
callback = CustomJS(
                        args=dict(s1=CDS),
                        code="""
                            console.log('Running CustomJS callback now.');
                            var inds = s1.selected.indices;
                            console.log(inds);
                            var line = "<span style=float:left;clear:left;font_size=13px><b> Selected index range: [" + Math.min.apply(Math,inds) + "," + Math.max.apply(Math,inds) + "]</b></span>\\n";
                            console.log(line);""",
                    )

G.js_on_event("selectiongeometry", callback)
show(G, notebook_handle=True)