In [5]:
import sagemaker
from sagemaker.tensorflow import TensorFlow
from sagemaker.debugger import (
    # profiler config
    ProfilerConfig,
    FrameworkProfile,
    DetailedProfilingConfig,
    DataloaderProfilingConfig,
    PythonProfilingConfig,
    PythonProfiler,
    cProfileTimer,
    # debugger config
    DebuggerHookConfig,
    CollectionConfig,
    # rules
    Rule,
    ProfilerRule,
    rule_configs
)


In [6]:
import boto3

role = sagemaker.get_execution_role()
print("RoleArn:", role)

session = boto3.session.Session()
region = session.region_name
print("Region:", region)

RoleArn: arn:aws:iam::658994994074:role/service-role/AmazonSageMaker-ExecutionRole-20211220T111268
Region: us-east-1


### 1. Specify the profiler configuration

In [7]:
profiler_config = ProfilerConfig(
    # Record system metrics every 500 milliseconds
    system_monitor_interval_millis=500,
    # Also collect data from Tensorflow's built-in Profiler
    # such as data from the initialization stage, data loaders,
    # detailed profiling between training steps etc.
    framework_profile_params=FrameworkProfile(
        # Collect data from Tensorflow's built-in profiler
        detailed_profiling_config=DetailedProfilingConfig(
            start_step=5, 
            num_steps=1
        ),
        # Profile data-loaders
        dataloader_profiling_config=DataloaderProfilingConfig(
            start_step=7, 
            num_steps=1
        ),
        # Activate the Python and cPython native profilers!
        python_profiling_config=PythonProfilingConfig(
            start_step=9, 
            num_steps=1, 
            python_profiler=PythonProfiler.CPROFILE, 
            cprofile_timer=cProfileTimer.TOTAL_TIME
        )
    ),
)

### 2. Specify the Debugger configuration

Configure the debugger to save tensors collected during training time, such as losses, weights, gradients, biases.

In [8]:
# and debugger configuration
debugger_hook_config = DebuggerHookConfig(
    # Specify what "kind" of tensors should be collected by the debugger
    # For more information on collections, see:
    # https://github.com/awslabs/sagemaker-debugger/blob/master/docs/api.md#collection
    collection_configs=[
        # Collections that debugger knows how to collect
        # like weights, gradients, losses and biases
        CollectionConfig(name="weights"),
        
        # NOT NEEDED: Note that gradients and losses are collected anyway
        # because  some rules require them (loss not decreasing, vanishing gradients)
        #CollectionConfig(name="gradients"),
        #CollectionConfig(name="losses"),
        
        CollectionConfig(name="biases"),
        # and a custom collection defined by a regex
        CollectionConfig(
            name="conv0_tensors",
            parameters={
                # save all tensors whose name starts with 'conv0.'
                "include_regex": "conv0.*",
                # save these tensors every 100 steps (batches)
                "save_interval": "100"
            }
        ),
    ],
    hook_parameters={
        'save_interval': '10'
    }
)

## 3. Specify monitoring rules

We also want debugger to monitor tensors emitted during training and alert us if certain conditions are breached.
In this example, we will enable the following rules:

Debugger rules enabled:
* loss is not decreasing
* overfit
* overtraining
* stalled training
* vanishing gradients
* class imbalance

Profiler rules enabled:
* CPU bottlenecks
* I/O Bottlenecks
* GPU utilization
* Multi-GPU LoadBalancing
* GPU memory Analysis
* etc.


Debugger already provides a set of built-in rules, but we can also add our own custom rules.
For a complete list of rules, see [Configure Debugger Built-in Rules](https://docs.aws.amazon.com/sagemaker/latest/dg/use-debugger-built-in-rules.html) and [Profiling report](https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-profiling-report.html).

In [9]:
rules = [
    # add all monitoring and profiling rules
    ProfilerRule.sagemaker(rule_configs.ProfilerReport()),
    # loss not decreasing
    Rule.sagemaker(rule_configs.loss_not_decreasing()),
    Rule.sagemaker(rule_configs.overfit()),
    Rule.sagemaker(rule_configs.overtraining()),
    Rule.sagemaker(rule_configs.stalled_training_rule()),
    Rule.sagemaker(rule_configs.vanishing_gradient()),
    Rule.sagemaker(rule_configs.class_imbalance()),
]

## 4. Prepare and launch a training job 

Specify the instance type and count, the image, the training script and the debugger/profiler configurations.

In [10]:
# Prepare a training job
estimator = TensorFlow(
    # Provide our notebook execution role to grant access to the Training Job API
    role=sagemaker.get_execution_role(),
    # Specify a docker image with Tensorflow 2.3.1
    image_uri=f"763104351884.dkr.ecr.{region}.amazonaws.com/tensorflow-training:2.3.1-gpu-py37-cu110-ubuntu18.04",
    # We don't need more than 1 instance. It's a small dataset.
    instance_count=1,
    # 32 vCPUs and 244 GiB of RAM
    instance_type="ml.p2.8xlarge",
    # Point to our training script
    entry_point="resnet50-cifar10.py",
    # Profiler and Debugger configuration
    profiler_config=profiler_config,
    debugger_hook_config=debugger_hook_config,
    rules=rules
)

### Launch it!

In [11]:
# TRAIN!
estimator.fit(wait=False)
# Get the training job name
estimator.latest_training_job.job_name

'tensorflow-training-2022-02-18-21-15-00-600'

## 5. Now take a look at the Debugger Insights Dashboard to analyze debugging/profiling data

.... Go

## 6. For a more thorough analysis, download the tensors collected by Debugger

In [12]:
def job_status():
    description = client.describe_training_job(TrainingJobName=job_name)
    return description["TrainingJobStatus"]

job_name = estimator.latest_training_job.name
client = estimator.sagemaker_session.sagemaker_client


while job_status() == "InProgress":
    print("Training job status: ", job_status())
    print("Waiting for training job to start training...")
    time.sleep(15)

Training job status:  InProgress
Waiting for training job to start training...


NameError: name 'time' is not defined

In [None]:
import os
from urllib.parse import urlparse
from smdebug.trials import create_trial

# this is where we create a Trial object that allows access to saved tensors
path = estimator.latest_job_debugger_artifacts_path()
print("Tensors are stored in: ", path)
trial = create_trial(estimator.latest_job_debugger_artifacts_path())


Tensors are stored in:  s3://sagemaker-us-east-1-658994994074/tensorflow-training-2022-02-18-19-59-30-787/debug-output
[2022-02-18 20:00:20.771 tensorflow-2-6-gpu--ml-g4dn-xlarge-0dac2104acc07d6f4758a14ad24a:284 INFO s3_trial.py:42] Loading trial debug-output at path s3://sagemaker-us-east-1-658994994074/tensorflow-training-2022-02-18-19-59-30-787/debug-output


In [None]:
trial1.steps()