In [None]:
!pip install smdebug matplotlib

In [83]:
import sys
import sagemaker
import smdebug

print("sagemaker =", sagemaker.__version__)
print("smdebug =", smdebug.__version__)

sagemaker = 2.70.0
smdebug = 1.0.12


## Launch a training job


In [91]:
import boto3
from sagemaker.tensorflow import TensorFlow
from sagemaker.debugger import Rule, ProfilerRule, rule_configs

session = boto3.session.Session()
region = session.region_name


estimator = TensorFlow(
    role=sagemaker.get_execution_role(),
    # we only need one EC2 instance
    instance_count=1,
    # P2 instances are intended for general-purpose GPU compute applications.
    instance_type="ml.p2.xlarge",
    # Tensorflow 2.3.1 image
    image_uri=f"763104351884.dkr.ecr.{region}.amazonaws.com/tensorflow-training:2.3.1-gpu-py37-cu110-ubuntu18.04",
    max_run=3600,
    source_dir=".",
    entry_point="./src/resnet50-cifar10.py",
)

In [92]:
estimator.fit(wait=False)

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: tensorflow-training-2022-02-18-20-43-49-830


## Monitor the job from "Trials and Experiments" or from the AWS Console for SageMaker.

## Check in on the training job status

In [93]:
job_name = estimator.latest_training_job.name
print("Training job name: {}".format(job_name))

client = estimator.sagemaker_session.sagemaker_client
description = client.describe_training_job(TrainingJobName=job_name)

print(description["TrainingJobStatus"])

Training job name: tensorflow-training-2022-02-18-20-43-49-830
InProgress


In [64]:
if description["TrainingJobStatus"] == "Failed":
    print(description["FailureReason"])

In [69]:
if description["TrainingJobStatus"] != "Completed":
    while description["SecondaryStatus"] not in {"Training", "Stopped", "Completed", "Failed"}:
        description = client.describe_training_job(TrainingJobName=job_name)
        primary_status = description["TrainingJobStatus"]
        secondary_status = description["SecondaryStatus"]
        print(
            "Current job status: [PrimaryStatus: {}, SecondaryStatus: {}] | {} Rule Evaluation Status: {}".format(
                primary_status,
                secondary_status,
                estimator.latest_training_job.rule_job_summary()[0]["RuleConfigurationName"],
                estimator.latest_training_job.rule_job_summary()[0]["RuleEvaluationStatus"],
            )
        )
        time.sleep(30)

In [None]:
from smdebug.trials import create_trial
tutorial_trial = create_trial(estimator.latest_job_debugger_artifacts_path())


In [None]:
tutorial_trial.path


In [None]:
tutorial_trial.tensor_names()
