aws · shunjd · Feb 10, 2020 · Feb 10, 2020
diff --git a/src/stepfunctions/steps/sagemaker.py b/src/stepfunctions/steps/sagemaker.py
@@ -66,6 +66,12 @@ def __init__(self, state_id, estimator, job_name, data=None, hyperparameters=Non
         else:
             parameters = training_config(estimator=estimator, inputs=data, mini_batch_size=mini_batch_size)
 
+        if estimator.debugger_hook_config != None:
+            parameters['DebugHookConfig'] = estimator.debugger_hook_config._to_request_dict()
+
+        if estimator.rules != None:
+            parameters['DebugRuleConfigurations'] = [rule.to_debugger_rule_config_dict() for rule in estimator.rules]
+
         if isinstance(job_name, (ExecutionInput, StepInput)):
             parameters['TrainingJobName'] = job_name
 

diff --git a/src/stepfunctions/template/pipeline/inference.py b/src/stepfunctions/template/pipeline/inference.py
@@ -204,6 +204,10 @@ def execute(self, job_name=None, hyperparameters=None):
             s3_bucket=self.s3_bucket,
             pipeline_name=self.workflow.name
         )
+        inputs[StepId.TrainPreprocessor.value]['DebugHookConfig']['S3OutputPath'] = 's3://{s3_bucket}/{pipeline_name}/models/debug'.format(
+            s3_bucket=self.s3_bucket,
+            pipeline_name=self.workflow.name
+        )
         inputs[StepId.CreatePreprocessorModel.value]['PrimaryContainer']['ModelDataUrl'] = '{s3_uri}/{job}/output/model.tar.gz'.format(
             s3_uri=inputs[StepId.TrainPreprocessor.value]['OutputDataConfig']['S3OutputPath'],
             job=inputs[StepId.TrainPreprocessor.value]['TrainingJobName']
@@ -236,6 +240,10 @@ def execute(self, job_name=None, hyperparameters=None):
             s3_bucket=self.s3_bucket,
             pipeline_name=self.workflow.name
         )
+        inputs[StepId.Train.value]['DebugHookConfig']['S3OutputPath'] = 's3://{s3_bucket}/{pipeline_name}/models/debug'.format(
+            s3_bucket=self.s3_bucket,
+            pipeline_name=self.workflow.name
+        )
         inputs[StepId.CreatePipelineModel.value]['ModelName'] = job_name
         self.replace_sagemaker_job_name(inputs[StepId.Train.value], inputs[StepId.Train.value]['TrainingJobName'])
 

diff --git a/tests/unit/test_pipeline.py b/tests/unit/test_pipeline.py
@@ -18,6 +18,7 @@
 from sagemaker.sklearn.estimator import SKLearn
 from unittest.mock import MagicMock, patch
 from stepfunctions.template import TrainingPipeline, InferencePipeline
+from sagemaker.debugger import DebuggerHookConfig
 
 from tests.unit.utils import mock_boto_api_call
 
@@ -65,6 +66,10 @@ def sklearn_preprocessor():
         source_dir=source_dir,
         sagemaker_session=sagemaker_session
     )
+
+    sklearn_preprocessor.debugger_hook_config = DebuggerHookConfig(
+        s3_output_path='s3://sagemaker/source/debug'
+    )
 
     return sklearn_preprocessor
 
@@ -86,6 +91,10 @@ def linear_learner_estimator():
         sagemaker_session=sagemaker_session
     )
 
+    ll_estimator.debugger_hook_config = DebuggerHookConfig(
+        s3_output_path='s3://sagemaker/models/debug'
+    )
+
     ll_estimator.set_hyperparameters(feature_dim=10, predictor_type='regressor', mini_batch_size=32)
 
     return ll_estimator
@@ -238,6 +247,7 @@ def test_inference_pipeline(sklearn_preprocessor, linear_learner_estimator):
     assert result['States']['Train Preprocessor'] == {
         'Parameters': {
             'AlgorithmSpecification.$': "$$.Execution.Input['Train Preprocessor'].AlgorithmSpecification",
+            'DebugHookConfig.$': "$$.Execution.Input['Train Preprocessor'].DebugHookConfig",
             'HyperParameters.$': "$$.Execution.Input['Train Preprocessor'].HyperParameters",
             'InputDataConfig.$': "$$.Execution.Input['Train Preprocessor'].InputDataConfig",
             'OutputDataConfig.$': "$$.Execution.Input['Train Preprocessor'].OutputDataConfig",
@@ -342,6 +352,9 @@ def test_inference_pipeline(sklearn_preprocessor, linear_learner_estimator):
             'OutputDataConfig': {
                 'S3OutputPath': 's3://sagemaker-us-east-1/inference-pipeline/models'
             },
+            'DebugHookConfig': {
+                'S3OutputPath': 's3://sagemaker-us-east-1/inference-pipeline/models/debug'
+            },
             'ResourceConfig': {
                 'InstanceCount': 1,
                 'InstanceType': 'ml.c4.xlarge',
@@ -406,6 +419,7 @@ def test_inference_pipeline(sklearn_preprocessor, linear_learner_estimator):
                 }
             }],
             'OutputDataConfig': { 'S3OutputPath': 's3://sagemaker-us-east-1/inference-pipeline/models' },
+            'DebugHookConfig': { 'S3OutputPath': 's3://sagemaker-us-east-1/inference-pipeline/models/debug' },
             'ResourceConfig': {
                 'InstanceCount': 1,
                 'InstanceType': 'ml.c4.xlarge',

diff --git a/tests/unit/test_sagemaker_steps.py b/tests/unit/test_sagemaker_steps.py
@@ -21,6 +21,7 @@
 from sagemaker.tensorflow import TensorFlow
 from sagemaker.pipeline import PipelineModel
 from sagemaker.model_monitor import DataCaptureConfig
+from sagemaker.debugger import Rule, rule_configs, DebuggerHookConfig, CollectionConfig
 
 from unittest.mock import MagicMock, patch
 from stepfunctions.steps.sagemaker import TrainingStep, TransformStep, ModelStep, EndpointStep, EndpointConfigStep
@@ -58,6 +59,54 @@ def pca_estimator():
 
     return pca
 
+@pytest.fixture
+def pca_estimator_with_debug_hook():
+    s3_output_location = 's3://sagemaker/models'
+
+    hook_config = DebuggerHookConfig(
+        s3_output_path='s3://sagemaker/output/debug',
+        hook_parameters={
+            "save_interval": "1"
+        },
+        collection_configs=[
+            CollectionConfig("hyperparameters"),
+            CollectionConfig("metrics")
+        ]
+    )
+
+    rules = [Rule.sagemaker(rule_configs.confusion(),
+        rule_parameters={
+            "category_no": "15",
+            "min_diag": "0.7",
+            "max_off_diag": "0.3",
+            "start_step": "17",
+            "end_step": "19"}
+    )]
+
+    pca = sagemaker.estimator.Estimator(
+        PCA_IMAGE,
+        role=EXECUTION_ROLE,
+        train_instance_count=1,
+        train_instance_type='ml.c4.xlarge',
+        output_path=s3_output_location,
+        debugger_hook_config = hook_config,
+        rules=rules
+    )
+
+    pca.set_hyperparameters(
+        feature_dim=50000,
+        num_components=10,
+        subtract_mean=True,
+        algorithm_mode='randomized',
+        mini_batch_size=200
+    )
+
+    pca.sagemaker_session = MagicMock()
+    pca.sagemaker_session.boto_region_name = 'us-east-1'
+    pca.sagemaker_session._default_bucket = 'sagemaker'
+
+    return pca
+
 @pytest.fixture
 def pca_model():
     model_data = 's3://sagemaker/models/pca.tar.gz'
@@ -95,6 +144,10 @@ def tensorflow_estimator():
         checkpoint_path='s3://sagemaker/models/sagemaker-tensorflow/checkpoints'
     )
 
+    estimator.debugger_hook_config = DebuggerHookConfig(
+        s3_output_path='s3://sagemaker/models/debug'
+    )
+
     estimator.sagemaker_session = MagicMock()
     estimator.sagemaker_session.boto_region_name = 'us-east-1'
     estimator.sagemaker_session._default_bucket = 'sagemaker'
@@ -148,6 +201,65 @@ def test_training_step_creation(pca_estimator):
         'End': True
     }
 
+@patch('botocore.client.BaseClient._make_api_call', new=mock_boto_api_call)
+def test_training_step_creation_with_debug_hook(pca_estimator_with_debug_hook):
+    step = TrainingStep('Training',
+        estimator=pca_estimator_with_debug_hook,
+        job_name='TrainingJob')
+    assert step.to_dict() == {
+        'Type': 'Task',
+        'Parameters': {
+            'AlgorithmSpecification': {
+                'TrainingImage': PCA_IMAGE,
+                'TrainingInputMode': 'File'
+            },
+            'OutputDataConfig': {
+                'S3OutputPath': 's3://sagemaker/models'
+            },
+            'StoppingCondition': {
+                'MaxRuntimeInSeconds': 86400
+            },
+            'ResourceConfig': {
+                'InstanceCount': 1,
+                'InstanceType': 'ml.c4.xlarge',
+                'VolumeSizeInGB': 30
+            },
+            'RoleArn': EXECUTION_ROLE,
+            'HyperParameters': {
+                'feature_dim': '50000',
+                'num_components': '10',
+                'subtract_mean': 'True',
+                'algorithm_mode': 'randomized',
+                'mini_batch_size': '200'
+            },
+            'DebugHookConfig': {
+                'S3OutputPath': 's3://sagemaker/output/debug',
+                'HookParameters': {'save_interval': '1'},
+                'CollectionConfigurations': [
+                    {'CollectionName': 'hyperparameters'},
+                    {'CollectionName': 'metrics'}
+                ]
+            },
+            'DebugRuleConfigurations': [
+                {
+                    'RuleConfigurationName': 'Confusion',
+                    'RuleEvaluatorImage': '503895931360.dkr.ecr.us-east-1.amazonaws.com/sagemaker-debugger-rules:latest',
+                    'RuleParameters': {
+                        'rule_to_invoke': 'Confusion',
+                        'category_no': '15',
+                        'min_diag': '0.7',
+                        'max_off_diag': '0.3',
+                        'start_step': '17',
+                        'end_step': '19'
+                    }
+                }
+            ],
+            'TrainingJobName': 'TrainingJob'
+        },
+        'Resource': 'arn:aws:states:::sagemaker:createTrainingJob.sync',
+        'End': True
+    }
+
 @patch('botocore.client.BaseClient._make_api_call', new=mock_boto_api_call)
 def test_training_step_creation_with_model(pca_estimator):
     training_step = TrainingStep('Training', estimator=pca_estimator, job_name='TrainingJob')
@@ -231,6 +343,9 @@ def test_training_step_creation_with_framework(tensorflow_estimator):
             'OutputDataConfig': {
                 'S3OutputPath': 's3://sagemaker/models'
             },
+            'DebugHookConfig': {
+                'S3OutputPath': 's3://sagemaker/models/debug'
+            },
             'StoppingCondition': {
                 'MaxRuntimeInSeconds': 86400
             },