In [63]:
import sagemaker
from sagemaker import get_execution_role

role = get_execution_role()
sess = sagemaker.Session()

default_bucket = sess.default_bucket() # or use your own custom bucket name
account = sess.account_id()
base_job_prefix = 'BirdEnd2End'
region = sagemaker.Session().boto_region_name

In [64]:
from sagemaker.tensorflow import TensorFlow

from sagemaker.debugger import (ProfilerConfig,
                                FrameworkProfile,
                                CollectionConfig,
                                DebuggerHookConfig,
                                DetailedProfilingConfig, 
                                DataloaderProfilingConfig, 
                                PythonProfilingConfig,
                                Rule,
                                PythonProfiler,
                                cProfileTimer,
                                ProfilerRule,
                                rule_configs)

from sagemaker.inputs import TrainingInput


TF_FRAMEWORK_VERSION = '2.1'

In [65]:
# Location in S3 where the debugger output will be stored is mentioned in the previous step

# Set the profile config for both system and framework metrics
profiler_config = ProfilerConfig(
    system_monitor_interval_millis = 500,
    framework_profile_params = FrameworkProfile(
        detailed_profiling_config = DetailedProfilingConfig(
            start_step = 5, 
            num_steps = 10
        ),
        dataloader_profiling_config = DataloaderProfilingConfig(
            start_step = 7, 
            num_steps = 10
        ),
        python_profiling_config = PythonProfilingConfig(
            start_step = 9, 
            num_steps = 10,
            python_profiler = PythonProfiler.CPROFILE, 
            cprofile_timer = cProfileTimer.TOTAL_TIME
        )
    )
)

# Set the debugger hook config to save tensors
debugger_hook_config = DebuggerHookConfig(
    collection_configs = [
        CollectionConfig(name = 'weights'),
        CollectionConfig(name = 'gradients')
    ]
)

# Set the rules to analyze tensors emitted during training
# These specific set of rules will inspect the overall training performance and progress of the model
rules=[
    ProfilerRule.sagemaker(rule_configs.ProfilerReport()),
#     Rule.sagemaker(rule_configs.loss_not_decreasing()),
#     Rule.sagemaker(rule_configs.overfit()),
]

In [66]:
import uuid


checkpoint_s3_uri = f's3://{default_bucket}/{base_job_prefix}/checkpoints/{uuid.uuid4()}'

model_path = f"s3://{default_bucket}/{base_job_prefix}"

instance_type  = 'ml.p3.2xlarge'#'ml.p3.16xlarge'#
instance_count = 1

gpus_per_host = 1

hyperparameters = {'lr':                 0.00012367461028516715, #0.000019,
                   'batch_size':         8,
                   'epochs':             38, #36, 
                   'dropout':            0.7459862089753134, #0.76,
                   'data_dir':           '/opt/ml/input/data'}
    
metric_definitions = [{'Name': 'loss',      'Regex': 'loss: ([0-9\\.]+)'},
                      {'Name': 'acc',       'Regex': 'accuracy: ([0-9\\.]+)'},
                      {'Name': 'val_loss',  'Regex': 'val_loss: ([0-9\\.]+)'},
                      {'Name': 'val_acc',   'Regex': 'val_accuracy: ([0-9\\.]+)'}]

distribution = {'parameter_server': {'enabled': False}}

estimator = TensorFlow(entry_point='train.py',
                       source_dir='pipeline/code',
                       instance_type=instance_type,
                       instance_count=instance_count,
                       distribution=distribution,
                       output_path=model_path,
                       hyperparameters=hyperparameters,
                       metric_definitions=metric_definitions,
                       role=role,
                       framework_version=TF_FRAMEWORK_VERSION, 
                       py_version='py3',
                       base_job_name=f"{base_job_prefix}-debugger",
                       profiler_config=profiler_config,
                       debugger_hook_config=debugger_hook_config,
                       rules=rules,
                       input_mode='Pipe',
                       script_mode=True)

In [67]:
# Set the training container related parameters

output_s3_uri = f's3://{default_bucket}/{base_job_prefix}/scriptprocessor/output/preprocess'

s3_train = output_s3_uri +'/train'
s3_valid = output_s3_uri +'/valid'

DISTRIBUTION_MODE = 'FullyReplicated'

# output_s3_uri is the output from previous process.

train_in = TrainingInput(s3_data=s3_train, distribution=DISTRIBUTION_MODE)
val_in   = TrainingInput(s3_data=s3_valid, distribution=DISTRIBUTION_MODE)

inputs = {'train':train_in, 'valid': val_in}

estimator.fit(inputs)

2022-03-17 20:19:29 Starting - Starting the training job...ProfilerReport: InProgress
...
2022-03-17 20:20:12 Starting - Preparing the instances for training......
2022-03-17 20:21:18 Downloading - Downloading input data...
2022-03-17 20:21:53 Training - Downloading the training image............
2022-03-17 20:23:54 Training - Training image download completed. Training in progress..[34m2022-03-17 20:24:00,682 sagemaker-training-toolkit INFO     Imported framework sagemaker_tensorflow_container.training[0m
[34m2022-03-17 20:24:01,246 sagemaker-training-toolkit INFO     Invoking user script[0m
[34mTraining Env:[0m
[34m{
    "additional_framework_parameters": {
        "sagemaker_parameter_server_enabled": false
    },
    "channel_input_dirs": {
        "train": "/opt/ml/input/data/train",
        "valid": "/opt/ml/input/data/valid"
    },
    "current_host": "algo-1",
    "framework_module": "sagemaker_tensorflow_container.training:main",
    "hosts": [
        "algo-1"
    ],
 

In [59]:
training_job_name = estimator.latest_training_job.name

print(f"model artifacts file is uploaded here: {model_path}/{training_job_name}/output ========")



## Tuning Job

In [16]:
estimator2 = TensorFlow(entry_point='train.py',
                       source_dir='pipeline/code',
                       instance_type=instance_type,
                       instance_count=instance_count,
                       distribution=distribution,
                       output_path=model_path,
                       hyperparameters=hyperparameters,
                       metric_definitions=metric_definitions,
                       role=role,
                       framework_version=TF_FRAMEWORK_VERSION, 
                       py_version='py3',
                       base_job_name=base_job_prefix,
                       input_mode='Pipe',
                       script_mode=True)

In [17]:
from sagemaker.tuner import ContinuousParameter, IntegerParameter, HyperparameterTuner

hyperparameter_ranges = {
    'epochs': IntegerParameter(30, 40),
    'dropout': ContinuousParameter(0.5, 0.8),
    'lr': ContinuousParameter(0.00001, 0.001)}

objective_metric_name = 'val_acc'
objective_type = 'Maximize'

In [18]:
tuner = HyperparameterTuner(estimator2,
                        objective_metric_name,
                        hyperparameter_ranges,
                        metric_definitions,
                        max_jobs=2,
                        max_parallel_jobs=2,
                        objective_type=objective_type,
                        base_tuning_job_name=f"{base_job_prefix}-tuning")

tuner.fit(inputs)

....................................................................................................................................................................................................................................................................................................!
