### Import and Install Necessary Packages 

In [1]:
!pip install sagemaker-experiments==0.1.24
!pip install smdebug==0.9.4

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
import boto3
import sagemaker
from sagemaker.amazon.amazon_estimator import get_image_uri
import os 

### Define S3 Bucket and Prefix of Training and Testing Data 

In [3]:
bucket=sagemaker.Session().default_bucket()
prefix = 'demand-prediction'

### Get xgboost 1.0-1 Container 

In [4]:
container = get_image_uri(boto3.Session().region_name, 'xgboost',repo_version="1.0-1")

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


### Produce s3_input object for SageMaker 

In [5]:
s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='csv')


's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


### Create SageMaker Experiment 

In [6]:
from smexperiments.experiment import Experiment
from sagemaker import get_execution_role
import time 
import boto3

role = get_execution_role()
account_id = role.split(':')[4]
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker',region_name=region)
m5predict_experiment = Experiment.create(
    experiment_name=f"m5-predict-{int(time.time())}", 
    description="Predict the sales units of items", 
    sagemaker_boto_client=sm)
print(m5predict_experiment)


Experiment(sagemaker_boto_client=<botocore.client.SageMaker object at 0x7f0c0eae78d0>,experiment_name='m5-predict-1603732816',description='Predict the sales units of items',tags=None,experiment_arn='arn:aws:sagemaker:us-east-1:230755935769:experiment/m5-predict-1603732816',response_metadata={'RequestId': 'ea914753-23f4-4e1e-bf3f-49853ab2f5fd', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'ea914753-23f4-4e1e-bf3f-49853ab2f5fd', 'content-type': 'application/x-amz-json-1.1', 'content-length': '93', 'date': 'Mon, 26 Oct 2020 17:20:16 GMT'}, 'RetryAttempts': 0})


### Define Debugger Hooks and Rules 

In [7]:
from sagemaker.debugger import DebuggerHookConfig, CollectionConfig

## define which kind of tensors to save  
def get_debugger_hook(bucket_path, save_interval):
    return DebuggerHookConfig(
        s3_output_path=bucket_path,  # Required
        collection_configs=[
            CollectionConfig(
                name="metrics",
                parameters={
                    "save_interval": str(save_interval)
                }
            ),
            CollectionConfig(
                name="feature_importance",
                parameters={
                    "save_interval": str(save_interval)
                }
            )  
        ],
    )

## define when to alert 
def get_debugger_rule(save_interval):
    return [
        Rule.sagemaker(
            rule_configs.loss_not_decreasing(),
            rule_parameters={
                "collection_names": "metrics",
                "num_steps": str(save_interval * 2),
            },
        ),
    ]

### Utility Functions to Observe Metrics Collected

In [8]:
import matplotlib.pyplot as plt
import seaborn as sns
import re


def get_data(trial, tname):
    """
    For the given tensor name, walks though all the iterations
    for which you have data and fetches the values.
    Returns the set of steps and the values.
    """
    tensor = trial.tensor(tname)
    steps = tensor.steps()
    vals = [tensor.value(s) for s in steps]
    return steps, vals

def plot_collection(trial, collection_name, regex='.*', figsize=(8, 6)):
    """
    Takes a `trial` and a collection name, and 
    plots all tensors that match the given regex.
    """
    fig, ax = plt.subplots(figsize=figsize)
    sns.despine()

    tensors = trial.collection(collection_name).tensor_names

    for tensor_name in sorted(tensors):
        if re.match(regex, tensor_name):
            steps, data = get_data(trial, tensor_name)
            ax.plot(steps, data, label=tensor_name)

    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    ax.set_xlabel('Iteration')

def plot_feature_importance(trial, importance_type="weight"):
    SUPPORTED_IMPORTANCE_TYPES = ["weight", "gain", "cover", "total_gain", "total_cover"]
    if importance_type not in SUPPORTED_IMPORTANCE_TYPES:
        raise ValueError(f"{importance_type} is not one of the supported importance types.")
    plot_collection(
        trial,
        "feature_importance",
        regex=f"feature_importance/{importance_type}/.*")
    

In [None]:

from smexperiments.trial import Trial
import sagemaker
from sagemaker.debugger import rule_configs, Rule
from sagemaker.debugger import DebuggerHookConfig, CollectionConfig
from smdebug.trials import create_trial


save_interval = 5
sess = sagemaker.Session()
trial_info = [] 

for i, tree_max_depth in enumerate([1,5,9]):
    # create trial
    trial_name = f"xgboost-training-job-{tree_max_depth}-treemaxdepth-{int(time.time())}"
    xgboost_trial = Trial.create(
        trial_name=trial_name, 
        experiment_name=m5predict_experiment.experiment_name,
        sagemaker_boto_client=sm,
    )
    
    bucket_path = 's3://{}/{}'.format(bucket,trial_name)
    debugger_hook_config_xgboost=get_debugger_hook(bucket_path, save_interval)
    debug_rules=get_debugger_rule(save_interval)

    

    xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sess,
                                    debugger_hook_config=debugger_hook_config_xgboost,
                                    rules=debug_rules)
    xgb.set_hyperparameters(max_depth=tree_max_depth,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='reg:linear',
                        num_round=20)
    
    xgboost_training_job_name = "m5predict-xgboost-training-job-{}".format(int(time.time()))
    
    
    xgb.fit(
        inputs={'train': s3_input_train, 'validation': s3_input_validation},
        job_name=xgboost_training_job_name,
        experiment_config={
            "TrialName": xgboost_trial.trial_name,
            "TrialComponentDisplayName": "Training",
        },
        wait=True,
    )
    s3_output_path = xgb.latest_job_debugger_artifacts_path()
    trial_info.append((trial_name, s3_output_path))

    # give it a while before dispatching the next training job
    time.sleep(2)




INFO:sagemaker:Creating training-job with name: m5predict-xgboost-training-job-1603751161


2020-10-26 22:26:02 Starting - Starting the training job...
2020-10-26 22:26:25 Starting - Launching requested ML instances
********* Debugger Rule Status *********
*
*  LossNotDecreasing: InProgress        
*
****************************************
......
2020-10-26 22:27:26 Starting - Preparing the instances for training......
2020-10-26 22:28:27 Downloading - Downloading input data...
2020-10-26 22:28:57 Training - Downloading the training image...
2020-10-26 22:29:28 Training - Training image download completed. Training in progress.[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value reg:linear to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input i

INFO:sagemaker:Creating training-job with name: m5predict-xgboost-training-job-1603751508


2020-10-26 22:31:48 Starting - Starting the training job...
2020-10-26 22:32:12 Starting - Launching requested ML instances
********* Debugger Rule Status *********
*
*  LossNotDecreasing: InProgress        
*
****************************************
.........
2020-10-26 22:33:33 Starting - Preparing the instances for training......
2020-10-26 22:34:34 Downloading - Downloading input data...
2020-10-26 22:35:19 Training - Downloading the training image...
2020-10-26 22:35:39 Training - Training image download completed. Training in progress.[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value reg:linear to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV inpu

In [None]:
for trial_name, s3_output_path in trial_info: 
    print("loading {} of trial:{}".format(s3_output_path, trial_name))
    trial = create_trial(s3_output_path)
    plot_collection(trial, "metrics")
    plot_feature_importance(trial)