## Use SageMaker experiments to track results of model training

In [57]:
import sys

In [2]:
#!{sys.executable} -m pip install sagemaker-experiments
#!{sys.executable} -m pip install torch
#!{sys.executable} -m pip install sagemaker

In [None]:
import time
import boto3
import numpy as np
import pandas as pd
%config InlineBackend.figure_format = "retina" # svg, eps, png, pdf
from matplotlib import pyplot as plt
from torchvision import datasets, transforms

import sagemaker
from sagemaker.session import Session
from sagemaker.analytics import ExperimentAnalytics

from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
from smexperiments.tracker import Tracker

In [None]:
session = boto3.Session()
sm = session.client('sagemaker')
role = sagemaker.get_execution_role()

In [None]:
account_id = session.client('sts').get_caller_identity()["Account"]
bucket = f"sagemaker-experiments-{session.region_name}-{account_id}"
prefix = "mnist"

try:
    if session.region_name == "us-east-1":
        session.client('s3').create_bucket(Bucket=bucket)
    else:
        session.client('s3').create_bucket(Bucket=bucket,
                                           CreateBucketConfiguration={'LocationConstraint':
                                           session.region_name})
except Exception as e:
    print(e)

In [None]:
train_set = datasets.MNIST('mnist', train=True, transform=transforms.Compose([
    transforms.toTensor(),
    transforms.Normalize((0.1307,), (0.3081,))]),
download=True)

test_set = datasets.MNIST('mnist', train=False, transform=transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))]),
download=False)

In [54]:
train_set

Dataset MNIST
    Number of datapoints: 60000
    Root location: mnist
    Split: Train
    StandardTransform
Transform: Compose(
               ToTensor()
               Normalize(mean=(0.1307,), std=(0.3081,))
           )

In [None]:
plt.imshow(train_set.data[2].numpy())

# Upload Data to S3

In [55]:
inputs = sagemaker.Session().upload_data(path='mnist', bucket=bucket, key_prefix=prefix)
print(f"Input location: {inputs}")

Input location: s3://sagemaker-experiments-us-east-1-264639154954/mnist


Now lets track the parameters from the data pre-processing step

This is a way to log preprocessing steps

In [110]:
with Tracker.create(display_name="Preprocessing", sagemaker_boto_client=sm) as tracker:
    tracker.log_parameters({
        "normalization_mean": 0.1307,
        "normalization_std": 0.3081,
    })

tracker.log_input(name="mnist-dataset", media_type="s3/uri", value=inputs)

### Step 1 - Set up the Experiment

Create an experiment to track all the model training iterations. Experiments are a great way to organize your data science work. You can create experiments to organize all your model development work for: [1] A business use case you are addressing (e.g. create experiment named "customer churn prediction"), or [2] A data science team that owns the experiment (e.g. create experiment named "marketig analytics experiment"), or [3] A specific data science and ML project. Think of it as a "folder" for organizing your "files".

### Create an Experiment - remember to refresh these each time you want to run a new experiment

In [123]:
mnist_experiment = Experiment.create(
    experiment_name=f"mnist-hand-written-digits-classification-{int(time.time())}",
    description="Classification of mnist hand-written digits",
    sagemaker_boto_client=sm
)
print(mnist_experiment)

Experiment(sagemaker_boto_client=<botocore.client.SageMaker object at 0x7f623a5b0390>,experiment_name='mnist-hand-written-digits-classification-1674927402',description='Classification of mnist hand-written digits',tags=None,experiment_arn='arn:aws:sagemaker:us-east-1:264639154954:experiment/mnist-hand-written-digits-classification-1674927402',response_metadata={'RequestId': '56068477-cbde-456c-9e17-41b233ee13f3', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '56068477-cbde-456c-9e17-41b233ee13f3', 'content-type': 'application/x-amz-json-1.1', 'content-length': '123', 'date': 'Sat, 28 Jan 2023 17:36:41 GMT'}, 'RetryAttempts': 0})


### Step 2 - Track Experiment
### Now create a Trial for each training run to track the inputs, parameters and metrics.

While training the CNN model on Sagemaker, we will experiment with several values fo the number of hidden channels in the model. We will create a Trial to track each training job run. We will also create a TrialComponent from the tracker we created before, and add to the Trial. This will enrich the trial with the parameters we captured from the data pre-preocessing stage.

Note the execution of the following code takes a while

In [124]:
from sagemaker.pytorch import PyTorch

hidden_channel_trial_name_map = {}

If you want to run the following training jobs asynchronously, you may need to increase your resource limit. Otherwise, you can run them sequentially.

In [125]:
preprocessing_trial_component = tracker.trial_component

### Be careful to ensure that the metric definition regex matches what you are logging in your training script

In [126]:
for i, num_hidden_channels in enumerate([35, 50]):
    
    trial_name = f"cnn-training-job-{num_hidden_channels}-hidden-channels{int(time.time())}"
    cnn_trial = Trial.create(
        trial_name=trial_name,
        experiment_name=mnist_experiment.experiment_name,
        sagemaker_boto_client=sm,
    )
    hidden_channel_trial_name_map[num_hidden_channels] = trial_name

    # associate the preprocessing trial component with the current trial
    cnn_trial.add_trial_component(preprocessing_trial_component)

    # all input configurations, parameters, and metrics specified in estimator definition are automatically tracked
    estimator = PyTorch(
        entry_point='mnist.py',
        source_dir='./scripts/',
        role=role,
        sagemaker_session=sagemaker.Session(sagemaker_client=sm),
        framework_version='1.6.0',
        py_version='py3',
        instance_count=1,
        instance_type='ml.c4.xlarge',
        hyperparameters={
            'epochs': 2,
            'backend': 'gloo',
            'hidden_units': num_hidden_channels,
            'dropout': 0.2,
        },
        metric_definitions=[
            {'Name':'train:loss', 'Regex':'Train Loss: (.*?):'},
            {'Name':'test:loss', 'Regex':'Test Average loss: (.*?),'},
            {'Name':'test:accuracy', 'Regex':'Test Accuracy: (.*?)%;'}
        ],
        enable_sagemaker_metrics=True,
    )

    cnn_training_job_name = f"cnn-training-job-{int(time.time())}"

    estimator.fit(
    inputs={'training': inputs},
    job_name=cnn_training_job_name,
    experiment_config={
        "TrialName": cnn_trial.trial_name,
        "TrialComponentDisplayName": "Training",
    },
    wait=True,
    )

    # give a couple of secs before dispatching the next training job
    time.sleep(2)

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: cnn-training-job-1674927404


2023-01-28 17:36:44 Starting - Starting the training job...
2023-01-28 17:37:11 Starting - Preparing the instances for trainingProfilerReport-1674927404: InProgress
.........
2023-01-28 17:38:29 Downloading - Downloading input data...
2023-01-28 17:39:09 Training - Downloading the training image...
2023-01-28 17:39:30 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2023-01-28 17:39:37,429 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2023-01-28 17:39:37,431 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-01-28 17:39:37,442 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2023-01-28 17:39:37,444 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2023-0

Retreive trial components by using search experssions ("Training" is the TrialComponentDisplayName / "Preprocessing" could also be used here)

In [144]:
search_expression = {
    "Filters":[
        {
            "Name": "DisplayName",
            "Operator": "Equals",
            "Value": "Training",
        }
    ],
}

In [141]:
trial_component_analytics = ExperimentAnalytics(
    sagemaker_session=Session(session, sm),
    experiment_name=mnist_experiment.experiment_name,
    search_expression=search_expression,
    sort_by="metrics.test:accuracy.max",
    sort_order="Descending",
    metric_names=["test:accuracy"],
    parameter_names=["hidden_channels", "epochs", "dropout", "optimizer"]
)

This is a more succint definition than above

In [145]:
trial_component_analytics = ExperimentAnalytics(
    sagemaker_session=Session(session, sm), 
    experiment_name=mnist_experiment.experiment_name,
    search_expression=search_expression,
    metric_names=["test:accuracy"],
    parameter_names=["hidden_channels"],
    sort_by="metrics.test:accuracy.max",
    sort_order="Descending",
)
analytic_table = trial_component_analytics.dataframe()

In [146]:
analytic_table = trial_component_analytics.dataframe()

In [147]:
analytic_table

Unnamed: 0,TrialComponentName,DisplayName,SourceArn,test:accuracy - Min,test:accuracy - Max,test:accuracy - Avg,test:accuracy - StdDev,test:accuracy - Last,test:accuracy - Count,training - MediaType,training - Value,SageMaker.DebugHookOutput - MediaType,SageMaker.DebugHookOutput - Value,SageMaker.ModelArtifact - MediaType,SageMaker.ModelArtifact - Value,Trials,Experiments
0,cnn-training-job-1674927404-aws-training-job,Training,arn:aws:sagemaker:us-east-1:264639154954:train...,94.0,96.0,95.0,1.414214,96.0,2,,s3://sagemaker-experiments-us-east-1-264639154...,,s3://sagemaker-us-east-1-264639154954/,,s3://sagemaker-us-east-1-264639154954/cnn-trai...,[cnn-training-job-50-hidden-channels1674927403],[mnist-hand-written-digits-classification-1674...


In [2]:
# for col in analytic_table.columns: 
#     print(col) 

Let's look at an example of tracing the lineage of a model by accessing the data tracked by Sagemaker Experiments for cnn-training-job-2-hidden-channels trial

In [74]:
lineage_table = ExperimentAnalytics(
    sagemaker_session=Session(session, sm),
    search_expression={
        "Filters":[{
            "Name": "Parents.TrialName",
            "Operator": "Equals",
            "Value": hidden_channel_trial_name_map[50]
        }]
    },
    sort_by="CreationTime",
    sort_order="Ascending",
)

In [75]:
lineage_table

<sagemaker.ExperimentAnalytics for None>