# Now let's monitor the training/deploying process

In [None]:
!pip install tqdm

In [None]:
import boto3
import json
import ipywidgets as widgets
import time

from IPython.display import display

## Helper functions

In [None]:
def get_actions():
    actions = []
    executionId = None
    resp = codepipeline.get_pipeline_state( name=pipeline_name )
    for stage in resp['stageStates']:
        stageName = stage['stageName']
        stageStatus = None
        if stage.get('latestExecution') is not None:
            stageStatus = stage['latestExecution']['status']
            if executionId is None:
                executionId = stage['latestExecution']['pipelineExecutionId']
            elif stage['latestExecution']['pipelineExecutionId'] != executionId:
                stageStatus = 'Old'
        for action in stage['actionStates']:
            actionName = action['actionName']
            actionStatus = 'Old'
            if action.get('latestExecution') is not None and stageStatus != 'Old':
                actionStatus = action['latestExecution']['status']
            actions.append( {'stageName': stageName, 
                             'stageStatus': stageStatus, 
                             'actionName': actionName, 
                             'actionStatus': actionStatus})
    return actions

In [None]:
def get_approval_token():
    resp = codepipeline.get_pipeline_state( name=pipeline_name )
    token = None
    # Get the approve train status token
    for stageState in resp['stageStates']:
        if stageState['stageName'] == 'DeployDev':
            for actionState in stageState['actionStates']:
                if actionState['actionName'] == 'ApproveDeploy':
                    if actionState.get('latestExecution') is None:
                        return None
                    latestExecution = actionState['latestExecution']
                    if latestExecution['status'] == 'InProgress':
                        token = latestExecution['token']
    return token

In [None]:
def approval(token, result):
    if token is None:
        return
    
    codepipeline.put_approval_result(
      pipelineName=pipeline_name,
      stageName='DeployDev',
      actionName='ApproveDeploy',
      result=result,
      token=token
    )

In [None]:
def approve(b):
    result={
        'summary': 'This is a great model! Put into production.',
        'status': 'Approved'
    }
    approval(get_approval_token(), result) 
    button_box.close()
    start_monitoring()

In [None]:
def reject(b):
    result={
        'summary': 'This is a rubbish model. Discard it',
        'status': 'Rejected'
    }
    approval(get_approval_token(), result)
    button_box.close()
    start_monitoring()

In [None]:
def start_monitoring():
    global button_box
    
    running = True
    while running:
        steps_ok = 0
        for k,action in enumerate(get_actions()):
            if action['actionStatus'] == 'Failed':
                bar.bar_style='danger'
                label.value='Ops! Something went wrong Stage[{}] Action[{}]'.format(
                    action['stageName'], action['actionName'])
                running = False
                return

            elif action['actionStatus'] == 'InProgress':
                if get_approval_token() is not None:
                    display(button_box)
                    running = False
                break
            elif action['actionStatus'] == 'Old':
                break
            elif action['actionStatus'] == 'Succeeded':
                steps_ok += 1
        
        label.value = "Actions {}/{} - Current: Stage[{}] Action[{}]".format( 
                k+1,max_actions, action['stageName'], action['actionName'] )
        bar.value = steps_ok

        if steps_ok == max_actions:
            running = False
        else:    
            time.sleep(2)

## Job monitoring

In [None]:
import os

codepipeline = boto3.client('codepipeline')
pipeline_name = os.environ['PIPELINE_NAME']
model_name = os.environ['MODEL_NAME']

print('pipeline: {}'.format(pipeline_name))
print('model name: {}'.format(model_name))

In [None]:
approve_btn = widgets.Button(description="Approve", button_style='success', icon='check')
reject_btn = widgets.Button(description="Reject", button_style='danger', icon='close')
approve_btn.on_click(approve)
reject_btn.on_click(reject)
button_box = widgets.HBox([approve_btn, reject_btn])
                
max_actions = len(get_actions())
label = widgets.Label(value="Loading...")
bar = widgets.IntProgress( value=0, min=0, max=max_actions, step=1, bar_style='info' )
info_box = widgets.VBox([label, bar])

display(info_box)
start_monitoring()

## Now, if everything went fine, we can test our models

In [None]:
# Get the current execution id, and production endpoints
response = codepipeline.get_pipeline_state( name=pipeline_name )
executionId = response['stageStates'][-1]['latestExecution']['pipelineExecutionId']

endpoint_name='mlops-{}-prd-{}'.format(model_name, executionId)
processing_job_name='mlops-{}-pbl-{}'.format(model_name, executionId)
schedule_name='mlops-{}-pms-{}'.format(model_name, executionId)

print('execution id: {}'.format(executionId))
print('endpoint name: {}'.format(endpoint_name))

Call the endpoint with some expected, and unexpected data

In [None]:
sm_runtime = boto3.client('sagemaker-runtime')

def test_endpoint(endpoint_name, payload, content_type='text/csv', custom_attributes=''):
    resp = sm_runtime.invoke_endpoint(
        EndpointName=endpoint_name,
        Body=payload,
        ContentType=content_type,
        CustomAttributes=custom_attributes
    )
    return resp['Body'].read()

In [None]:
# Validate that we can send some traffic to the end point
test_endpoint(endpoint_name, 'text\nthis is a test'.encode('utf-8'))

### Load Test Endpoint

Send a bunch of examples to the endpoint 

In [None]:
# Load sample data
monitor_sample = [
# Define some typical data
#     'cool asian food pty melbourne au', # eathing out
#     'woolworths 3188 brunswick au', # groceries
#     'airbnb * blaa surry hills au', # travel
#     'lido cinemas hawthorn au', # entertainment
#     'northcote indoor spo thornbury au', # health
# Define some data which is out of bounds of usual sample
    'one',
    '1',
    'fdkslfjkdlsjfkdsfkldjsklfmdskfjkdlsjfkldsjfkldsjfkldsjklfjsdkjfklds',
    'this is a very long sentance that should skew the character and word count',
]

# Send off a series of invidual requests for each sample
from tqdm import tqdm
for i in tqdm(range(100)):
    for sample in monitor_sample:
        payload = 'text\n{}'.format(sample).encode('utf-8')
        test_endpoint(endpoint_name, payload).decode('utf-8')

### Production deployment

List progress on this cloud formation stack

In [None]:
import boto3

region = boto3.Session().region_name
cfn = boto3.client('cloudformation')

stack_name = stack_name='{}-deploy-prd'.format(pipeline_name)
print('stack name: {}'.format(stack_name))

In [None]:
response = cfn.describe_stacks(StackName=stack_name)
if response['Stacks']:
    stack = response['Stacks'][0]
    print('stack status: {}'.format(stack['StackStatus']))

List the last events and how long ago they occured

In [None]:
from datetime import datetime
from dateutil.tz import tzlocal

def get_event_dataframe(events):
    stack_cols = ['LogicalResourceId', 'ResourceStatus', 'ResourceStatusReason', 'Timestamp']
    stack_event_df = pd.DataFrame(events)[stack_cols].fillna('')
    stack_event_df['TimeAgo'] = (datetime.now(tzlocal())-stack_event_df['Timestamp'])
    return stack_event_df.drop('Timestamp', axis=1)

# Get latest stack events
response = cfn.describe_stack_events(StackName=stack_name)
get_event_dataframe(response['StackEvents']).head(10)

In [None]:
def make_clickable(val):
    return '<a href="{}" rel="noopener noreferrer" target="_blank">link</a>'.format(val,val)

def get_resource_dataframe(resources):
    resource_map = {
        'AWS::Lambda::Function': 'https://{0}.console.aws.amazon.com/lambda/home?region={0}#functions/{1}',
        'AWS::CodeDeploy::Application': 'https://{0}.console.aws.amazon.com/codesuite/codedeploy/applications/{1}?region={0}',
        'AWS::ApiGateway::RestApi': 'https://{0}.console.aws.amazon.com/apigateway/home?region={0}#/apis/{1}/resources',
        'AWS::SageMaker::Endpoint': 'https://{0}.console.aws.amazon.com/sagemaker/home?region={0}#/endpoints/{1}'
    }
    resources = [
        {
            'name': r['LogicalResourceId'],
            'url': resource_map[r['ResourceType']].format(region, r['PhysicalResourceId'].split('/')[-1]),
            'type': r['ResourceType'],
            'status': r['ResourceStatus']
        } for r in resources
        if (r['ResourceType'] in resource_map and r['ResourceStatus'] in ['CREATE_COMPLETE', 'UPDATE_COMPLETE'])
    ]
    cols = ['name', 'type', 'status', 'url']
    df = pd.DataFrame(resources)[cols]
    return df.style.format({'url': make_clickable})

# Get resource list
response = cfn.describe_stack_resources(StackName=stack_name)
get_resource_dataframe(response['StackResources'])

## Test Lambda API

Send a message to the API endpoint, and check the endpoint name included in the response

In [None]:
# TODO: Add loop to call API endpoint

## Load baseline

Load baseline processing job

In [None]:
import boto3
import pandas as pd
import json

import sagemaker
from sagemaker.model_monitor import BaseliningJob, DefaultModelMonitor, MonitoringExecution
from sagemaker.s3 import S3Downloader

s3 = boto3.client('s3')
sm = boto3.client('sagemaker')

sagemaker_session = sagemaker.Session()

In [None]:
baseline_job = BaseliningJob.from_processing_name(sagemaker_session, processing_job_name)
status = baseline_job.describe()['ProcessingJobStatus']
if status != 'Completed':
    raise(Exception('Processing job not complete, status: {}'.format(status)))
    
baseline_results_uri  = baseline_job.outputs[0].destination
print('baseline results uri: {}'.format(baseline_results_uri))

In [None]:
!aws s3 ls $baseline_results_uri/

### Explore the generated constraints and statistics

In [None]:
schema_df = pd.io.json.json_normalize(baseline_job.baseline_statistics().body_dict["features"])
schema_df

In [None]:
constraints_df = pd.io.json.json_normalize(baseline_job.suggested_constraints().body_dict["features"])
constraints_df

## View Data Capture

Get the list of data capture files form the endpoint

In [None]:
bucket = sagemaker.Session().default_bucket()
data_capture_logs_uri = 's3://{}/{}/datacapture/{}'.format(bucket, model_name, endpoint_name)

print('Data Capture logs: {}'.format(data_capture_logs_uri))

In [None]:
capture_files = S3Downloader.list(data_capture_logs_uri)
print('Found {} files'.format(len(capture_files)))

if capture_files:
    # Get the first line of the most recent file    
    event = json.loads(S3Downloader.read_file(capture_files[-1]).split('\n')[0])
    print('\nLast file:\n{}'.format(json.dumps(event, indent=2)))

## View Monitoring Schedule

The functions for plotting and rendering distribution statistics or constraint violations are implemented in a `utils` file so let's grab that.

In [None]:
!wget -O utils.py --quiet https://raw.githubusercontent.com/awslabs/amazon-sagemaker-examples/master/sagemaker_model_monitor/visualization/utils.py
import utils as mu

Load the last succesful monitoring schedule

In [None]:
# Validate that we are looking for completed/stopped schedules
response = sm.list_monitoring_executions(MonitoringScheduleName=schedule_name)

status = None
expected_status = ['Completed', 'CompletedWithViolations']
for mon in response['MonitoringExecutionSummaries']:
    processing_job_arn = mon['ProcessingJobArn']
    status = mon['MonitoringExecutionStatus']
    if status in expected_status:
        break

if not status in expected_status:
    raise(Exception('No completed schedules'))
    
print('Schedule status: {}'.format(status))

Load the monitoring execution

In [None]:
execution = MonitoringExecution.from_processing_arn(sagemaker_session=sagemaker.Session(), 
                                                    processing_job_arn=processing_job_arn)
exec_inputs = {inp['InputName']: inp for inp in execution.describe()['ProcessingInputs']}
exec_results_uri = execution.output.destination

print('Monitoring Execution results: {}'.format(exec_results_uri))

List the constraints, statistics and violations if they exist. 

In [None]:
!aws s3 ls $exec_results_uri/

In [None]:
# Get the baseline and monitoring statistics & violations
baseline_statistics = baseline_job.baseline_statistics().body_dict
execution_statistics = execution.statistics().body_dict
violations = execution.constraint_violations().body_dict['violations']

In [None]:
mu.show_violation_df(baseline_statistics=baseline_statistics, 
                     latest_statistics=execution_statistics, 
                     violations=violations)

## Distributions

This section visualizes the distribution and renders the distribution statistics for all features

In [None]:
features = mu.get_features(execution_statistics)
feature_baselines = mu.get_features(baseline_statistics)

In [None]:
mu.show_distributions(features)

### Execution Stats vs Baseline

In [None]:
mu.show_distributions(features, feature_baselines)