Installing required libraries for stepfunctions

In [1]:
 import sys
 !{sys.executable} -m pip install --upgrade pip
 !{sys.executable} -m pip install -qU awscli boto3 "sagemaker>=2.0.0"
 !{sys.executable} -m pip install -qU "stepfunctions>=2.0.0"
 !{sys.executable} -m pip show sagemaker stepfunctions

Collecting pip
  Downloading pip-21.2.2-py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 48.5 MB/s eta 0:00:01
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 21.1.2
    Uninstalling pip-21.1.2:
      Successfully uninstalled pip-21.1.2
Successfully installed pip-21.2.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
aiobotocore 1.3.0 requires botocore<1.20.50,>=1.20.49, but you have botocore 1.21.15 which is incompatible.[0m
Name: sagemaker
Version: 2.52.0
Summary: Open source library for training and deploying models on Amazon SageMaker.
Home-page: https://github.com/aws/sagemaker-python-sdk/
Author: Amazon Web Services
Author-email: 
License: Apache License 2.0
Location: /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages
Requires: importlib-metadata, bot

# 1. Setting up notebook with parameters and libraries

In [38]:
# importing Python libraies
import stepfunctions
import logging

from stepfunctions.steps import *
from stepfunctions.workflow import Workflow
from stepfunctions import steps
from stepfunctions.inputs import ExecutionInput
from sagemaker.processing import Processor,ProcessingInput, ProcessingOutput
import calendar
import time
import sagemaker
from sagemaker.inputs import TrainingInput
import boto3
from sagemaker.network import NetworkConfig

stepfunctions.set_stream_logger(level=logging.INFO)

### Defining paramters

This needs to be changed if we are taking it to different environment

In [39]:
v_workflow_execution_role = "arn:aws:iam::525102048888:role/poc-sagemaker-step-functi-MachineLearningWorkflowE-1XFI2UPRXFTXE" # Step function IAM role ARN
v_preprocessing_iam_role = "arn:aws:iam::525102048888:role/service-role/AmazonSageMaker-ExecutionRole-20191105T125227" # IAM role for preprocessing container
v_preprocessing_instance_type = "ml.m5.xlarge" # Instance type for preprocessing container it changes as per workload
v_s3_input_bucket = "wipcoe-ml-s3-data-bucket" # S3 bucket for input and output data
v_prefix_for_input_data = "data/output/lr/baselineinp/train_baseline.csv"  # Prefix where data is stored
v_region = 'us-east-1' # AWS region
sec_groups = ["sg-044e0e7ce4f5721c0"]
subnets = ["subnet-0cf0e3f46326aa259",
           "subnet-0156b7f5500cf0b78",
           "subnet-032420199163cff9b"]
config_bucket = "wipcoe-datalake-init-s3-mlops-config"

## 2. Defining preprocessing jobs

In [73]:
# Defining environment config for baseline jobs
environment = {
          "dataset_format": "{\"csv\":{\"header\": true,\"output_columns_position\": \"START\"}}",
          "dataset_source": "/opt/ml/processing/input/baseline_dataset_input",
          "output_path": "/opt/ml/processing/output",
          "publish_cloudwatch_metrics": "Disabled"
        }

In [74]:
# Here we are creating baseline preprocesor
baseline_processor = Processor(image_uri='156813124566.dkr.ecr.us-east-1.amazonaws.com/sagemaker-model-monitor-analyzer',
                     role=v_preprocessing_iam_role,
                     instance_count=1,
                     instance_type=v_preprocessing_instance_type,
                     network_config = NetworkConfig(security_group_ids = sec_groups, subnets = subnets),
                     env=environment)

In [75]:
input_data ="s3://{}/{}".format(v_s3_input_bucket,v_prefix_for_input_data)
inputs = [
    ProcessingInput(
        source=input_data, destination="/opt/ml/processing/input/baseline_dataset_input", input_name="input_data"
    )
]

outputs = [
    ProcessingOutput(
        source="/opt/ml/processing/output",
        destination="s3://{}/{}".format(config_bucket,"customonitor/"),
        output_name="tr_data",
    )
]
gmt = time.gmtime()
ts = calendar.timegm(gmt)
baseline_name = "baseline-{}".format(ts)

In [76]:
print("s3://{}/{}".format(config_bucket,"customonitor"))
print(input_data)

s3://wipcoe-datalake-init-s3-mlops-config/customonitor
s3://wipcoe-ml-s3-data-bucket/data/output/lr/baselineinp/train_baseline.csv


In [78]:
baseline_preprocessing_step = steps.ProcessingStep(
    state_id='Baseline', 
    processor=baseline_processor,
    job_name=baseline_name, 
    inputs=inputs, 
    outputs=outputs, 
    experiment_config=None, 
    wait_for_completion=True
)

## 3. Step Function

In [79]:
# First we chain the start pass state,preprocessing_step,
basic_path=Chain([baseline_preprocessing_step])

In [80]:
# Next, we define the workflow
basic_workflow = Workflow(
    name="ds-mlops-dev-baseline-jobV2",
    definition=basic_path,
    role=v_workflow_execution_role
)

#Render the workflow
basic_workflow.render_graph()

## 3.1 Create the workflow on AWS Step Functions

Create the workflow in AWS Step Functions with [create](https://aws-step-functions-data-science-sdk.readthedocs.io/en/latest/workflow.html#stepfunctions.workflow.Workflow.create).

In [81]:
basic_workflow.create()

[31m[ERROR] A workflow with the same name already exists on AWS Step Functions. To update a workflow, use Workflow.update().[0m


'arn:aws:states:us-east-1:525102048888:stateMachine:ds-mlops-dev-baseline-jobV2'

In [82]:
basic_workflow.update(definition=basic_workflow.definition,role=basic_workflow.role)

[32m[INFO] Workflow updated successfully on AWS Step Functions. All execute() calls will use the updated definition and role within a few seconds. [0m


'arn:aws:states:us-east-1:525102048888:stateMachine:ds-mlops-dev-baseline-jobV2'

In [83]:
basic_workflow_execution = basic_workflow.execute(
    inputs={
    }
)

[32m[INFO] Workflow execution started successfully on AWS Step Functions.[0m


## 3.2 Review the execution progress

Render workflow progress with the [render_progress](https://aws-step-functions-data-science-sdk.readthedocs.io/en/latest/workflow.html#stepfunctions.workflow.Execution.render_progress).

This generates a snapshot of the current state of your workflow as it executes. This is a static image. Run the cell again to check progress. 

In [84]:
basic_workflow_execution.render_progress()

## 4 Downloading generated report on notebook

In [53]:
# Downloading contraint file for evaluation
!aws s3 cp s3://$config_bucket/monitoring/constraints.json .

download: s3://wipro-datalake-init-s3-mlops-config/monitoring/constraints.json to ./constraints.json


In [54]:
# Dowloading stats file for evaluation
!aws s3 cp s3://$config_bucket/monitoring/statistics.json .

download: s3://wipro-datalake-init-s3-mlops-config/monitoring/statistics.json to ./statistics.json


In [37]:
from sagemaker.model_monitor import DefaultModelMonitor
from sagemaker.model_monitor.dataset_format import DatasetFormat
input_s3_path='s3://wipcoe-ml-s3-data-bucket/data/output/lr/baselineinp/train_baseline.csv'
output_s3_path='s3://wipcoe-ml-s3-config-bucket/monitoring/'
my_default_monitor = DefaultModelMonitor(
    role=v_preprocessing_iam_role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    volume_size_in_gb=20,
    max_runtime_in_seconds=3600,
)

my_default_monitor.suggest_baseline(
    baseline_dataset=input_s3_path,#'s3://sagemaker-us-east-1-525102048888/sagemaker/DEMO-ModelMonitor/baselining/data/training-dataset-with-header.csv',
    dataset_format=DatasetFormat.csv(header=True),
    output_s3_uri=output_s3_path,#'s3://wipcoe-datalake-init-s3-mlops-config/customonitor',
    wait=True,
)


Job Name:  baseline-suggestion-job-2021-08-06-11-30-27-278
Inputs:  [{'InputName': 'baseline_dataset_input', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://wipcoe-ml-s3-data-bucket/data/output/lr/baselineinp/train_baseline.csv', 'LocalPath': '/opt/ml/processing/input/baseline_dataset_input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'monitoring_output', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://wipcoe-ml-s3-config-bucket/monitoring/', 'LocalPath': '/opt/ml/processing/output', 'S3UploadMode': 'EndOfJob'}}]
.............................[34m2021-08-06 11:35:02,401 - __main__ - INFO - All params:{'ProcessingJobArn': 'arn:aws:sagemaker:us-east-1:525102048888:processing-job/baseline-suggestion-job-2021-08-06-11-30-27-278', 'ProcessingJobName': 'baseline-suggestion-job-2021-08-06-11-30-27-278', 'Environment': {'dataset_format': '{"csv": {"header": true, "output_columns_positi

[34m2021-08-06 11:35:24,106 - DefaultDataAnalyzer - INFO - Running command: bin/spark-submit --master yarn --deploy-mode client --conf spark.hadoop.fs.s3a.aws.credentials.provider=org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider --conf spark.serializer=org.apache.spark.serializer.KryoSerializer /opt/amazon/sagemaker-data-analyzer-1.0-jar-with-dependencies.jar --analytics_input /tmp/spark_job_config.json[0m
[34m2021-08-06 11:35:25 WARN  NativeCodeLoader:60 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable[0m
[34m2021-08-06 11:35:25 INFO  Main:28 - Start analyzing with args: --analytics_input /tmp/spark_job_config.json[0m
[34m2021-08-06 11:35:25 INFO  Main:31 - Analytics input path: DataAnalyzerParams(/tmp/spark_job_config.json,yarn)[0m
[34m2021-08-06 11:35:25 INFO  FileUtil:66 - Read file from path /tmp/spark_job_config.json.[0m
[34m2021-08-06 11:35:25 INFO  SparkContext:54 - Running Spark version 2.3.1[0m
[34m2021-

[34m2021-08-06 11:35:56 INFO  YarnClientSchedulerBackend:54 - SchedulerBackend is ready for scheduling beginning after waiting maxRegisteredResourcesWaitingTime: 30000(ms)[0m
[34m2021-08-06 11:35:56 WARN  SparkContext:66 - Spark is not running in local mode, therefore the checkpoint directory must not be on the local filesystem. Directory '/tmp' appears to be on the local filesystem.[0m
[34m2021-08-06 11:35:56 INFO  DatasetReader:91 - Files to process:List(file:///opt/ml/processing/input/baseline_dataset_input/train_baseline.csv)[0m
[34m2021-08-06 11:35:56 INFO  SharedState:54 - Setting hive.metastore.warehouse.dir ('null') to the value of spark.sql.warehouse.dir ('file:/usr/spark-2.3.1/spark-warehouse').[0m
[34m2021-08-06 11:35:56 INFO  SharedState:54 - Warehouse path is 'file:/usr/spark-2.3.1/spark-warehouse'.[0m
[34m2021-08-06 11:35:56 INFO  StateStoreCoordinatorRef:54 - Registered StateStoreCoordinator endpoint[0m
[34m2021-08-06 11:35:57 INFO  FileSourceStrategy:54 - P

[34m2021-08-06 11:36:05 INFO  BlockManagerInfo:54 - Added rdd_11_1 in memory on algo-1:34613 (size: 4.9 MB, free: 5.8 GB)[0m
[34m2021-08-06 11:36:05 INFO  TaskSetManager:54 - Finished task 0.0 in stage 2.0 (TID 2) in 2901 ms on algo-1 (executor 1) (2/3)[0m
[34m2021-08-06 11:36:06 INFO  TaskSetManager:54 - Finished task 1.0 in stage 2.0 (TID 3) in 3369 ms on algo-1 (executor 1) (3/3)[0m
[34m2021-08-06 11:36:06 INFO  YarnScheduler:54 - Removed TaskSet 2.0, whose tasks have all completed, from pool [0m
[34m2021-08-06 11:36:06 INFO  DAGScheduler:54 - ShuffleMapStage 2 (collect at AnalysisRunner.scala:313) finished in 3.390 s[0m
[34m2021-08-06 11:36:06 INFO  DAGScheduler:54 - looking for newly runnable stages[0m
[34m2021-08-06 11:36:06 INFO  DAGScheduler:54 - running: Set()[0m
[34m2021-08-06 11:36:06 INFO  DAGScheduler:54 - waiting: Set(ResultStage 3)[0m
[34m2021-08-06 11:36:06 INFO  DAGScheduler:54 - failed: Set()[0m
[34m2021-08-06 11:36:06 INFO  DAGScheduler:54 - Submit

[34m2021-08-06 11:36:16 INFO  TaskSetManager:54 - Finished task 2.0 in stage 11.0 (TID 25) in 1368 ms on algo-1 (executor 1) (1/3)[0m
[34m2021-08-06 11:36:17 INFO  TaskSetManager:54 - Finished task 1.0 in stage 11.0 (TID 24) in 2601 ms on algo-1 (executor 1) (2/3)[0m
[34m2021-08-06 11:36:17 INFO  ContextCleaner:54 - Cleaned accumulator 321[0m
[34m2021-08-06 11:36:17 INFO  ContextCleaner:54 - Cleaned accumulator 334[0m
[34m2021-08-06 11:36:17 INFO  ContextCleaner:54 - Cleaned accumulator 250[0m
[34m2021-08-06 11:36:17 INFO  ContextCleaner:54 - Cleaned accumulator 236[0m
[34m2021-08-06 11:36:17 INFO  ContextCleaner:54 - Cleaned accumulator 134[0m
[34m2021-08-06 11:36:17 INFO  ContextCleaner:54 - Cleaned accumulator 197[0m
[34m2021-08-06 11:36:17 INFO  ContextCleaner:54 - Cleaned accumulator 233[0m
[34m2021-08-06 11:36:17 INFO  ContextCleaner:54 - Cleaned accumulator 163[0m
[34m2021-08-06 11:36:17 INFO  ContextCleaner:54 - Cleaned accumulator 149[0m
[34m2021-08-06 1




<sagemaker.processing.ProcessingJob at 0x7fbc63b36d30>

We can perform analysis on this file and put it on git hub

**Note :**
In order to refer it for modified location in monitoring schedule lambda function we need to changes the baseline stats location to prefix custom_monitoring/ from monitoring
