#### Prerequisites 

In [2]:
%%capture

!pip install sagemaker==2.121.2
!pip install boto3==1.26.27

### Imports

In [3]:
from sagemaker.dataset_definition.inputs import AthenaDatasetDefinition, DatasetDefinition
from sagemaker.workflow.parameters import ParameterInteger, ParameterString
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.workflow.automl_step import AutoMLStep
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.functions import Join
from sagemaker.processing import Processor
from sagemaker import AutoML, AutoMLInput
import sagemaker
import logging
import boto3
import json
import time

In [4]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

In [5]:
logger.info(f'Using SageMaker: {sagemaker.__version__}')
logger.info(f'Using Boto3: {boto3.__version__}')

Using SageMaker: 2.121.2
Using Boto3: 1.26.27


### Essentials 

In [6]:
sagemaker_session = sagemaker.Session()
s3_client = boto3.client('s3')

In [7]:
ROLE = sagemaker.get_execution_role()
BUCKET = sagemaker_session.default_bucket()  # Can also be a custom S3 bucket
PREFIX = '01-dw-datasets'
PROCESSING_INPUT_PATH = f's3://{BUCKET}/{PREFIX}'
PROCESSING_INPUT_NAME = 'loans.csv' #
PROCESSING_OUTPUT_PATH = f's3://{BUCKET}/{PREFIX}' # Bucket and prefix can also be different from input
NODE_ID = 'c80bb7e0-4bb6-404a-85b9-101887e4f8d1'
PROCESSING_OUTPUT_NAME = f'{NODE_ID}.default'
CURRENT_TIMESTAMP = time.strftime('%d-%H-%M-%S', time.gmtime())
CURRENT_TIMESTAMP

'21-18-54-00'

### 1. Create Data Wrangler Processing Step

#### Processing Input and Output

In [8]:
data_sources = []
processing_input = ProcessingInput(source=f'{PROCESSING_INPUT_PATH}/{PROCESSING_INPUT_NAME}', 
                                   destination=f'/opt/ml/processing/{PROCESSING_INPUT_NAME}', 
                                   input_name=PROCESSING_INPUT_NAME, 
                                   s3_data_type='S3Prefix', 
                                   s3_input_mode='File', 
                                   s3_data_distribution_type='FullyReplicated')
data_sources.append(processing_input)

In [9]:
processing_job_output = ProcessingOutput(source='/opt/ml/processing/output', 
                                         destination=f'{PROCESSING_OUTPUT_PATH}/{CURRENT_TIMESTAMP}',
                                         output_name=PROCESSING_OUTPUT_NAME,
                                         s3_upload_mode='EndOfJob')

#### Upload original data flow to S3

In [10]:
FLOW_FILE_NAME = 'loans.flow'

s3_client.upload_file(FLOW_FILE_NAME, 
                      BUCKET, 
                      f'{PREFIX}/{CURRENT_TIMESTAMP}-{FLOW_FILE_NAME}')

FLOW_S3_URI = f's3://{BUCKET}/{PREFIX}/{CURRENT_TIMESTAMP}-{FLOW_FILE_NAME}'
FLOW_S3_URI

's3://sagemaker-us-east-1-119174016168/01-dw-datasets/21-18-54-00-loans.flow'

In [11]:
flow_input = ProcessingInput(source=FLOW_S3_URI, 
                             destination='/opt/ml/processing/flow', 
                             input_name='flow', 
                             s3_data_type='S3Prefix', 
                             s3_input_mode='File', 
                             s3_data_distribution_type='FullyReplicated')

#### Data Wrangler config parameters

In [12]:
PROCESSING_JOB_NAME = f'Data-Wrangler-Processing-job-{CURRENT_TIMESTAMP}'
DW_PROCESSING_CONTAINER_URI = '663277389841.dkr.ecr.us-east-1.amazonaws.com/sagemaker-data-wrangler-container:1.31.0'
INSTANCE_COUNT = 2
INSTANCE_TYPE = 'ml.m5.4xlarge'
EBS_VOLUME_SIZE = 30  # in GB
OUTPUT_CONTENT_TYPE = 'CSV'

In [13]:
refit_trained_params = {'refit': True, 
                        'output_flow': f'{CURRENT_TIMESTAMP}-refitted-{FLOW_FILE_NAME}'}

#### Create a Processor

In [14]:
processor = Processor(base_job_name=PROCESSING_JOB_NAME,
                      role=ROLE, 
                      image_uri=DW_PROCESSING_CONTAINER_URI, 
                      instance_count=INSTANCE_COUNT, 
                      instance_type=INSTANCE_TYPE, 
                      volume_size_in_gb=EBS_VOLUME_SIZE,  
                      sagemaker_session=sagemaker_session)

#### Create the Data Wrangler Processing Step

In [15]:
data_wrangler_step = ProcessingStep(name='DataWranglerProcessingStep', 
                                    processor=processor, 
                                    inputs=[flow_input] + data_sources, 
                                    outputs=[processing_job_output], 
                                    job_arguments=[f"--refit-trained-params '{json.dumps(refit_trained_params)}'"])

### 2. Create Autopilot Step

In [16]:
pipeline_session = PipelineSession()

In [17]:
TRAINING_INPUT_CONTENT_TYPE = 'text/csv;header=present'
TARGET_ATTRIBUTE_NAME = 'loan_status'

In [18]:
auto_ml = AutoML(role=ROLE, 
                 target_attribute_name=TARGET_ATTRIBUTE_NAME, 
                 sagemaker_session=pipeline_session, 
                 mode='ENSEMBLING')

In [21]:
s3_input = Join(on='/', 
                values=[data_wrangler_step.properties.ProcessingOutputConfig.Outputs[PROCESSING_OUTPUT_NAME].S3Output.S3Uri,
                        data_wrangler_step.properties.ProcessingJobName, 
                        f'{PROCESSING_OUTPUT_NAME.replace(".", "/")}'])
s3_input

Join(on='/', values=[<sagemaker.workflow.properties.Properties object at 0x7f46564504d0>, <sagemaker.workflow.properties.Properties object at 0x7f4657a5d350>, 'c80bb7e0-4bb6-404a-85b9-101887e4f8d1/default'])

In [22]:
train_args = auto_ml.fit(inputs=AutoMLInput(inputs=s3_input, 
                                            content_type=TRAINING_INPUT_CONTENT_TYPE, 
                                            target_attribute_name=TARGET_ATTRIBUTE_NAME))



In [23]:
training_step = AutoMLStep(name='DataWrangerAutoML', 
                           step_args=train_args)

### 4. Create SageMaker Pipeline

#### Define Pipeline Parameters

In [24]:
instance_type = ParameterString(name='InstanceType', default_value='ml.m5.4xlarge')
instance_count = ParameterInteger(name='InstanceCount', default_value=2)

#### Create Pipeline

In [25]:
pipeline_name = f'LowCodePipeline'

In [26]:
pipeline_steps = [data_wrangler_step, training_step]
pipeline = Pipeline(name=pipeline_name, 
                    parameters=[instance_type, instance_count], 
                    steps=pipeline_steps, 
                    sagemaker_session=sagemaker_session)

#### Examine Pipeline Definition

In [27]:
definition = json.loads(pipeline.definition())
definition

{'Version': '2020-12-01',
 'Metadata': {},
 'Parameters': [{'Name': 'InstanceType',
   'Type': 'String',
   'DefaultValue': 'ml.m5.4xlarge'},
  {'Name': 'InstanceCount', 'Type': 'Integer', 'DefaultValue': 2}],
 'PipelineExperimentConfig': {'ExperimentName': {'Get': 'Execution.PipelineName'},
  'TrialName': {'Get': 'Execution.PipelineExecutionId'}},
 'Steps': [{'Name': 'DataWranglerProcessingStep',
   'Type': 'Processing',
   'Arguments': {'ProcessingResources': {'ClusterConfig': {'InstanceType': 'ml.m5.4xlarge',
      'InstanceCount': 2,
      'VolumeSizeInGB': 30}},
    'AppSpecification': {'ImageUri': '663277389841.dkr.ecr.us-east-1.amazonaws.com/sagemaker-data-wrangler-container:1.31.0',
     'ContainerArguments': ['--refit-trained-params \'{"refit": true, "output_flow": "21-18-54-00-refitted-loans.flow"}\'']},
    'RoleArn': 'arn:aws:iam::119174016168:role/service-role/AmazonSageMaker-ExecutionRole-20211014T093628',
    'ProcessingInputs': [{'InputName': 'flow',
      'AppManaged':

#### Start Pipeline Execution

In [28]:
pipeline.upsert(role_arn=ROLE)

{'PipelineArn': 'arn:aws:sagemaker:us-east-1:119174016168:pipeline/lowcodepipeline',
 'ResponseMetadata': {'RequestId': 'dd49e428-e93d-4352-a17b-c215aa8500ff',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'dd49e428-e93d-4352-a17b-c215aa8500ff',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '83',
   'date': 'Wed, 21 Dec 2022 18:55:23 GMT'},
  'RetryAttempts': 0}}

In [29]:
pipeline.start()

_PipelineExecution(arn='arn:aws:sagemaker:us-east-1:119174016168:pipeline/lowcodepipeline/execution/r0e9roq33nuw', sagemaker_session=<sagemaker.session.Session object at 0x7f46599cff50>)