#### Prerequisites 

In [None]:
%%capture

!pip install sagemaker==2.121.2
!pip install boto3==1.26.27

### Imports

In [27]:
from sagemaker.dataset_definition.inputs import AthenaDatasetDefinition, DatasetDefinition
from sagemaker.workflow.parameters import ParameterInteger, ParameterString
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.workflow.automl_step import AutoMLStep
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.workflow.model_step import ModelStep
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.functions import Join
from sagemaker.processing import Processor
from sagemaker import AutoML, AutoMLInput
import sagemaker
import logging
import boto3
import json
import time

In [3]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

In [4]:
logger.info(f'Using SageMaker: {sagemaker.__version__}')
logger.info(f'Using Boto3: {boto3.__version__}')

Using SageMaker: 2.121.2
Using Boto3: 1.26.27


### Essentials 

In [5]:
sagemaker_session = sagemaker.Session()
s3_client = boto3.client('s3')

In [6]:
ROLE = sagemaker.get_execution_role()
BUCKET = sagemaker_session.default_bucket()  # Can also be a custom S3 bucket
PREFIX = '01-dw-datasets'
PROCESSING_INPUT_PATH = f's3://{BUCKET}/{PREFIX}'
PROCESSING_INPUT_NAME = 'loans.csv' #
PROCESSING_OUTPUT_PATH = f's3://{BUCKET}/{PREFIX}' # Bucket and prefix can also be different from input
NODE_ID = 'c80bb7e0-4bb6-404a-85b9-101887e4f8d1'
PROCESSING_OUTPUT_NAME = f'{NODE_ID}.default'
CURRENT_TIMESTAMP = time.strftime('%d-%H-%M-%S', time.gmtime())
CURRENT_TIMESTAMP

'21-22-43-17'

### Define Pipeline Parameters 

In [10]:
instance_count = ParameterInteger(name='InstanceCount', default_value=1)
instance_type = ParameterString(name='InstanceType', default_value='ml.m5.xlarge')
max_automl_runtime = ParameterInteger(name='MaxAutoMLRuntime', default_value=3600)  # 1 hour
model_approval_status = ParameterString(name='ModelApprovalStatus', default_value='Approved')
model_package_group_name = ParameterString(name='ModelPackageName', default_value='AutoMLModelPackageGroup')

### 1. Create Data Wrangler Processing Step

#### Processing Input and Output

In [11]:
data_sources = []
processing_input = ProcessingInput(source=f'{PROCESSING_INPUT_PATH}/{PROCESSING_INPUT_NAME}', 
                                   destination=f'/opt/ml/processing/{PROCESSING_INPUT_NAME}', 
                                   input_name=PROCESSING_INPUT_NAME, 
                                   s3_data_type='S3Prefix', 
                                   s3_input_mode='File', 
                                   s3_data_distribution_type='FullyReplicated')
data_sources.append(processing_input)

In [12]:
processing_job_output = ProcessingOutput(source='/opt/ml/processing/output', 
                                         destination=f'{PROCESSING_OUTPUT_PATH}/{CURRENT_TIMESTAMP}',
                                         output_name=PROCESSING_OUTPUT_NAME,
                                         s3_upload_mode='EndOfJob')

#### Upload original data flow to S3

In [13]:
FLOW_FILE_NAME = 'loans.flow'

s3_client.upload_file(FLOW_FILE_NAME, 
                      BUCKET, 
                      f'{PREFIX}/{CURRENT_TIMESTAMP}-{FLOW_FILE_NAME}')

FLOW_S3_URI = f's3://{BUCKET}/{PREFIX}/{CURRENT_TIMESTAMP}-{FLOW_FILE_NAME}'
FLOW_S3_URI

's3://sagemaker-us-east-1-119174016168/01-dw-datasets/21-22-43-17-loans.flow'

In [14]:
flow_input = ProcessingInput(source=FLOW_S3_URI, 
                             destination='/opt/ml/processing/flow', 
                             input_name='flow', 
                             s3_data_type='S3Prefix', 
                             s3_input_mode='File', 
                             s3_data_distribution_type='FullyReplicated')

#### Data Wrangler config parameters

In [15]:
PROCESSING_JOB_NAME = f'Data-Wrangler-Processing-job-{CURRENT_TIMESTAMP}'
DW_PROCESSING_CONTAINER_URI = '663277389841.dkr.ecr.us-east-1.amazonaws.com/sagemaker-data-wrangler-container:1.31.0'
INSTANCE_COUNT = 2
INSTANCE_TYPE = 'ml.m5.4xlarge'
EBS_VOLUME_SIZE = 30  # in GB
OUTPUT_CONTENT_TYPE = 'CSV'

In [16]:
refit_trained_params = {'refit': True, 
                        'output_flow': f'{CURRENT_TIMESTAMP}-refitted-{FLOW_FILE_NAME}'}

#### Create a Processor

In [17]:
processor = Processor(base_job_name=PROCESSING_JOB_NAME,
                      role=ROLE, 
                      image_uri=DW_PROCESSING_CONTAINER_URI, 
                      instance_count=INSTANCE_COUNT, 
                      instance_type=INSTANCE_TYPE, 
                      volume_size_in_gb=EBS_VOLUME_SIZE,  
                      sagemaker_session=sagemaker_session)

#### Create the Data Wrangler Processing Step

In [18]:
data_wrangler_step = ProcessingStep(name='DataWranglerProcessingStep', 
                                    processor=processor, 
                                    inputs=[flow_input] + data_sources, 
                                    outputs=[processing_job_output], 
                                    job_arguments=[f"--refit-trained-params '{json.dumps(refit_trained_params)}'"])

### 2. Create Autopilot Step

In [19]:
pipeline_session = PipelineSession()

In [20]:
TRAINING_INPUT_CONTENT_TYPE = 'text/csv;header=present'
TARGET_ATTRIBUTE_NAME = 'loan_status'

In [21]:
auto_ml = AutoML(role=ROLE, 
                 target_attribute_name=TARGET_ATTRIBUTE_NAME, 
                 sagemaker_session=pipeline_session, 
                 mode='ENSEMBLING')

In [22]:
s3_input = Join(on='/', 
                values=[data_wrangler_step.properties.ProcessingOutputConfig.Outputs[PROCESSING_OUTPUT_NAME].S3Output.S3Uri,
                        data_wrangler_step.properties.ProcessingJobName, 
                        f'{PROCESSING_OUTPUT_NAME.replace(".", "/")}'])
s3_input

Join(on='/', values=[<sagemaker.workflow.properties.Properties object at 0x7fb040e17c90>, <sagemaker.workflow.properties.Properties object at 0x7fb040d977d0>, 'c80bb7e0-4bb6-404a-85b9-101887e4f8d1/default'])

In [23]:
train_args = auto_ml.fit(inputs=AutoMLInput(inputs=s3_input, 
                                            content_type=TRAINING_INPUT_CONTENT_TYPE, 
                                            target_attribute_name=TARGET_ATTRIBUTE_NAME))



In [24]:
automl_step = AutoMLStep(name='DataWrangerAutoML', 
                         step_args=train_args)

### 3. Model Creation Step

In [25]:
INSTANCE_TYPE = 'ml.m5.xlarge'

In [28]:
best_automl_model = automl_step.get_best_auto_ml_model(ROLE, 
                                                       sagemaker_session=pipeline_session)
step_args_create_model = best_automl_model.create(instance_type=INSTANCE_TYPE)
step_create_model = ModelStep(name='AutoMLModelCreationStep', 
                              step_args=step_args_create_model)

### 4. Batch Scoring Step

In [29]:
INSTANCE_TYPE = 'ml.m5.xlarge'
INSTANCE_COUNT = 2

In [31]:
TRUE_LABELS_FILE_NAME = 'true_labels.csv'
s3_client.upload_file('', 
                      BUCKET, 
                      f'{PREFIX}/true_labels.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'true_labels.csv'

In [34]:
s3_client.upload_file()

TypeError: upload_file() missing 3 required positional arguments: 'Filename', 'Bucket', and 'Key'

In [None]:
s3_x_test = s3_path_join(s3_prefix, "x_test.csv")
s3_y_test = s3_path_join(s3_prefix, "y_test.csv")

In [None]:
transformer = Transformer(model_name=step_create_model.properties.ModelName, 
                          instance_count=INSTANCE_COUNT, 
                          instance_type=INSTANCE_TYPE, 
                          output_path=Join(on='/', values=['s3:/', BUCKET, PREFIX, 'transform']), 
                          sagemaker_session=pipeline_session)

In [None]:
step_batch_transform = TransformStep(name='BatchTransformStep', 
                                     step_args=transformer.transform(data=s3_x_test, content_type='text/csv'))

### 4. Create SageMaker Pipeline

#### Define Pipeline Parameters

In [None]:
instance_type = ParameterString(name='InstanceType', default_value='ml.m5.4xlarge')
instance_count = ParameterInteger(name='InstanceCount', default_value=2)

#### Create Pipeline

In [None]:
pipeline_name = f'LowCodePipeline'

In [None]:
pipeline_steps = [data_wrangler_step, training_step]
pipeline = Pipeline(name=pipeline_name, 
                    parameters=[instance_type, instance_count], 
                    steps=pipeline_steps, 
                    sagemaker_session=sagemaker_session)

#### Examine Pipeline Definition

In [None]:
definition = json.loads(pipeline.definition())
definition

#### Start Pipeline Execution

In [None]:
pipeline.upsert(role_arn=ROLE)

In [None]:
pipeline.start()