#### Setup

In [1]:
import json
import logging
import sys
from pathlib import Path
import ipytest
import os
import sagemaker
import boto3


CODE_FOLDER = Path("code")
sys.path.extend([f"./{CODE_FOLDER}"])

from config import Configuration


DATA_FILEPATH = "data/penguins.csv"

ipytest.autoconfig(raise_on_error=True)

logging.getLogger("sagemaker.config").setLevel(logging.ERROR)


sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/carlodavid/Library/Application Support/sagemaker/config.yaml


In [13]:
LOCAL_MODE = False
config_instance = Configuration(LOCAL_MODE)

In [14]:
bucket = config_instance.bucket
role = config_instance.role
config = config_instance.config
config

{'session': <sagemaker.workflow.pipeline_context.PipelineSession at 0x31398dbd0>,
 'instance_type': 'ml.m5.xlarge',
 'image': None,
 'framework_version': '2.12',
 'py_version': 'py310'}

In [15]:
S3_LOCATION = f"s3://{bucket}/penguins"

sagemaker_session = sagemaker.session.Session()
sagemaker_client = boto3.client("sagemaker")
iam_client = boto3.client("iam")
region = boto3.Session().region_name

#### Preprocessing

In [16]:
(CODE_FOLDER / "processing").mkdir(parents=True, exist_ok=True)
sys.path.extend([f"./{CODE_FOLDER}/processing"])

In [17]:
# cache configuration
from sagemaker.workflow.steps import CacheConfig

cache_config = CacheConfig(enable_caching=True, expire_after="15d")

In [18]:
# pipeline configuration
from sagemaker.workflow.parameters import ParameterString
from sagemaker.workflow.pipeline_definition_config import PipelineDefinitionConfig

pipeline_definition_config = PipelineDefinitionConfig(use_custom_job_prefix=True)

dataset_location = ParameterString(
    name="dataset_location",
    default_value = f"{S3_LOCATION}/data"
)

In [19]:
# setup processing step
from sagemaker.sklearn.processing import SKLearnProcessor

processor = SKLearnProcessor(
    base_job_name = "preprocess-data",
    framework_version = '1.2-1',
    instance_type = config["instance_type"],
    instance_count = 1,
    role = role,
    sagemaker_session = config["session"]
)

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


In [20]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep

preprocessing_step = ProcessingStep(
    name="preprocess-data",
    step_args=processor.run(
        code=f"{(CODE_FOLDER / 'processing' / 'script.py').as_posix()}",
        inputs=[
            ProcessingInput(
                source=dataset_location,
                destination="/opt/ml/processing/input",
            ),
        ],
        outputs=[
            ProcessingOutput(
                output_name="train",
                source="/opt/ml/processing/train",
                destination=f"{S3_LOCATION}/preprocessing/train",
            ),
            ProcessingOutput(
                output_name="validation",
                source="/opt/ml/processing/validation",
                destination=f"{S3_LOCATION}/preprocessing/validation",
            ),
            ProcessingOutput(
                output_name="test",
                source="/opt/ml/processing/test",
                destination=f"{S3_LOCATION}/preprocessing/test",
            ),
            ProcessingOutput(
                output_name="model",
                source="/opt/ml/processing/model",
                destination=f"{S3_LOCATION}/preprocessing/model",
            ),
            ProcessingOutput(
                output_name="train-baseline",
                source="/opt/ml/processing/train-baseline",
                destination=f"{S3_LOCATION}/preprocessing/train-baseline",
            ),
            ProcessingOutput(
                output_name="test-baseline",
                source="/opt/ml/processing/test-baseline",
                destination=f"{S3_LOCATION}/preprocessing/test-baseline",
            ),
        ],
    ),
    cache_config=cache_config,
)



In [21]:
# create pipeline
from sagemaker.workflow.pipeline import Pipeline

session3_pipeline = Pipeline(
    name="session3-pipeline",
    parameters=[dataset_location],
    steps=[
        preprocessing_step,
    ],
    pipeline_definition_config=pipeline_definition_config,
    sagemaker_session=config["session"],
)

session3_pipeline.upsert(role_arn=role)

{'PipelineArn': 'arn:aws:sagemaker:ap-southeast-1:931619889046:pipeline/session3-pipeline',
 'ResponseMetadata': {'RequestId': 'c5605e6f-e784-465a-8639-1827cb2e062e',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'c5605e6f-e784-465a-8639-1827cb2e062e',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '90',
   'date': 'Sun, 07 Apr 2024 00:01:57 GMT'},
  'RetryAttempts': 0}}

In [12]:
# local setup
# session3_pipeline.start()

INFO:sagemaker.local.entities:Starting execution for pipeline session3-pipeline. Execution ID is 3601d7dd-2c0b-4c1a-a22a-1dd662c743fd
INFO:sagemaker.local.entities:Starting pipeline step: 'preprocess-data'
INFO:sagemaker.local.image:'Docker Compose' found using Docker CLI.
INFO:sagemaker.local.local_session:Starting processing job
INFO:sagemaker.local.image:Using the long-lived AWS credentials found in session
INFO:sagemaker.local.image:docker compose file: 
networks:
  sagemaker-local:
    name: sagemaker-local
services:
  algo-1-ti1z0:
    container_name: ynu9bhvzt6-algo-1-ti1z0
    entrypoint:
    - python3
    - /opt/ml/processing/input/code/script.py
    environment:
    - '[Masked]'
    - '[Masked]'
    image: 121021644041.dkr.ecr.ap-southeast-1.amazonaws.com/sagemaker-scikit-learn:1.2-1-cpu-py3
    networks:
      sagemaker-local:
        aliases:
        - algo-1-ti1z0
    stdin_open: true
    tty: true
    volumes:
    - /private/var/folders/dp/50k_xys57dgcryfz0dv88_vr0000gn/T

 Container ynu9bhvzt6-algo-1-ti1z0  Creating
 algo-1-ti1z0 The requested image's platform (linux/amd64) does not match the detected host platform (linux/arm64/v8) and no specific platform was requested 
 Container ynu9bhvzt6-algo-1-ti1z0  Created
Attaching to ynu9bhvzt6-algo-1-ti1z0
ynu9bhvzt6-algo-1-ti1z0 exited with code 0
Aborting on container exit...
 Container ynu9bhvzt6-algo-1-ti1z0  Stopping
 Container ynu9bhvzt6-algo-1-ti1z0  Stopped


INFO:sagemaker.local.image:===== Job Complete =====
INFO:sagemaker.local.entities:Pipeline step 'preprocess-data' SUCCEEDED.
INFO:sagemaker.local.entities:Pipeline execution 3601d7dd-2c0b-4c1a-a22a-1dd662c743fd SUCCEEDED


<sagemaker.local.entities._LocalPipelineExecution at 0x31167de70>

In [22]:
# remote setup
session3_pipeline.start()

_PipelineExecution(arn='arn:aws:sagemaker:ap-southeast-1:931619889046:pipeline/session3-pipeline/execution/gt182o55nj0k', sagemaker_session=<sagemaker.workflow.pipeline_context.PipelineSession object at 0x31398dbd0>)