In [3]:
# https://github.com/aws/amazon-sagemaker-examples/blob/main/sagemaker_processing/scikit_learn_data_processing_and_model_evaluation/scikit_learn_data_processing_and_model_evaluation.ipynb

In [7]:
pip uninstall sagemaker -y

Found existing installation: sagemaker 2.187.0
Uninstalling sagemaker-2.187.0:
  Successfully uninstalled sagemaker-2.187.0
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
pip install sagemaker

Collecting sagemaker
  Using cached sagemaker-2.187.0-py2.py3-none-any.whl
Installing collected packages: sagemaker
Successfully installed sagemaker-2.187.0
[0mNote: you may need to restart the kernel to use updated packages.


In [4]:
pip install stepfunctions

Collecting stepfunctions
  Using cached stepfunctions-2.3.0-py2.py3-none-any.whl
Installing collected packages: stepfunctions
Successfully installed stepfunctions-2.3.0
[0mNote: you may need to restart the kernel to use updated packages.


In [6]:
# ! pip install -U sagemaker

In [2]:
# pip install --upgrade pip

In [7]:
!pip install stepfunctions  omegaconf  nb-black python-dotenv

Collecting omegaconf
  Using cached omegaconf-2.3.0-py3-none-any.whl (79 kB)
Collecting nb-black
  Using cached nb_black-1.0.7-py3-none-any.whl
Collecting python-dotenv
  Using cached python_dotenv-1.0.0-py3-none-any.whl (19 kB)
Collecting antlr4-python3-runtime==4.9.* (from omegaconf)
  Using cached antlr4_python3_runtime-4.9.3-py3-none-any.whl
[33mDEPRECATION: nb-black 1.0.7 has a non-standard dependency specifier black>='19.3'; python_version >= "3.6". pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of nb-black or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mInstalling collected packages: antlr4-python3-runtime, python-dotenv, omegaconf, nb-black
Successfully installed antlr4-python3-runtime-4.9.3 nb-black-1.0.7 omegaconf-2.3.0 python-dotenv-1.0.0
[0m

In [5]:
!pip uninstall -y PyYAML

Found existing installation: PyYAML 6.0.1
Uninstalling PyYAML-6.0.1:
  Successfully uninstalled PyYAML-6.0.1
[0m

In [None]:
pip install PyYAML

[0mNote: you may need to restart the kernel to use updated packages.


In [7]:
%load_ext lab_black

In [8]:
%load_ext autoreload
%autoreload 2

# Import general libreries

In [8]:
import stepfunctions
from config import config

In [9]:
REGION = config.region

# Custom Retry Configuration

In [10]:
# This code configures a Boto3 client for Amazon SageMaker with specific timeouts and retry settings, then prints the configured retry settings.
import boto3
from botocore.config import Config

sm_boto = boto3.client(
    "sagemaker",
    config=Config(connect_timeout=5, read_timeout=60, retries={"max_attempts": 60}),
)
print(sm_boto.meta.config.retries)

{'total_max_attempts': 61, 'mode': 'legacy'}


# Sagemaker Configuration

In [11]:
import boto3
import sagemaker
from sagemaker import get_execution_role

In [12]:
# SageMaker Session

sagemaker_session = sagemaker.Session(sagemaker_client=sm_boto)
region = sagemaker_session.boto_session.region_name
print(sagemaker_session.sagemaker_client.meta.config.retries)
# SageMaker Execution Role
# You can use sagemaker.get_execution_role() if running inside sagemaker's notebook instance
role = get_execution_role()
bucket = config.s3.bucket
prefix = config.s3.prefix
# bucket_path = config.s3.bucket_path
s3_bucket_base_uri = config.s3.s3_bucket_base_uri

role

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
{'total_max_attempts': 61, 'mode': 'legacy'}
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


'arn:aws:iam::889859566884:role/service-role/AmazonSageMaker-ExecutionRole-20211215T171253'

# Processing Steps

In [13]:
import os
from sagemaker.sklearn.processing import SKLearnProcessor

import time
import stepfunctions
from stepfunctions import steps
from stepfunctions.inputs import ExecutionInput
from stepfunctions.workflow import Workflow
from stepfunctions.steps import (
    Chain,
    ChoiceRule,
    ModelStep,
    ProcessingStep,
    TrainingStep,
    TransformStep,
    Parallel,
)

from sagemaker.processing import ProcessingInput, ProcessingOutput

## Functions

In [14]:
def job_name(jobname):
    """
    Generate a unique job name for an Amazon SageMaker job based on a given 'jobname' and current timestamp.

    Args:
        jobname (str): A descriptive name for the job.

    Returns:
        str: A unique job name incorporating the 'jobname' and timestamp.
    """
    return f"MultiLabelClassification-{jobname}--{time.strftime('%Y%m%d%H%M%S', time.gmtime())}"


def upload_code(bucket_name, prefix_name, script_location):
    """
    Upload code or script to an Amazon S3 bucket for use in SageMaker.

    Args:
        bucket_name (str): The name of the S3 bucket where the code will be uploaded.
        prefix_name (str): The prefix or directory within the S3 bucket where the code will be stored.
        script_location (str): The local path to the code or script file.

    Returns:
        str: The S3 URI of the uploaded code.
    """
    return sagemaker_session.upload_data(
        script_location,
        bucket=bucket_name,
        key_prefix=f"{prefix_name}/{script_location}",
    )

## Config

In [15]:
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())
workflow_execution_role = os.getenv("SAGEMAKER_WORKFLOW_ROLE")

In [16]:
# execution_input = ExecutionInput(
#     schema={
#         "PreprocessingJobName": str,
#     }
# )
# failed_state_sagemaker_processing_failure = stepfunctions.steps.states.Fail(
#     "ML Workflow failed", cause="SageMakerProcessingJobFailed"
# )
# catch_state_processing = stepfunctions.steps.states.Catch(
#     error_equals=["States.TaskFailed"],
#     next_step=failed_state_sagemaker_processing_failure,
# )

In [41]:
failed_state_sagemaker_processing_failure = stepfunctions.steps.states.Fail(
    "ML Workflow failed", cause="SageMakerProcessingJobFailed"
)
catch_state_processing = stepfunctions.steps.states.Catch(
    error_equals=["States.TaskFailed"],
    next_step=failed_state_sagemaker_processing_failure,
)

In [43]:
failed_state_sagemaker_training_failure = stepfunctions.steps.states.Fail(
    "ML Training failed", cause="SageMakerTrainingJobFailed"
)

catch_state_training = stepfunctions.steps.states.Catch(
    error_equals=["States.TaskFailed"],
    next_step=failed_state_sagemaker_training_failure,
)


In [44]:
failed_state_sagemaker_inference_failure = stepfunctions.steps.states.Fail(
    "ML Inference failed", cause="SageMakerInferenceFailed"
)

catch_state_inference = stepfunctions.steps.states.Catch(
    error_equals=["States.TaskFailed"],
    next_step=failed_state_sagemaker_inference_failure,
)


## 1.1 Preprocess Job

In [17]:
PREPROCESSING_SCRIPT_LOCATION = "preprocess/code"
output_preprocess = "{}/{}".format(s3_bucket_base_uri, config.s3.prefix)

In [18]:
input_code_preprocess = sagemaker_session.upload_data(
    PREPROCESSING_SCRIPT_LOCATION,
    bucket=bucket,
    key_prefix=f"{prefix}/{PREPROCESSING_SCRIPT_LOCATION}",
)
inputs_preprocess = [
    ProcessingInput(
        source=f"{config.s3.s3_bucket_base_uri}/{config.s3.input}",
        destination="/opt/ml/processing/input",
        input_name="input-data",
    ),
    ProcessingInput(
        source=input_code_preprocess,
        destination="/opt/ml/processing/input/code",
        input_name="code",
    ),
    ProcessingInput(
        source=f"s3://{bucket}/{prefix}/{PREPROCESSING_SCRIPT_LOCATION}/config",
        destination="/opt/ml/processing/input/config",
        input_name="code-config",
    ),
]


outputs_preprocess = [
    ProcessingOutput(
        source="/opt/ml/processing/train",
        destination=output_preprocess,
        output_name="train_data",
    ),
    ProcessingOutput(
        source="/opt/ml/processing/test",
        destination=output_preprocess,
        output_name="test_data",
    ),
    ProcessingOutput(
        source="/opt/ml/processing/labels",
        destination=output_preprocess,
        output_name="labels_data",
    ),
]

In [19]:
# Create a descriptive name for the preprocessing step, including a timestamp.
preprocessing_step_name = f"Multilabel Classification - Preprocessing Step {time.strftime('%Y%m%d%H%M%S', time.gmtime())}"


# Define an Amazon SageMaker SKLearnProcessor with custom settings.
def sklearn_processor(instance_type="ml.m5.xlarge"):
    """
    Create an SKLearnProcessor instance for Amazon SageMaker processing jobs.

    Args:
        instance_type (str): The Amazon SageMaker instance type for processing jobs.

    Returns:
        sagemaker.processing.SKLearnProcessor: An instance of the SKLearnProcessor.
    """
    return SKLearnProcessor(
        framework_version="1.2-1",
        role=role,  # Ensure the 'role' variable is defined and appropriate.
        instance_type=instance_type,
        instance_count=1,
        # max_runtime_in_seconds=1200,  # Uncomment and customize if needed.
    )


# Define the SageMaker processing step for data preprocessing.
processing_step = ProcessingStep(
    preprocessing_step_name,
    processor=sklearn_processor(),
    job_name=f"PreprocessingJobName{time.strftime('%Y%m%d%H%M%S', time.gmtime())}",
    inputs=inputs_preprocess,  # Define your input data configuration here.
    outputs=outputs_preprocess,  # Define your output data configuration here.
    container_entrypoint=["python3", "/opt/ml/processing/input/code/run.py"],
    container_arguments=[
        "--train-test-split-ratio",
        "0.2",
    ],  # Uncomment and customize if needed.
)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


# Training 

In [20]:
output_config = processing_step.fields["parameters"]["ProcessingOutputConfig"]
for output in output_config["Outputs"]:
    if output["OutputName"] == "train_data":
        preprocessed_training_data = os.path.join(
            output["S3Output"]["S3Uri"], "train.csv"
        )
    if output["OutputName"] == "test_data":
        preprocessed_test_data = os.path.join(output["S3Output"]["S3Uri"], "test.csv")
    if output["OutputName"] == "labels_data":
        preprocessed_labels_data = os.path.join(
            output["S3Output"]["S3Uri"], "labels.csv"
        )

In [21]:
output_config

{'Outputs': [{'OutputName': 'train_data',
   'AppManaged': False,
   'S3Output': {'S3Uri': 's3://ck-qa-pldatascience/aws_mlOps/sagemaker-pipeline/stepfunctions',
    'LocalPath': '/opt/ml/processing/train',
    'S3UploadMode': 'EndOfJob'}},
  {'OutputName': 'test_data',
   'AppManaged': False,
   'S3Output': {'S3Uri': 's3://ck-qa-pldatascience/aws_mlOps/sagemaker-pipeline/stepfunctions',
    'LocalPath': '/opt/ml/processing/test',
    'S3UploadMode': 'EndOfJob'}},
  {'OutputName': 'labels_data',
   'AppManaged': False,
   'S3Output': {'S3Uri': 's3://ck-qa-pldatascience/aws_mlOps/sagemaker-pipeline/stepfunctions',
    'LocalPath': '/opt/ml/processing/labels',
    'S3UploadMode': 'EndOfJob'}}]}

In [22]:
from sagemaker.huggingface import HuggingFace

In [23]:
metric_definitions = [
    {"Name": "eval_loss", "Regex": "'eval_loss': ([0-9]+(.|e\-)[0-9]+),?"},
    {"Name": "eval_accuracy", "Regex": "'eval_accuracy': ([0-9]+(.|e\-)[0-9]+),?"},
    {"Name": "eval_f1", "Regex": "'eval_f1': ([0-9]+(.|e\-)[0-9]+),?"},
    {"Name": "eval_roc", "Regex": "'eval_roc': ([0-9]+(.|e\-)[0-9]+),?"},
    {"Name": "eval_roc_auc", "Regex": "'eval_roc_auc': ([0-9]+(.|e\-)[0-9]+),?"},
    {"Name": "eval_precision", "Regex": "'eval_precision': ([0-9]+(.|e\-)[0-9]+),?"},
    {"Name": "eval_recall", "Regex": "'eval_recall': ([0-9]+(.|e\-)[0-9]+),?"},
    {"Name": "eval_runtime", "Regex": "'eval_runtime': ([0-9]+(.|e\-)[0-9]+),?"},
    {
        "Name": "eval_samples_per_second",
        "Regex": "'eval_samples_per_second': ([0-9]+(.|e\-)[0-9]+),?",
    },
    {"Name": "epoch", "Regex": "'epoch': ([0-9]+(.|e\-)[0-9]+),?"},
]

instance_volume = {
    "ml.g4dn.16xlarge": 900,
    "ml.g4dn.8xlarge": 500,
    "ml.g4dn.4xlarge": 225,
    "ml.g4dn.2xlarge": 225,
    "ml.g4dn.xlarge": 125,
}


params = {
    "epochs": 2,
    "train-batch-size": 8,
    "eval_steps": 1,
    "instance_type": "ml.g4dn.2xlarge",
    "volume_size": 125,
}

In [31]:
def generate_estimator():
    checkpoint_s3_uri = f"s3://{config.s3.bucket}/{config.s3.prefix}/checkpoints"
    output_path = f"s3://{config.s3.bucket}/{config.s3.prefix}/training_jobs"
    hyperparameters = {
        "model_name": "distilbert-base-uncased",
        "epochs": params["epochs"],
        "train-batch-size": params["train-batch-size"],
        "output_dir": checkpoint_s3_uri,  # Use the checkpoint S3 URI
        "eval_steps": params["eval_steps"],
    }

    return HuggingFace(
        entry_point="train.py",
        source_dir="train/code",
        output_path=f"{output_path}/",
        code_location=output_path,
        role=role,
        base_job_name=f"multi-label-classification",
        checkpoint_s3_uri=checkpoint_s3_uri,  # Specify the checkpoint input path here
        instance_type=params["instance_type"],
        instance_count=1,
        transformers_version="4.6",
        pytorch_version="1.7",
        py_version="py36",
        hyperparameters=hyperparameters,
        metric_definitions=metric_definitions,
        volume_size=instance_volume[params["instance_type"]],
        sagemaker_session=sagemaker_session,
    )

In [32]:
preprocessed_labels_data

's3://ck-qa-pldatascience/aws_mlOps/sagemaker-pipeline/stepfunctions/labels.csv'

In [33]:
def generate_data():
    train_path = preprocessed_training_data
    test_path = preprocessed_test_data
    labels_path = preprocessed_labels_data
    data = {"train": train_path, "test": test_path, "labels": labels_path}

    return data


def generate_training_step(instance="ml.g4dn.2xlarge"):
    jobname = (
        f"multi-label-classification--{time.strftime('%Y%m%d%H%M%S', time.gmtime())}"
    )
    # mode_names.append(jobname)

    training_step = steps.TrainingStep(
        f"Trainning -- instance {instance}",
        estimator=generate_estimator(),
        data=generate_data(),
        job_name=jobname,
        wait_for_completion=True,
         )
    return training_step

In [34]:
model = generate_estimator()
model.fit(generate_data(), wait=False)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.


Using provided s3_resource


INFO:sagemaker:Creating training-job with name: multi-label-classification-2023-09-20-13-44-46-526


# Create WorkFlow

In [38]:
processing_step.add_catch(catch_state_processing)
generate_training_step.add_catch(catch_state_training)

workflow_graph = Chain([processing_step])
workflow_graph = Chain([processing_step, generate_training_step()])


branching_workflow = Workflow(
    name=job_name("MlOpsWorkflow"),
    definition=workflow_graph,
    role=workflow_execution_role,
)

branching_workflow.create()

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.


Using provided s3_resource


INFO:stepfunctions:Workflow created successfully on AWS Step Functions.


'arn:aws:states:us-east-1:889859566884:stateMachine:MultiLabelClassification-MlOpsWorkflow--20230920140656'

In [39]:
# Execute workflow
execution = branching_workflow.execute(
    inputs={
        "PreprocessingJobName": job_name("preprocess"),
        # "EvaluationProcessingJobName": evaluation_job_name,  # Each SageMaker processing job requires a unique name,
    }
)

INFO:stepfunctions:Workflow execution started successfully on AWS Step Functions.


In [40]:
execution.get_output(wait=False)