In [1]:
import json
import pathlib
import os
import io
import time
import boto3
import sagemaker
import pandas as pd
import numpy as np

from platformdirs import site_config_dir, user_config_dir
from datetime import datetime, timezone, date
from time import gmtime, strftime, sleep
from sklearn.preprocessing import MinMaxScaler

from sagemaker import Model
from sagemaker.inputs import CreateModelInput
from sagemaker.inputs import TrainingInput
from sagemaker.feature_store.feature_group import FeatureGroup
from sagemaker.feature_store.feature_store import FeatureStore
from sagemaker.feature_store.inputs import TableFormatEnum
# from sagemaker.feature_store.feature_processor import CSVDataSource, feature_processor, to_pipeline
from sagemaker.remote_function import remote
from sagemaker.deserializers import CSVDeserializer
from sagemaker.serializers import CSVSerializer
from sagemaker.lambda_helper import Lambda
from sagemaker.xgboost.estimator import XGBoost
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.drift_check_baselines import DriftCheckBaselines
from sagemaker.image_uris import retrieve

from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.check_job_config import CheckJobConfig
from sagemaker.workflow.model_step import ModelStep
from sagemaker.workflow.fail_step import FailStep
from sagemaker.workflow.function_step import step
from sagemaker.workflow.step_outputs import get_step
from sagemaker.workflow.pipeline_experiment_config import PipelineExperimentConfig
from sagemaker.workflow.pipeline_definition_config import PipelineDefinitionConfig
from sagemaker.workflow.properties import PropertyFile
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.steps import (
    ProcessingStep,
    TrainingStep,
    CreateModelStep,
    CacheConfig
)
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterFloat,
    ParameterString,
    ParameterBoolean
)
from sagemaker.workflow.clarify_check_step import (
    ModelBiasCheckConfig,
    ClarifyCheckStep,
    ModelExplainabilityCheckConfig
)
from sagemaker.workflow.conditions import (
    ConditionGreaterThan,
    ConditionGreaterThanOrEqualTo
)

from sagemaker.workflow.functions import (
    Join,
    JsonGet
)
from sagemaker.workflow.lambda_step import (
    LambdaStep,
    LambdaOutput,
    LambdaOutputTypeEnum,
)
from sagemaker.model_metrics import (
    MetricsSource,
    ModelMetrics,
    FileSource
)
from sagemaker.processing import (
    ProcessingInput,
    ProcessingOutput,
    ScriptProcessor
)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [14]:
boto_session = boto3.Session()
client_sagemaker = boto_session.client("sagemaker")
client_s3 = boto_session.client("s3")
sagemaker_session = sagemaker.Session()
region_name = boto_session.region_name
sagemaker_role = sagemaker.get_execution_role()

In [2]:
%store -r 

%store

Stored variables and their in-db values:
bucket_name                         -> 'sagemaker-us-east-1-891377318910'
bucket_prefix                       -> 'from-idea-to-production/xgboost'
col_target                          -> 'y'
dataset_file_local_path             -> 'data/bank-additional/bank-additional-full.csv'
dataset_raw                         -> 'bank-additional-full.csv'
domain_id                           -> 'd-ehxji4qaadry'
experiment_name                     -> 'itau-experiment-2024-10-13-18-09-47'
initialized                         -> True
input_s3_url                        -> 's3://sagemaker-us-east-1-891377318910/workshop_v2
loca_transformed_path               -> './data/transformed'
local_prefix                        -> './data/raw'
output_s3_url                       -> 's3://sagemaker-us-east-1-891377318910/workshop_v2
region                              -> 'us-east-1'
region_name                         -> 'us-east-1'
s3_data_raw_prefix                  -> 'data/

## Set constants

In [4]:
project = "itau-project"

current_timestamp        = strftime('%Y-%m-%d-%H-%M-%S', gmtime())
pipeline_name            = f"{project}-pipeline-{current_timestamp}"
pipeline_model_name      = f"{project}-model-xgb"
model_package_group_name = f"{project}-model-group-{current_timestamp}"
endpoint_config_name     = f"{project}-endpoint-config"
endpoint_name            = f"{project}-endpoint"
model_approval_status    = "PendingManualApproval"

In [6]:
process_instance_type = "ml.c5.xlarge"
process_instance_count = 1
train_instance_type = "ml.m5.xlarge"
train_instance_count = 1

In [7]:
output_s3_prefix = f"s3://{bucket_name}/{bucket_prefix}"
train_s3_url = f"{output_s3_prefix}/train"
validation_s3_url = f"{output_s3_prefix}/validation"
test_s3_url = f"{output_s3_prefix}/test"
baseline_s3_url = f"{output_s3_prefix}/baseline"

evaluation_s3_url = f"{output_s3_prefix}/evaluation"
prediction_baseline_s3_url = f"{output_s3_prefix}/prediction_baseline"

output_s3_url = f"{output_s3_prefix}/output"

In [8]:
print(f"Train S3 url:                     {train_s3_url}")
print(f"Validation S3 url:                {validation_s3_url}")
print(f"Test S3 url:                      {test_s3_url}")
print(f"Data baseline S3 url:             {baseline_s3_url}")
print(f"Evaluation metrics S3 url:        {evaluation_s3_url}")
print(f"Model prediction baseline S3 url: {prediction_baseline_s3_url}")

Train S3 url:                     s3://sagemaker-us-east-1-891377318910/from-idea-to-production/xgboost/train
Validation S3 url:                s3://sagemaker-us-east-1-891377318910/from-idea-to-production/xgboost/validation
Test S3 url:                      s3://sagemaker-us-east-1-891377318910/from-idea-to-production/xgboost/test
Data baseline S3 url:             s3://sagemaker-us-east-1-891377318910/from-idea-to-production/xgboost/baseline
Evaluation metrics S3 url:        s3://sagemaker-us-east-1-891377318910/from-idea-to-production/xgboost/evaluation
Model prediction baseline S3 url: s3://sagemaker-us-east-1-891377318910/from-idea-to-production/xgboost/prediction_baseline


In [None]:
# %store train_s3_url
# %store validation_s3_url
# %store test_s3_url
# %store baseline_s3_url
# %store pipeline_name
# %store model_package_group_name
# %store evaluation_s3_url
# %store prediction_baseline_s3_url
# %store output_s3_url

## Create pipeline

In [9]:
skprocessor_framework_version = "0.23-1"

In [10]:
# Set processing instance type
process_instance_type_param = ParameterString(
    name="ProcessingInstanceType",
    default_value=process_instance_type,
)

# Set training instance type
train_instance_type_param = ParameterString(
    name="TrainingInstanceType",
    default_value=train_instance_type,
)

# Set training instance count
train_instance_count_param = ParameterInteger(
    name="TrainingInstanceCount",
    default_value=train_instance_count
)

# Set model approval status for the model registry
model_approval_status_param = ParameterString(
    name="ModelApprovalStatus",
    default_value=model_approval_status
)

# Minimal threshold for model performance on the test dataset
test_score_threshold_param = ParameterFloat(
    name="TestScoreThreshold",
    default_value=0.75
)

# Parametrize the S3 url for input dataset
input_s3_url_param = ParameterString(
    name="InputDataUrl",
    default_value=input_s3_url,
)

# Model package group name
model_package_group_name_param = ParameterString(
    name="ModelPackageGroupName",
    default_value=model_package_group_name,
)

## Build the pipeline steps

You create a pipeline with the following:
| Step | Description |
|---|---|
| **Data processing** | runs a SageMaker processing job for feature engineering and dataset split|
| **Training** | runs a SageMaker training job using XGBoost algorithm |
| **Evaluation** | evaluates the performance of the trained model |
| **Condition** | checks if the performance of the model meets the specified threshold |
| **Register model** | registers a version of the model in the SageMaker model registry |


In [11]:
session = PipelineSession()

## Processing step

In [12]:
%%writefile ./code/preprocessing_pipeline.py

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import pandas as pd
import numpy as np
import argparse
import os


def _parse_args():
    parser = argparse.ArgumentParser()
    # Data, model, and output directories
    # model_dir is always passed in from SageMaker. By default this is a S3 path under the default bucket.
    parser.add_argument('--filepath', type=str, default='/opt/ml/processing/input/')
    parser.add_argument('--filename', type=str, default='bank-additional-full.csv')
    parser.add_argument('--outputpath', type=str, default='/opt/ml/processing/output/')
    return parser.parse_known_args()


def process_data(df_data):
    target_col = "y"

    # Indicator variable to capture when pdays takes a value of 999
    df_data["no_previous_contact"] = np.where(df_data["pdays"] == 999, 1, 0)

    # Indicator for individuals not actively employed
    df_data["not_working"] = np.where(
        np.in1d(df_data["job"], ["student", "retired", "unemployed"]), 1, 0
    )

    # remove unnecessary data
    df_model_data = df_data.drop(
        ["duration", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"],
        axis=1,
    )

    bins = [18, 30, 40, 50, 60, 70, 90]
    labels = ['18-29', '30-39', '40-49', '50-59', '60-69', '70-plus']

    df_model_data['age_range'] = pd.cut(df_model_data.age, bins, labels=labels, include_lowest=True)
    df_model_data = pd.concat([df_model_data, pd.get_dummies(df_model_data['age_range'], prefix='age', dtype=int)], axis=1)
    df_model_data.drop('age', axis=1, inplace=True)
    df_model_data.drop('age_range', axis=1, inplace=True)

    scaled_features = ['pdays', 'previous', 'campaign']
    df_model_data[scaled_features] = MinMaxScaler().fit_transform(df_model_data[scaled_features])

    df_model_data = pd.get_dummies(df_model_data, dtype=int)  # Convert categorical variables to sets of indicators

    # Replace "y_no" and "y_yes" with a single label column, and bring it to the front:
    df_model_data = pd.concat([
            df_model_data["y_yes"].rename(target_col),
            df_model_data.drop(["y_no", "y_yes"], axis=1),
        ],
        axis=1,
    )
    return df_model_data


if __name__=="__main__":
    # Process arguments
    args, _ = _parse_args()
    target_col = "y"

    # process data
    df_model_data = process_data(pd.read_csv(os.path.join(args.filepath, args.filename), sep=";"))

    # Shuffle and splitting dataset
    train_data, validation_data, test_data = np.split(
        df_model_data.sample(frac=1, random_state=1729),
        [int(0.7 * len(df_model_data)), int(0.9 * len(df_model_data))],
    )

    print(f"Data split -> train:{train_data.shape} | validation:{validation_data.shape} | test:{test_data.shape}")

    # Save datasets locally
    train_data.to_csv(os.path.join(args.outputpath, 'train/train.csv'), index=False, header=False)
    validation_data.to_csv(os.path.join(args.outputpath, 'validation/validation.csv'), index=False, header=False)
    test_data[target_col].to_csv(os.path.join(args.outputpath, 'test/test_y.csv'), index=False, header=False)
    test_data.drop([target_col], axis=1).to_csv(os.path.join(args.outputpath, 'test/test_x.csv'), index=False, header=False)

    # Save the baseline dataset for model monitoring
    df_model_data.drop([target_col], axis=1).to_csv(os.path.join(args.outputpath, 'baseline/baseline.csv'), index=False, header=False)

    print("## Processing complete. Exiting.")

Writing ./code/preprocessing_pipeline.py


In [15]:
sklearn_processor = SKLearnProcessor(
        framework_version=skprocessor_framework_version,
        role=sagemaker_role,
        instance_type=process_instance_type_param,
        instance_count=process_instance_count,
        base_job_name=f"{project}-preprocess",
        sagemaker_session=session,
    )

processing_inputs = [
    ProcessingInput(
        source=input_s3_url_param,
        destination="/opt/ml/processing/input"
    )
]

processing_outputs = [
    ProcessingOutput(
        output_name="train_data",
        source="/opt/ml/processing/output/train",
        destination=train_s3_url
    ),
    ProcessingOutput(
        output_name="validation_data",
        source="/opt/ml/processing/output/validation",
        destination=validation_s3_url
    ),
    ProcessingOutput(
        output_name="test_data",
        source="/opt/ml/processing/output/test",
        destination=test_s3_url
    ),
    ProcessingOutput(
        output_name="baseline_data",
        source="/opt/ml/processing/output/baseline",
        destination=baseline_s3_url
    ),
]

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


In [16]:
processor_args = sklearn_processor.run(
    inputs=processing_inputs,
    outputs=processing_outputs,
    code='./code/preprocessing_pipeline.py',
    # arguments = ['arg1', 'arg2'],
)

# Define processing step
step_process = ProcessingStep(
    name=f"{project}-preprocess",
    step_args=processor_args,
)



## Training step

In [19]:
xgboost_image_uri = sagemaker.image_uris.retrieve(
    framework="xgboost",
    region=region_name,
    version="1.7-1"
)

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


In [20]:
# Instantiate an XGBoost estimator object
estimator = sagemaker.estimator.Estimator(
    image_uri=xgboost_image_uri,
    role=sagemaker_role,
    instance_type=train_instance_type_param,
    instance_count=train_instance_count_param,
    output_path=output_s3_url,
    sagemaker_session=session,
    base_job_name=f"{project}-train",
)

# Define algorithm hyperparameters
estimator.set_hyperparameters(
    num_round=100,            # the number of rounds to run the training
    max_depth=3,              # maximum depth of a tree
    eta=0.5,                  # step size shrinkage used in updates to prevent overfitting
    alpha=2.5,                # L1 regularization term on weights
    objective="binary:logistic",
    eval_metric="auc",        # evaluation metrics for validation data
    subsample=0.8,            # subsample ratio of the training instance
    colsample_bytree=0.8,     # subsample ratio of columns when constructing each tree
    min_child_weight=3,       # minimum sum of instance weight (hessian) needed in a child
    early_stopping_rounds=10, # the model trains until the validation score stops improving
    verbosity=1,              # verbosity of printing messages
)

training_inputs = {
    "train": TrainingInput(
        s3_data=step_process.properties.ProcessingOutputConfig.Outputs["train_data"].S3Output.S3Uri,
        content_type="text/csv",
    ),
    "validation": TrainingInput(
        s3_data=step_process.properties.ProcessingOutputConfig.Outputs["validation_data"].S3Output.S3Uri,
        content_type="text/csv",
    ),
}

training_args = estimator.fit(training_inputs)

# Define training step
step_train = TrainingStep(
    name=f"{project}-train",
    step_args=training_args
)



## Evaluation step

In [21]:
%%writefile ./code/evaluation.py

import json
import os
import pathlib
import pickle as pkl
import tarfile
import joblib
import numpy as np
import pandas as pd
import xgboost as xgb
import datetime as dt
from sklearn.metrics import roc_curve, auc


if __name__ == "__main__":   
    # All paths are local for the processing container
    model_path = "/opt/ml/processing/model/model.tar.gz"
    test_x_path = "/opt/ml/processing/test/test_x.csv"
    test_y_path = "/opt/ml/processing/test/test_y.csv"
    output_dir = "/opt/ml/processing/evaluation"
    output_prediction_path = "/opt/ml/processing/output/"

    # Read model tar file
    with tarfile.open(model_path, "r:gz") as t:
        t.extractall(path=".")

    # Load model
    model = xgb.Booster()
    model.load_model("xgboost-model")

    # Read test data
    X_test = xgb.DMatrix(pd.read_csv(test_x_path, header=None).values)
    y_test = pd.read_csv(test_y_path, header=None).to_numpy()

    # Run predictions
    probability = model.predict(X_test)

    # Evaluate predictions
    fpr, tpr, thresholds = roc_curve(y_test, probability)
    auc_score = auc(fpr, tpr)
    report_dict = {
        "classification_metrics": {
            "auc_score": {
                "value": auc_score,
            },
        },
    }

    # Save evaluation report
    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
    with open(f"{output_dir}/evaluation.json", "w") as f:
        f.write(json.dumps(report_dict))

    # Save prediction baseline file - you need it later for the model quality monitoring
    pd.DataFrame({
        "prediction": np.array(np.round(probability), dtype=int),
        "probability": probability,
        "label": y_test.squeeze()}
    ).to_csv(
        os.path.join(output_prediction_path, 'prediction_baseline/prediction_baseline.csv'), 
        index=False,
        header=True
    )


Writing ./code/evaluation.py


In [29]:
script_processor = ScriptProcessor(
    image_uri=xgboost_image_uri,
    role=sagemaker_role,
    command=["python3"],
    instance_type=process_instance_type_param,
    instance_count=process_instance_count,
    base_job_name=f"{project}-evaluate",
    sagemaker_session=session,
)

eval_inputs = [
    ProcessingInput(
        source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
        destination="/opt/ml/processing/model"
    ),
    ProcessingInput(
        source=step_process.properties.ProcessingOutputConfig.Outputs["test_data"].S3Output.S3Uri,
        destination="/opt/ml/processing/test"
    ),
]

eval_outputs = [
    ProcessingOutput(
        output_name="evaluation", source="/opt/ml/processing/evaluation",
        destination=evaluation_s3_url),
    ProcessingOutput(
        output_name="prediction_baseline_data", source="/opt/ml/processing/output/prediction_baseline",
        destination=prediction_baseline_s3_url),
]

eval_args = script_processor.run(
    inputs=eval_inputs,
    outputs=eval_outputs,
    code="./code/evaluation.py",
)

evaluation_report = PropertyFile(
    name="ModelEvaluationReport",
    output_name="evaluation",
    path="evaluation.json"
)

step_eval = ProcessingStep(
    name=f"{project}-evaluate",
    step_args=eval_args,
    property_files=[evaluation_report]
)



## Register step

In [32]:
model = Model(
    image_uri=xgboost_image_uri,
    model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
    name="from-idea-to-prod-xgboost-model",
    sagemaker_session=session,
    role=sagemaker_role,
)

model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        s3_uri="{}/evaluation.json".format(
            step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
        ),
        content_type="application/json",
    )
)

register_args = model.register(
    content_types=["text/csv"],
    response_types=["text/csv"],
    inference_instances=["ml.m5.xlarge", "ml.m5.large"],
    transform_instances=["ml.m5.xlarge", "ml.m5.large"],
    model_package_group_name=model_package_group_name_param,
    approval_status=model_approval_status_param,
    model_metrics=model_metrics,
)

step_register = ModelStep(
    name=f"{project}-register",
    step_args=register_args
)



## Fail step

In [33]:
step_fail = FailStep(
    name=f"{project}-fail",
    error_message=Join(on=" ", values=["Execution failed due to AUC Score >", test_score_threshold_param]),
)

## Condition step

In [34]:
cond_lte = ConditionGreaterThan(
    left=JsonGet(
        step_name=step_eval.name,
        property_file=evaluation_report,
        json_path="classification_metrics.auc_score.value",
    ),
    right=test_score_threshold_param,
)

step_cond = ConditionStep(
    name=f"{project}-check-test-score",
    conditions=[cond_lte],
    if_steps=[step_register],
    else_steps=[step_fail],
)

## Construct pipeline

In [35]:
pipeline_def_config = PipelineDefinitionConfig(use_custom_job_prefix=True)


pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        process_instance_type_param,
        train_instance_type_param,
        train_instance_count_param,
        model_approval_status_param,
        test_score_threshold_param,
        input_s3_url_param,
        model_package_group_name_param,
    ],
    steps=[step_process, step_train, step_eval, step_cond],
    sagemaker_session=session,
    pipeline_definition_config=pipeline_def_config
)

In [36]:
print(pipeline_name)

itau-project-pipeline-2024-10-16-23-49-13


In [37]:
pipeline.upsert(role_arn=sagemaker_role)



{'PipelineArn': 'arn:aws:sagemaker:us-east-1:891377318910:pipeline/itau-project-pipeline-2024-10-16-23-49-13',
 'ResponseMetadata': {'RequestId': '11b12cfb-a9a6-4656-a9f7-5e2e56aaa5a0',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '11b12cfb-a9a6-4656-a9f7-5e2e56aaa5a0',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '109',
   'date': 'Thu, 17 Oct 2024 00:03:26 GMT'},
  'RetryAttempts': 0}}

In [39]:
pipeline_definition = json.loads(pipeline.describe()['PipelineDefinition'])
# pipeline_definition

# Execute the pipeline

In [45]:
# execution = pipeline.start(
#     parameters=dict(
#         ProcessingInstanceType=process_instance_type,
#         TrainingInstanceType=train_instance_type,
#         TrainingInstanceCount=train_instance_count,
#         ModelApprovalStatus="PendingManualApproval",
#         TestScoreThreshold=0.75,
#         InputDataUrl=input_s3_url
#     )
# )

execution = pipeline.start(
    parameters={
        "ProcessingInstanceType": process_instance_type,
        "TrainingInstanceType": train_instance_type,
        "TrainingInstanceCount": train_instance_count,
        "ModelApprovalStatus": "PendingManualApproval",
        "TestScoreThreshold": 0.75,
        "InputDataUrl": input_s3_url
    }
)

In [42]:
sleep(5)
execution.list_steps()

[{'StepName': 'itau-project-register-RegisterModel',
  'StartTime': datetime.datetime(2024, 10, 17, 0, 11, 45, 135000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2024, 10, 17, 0, 11, 47, 90000, tzinfo=tzlocal()),
  'StepStatus': 'Succeeded',
  'Metadata': {'RegisterModel': {'Arn': 'arn:aws:sagemaker:us-east-1:891377318910:model-package/itau-project-model-group-2024-10-16-23-49-13/1'}},
  'AttemptCount': 1},
 {'StepName': 'itau-project-check-test-score',
  'StartTime': datetime.datetime(2024, 10, 17, 0, 11, 44, 125000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2024, 10, 17, 0, 11, 44, 507000, tzinfo=tzlocal()),
  'StepStatus': 'Succeeded',
  'Metadata': {'Condition': {'Outcome': 'True'}},
  'AttemptCount': 1},
 {'StepName': 'itau-project-evaluate',
  'StartTime': datetime.datetime(2024, 10, 17, 0, 9, 8, 852000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2024, 10, 17, 0, 11, 43, 540000, tzinfo=tzlocal()),
  'StepStatus': 'Succeeded',
  'Metadata': {'ProcessingJob': 