# imports 


In [2]:

import pandas as pd
import json
import boto3
import pathlib
import io
import sagemaker
from time import gmtime, strftime, sleep
from sagemaker.deserializers import CSVDeserializer
from sagemaker.serializers import CSVSerializer

from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.xgboost.estimator import XGBoost
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import (
    ProcessingInput, 
    ProcessingOutput, 
    ScriptProcessor
)
from sagemaker.inputs import TrainingInput

from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.steps import (
    ProcessingStep, 
    TrainingStep, 
    CreateModelStep
)
from sagemaker.workflow.check_job_config import CheckJobConfig
from sagemaker.workflow.parameters import (
    ParameterInteger, 
    ParameterFloat, 
    ParameterString, 
    ParameterBoolean
)
from sagemaker.workflow.clarify_check_step import (
    ModelBiasCheckConfig, 
    ClarifyCheckStep, 
    ModelExplainabilityCheckConfig
)
from sagemaker import Model
from sagemaker.inputs import CreateModelInput
from sagemaker.workflow.model_step import ModelStep
from sagemaker.workflow.fail_step import FailStep
from sagemaker.workflow.conditions import (
    ConditionGreaterThan,
    ConditionGreaterThanOrEqualTo
)
from sagemaker.workflow.pipeline_experiment_config import PipelineExperimentConfig
from sagemaker.workflow.properties import PropertyFile
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.functions import (
    Join,
    JsonGet
)
from sagemaker.workflow.lambda_step import (
    LambdaStep,
    LambdaOutput,
    LambdaOutputTypeEnum,
)
from sagemaker.lambda_helper import Lambda

from sagemaker.model_metrics import (
    MetricsSource, 
    ModelMetrics, 
    FileSource
)
from sagemaker.drift_check_baselines import DriftCheckBaselines

from sagemaker.image_uris import retrieve

sagemaker.__version__

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


'2.199.0'

In [3]:
# import config 
import os
import yaml
with open(os.path.abspath(os.path.join(os.pardir,"config.yml")),"r") as f:
    config = yaml.load(f,Loader=yaml.FullLoader)
print(config)

{'features': ['dteday', 'season', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'casual', 'registered', 'yr', 'mnth'], 'holiday_mappings': {'No': 1, 'Yes': 0}, 'hr_mappings': {'10am': 9, '10pm': 8, '11am': 11, '11pm': 7, '12am': 5, '12pm': 17, '1am': 4, '1pm': 16, '2am': 3, '2pm': 15, '3am': 1, '3pm': 18, '4am': 0, '4pm': 19, '5am': 2, '5pm': 23, '6am': 6, '6pm': 22, '7am': 12, '7pm': 20, '8am': 21, '8pm': 14, '9am': 13, '9pm': 10}, 'mnth_mappings': {'April': 5, 'August': 11, 'December': 2, 'February': 1, 'January': 0, 'July': 10, 'June': 9, 'March': 3, 'May': 7, 'November': 4, 'October': 6, 'September': 8}, 's3-bucket': {'bucket_name': 'sagemaker-us-east-1-644383320443', 'bucket_prefix': 'siemens-poc/', 'training_file_key': 'siemens-poc/bike-sharing-dataset.csv'}, 'sagemaker': {'domain_id': 'd-zjfao8azi0ng', 'region': 'us-east-1', 'role': 'arn:aws:iam::644383320443:role/service-role/AmazonSageMaker-ExecutionRole-20240109T220483'}, 'season

# set pipeline constants

In [4]:
# Set names of pipeline objects
project = "siemens-poc-xgboost"

pipeline_name = f"{project}-pipeline"
pipeline_model_name = f"{project}-model-xgb"
model_package_group_name = f"{project}-model-group"
endpoint_config_name = f"{project}-endpoint-config"
endpoint_name = f"{project}-endpoint"

In [5]:
# Set instance types and counts
process_instance_type = "ml.t3.medium"
train_instance_count = 1
train_instance_type = "ml.m5.xlarge"

In [6]:
bucket_name = config['s3-bucket']['bucket_name']
bucket_name

'sagemaker-us-east-1-644383320443'

In [7]:
bucket_name = config['s3-bucket']['bucket_name']
bucket_prefix = f'{config["s3-bucket"]["bucket_prefix"]}pipeline-jobs'

print(f"bucket_prefix for pipelines--> {bucket_prefix}")

input_s3_url = f's3://{bucket_name}/{config["s3-bucket"]["training_file_key"]}'
print(f"input_s3_url--> {input_s3_url}")

train_s3_url = f"s3://{bucket_name}/{bucket_prefix}/train"
print(f"train_s3_url--> {train_s3_url}")

validation_s3_url = f"s3://{bucket_name}/{bucket_prefix}/validation"
print(f"validation_s3_url--> {validation_s3_url}")

test_s3_url = f"s3://{bucket_name}/{bucket_prefix}/test"
print(f"test_s3_url--> {test_s3_url}")

baseline_s3_url = f"s3://{bucket_name}/{bucket_prefix}/baseline"
print(f"baseline_s3_url--> {baseline_s3_url}")


evaluation_s3_url = f"s3://{bucket_name}/{bucket_prefix}/evaluation"
print(f"evaluation_s3_url--> {evaluation_s3_url}")

prediction_baseline_s3_url = f"s3://{bucket_name}/{bucket_prefix}/prediction_baseline"
print(f"prediction_baseline_s3_url--> {prediction_baseline_s3_url}")

output_s3_url = f"s3://{bucket_name}/{bucket_prefix}/output"
print(f"output_s3_url--> {output_s3_url}")

bucket_prefix for pipelines--> siemens-poc/pipeline-jobs
input_s3_url--> s3://sagemaker-us-east-1-644383320443/siemens-poc/bike-sharing-dataset.csv
train_s3_url--> s3://sagemaker-us-east-1-644383320443/siemens-poc/pipeline-jobs/train
validation_s3_url--> s3://sagemaker-us-east-1-644383320443/siemens-poc/pipeline-jobs/validation
test_s3_url--> s3://sagemaker-us-east-1-644383320443/siemens-poc/pipeline-jobs/test
baseline_s3_url--> s3://sagemaker-us-east-1-644383320443/siemens-poc/pipeline-jobs/baseline
evaluation_s3_url--> s3://sagemaker-us-east-1-644383320443/siemens-poc/pipeline-jobs/evaluation
prediction_baseline_s3_url--> s3://sagemaker-us-east-1-644383320443/siemens-poc/pipeline-jobs/prediction_baseline
output_s3_url--> s3://sagemaker-us-east-1-644383320443/siemens-poc/pipeline-jobs/output


# create pipeline 
    > set up pipeline parameters

In [8]:
# Set processing instance type
process_instance_type_param = ParameterString(
    name="ProcessingInstanceType",
    default_value=process_instance_type,
)

# Set training instance type
train_instance_type_param = ParameterString(
    name="TrainingInstanceType",
    default_value=train_instance_type,
)

# Set training instance count
train_instance_count_param = ParameterInteger(
    name="TrainingInstanceCount",
    default_value=train_instance_count
)

# Set model approval param
model_approval_status_param = ParameterString(
    name="ModelApprovalStatus",
    default_value="PendingManualApproval"
)

# Minimal threshold for model performance on the test dataset
test_score_threshold_param = ParameterFloat(
    name="TestScoreThreshold", 
    default_value=100.00
)

# Set S3 url for input dataset
input_s3_url_param = ParameterString(
    name="InputDataUrl",
    default_value=input_s3_url,
)

# build the the pipeline

In [9]:
session = PipelineSession()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [10]:
sm_role = config['sagemaker']['role']
region = config['sagemaker']['region']
sm_role , region

('arn:aws:iam::644383320443:role/service-role/AmazonSageMaker-ExecutionRole-20240109T220483',
 'us-east-1')

In [11]:
# preprocessing step 
sklearn_processor = SKLearnProcessor(
        framework_version="0.23-1",
        role=sm_role,
        instance_type=process_instance_type_param,
        instance_count=1,
        base_job_name=f"{pipeline_name}/preprocess",
        sagemaker_session=session,
    )
    
processing_inputs=[
    ProcessingInput(source=input_s3_url_param, destination="/opt/ml/processing/input")
]

processing_outputs=[
    ProcessingOutput(output_name="train_data", source="/opt/ml/processing/output/train", 
                     destination=train_s3_url),
    ProcessingOutput(output_name="validation_data", source="/opt/ml/processing/output/validation",
                     destination=validation_s3_url),
    ProcessingOutput(output_name="test_data", source="/opt/ml/processing/output/test",
                     destination=test_s3_url),
    ProcessingOutput(output_name="baseline_data", source="/opt/ml/processing/output/baseline", 
                     destination=baseline_s3_url),
]

processor_args = sklearn_processor.run(
    inputs=processing_inputs,
    outputs=processing_outputs,
    code='../scripts/preprocessing.py',
    # arguments = ['arg1', 'arg2'],
)
    
# Define processing step
step_process = ProcessingStep(
    name=f"{pipeline_name}-preprocess-data",
    step_args=processor_args,
)

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


In [12]:
# training step 
xgboost_image_uri = sagemaker.image_uris.retrieve("xgboost", region=region, version="1.5-1")

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


In [13]:
# Instantiate an XGBoost estimator object
estimator = sagemaker.estimator.Estimator(
    image_uri=xgboost_image_uri,  # XGBoost algorithm container
    instance_type=train_instance_type_param,  
    instance_count=train_instance_count_param,  
    role=sm_role,  
    max_run=20 * 60,  # Maximum allowed active runtime
    output_path=output_s3_url, # S3 location 
    sagemaker_session=session, # Session object  manages interactions with SageMaker API and AWS services
    base_job_name=f"{pipeline_name}/train", # Prefix for training job name
)

# define its hyperparameters
estimator.set_hyperparameters(
    num_round=150, # the number of rounds to run the training
    max_depth=3, # maximum depth of a tree
    eta=0.5, # step size shrinkage used in updates to prevent overfitting
    alpha=2.5, # L1 regularization term on weights
    objective="reg:squarederror",
    eval_metric="rmse", # evaluation metrics for validation data
    subsample=0.8, # subsample ratio of the training instance
    colsample_bytree=0.8, # subsample ratio of columns when constructing each tree
    min_child_weight=3, # minimum sum of instance weight (hessian) needed in a child
    early_stopping_rounds=10, # the model trains until the validation score stops improving
    verbosity=1, # verbosity of printing messages
)

In [14]:
training_inputs = {
    "train": TrainingInput(
        s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
            "train_data"
        ].S3Output.S3Uri,
        content_type="text/csv",
    ),
    "validation": TrainingInput(
        s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
            "validation_data"
        ].S3Output.S3Uri,
        content_type="text/csv",
    ),
}

In [15]:
training_args = estimator.fit(training_inputs)

# Define training step
step_train = TrainingStep(
    name=f"{pipeline_name}-train",
    step_args=training_args
)

In [16]:
# %%writefile ../scripts/evaluation.py

# import json
# import os
# import pathlib
# import pickle as pkl
# import tarfile
# import joblib
# import numpy as np
# import pandas as pd
# import xgboost as xgb
# import datetime as dt
# from sklearn.metrics import mean_squared_error

# if __name__ == "__main__":   
    
#     # All paths are local for the processing container
#     model_path = "/opt/ml/processing/model/model.tar.gz"
#     test_x_path = "/opt/ml/processing/test/test_x.csv"
#     test_y_path = "/opt/ml/processing/test/test_y.csv"
#     output_dir = "/opt/ml/processing/evaluation"
#     output_prediction_path = "/opt/ml/processing/output/"
        
#     # Read model tar file
#     with tarfile.open(model_path, "r:gz") as t:
#         t.extractall(path=".")
    
#     # Load model
#     model = xgb.Booster()
#     model.load_model("xgboost-model")
    
#     # Read test data
#     X_test = xgb.DMatrix(pd.read_csv(test_x_path).values)
    
#     y_test = pd.read_csv(test_y_path).to_numpy()

#     # Run predictions
#     predictions = np.array(predictor.predict(X_test.values), dtype=float).squeeze()

#     # Evaluate predictions
#     test_results = pd.concat([pd.Series(predictions, name="y_pred", index=X_test.index),X_test,],axis=1,)
#     test_results.head()
    
#     test_rmse = mean_squared_error(y_test, test_results["y_pred"])
#     report_dict = {"regression_metric":{"test_rmse":{"value":test_rmse}}}
#     print(f"Test-rmse: {test_rmse:.2f}")


#     # Save evaluation report
#     pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
#     with open(f"{output_dir}/evaluation.json", "w") as f:
#         f.write(json.dumps(report_dict))
    
#     # Save prediction baseline file - we need it later for the model quality monitoring
#     test_results.to_csv(os.path.join(output_prediction_path, 'prediction_baseline/prediction_baseline.csv'), index=False, header=True)


In [17]:
# Processor to run the evaluation script and construct the evaluation step
script_processor = ScriptProcessor(
    image_uri=xgboost_image_uri,
    role=sm_role,
    command=["python3"],
    instance_type=process_instance_type_param,
    instance_count=1,
    base_job_name=f"{pipeline_name}/evaluate",
    sagemaker_session=session,
)

eval_inputs=[
    ProcessingInput(source=step_train.properties.ModelArtifacts.S3ModelArtifacts, 
                    destination="/opt/ml/processing/model"),
    ProcessingInput(source=step_process.properties.ProcessingOutputConfig.Outputs["test_data"].S3Output.S3Uri, 
                    destination="/opt/ml/processing/test"),
]

eval_outputs=[
    ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation", 
                     destination=evaluation_s3_url),
    ProcessingOutput(output_name="prediction_baseline_data", source="/opt/ml/processing/output/prediction_baseline", 
                     destination=prediction_baseline_s3_url),
]

eval_args = script_processor.run(
    inputs=eval_inputs,
    outputs=eval_outputs,
    code="../scripts/evaluation.py",
)
    
evaluation_report = PropertyFile(
    name="ModelEvaluationReport", output_name="evaluation", path="evaluation.json"
)

step_eval = ProcessingStep(
    name=f"{pipeline_name}-evaluate-model",
    step_args=eval_args,
    property_files=[evaluation_report]
)

In [18]:
# register step 
model = Model(
    image_uri=xgboost_image_uri,        
    model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
    name=f"siemens-poc-xgboost-model",
    sagemaker_session=session,
    role=sm_role,
)

model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        s3_uri="{}/evaluation.json".format(
            step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
        ),
        content_type="application/json",
    )
)

register_args = model.register(
    content_types=["text/csv"],
    response_types=["text/csv"],
    inference_instances=["ml.t2.medium", "ml.m5.xlarge", "ml.m5.large"],
    transform_instances=["ml.m5.xlarge", "ml.m5.large"],
    model_package_group_name=model_package_group_name,
    approval_status=model_approval_status_param,
    model_metrics=model_metrics,
)

step_register = ModelStep(
    name=f"{pipeline_name}-register",
    step_args=register_args
)



In [19]:
# fail step 
step_fail = FailStep(
    name=f"{pipeline_name}-fail",
    error_message=Join(on=" ", values=["Execution failed due to RMSE Score >", test_score_threshold_param]),
)

In [20]:
# condition step 
cond_lte = ConditionGreaterThan(
    left=JsonGet(
        step_name=step_eval.name,
        property_file=evaluation_report,
        json_path="regression_metric.test_rmse.value",
    ),
    right=test_score_threshold_param,
)

step_cond = ConditionStep(
    name=f"{pipeline_name}-check-test-score",
    conditions=[cond_lte],
    if_steps=[step_register],
    else_steps=[step_fail],
)

In [21]:
# construct the pipeline 
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        process_instance_type_param,
        train_instance_type_param,
        train_instance_count_param,
        model_approval_status_param,
        test_score_threshold_param,
        input_s3_url_param,
    ],
    steps=[step_process, step_train, step_eval, step_cond],
    sagemaker_session=session,
)

In [22]:
# Create a new or update existing Pipeline
pipeline.upsert(role_arn=sm_role)



{'PipelineArn': 'arn:aws:sagemaker:us-east-1:644383320443:pipeline/siemens-poc-xgboost-pipeline',
 'ResponseMetadata': {'RequestId': 'f9c6349b-767e-48cd-a4dc-bd54d656e3ae',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'f9c6349b-767e-48cd-a4dc-bd54d656e3ae',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '96',
   'date': 'Tue, 16 Jan 2024 07:22:15 GMT'},
  'RetryAttempts': 0}}

In [23]:
pipeline_definition = json.loads(pipeline.describe()['PipelineDefinition'])
pipeline_definition

{'Version': '2020-12-01',
 'Metadata': {},
 'Parameters': [{'Name': 'ProcessingInstanceType',
   'Type': 'String',
   'DefaultValue': 'ml.t3.medium'},
  {'Name': 'TrainingInstanceType',
   'Type': 'String',
   'DefaultValue': 'ml.m5.xlarge'},
  {'Name': 'TrainingInstanceCount', 'Type': 'Integer', 'DefaultValue': 1},
  {'Name': 'ModelApprovalStatus',
   'Type': 'String',
   'DefaultValue': 'PendingManualApproval'},
  {'Name': 'TestScoreThreshold', 'Type': 'Float', 'DefaultValue': 100.0},
  {'Name': 'InputDataUrl',
   'Type': 'String',
   'DefaultValue': 's3://sagemaker-us-east-1-644383320443/siemens-poc/bike-sharing-dataset.csv'}],
 'PipelineExperimentConfig': {'ExperimentName': {'Get': 'Execution.PipelineName'},
  'TrialName': {'Get': 'Execution.PipelineExecutionId'}},
 'Steps': [{'Name': 'siemens-poc-xgboost-pipeline-preprocess-data',
   'Type': 'Processing',
   'Arguments': {'ProcessingResources': {'ClusterConfig': {'InstanceType': {'Get': 'Parameters.ProcessingInstanceType'},
      

# execution of the pipeline 

In [24]:
execution = pipeline.start(
    parameters=dict(
        ProcessingInstanceType=process_instance_type,
        TrainingInstanceType=train_instance_type,
        TrainingInstanceCount=train_instance_count,
        ModelApprovalStatus="PendingManualApproval",
        TestScoreThreshold=100.0,
        InputDataUrl=input_s3_url
    )
)

In [25]:
execution.list_steps()

[]