In [2]:
print("Hello...")

Hello...


### Import

In [13]:
import boto3 
import pandas as pd 
import sagemaker
from sagemaker.workflow.pipeline_context import PipelineSession 

s3_client = boto3.resource('s3')
pipeline_name = f"emp-bonus-training-pipeline"
sagemaker_session = sagemaker.Session()
region = sagemaker_session.boto_region_name
role = sagemaker.get_execution_role()
pipeline_session = PipelineSession()
default_bucket = sagemaker_session.default_bucket()
model_package_group_name = f"EmpBonusPackageGroup"

print(f"Default S3 Bucket Name: {default_bucket}")
print(f"Model Package Group Name: {model_package_group_name}")

Default S3 Bucket Name: sagemaker-us-east-1-637423223719
Model Package Group Name: EmpBonusPackageGroup


In [14]:
from sagemaker.workflow.parameters import ( 
 ParameterInteger, 
 ParameterString, 
 ParameterFloat) 

base_job_prefix = "emp-bonus"
processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)
processing_instance_type = ParameterString( name="ProcessingInstanceType", default_value="ml.m5.xlarge") 
training_instance_type = ParameterString( name="TrainingInstanceType", default_value="ml.m5.xlarge") 
input_data = "data/mock_data.csv" 
model_approval_status = ParameterString( name="ModelApprovalStatus", default_value="PendingManualApproval")

### Preprocessing

In [34]:
# Define Processing Step for Feature Engineering
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep

framework_version = "1.0-1"

sklearn_processor = SKLearnProcessor(
    framework_version=framework_version,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    base_job_name="emp-pre-processing",
    role=role,
    sagemaker_session=pipeline_session,
)

processor_args = sklearn_processor.run(
    inputs=[
      ProcessingInput(source=input_data, destination="/opt/ml/processing/input"),  
    ],
    outputs=[
        ProcessingOutput(output_name="processed-data", source='/opt/ml/processing/output',
                         destination=f"s3://{default_bucket}/output/processed")
    ],
    code=f"preprocessing_script.py",
)
step_preprocess = ProcessingStep(name="EmpBonusPreProcessing", step_args=processor_args)

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


### Step 5: Model Training

In [35]:
# Model Traning Step

input_data = f"s3://{default_bucket}/output/processed/transformed_data.csv"

sklearn_processor = SKLearnProcessor(
    framework_version=framework_version,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    base_job_name="emp-model-trainig",
    role=role,
    sagemaker_session=pipeline_session,
)

processor_args = sklearn_processor.run(
    inputs=[
      ProcessingInput(source=input_data, destination="/opt/ml/processing/input"),  
    ],
    outputs=[
        ProcessingOutput(output_name="train", source="/opt/ml/processing/train",
                         destination=f"s3://{default_bucket}/output/train" ),
        ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation",
                         destination=f"s3://{default_bucket}/output/validation"),
        ProcessingOutput(output_name="test", source="/opt/ml/processing/test",
                         destination=f"s3://{default_bucket}/output/test")
    ],
    code=f"model_training_script.py",
)
step_model_train = ProcessingStep(name="EmpBonusModelTraining", step_args=processor_args, depends_on=[step_preprocess] )

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


### Model Train

In [44]:
print("Model Traing Job")
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TrainingStep

# Specify the container image for Linear Learner (built-in algorithm)
linear_learner_container = sagemaker.image_uris.retrieve('linear-learner', sagemaker_session.boto_region_name)

# Define the LinearLearner estimator
linear_estimator = Estimator(
    image_uri=linear_learner_container,
    role=role,
    instance_count=1,
    instance_type=training_instance_type,
    output_path=f's3://{default_bucket}/model-output',
    sagemaker_session=pipeline_session,
    base_job_name="emp-bonus-linear-learner"
)

# Set hyperparameters for Linear Learner
linear_estimator.set_hyperparameters(
    predictor_type='regressor',  # We're solving a regression problem
    mini_batch_size=32,
    epochs=10
)

train_path = f"s3://{default_bucket}/output/train/train.csv"
val_path = f"s3://{default_bucket}/output/validation/validation.csv"

from sagemaker.inputs import TrainingInput

# Specify the input data channels
train_input = TrainingInput(
    s3_data=train_path, 
    content_type='text/csv'
)
val_input = TrainingInput(
    s3_data=val_path, 
    content_type='text/csv'
)

# Training Step
step_training = TrainingStep(
    name="EmpBonusTrainingStep",
    estimator=linear_estimator,
    inputs={
        'train': train_input,
        'validation': val_input
    },
    depends_on=[step_model_train]
)


INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


Model Traing Job


In [45]:
import json
from sagemaker.workflow.pipeline import Pipeline

pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        processing_instance_count,
        processing_instance_type,
        training_instance_type,
        model_approval_status,
        input_data,
    ],
    steps=[step_preprocess, step_model_train, step_training],
)
definition = json.loads(pipeline.definition())
# print(definition)



In [46]:
# Create a new or update existing Pipeline
pipeline.upsert(role_arn=role)



{'PipelineArn': 'arn:aws:sagemaker:us-east-1:637423223719:pipeline/emp-bonus-training-pipeline',
 'ResponseMetadata': {'RequestId': 'a759a439-4462-48a4-b30d-632812defe72',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'a759a439-4462-48a4-b30d-632812defe72',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '117',
   'date': 'Sat, 02 Aug 2025 23:02:47 GMT'},
  'RetryAttempts': 0}}

In [47]:
# start Pipeline execution
pipeline.start()

_PipelineExecution(arn='arn:aws:sagemaker:us-east-1:637423223719:pipeline/emp-bonus-training-pipeline/execution/ji7hp43nnzxy', sagemaker_session=<sagemaker.session.Session object at 0x7f3fe811dbe0>)