# CI/CD

In [46]:
import boto3
import json
import pandas as pd
import numpy as np
from datetime import datetime
import sagemaker
from sagemaker import get_execution_role, Session
from sagemaker.inputs import TrainingInput
from sagemaker.model import Model
from sagemaker.processing import ProcessingInput, ProcessingOutput, ScriptProcessor
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.workflow.properties import PropertyFile
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
    ParameterFloat
)
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.conditions import ConditionLessThanOrEqualTo
from sagemaker.workflow.functions import JsonGet, Join
from sagemaker.workflow.step_collections import RegisterModel
from sagemaker.model_metrics import MetricsSource, ModelMetrics
from sagemaker.workflow.execution_variables import ExecutionVariables
from sagemaker.workflow.pipeline_definition_config import PipelineDefinitionConfig
from sagemaker.workflow.pipeline import PipelineExperimentConfig
from sagemaker.workflow.conditions import ConditionGreaterThan

### Setup Files and Bucket

In [47]:
# Upload Required Data Files
import boto3
import os
from botocore.exceptions import ClientError

def upload_required_data():
    """Upload required data files to the appropriate S3 locations"""
    # Setup AWS clients
    session = boto3.Session()
    s3 = session.client('s3')
    account_id = session.client('sts').get_caller_identity()['Account']
    region = session.region_name
    bucket = f"sagemaker-{region}-{account_id}"
    
    print(f"Setting up data files for account {account_id} in region {region}")
    print(f"Target bucket: {bucket}")
    
    # Define required files and their S3 paths
    required_files = {
        'train_data.csv': 'predictive-maintenance-feature-store/train_data.csv',
        'test_data.csv': 'predictive-maintenance-feature-store/test_data.csv',
        'model.tar.gz': 'predictive-maintenance-feature-store/output/xgb-2024-10-22-13-55-23/output/model.tar.gz'
    }
    
    # Check and upload each file
    for local_file, s3_path in required_files.items():
        if os.path.exists(local_file):
            try:
                print(f"\nUploading {local_file} to s3://{bucket}/{s3_path}")
                s3.upload_file(local_file, bucket, s3_path)
                print(f"✓ Successfully uploaded {local_file}")
                
                # Verify upload
                try:
                    s3.head_object(Bucket=bucket, Key=s3_path)
                    print(f"✓ Verified file at s3://{bucket}/{s3_path}")
                except ClientError:
                    print(f"✗ Could not verify file at s3://{bucket}/{s3_path}")
            except Exception as e:
                print(f"✗ Error uploading {local_file}: {str(e)}")
        else:
            print(f"\n✗ Local file {local_file} not found")
            print(f"Please ensure {local_file} is in the current directory")
    
    print("\nChecking all required S3 paths...")
    all_files_present = True
    for _, s3_path in required_files.items():
        try:
            s3.head_object(Bucket=bucket, Key=s3_path)
            print(f"✓ Found s3://{bucket}/{s3_path}")
        except ClientError:
            print(f"✗ Missing s3://{bucket}/{s3_path}")
            all_files_present = False
    
    if all_files_present:
        print("\nAll required files are in place. You can proceed with the pipeline.")
    else:
        print("\nSome files are missing. Please ensure all required files are present before proceeding.")
        print("\nRequired files:")
        for _, s3_path in required_files.items():
            print(f"- s3://{bucket}/{s3_path}")

# Run upload
upload_required_data()

Setting up data files for account 691334595165 in region us-east-1
Target bucket: sagemaker-us-east-1-691334595165

Uploading train_data.csv to s3://sagemaker-us-east-1-691334595165/predictive-maintenance-feature-store/train_data.csv
✓ Successfully uploaded train_data.csv
✓ Verified file at s3://sagemaker-us-east-1-691334595165/predictive-maintenance-feature-store/train_data.csv

Uploading test_data.csv to s3://sagemaker-us-east-1-691334595165/predictive-maintenance-feature-store/test_data.csv
✓ Successfully uploaded test_data.csv
✓ Verified file at s3://sagemaker-us-east-1-691334595165/predictive-maintenance-feature-store/test_data.csv

Uploading model.tar.gz to s3://sagemaker-us-east-1-691334595165/predictive-maintenance-feature-store/output/xgb-2024-10-22-13-55-23/output/model.tar.gz
✓ Successfully uploaded model.tar.gz
✓ Verified file at s3://sagemaker-us-east-1-691334595165/predictive-maintenance-feature-store/output/xgb-2024-10-22-13-55-23/output/model.tar.gz

Checking all requir

In [48]:
# Configure Account-Specific Settings
def get_account_specific_settings():
    """Get account-specific settings that need to be configured"""
    session = Session()
    account_id = boto3.client('sts').get_caller_identity()['Account']
    region = session.boto_region_name
    
    # Construct the default bucket name
    default_bucket = f"sagemaker-{region}-{account_id}"
    
    print("Account Settings:")
    print(f"Account ID: {account_id}")
    print(f"Region: {region}")
    print(f"Default bucket: {default_bucket}")
    
    return default_bucket, region

# Get account-specific settings
default_bucket, region = get_account_specific_settings()

Account Settings:
Account ID: 691334595165
Region: us-east-1
Default bucket: sagemaker-us-east-1-691334595165


In [49]:
def verify_required_files():
    """Verify that all required files are present in S3 with proper data examination"""
    s3_client = boto3.client('s3')
    
    required_files = {
        "train_data": f"predictive-maintenance-feature-store/train_data.csv",
        "test_data": f"predictive-maintenance-feature-store/test_data.csv",
        "preprocess_script": "code/preprocess.py",
        "evaluate_script": "code/evaluate.py"
    }
    
    print("\nChecking required files...")
    missing_files = []
    
    # Check S3 files
    for file_type, file_path in required_files.items():
        if file_type in ["train_data", "test_data"]:
            try:
                s3_client.head_object(Bucket=default_bucket, Key=file_path)
                print(f"✓ Found {file_type} at s3://{default_bucket}/{file_path}")
            except Exception as e:
                print(f"✗ Missing {file_type} at s3://{default_bucket}/{file_path}")
                missing_files.append(file_path)
    
    # Check local code files
    for file_type, file_path in required_files.items():
        if file_type.endswith('_script'):
            if not os.path.exists(file_path):
                print(f"✗ Missing {file_type} at {file_path}")
                missing_files.append(file_path)
            else:
                print(f"✓ Found {file_type} at {file_path}")
    
    # Print data shape info if files exist
    try:
        train_data = pd.read_csv(f"s3://{default_bucket}/predictive-maintenance-feature-store/train_data.csv")
        test_data = pd.read_csv(f"s3://{default_bucket}/predictive-maintenance-feature-store/test_data.csv")
        
        print("\nData Statistics:")
        print(f"Train data shape: {train_data.shape}")
        print("\nTrain data columns:")
        print(train_data.columns.tolist())
        
        print(f"\nTest data shape: {test_data.shape}")
        print("\nTest data columns:")
        print(test_data.columns.tolist())
        
        # Sample of data
        print("\nFirst few rows of training data:")
        print(train_data.head())
        
        # If we find the actual target column name, show its distribution
        potential_target_columns = ['target', 'label', 'class', 'failure', 'Status']
        target_col = None
        for col in potential_target_columns:
            if col in train_data.columns:
                target_col = col
                break
            
    except Exception as e:
        print(f"\nError examining data files: {str(e)}")
    
    return len(missing_files) == 0, missing_files

# Verify files
files_ok, missing_files = verify_required_files()
if not files_ok:
    print("\nMissing required files. Please ensure the following files are in place:")
    for file_path in missing_files:
        print(f"- {file_path}")
    print("\nPlease copy these files to the correct locations before proceeding.")
else:
    print("\nAll required files are in place. Proceeding with pipeline setup.")


Checking required files...
✓ Found train_data at s3://sagemaker-us-east-1-691334595165/predictive-maintenance-feature-store/train_data.csv
✓ Found test_data at s3://sagemaker-us-east-1-691334595165/predictive-maintenance-feature-store/test_data.csv
✓ Found preprocess_script at code/preprocess.py
✓ Found evaluate_script at code/evaluate.py

Data Statistics:
Train data shape: (152657, 10)

Train data columns:
['0', '215630672', '55', '0.1', '52', '6', '407438', '0.2', '0.3', '7']

Test data shape: (37746, 11)

Test data columns:
['2015-07-01', '1', '203011379', '16', '56', '5', '10', '277481', '10.1', '10.2', '946']

First few rows of training data:
   0  215630672    55  0.1  52   6  407438  0.2  0.3  7
0  1   11487214     0    0   0  16  412767    4    4  0
1  1  149265550  2024    0   1  50  251894    0    0  0
2  1  236294969     0    0   1  10  262375    0    0  0
3  1  133207556     0    0  14  10  247231   55   55  0
4  1  180004303  2767    0  18   7  197868   29   29  0

Could 

### Modify for your bucket

In [50]:
# Setup SageMaker session and role
session = Session()
pipeline_session = PipelineSession()
role = get_execution_role()
region = session.boto_region_name
#default_bucket = "sagemaker-us-east-1-807494057176"
default_bucket = "sagemaker-us-east-1-691334595165"
model_package_group_name = "PMPredictiveMaintenanceModels"
#model_s3_path = "s3://sagemaker-us-east-1-807494057176/predictive-maintenance-feature-store/output/xgb-2024-10-22-13-55-23/output/model.tar.gz"
model_s3_path = f"s3://{default_bucket}/predictive-maintenance-feature-store/output/xgb-2024-10-22-13-55-23/output/model.tar.gz"

In [51]:
# Define pipeline parameters
processing_instance_count = ParameterInteger(
    name="ProcessingInstanceCount",
    default_value=1
)

training_instance_type = ParameterString(
    name="TrainingInstanceType",
    default_value="ml.m5.xlarge"
)

model_approval_status = ParameterString(
    name="ModelApprovalStatus",
    default_value="PendingManualApproval"
)

In [52]:
#input_data = ParameterString(
    #name="InputData",
    #default_value=f"s3://{default_bucket}/predictive-maintenance-model-monitor/datacapture/sagemaker-xgboost-endpoint-2024-10-22-23-47-32/"
#)

#input_data = ParameterString(
#    name="InputData",
#    default_value=f"s3://{default_bucket}/predictive-maintenance-model-monitor/datacapture/validation_with_predictions.csv"
#)
input_data = ParameterString(
    name="InputData",
    default_value=f"s3://{default_bucket}/predictive-maintenance-feature-store/"  # Directory with your CSV files
)

quality_threshold = ParameterFloat(
    name="QualityThreshold",
    default_value=0.996
)
base_output_path = f"s3://{default_bucket}/PMPredictiveMaintenancePipeline"

In [53]:
# Get container images
xgb_image_uri = sagemaker.image_uris.retrieve(
    framework="xgboost",
    region=region,
    version="1.7-1",
)

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


In [54]:
# Create preprocessing step
processor = ScriptProcessor(
    image_uri=xgb_image_uri,
    command=["python3"],
    instance_type="ml.m5.xlarge",
    instance_count=1,
    base_job_name="pm-preprocess",
    role=role,
)

# Cell 6 - Create Preprocessing Step
step_process = ProcessingStep(
    name="PreprocessPMData",
    processor=processor,
    inputs=[
        ProcessingInput(
            source=input_data,
            destination="/opt/ml/processing/input"
        )
    ],
    outputs=[
        ProcessingOutput(
            output_name="train",
            source="/opt/ml/processing/train",
            destination=Join(
                on="/",
                values=[base_output_path, "preprocessing_output", "train"]
            )
        ),
        ProcessingOutput(
            output_name="test",
            source="/opt/ml/processing/test",
            destination=Join(
                on="/",
                values=[base_output_path, "preprocessing_output", "test"]
            )
        )
    ],
    code="code/preprocess.py",
    job_arguments=[
        "--input-path", "/opt/ml/processing/input",
        "--output-path", "/opt/ml/processing"
    ]
)

In [55]:
# Create XGBoost estimator
xgb_estimator = sagemaker.estimator.Estimator(
    image_uri=xgb_image_uri,
    role=role,
    instance_count=1,
    instance_type=training_instance_type,
    output_path=f"{base_output_path}/models",
    sagemaker_session=pipeline_session,
)

xgb_estimator.set_hyperparameters(
    objective="binary:logistic",
    num_round=50,
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.7
)

In [56]:
step_train = TrainingStep(
    name="TrainPMModel",
    estimator=xgb_estimator,
    inputs={
        "train": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs[0].S3Output.S3Uri,
            content_type="text/csv"
        )
    }
)

evaluation_report = PropertyFile(
    name="EvaluationReport",
    output_name="evaluation",
    path="evaluation.json"
)

step_eval = ProcessingStep(
    name="EvaluatePMModel",
    processor=processor,
    inputs=[
        ProcessingInput(
            source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
            destination="/opt/ml/processing/model"
        ),
        ProcessingInput(
            source=step_process.properties.ProcessingOutputConfig.Outputs[1].S3Output.S3Uri,
            destination="/opt/ml/processing/test"
        )
    ],
    outputs=[
        ProcessingOutput(
            output_name="evaluation",
            source="/opt/ml/processing/evaluation",
            destination=Join(
                on="/",
                values=[base_output_path, ExecutionVariables.PIPELINE_EXECUTION_ID, "EvaluatePMModel", "output"]
            )
        )
    ],
    code="code/evaluate.py",
    job_arguments=[
        "--model-path", "/opt/ml/processing/model",
        "--test-path", "/opt/ml/processing/test",
        "--output-path", "/opt/ml/processing/evaluation"
    ],
    property_files=[evaluation_report]
)


In [57]:
# Create model metrics matching your monitoring metrics
try:
    evaluation_s3_uri = Join(
        on="/",
        values=[
            step_eval.properties.ProcessingOutputConfig.Outputs[0].S3Output.S3Uri,
            "evaluation.json"
        ]
    )
    print(f"\nEvaluation S3 URI: {evaluation_s3_uri.expr}")
    
    model_metrics = ModelMetrics(
        model_statistics=MetricsSource(
            s3_uri=evaluation_s3_uri,
            content_type="application/json"
        )
    )
except Exception as e:
    print(f"Error creating model metrics: {str(e)}")
    raise

# Create register model step
step_register = RegisterModel(
    name="RegisterPMModel",
    estimator=xgb_estimator,
    model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
    content_types=["text/csv"],
    response_types=["text/csv"],
    inference_instances=["ml.m5.xlarge"],
    transform_instances=["ml.m5.xlarge"],
    model_package_group_name=model_package_group_name,
    approval_status=model_approval_status,
    model_metrics=model_metrics,
)

# Create condition step
cond_lte = ConditionLessThanOrEqualTo(
    left=JsonGet(
        step_name=step_eval.name,
        property_file=evaluation_report,
        json_path="binary_classification_metrics.f2.value"
    ),
    right=quality_threshold
)

step_cond = ConditionStep(
    name="CheckModelQuality",
    conditions=[cond_lte],
    if_steps=[step_register],
    else_steps=[]
)


Evaluation S3 URI: {'Std:Join': {'On': '/', 'Values': [{'Get': 'Steps.EvaluatePMModel.ProcessingOutputConfig.Outputs[0].S3Output.S3Uri'}, 'evaluation.json']}}


In [58]:
# pipeline definition config
pipeline_definition_config = PipelineDefinitionConfig(
    use_custom_job_prefix=True
)

# make the pipeline
pipeline = Pipeline(
    name="PMPredictiveMaintenancePipeline",
    parameters=[
        processing_instance_count,
        training_instance_type,
        model_approval_status,
        input_data,
        quality_threshold
    ],
    steps=[step_process, step_train, step_eval],
    sagemaker_session=pipeline_session #
)

pipeline.upsert(
    role_arn=role,
    description="CI/CD pipeline for predictive maintenance model with quality monitoring"
)



{'PipelineArn': 'arn:aws:sagemaker:us-east-1:691334595165:pipeline/PMPredictiveMaintenancePipeline',
 'ResponseMetadata': {'RequestId': '0882072b-84a3-4831-bb4d-5041d13be907',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '0882072b-84a3-4831-bb4d-5041d13be907',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '99',
   'date': 'Sat, 26 Oct 2024 19:59:36 GMT'},
  'RetryAttempts': 0}}

## Start the pipeline execution with initial model
Code should run if you have access to the bucket

In [59]:
# Cell 11 - Start Pipeline Execution
execution = pipeline.start(
    parameters={
        "ProcessingInstanceCount": 1,
        "TrainingInstanceType": "ml.m5.xlarge",
        "ModelApprovalStatus": "PendingManualApproval",
        "QualityThreshold": 0.996
    }
)

print("Pipeline started. Waiting for completion...")
execution.wait()

Pipeline started. Waiting for completion...


### Analyze execution

In [60]:
# Cell 12 - Check Execution Status
execution_steps = execution.list_steps()
print("Pipeline execution steps and their status:")
for step in execution_steps:
    print(f"\nStep: {step['StepName']}")
    print(f"Status: {step['StepStatus']}")
    if step['StepStatus'] == 'Failed':
        print(f"Failure Reason: {step.get('FailureReason', 'No failure reason provided')}")

execution_details = execution.describe()
print("\nFull execution details:")
print(f"Status: {execution_details['PipelineExecutionStatus']}")
if execution_details['PipelineExecutionStatus'] == 'Failed':
    print(f"Failure Reason: {execution_details.get('FailureReason', 'No failure reason provided')}")

Pipeline execution steps and their status:

Step: EvaluatePMModel
Status: Succeeded

Step: TrainPMModel
Status: Succeeded

Step: PreprocessPMData
Status: Succeeded

Full execution details:
Status: Succeeded


## Model Improvement

In [61]:
# New Hyperparameters
improved_hyperparameters = {
    "objective": "binary:logistic",
    "num_round": 100,  # Increased from 50
    "max_depth": 6,    # Increased from 5
    "eta": 0.1,        # Decreased from 0.2 for more careful learning
    "gamma": 3,        # Decreased from 4
    "min_child_weight": 4,  # Decreased from 6
    "subsample": 0.8,      # Increased from 0.7
    "colsample_bytree": 0.8,  # Added feature sampling
    "scale_pos_weight": 2,    # Added to handle class imbalance
}

In [63]:
# improved estimator
improved_xgb_estimator = sagemaker.estimator.Estimator(
    image_uri=xgb_image_uri,  # Reuse the image_uri from your existing pipeline
    role=role,
    instance_count=1,
    instance_type=training_instance_type,
    output_path=f"{base_output_path}/improved_models",
    sagemaker_session=pipeline_session,
)

improved_xgb_estimator.set_hyperparameters(**improved_hyperparameters)

In [64]:
# improved training step
step_train_improved = TrainingStep(
    name="TrainImprovedPMModel",
    estimator=improved_xgb_estimator,
    inputs={
        "train": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs[0].S3Output.S3Uri,
            content_type="text/csv"
        )
    }
)

In [65]:
evaluation_report = PropertyFile(
    name="EvaluationReport",
    output_name="evaluation",
    path="evaluation.json"
)

step_eval = ProcessingStep(
    name="EvaluatePMModel",
    processor=processor,
    inputs=[
        ProcessingInput(
            source=step_train_improved.properties.ModelArtifacts.S3ModelArtifacts,
            destination="/opt/ml/processing/model"
        ),
        ProcessingInput(
            source=step_process.properties.ProcessingOutputConfig.Outputs[1].S3Output.S3Uri,
            destination="/opt/ml/processing/test"
        )
    ],
    outputs=[
        ProcessingOutput(
            output_name="evaluation",
            source="/opt/ml/processing/evaluation",
            destination=Join(
                on="/",
                values=[
                    base_output_path,
                    ExecutionVariables.PIPELINE_EXECUTION_ID,
                    "evaluation"
                ]
            )
        )
    ],
    code="code/evaluate.py",
    job_arguments=[
        "--model-path", "/opt/ml/processing/model",
        "--test-path", "/opt/ml/processing/test",
        "--output-path", "/opt/ml/processing/evaluation"
    ],
    property_files=[evaluation_report]
)

In [66]:
# create model metrics
model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        s3_uri=Join(
            on="/",
            values=[
                step_eval.properties.ProcessingOutputConfig.Outputs[0].S3Output.S3Uri,
                "evaluation.json"
            ]
        ),
        content_type="application/json"
    )
)

# Cell [Next] - Update register model step
step_register = RegisterModel(
    name="RegisterImprovedPMModel",
    estimator=improved_xgb_estimator,
    model_data=step_train_improved.properties.ModelArtifacts.S3ModelArtifacts,
    content_types=["text/csv"],
    response_types=["text/csv"],
    inference_instances=["ml.t2.medium", "ml.m5.large"],
    transform_instances=["ml.m5.large"],
    model_package_group_name=model_package_group_name,
    approval_status=model_approval_status,
    model_metrics=model_metrics,
)

In [67]:
# Update condition step with correct path
condition_better = ConditionGreaterThan(
    left=JsonGet(
        step_name=step_eval.name,
        property_file=evaluation_report,
        json_path="binary_classification_metrics.f1.value"
    ),
    right=0.85
)

step_cond_improved = ConditionStep(
    name="CheckImprovedModelQuality",
    conditions=[condition_better],
    if_steps=[step_register],
    else_steps=[]
)

## Improved Pipeline Run

In [68]:
improved_pipeline = Pipeline(
    name="ImprovedPMPredictiveMaintenancePipeline",
    parameters=[
        processing_instance_count,
        training_instance_type,
        model_approval_status,
        input_data,
        quality_threshold
    ],
    steps=[step_process, step_train_improved, step_eval, step_cond_improved],
    sagemaker_session=pipeline_session
)

improved_pipeline.upsert(
    role_arn=role,
    description="Improved CI/CD pipeline for predictive maintenance model"
)



{'PipelineArn': 'arn:aws:sagemaker:us-east-1:691334595165:pipeline/ImprovedPMPredictiveMaintenancePipeline',
 'ResponseMetadata': {'RequestId': '5a791873-b15f-4a7b-bca8-b3e6d5b318c9',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '5a791873-b15f-4a7b-bca8-b3e6d5b318c9',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '107',
   'date': 'Sat, 26 Oct 2024 20:10:27 GMT'},
  'RetryAttempts': 0}}

In [69]:
# execute
print("Starting improved pipeline execution...")
improved_execution = improved_pipeline.start(
    parameters={
        "ProcessingInstanceCount": 1,
        "TrainingInstanceType": "ml.m5.xlarge",
        "ModelApprovalStatus": "PendingManualApproval",
        "QualityThreshold": 0.85
    }
)

print("Waiting for improved pipeline execution to complete...")
improved_execution.wait()

Starting improved pipeline execution...
Waiting for improved pipeline execution to complete...


## Analyze Improved Model Attempt

In [70]:
def check_pipeline_execution(execution):
    steps = execution.list_steps()
    print("\nPipeline execution steps and their status:")
    
    for step in steps:
        print(f"\nStep: {step['StepName']}")
        print(f"Status: {step['StepStatus']}")
        
        # Print more details for failed steps
        if step['StepStatus'] == 'Failed':
            print(f"Failure Reason: {step.get('FailureReason', 'No failure reason provided')}")
            
            # Get the CloudWatch logs for the failed step
            if 'Metadata' in step and 'ProcessingJob' in step['Metadata']:
                job_name = step['Metadata']['ProcessingJob']['Arn'].split('/')[-1]
                print(f"\nCloudWatch Logs for failed job {job_name}:")
                logs_client = boto3.client('logs')
                log_group = f"/aws/sagemaker/ProcessingJobs"
                
                try:
                    log_streams = logs_client.describe_log_streams(
                        logGroupName=log_group,
                        logStreamNamePrefix=job_name
                    )
                    
                    for stream in log_streams['logStreams']:
                        logs = logs_client.get_log_events(
                            logGroupName=log_group,
                            logStreamName=stream['logStreamName']
                        )
                        
                        print("\nLog events:")
                        for event in logs['events']:
                            print(event['message'])
                except Exception as e:
                    print(f"Could not retrieve logs: {str(e)}")
    
    execution_details = execution.describe()
    print("\nFull execution details:")
    print(f"Status: {execution_details['PipelineExecutionStatus']}")
    if execution_details['PipelineExecutionStatus'] == 'Failed':
        print(f"Failure Reason: {execution_details.get('FailureReason', 'No failure reason provided')}")
check_pipeline_execution(improved_execution)


Pipeline execution steps and their status:

Step: RegisterImprovedPMModel-RegisterModel
Status: Succeeded

Step: CheckImprovedModelQuality
Status: Succeeded

Step: EvaluatePMModel
Status: Succeeded

Step: TrainImprovedPMModel
Status: Succeeded

Step: PreprocessPMData
Status: Succeeded

Full execution details:
Status: Succeeded


## Compare Models

In [71]:
def get_evaluation_logs(execution):
    """Retrieve CloudWatch logs for the evaluation step"""
    try:
        # Get the evaluation step details
        steps = execution.list_steps()
        eval_step = next((step for step in steps if step['StepName'] == 'EvaluatePMModel'), None)
        
        if eval_step and 'Metadata' in eval_step:
            job_name = eval_step['Metadata'].get('ProcessingJob', {}).get('Arn', '').split('/')[-1]
            
            if job_name:
                print(f"\nRetrieving logs for evaluation job: {job_name}")
                logs_client = boto3.client('logs')
                
                # Get log streams
                response = logs_client.describe_log_streams(
                    logGroupName="/aws/sagemaker/ProcessingJobs",
                    logStreamNamePrefix=job_name
                )
                
                # Get logs from each stream
                for stream in response.get('logStreams', []):
                    print(f"\nLogs from stream: {stream['logStreamName']}")
                    logs = logs_client.get_log_events(
                        logGroupName="/aws/sagemaker/ProcessingJobs",
                        logStreamName=stream['logStreamName']
                    )
                    
                    for event in logs['events']:
                        if 'metrics' in event['message'].lower() or 'evaluation' in event['message'].lower():
                            print(event['message'])
            
    except Exception as e:
        print(f"Error retrieving logs: {str(e)}")

print("\nGetting logs for initial pipeline:")
get_evaluation_logs(execution)

print("\nGetting logs for improved pipeline:")
get_evaluation_logs(improved_execution)


Getting logs for initial pipeline:

Retrieving logs for evaluation job: pipelines-prno9jsypm5c-EvaluatePMModel-PpOc5KWVLn

Logs from stream: pipelines-prno9jsypm5c-EvaluatePMModel-PpOc5KWVLn/algo-1-1729973268
=== Starting Enhanced Evaluation ===
Output path: /opt/ml/processing/evaluation
Calculating metrics...
Saving metrics...
Writing metrics to: /opt/ml/processing/evaluation/evaluation.json
['evaluation.json']
Evaluation completed successfully
Metrics summary:
{
  "binary_classification_metrics": {
    "accuracy": {
      "value": 0.9681034254377832,
      "standard_deviation": 0.1757247367145952
    },
    "precision": {
      "value": 0.9546529562982005,
      "standard_deviation": 0.0
    },
    "recall": {
      "value": 0.9829539438856538,
      "standard_deviation": 0.0
    },
    "f1": {
      "value": 0.9685967657798643,
      "standard_deviation": 0.0
    },
    "auc_roc": {
      "value": 0.9958300909694157,
      "standard_deviation": 0.0
    }
  }

Getting logs for impro

## Metrics & Dashboard

In [76]:
def track_model_comparison(execution_initial, execution_improved):
    """Track and store model comparisons in SageMaker and CloudWatch"""
    
    def get_eval_metrics(execution):
        """Get evaluation metrics with improved error handling and debugging"""
        try:
            # Get execution ID and print it for verification
            execution_details = execution.describe()
            execution_id = execution_details['PipelineExecutionArn'].split('/')[-1]
            print(f"\nProcessing execution ID: {execution_id}")
            
            # Get evaluation step details
            eval_step = next(step for step in execution.list_steps() 
                           if step['StepName'] == 'EvaluatePMModel')
            job_name = eval_step['Metadata']['ProcessingJob']['Arn'].split('/')[-1]
            print(f"Processing job name: {job_name}")
            
            # Get the evaluation output
            s3_client = boto3.client('s3')
            
            # Check multiple possible S3 paths for evaluation results
            possible_paths = [
                f"{base_output_path}/{execution_id}/EvaluatePMModel/output",
                f"{base_output_path}/{execution_id}/evaluation",
                f"{base_output_path}/{execution_id}/EvaluatePMModel",
                f"{base_output_path}/{execution_id}"
            ]
            
            print("\nSearching for evaluation.json in the following paths:")
            for s3_base_path in possible_paths:
                print(f"Checking path: {s3_base_path}")
                
                # Remove the s3:// prefix and bucket name from the path
                prefix = s3_base_path.replace(f"s3://{default_bucket}/", "")
                
                # List objects in this path
                response = s3_client.list_objects_v2(
                    Bucket=default_bucket,
                    Prefix=prefix
                )
                
                # Print all objects found in this path
                if 'Contents' in response:
                    print("\nFiles found in this path:")
                    for obj in response['Contents']:
                        print(f"- {obj['Key']}")
                        if obj['Key'].endswith('evaluation.json'):
                            eval_key = obj['Key']
                            print(f"\nFound evaluation.json at: {eval_key}")
                            
                            # Get and parse the evaluation file
                            response = s3_client.get_object(Bucket=default_bucket, Key=eval_key)
                            metrics = json.loads(response['Body'].read().decode('utf-8'))
                            return metrics['binary_classification_metrics'], job_name
                else:
                    print("No files found in this path")
            
            # If we get here, we didn't find the file
            raise FileNotFoundError(
                f"Could not find evaluation.json in any of the expected paths. "
                f"Please check the S3 bucket {default_bucket} manually."
            )
            
        except Exception as e:
            print(f"\nError processing execution {execution_id}:")
            print(f"Error type: {type(e).__name__}")
            print(f"Error message: {str(e)}")
            
            # Print step details for debugging
            print("\nStep details:")
            for step in execution.list_steps():
                print(f"\nStep Name: {step['StepName']}")
                print(f"Step Status: {step['StepStatus']}")
                if 'Metadata' in step:
                    print("Metadata:", json.dumps(step['Metadata'], indent=2))
            raise

    try:
        # Get metrics for both models
        print("\nProcessing initial model metrics:")
        initial_metrics, initial_job = get_eval_metrics(execution_initial)
        
        print("\nProcessing improved model metrics:")
        improved_metrics, improved_job = get_eval_metrics(execution_improved)
        
        # Create CloudWatch metrics
        cloudwatch = boto3.client('cloudwatch')
        
        # Put metrics in CloudWatch
        def put_comparison_metrics(metrics, model_version):
            for metric_name, metric_data in metrics.items():
                if isinstance(metric_data, dict) and 'value' in metric_data:
                    cloudwatch.put_metric_data(
                        Namespace='CustomModelMetrics',
                        MetricData=[{
                            'MetricName': metric_name,
                            'Value': metric_data['value'],
                            'Unit': 'None',
                            'Dimensions': [
                                {'Name': 'ModelVersion', 'Value': model_version}
                            ]
                        }]
                    )

        put_comparison_metrics(initial_metrics, 'Initial')
        put_comparison_metrics(improved_metrics, 'Improved')
        
        # Create comparison dashboard
        dashboard_name = f"ModelComparison-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
        dashboard_body = {
            "widgets": [
                {
                    "type": "metric",
                    "properties": {
                        "metrics": [
                            ["CustomModelMetrics", "accuracy", "ModelVersion", "Initial"],
                            [".", ".", ".", "Improved"],
                            [".", "f1", ".", "Initial"],
                            [".", ".", ".", "Improved"],
                            [".", "precision", ".", "Initial"],
                            [".", ".", ".", "Improved"],
                            [".", "recall", ".", "Initial"],
                            [".", ".", ".", "Improved"]
                        ],
                        "view": "bar",
                        "region": sagemaker.Session().boto_region_name,
                        "title": "Model Performance Comparison",
                        "period": 300
                    }
                }
            ]
        }
        
        cloudwatch.put_dashboard(
            DashboardName=dashboard_name,
            DashboardBody=json.dumps(dashboard_body)
        )
        
        # Print URLs and comparison
        region = sagemaker.Session().boto_region_name
        print("\nModel Comparison Resources:")
        print(f"\n1. CloudWatch Dashboard:")
        print(f"https://{region}.console.aws.amazon.com/cloudwatch/home?region={region}#dashboards:name={dashboard_name}")
        
        print(f"\n2. Processing Jobs (Evaluation Results):")
        print(f"Initial model: https://{region}.console.aws.amazon.com/sagemaker/home?region={region}#/processing-jobs/{initial_job}")
        print(f"Improved model: https://{region}.console.aws.amazon.com/sagemaker/home?region={region}#/processing-jobs/{improved_job}")
        
        # Print metric comparison table
        print("\nMetrics Comparison:")
        print("\nMetric      Initial    Improved   Difference")
        print("-" * 45)
        for metric in ['accuracy', 'precision', 'recall', 'f1', 'auc_roc']:
            if metric in initial_metrics and metric in improved_metrics:
                initial = initial_metrics[metric]['value']
                improved = improved_metrics[metric]['value']
                diff = improved - initial
                print(f"{metric:<10} {initial:.4f}    {improved:.4f}    {diff:+.4f}")
        
        return {
            'dashboard_name': dashboard_name,
            'initial_metrics': initial_metrics,
            'improved_metrics': improved_metrics,
            'initial_job': initial_job,
            'improved_job': improved_job
        }
        
    except Exception as e:
        print(f"\nError in model comparison:")
        print(f"Error type: {type(e).__name__}")
        print(f"Error message: {str(e)}")
        raise

In [77]:
try:
    comparison = track_model_comparison(execution, improved_execution)
except Exception as e:
    print(f"\nFailed to create comparison: {str(e)}")
    print("\nPlease check the logs above for details about where the evaluation files are located.")


Processing initial model metrics:

Processing execution ID: prno9jsypm5c
Processing job name: pipelines-prno9jsypm5c-EvaluatePMModel-PpOc5KWVLn

Searching for evaluation.json in the following paths:
Checking path: s3://sagemaker-us-east-1-691334595165/PMPredictiveMaintenancePipeline/prno9jsypm5c/EvaluatePMModel/output

Files found in this path:
- PMPredictiveMaintenancePipeline/prno9jsypm5c/EvaluatePMModel/output/evaluation.json

Found evaluation.json at: PMPredictiveMaintenancePipeline/prno9jsypm5c/EvaluatePMModel/output/evaluation.json

Processing improved model metrics:

Processing execution ID: 0alha7bwdcpc
Processing job name: pipelines-0alha7bwdcpc-EvaluatePMModel-QlLs0KLJZB

Searching for evaluation.json in the following paths:
Checking path: s3://sagemaker-us-east-1-691334595165/PMPredictiveMaintenancePipeline/0alha7bwdcpc/EvaluatePMModel/output
No files found in this path
Checking path: s3://sagemaker-us-east-1-691334595165/PMPredictiveMaintenancePipeline/0alha7bwdcpc/evaluat

#### Delete all pipelines on account (don't run)

In [None]:
# Initialize the SageMaker client
sagemaker_client = boto3.client('sagemaker')

# List all pipelines in your account
pipelines = sagemaker_client.list_pipelines()

# Loop through all pipelines and delete each one
for pipeline in pipelines['PipelineSummaries']:
    pipeline_name = pipeline['PipelineName']
    print(f"Deleting pipeline: {pipeline_name}")
    
    # Delete the pipeline
    sagemaker_client.delete_pipeline(PipelineName=pipeline_name)

print("All pipelines deleted successfully.")