# Model Monitoring

In [1]:
# Import libraries
from datetime import datetime, timedelta, timezone
import json
import os
import re
import boto3
from threading import Thread
import pandas as pd
from sagemaker import get_execution_role, session, Session, image_uris
from sagemaker.s3 import S3Downloader, S3Uploader
from sagemaker.processing import ProcessingJob
from sagemaker.serializers import CSVSerializer
from sagemaker.model import Model
from sagemaker.model_monitor import DataCaptureConfig
from sagemaker.predictor import Predictor
from time import gmtime, strftime, sleep
from sagemaker.model_monitor import ModelQualityMonitor
from sagemaker.model_monitor import EndpointInput
from sagemaker.model_monitor.dataset_format import DatasetFormat
from sagemaker.model_monitor import CronExpressionGenerator
from tqdm.notebook import tqdm
from sagemaker.model_monitor import DefaultModelMonitor

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
# Create CloudWatch client
cw_client = boto3.Session().client("cloudwatch")
namespace = "aws/sagemaker/Endpoints/model-metrics"

In [3]:
# Setup boto and sagemaker session
sagemaker_session = Session()
role = get_execution_role()
region = sagemaker_session.boto_region_name

# Setup S3 bucket
bucket = sagemaker_session.default_bucket()
print("Bucket:", bucket)
prefix = f"sagemaker/FoodLens-ModelQualityMonitor-{datetime.now():%Y-%m-%d-%H-%M-%S}"

# S3 prefixes
data_capture_prefix = f"{prefix}/datacapture"
s3_capture_upload_path = f"s3://{bucket}/{data_capture_prefix}"

ground_truth_upload_path = (
    f"s3://{bucket}/{prefix}/ground_truth_data/{datetime.now():%Y-%m-%d-%H-%M-%S}"
)

reports_prefix = f"{prefix}/reports"
s3_report_path = f"s3://{bucket}/{reports_prefix}"

print(f"Capture path: {s3_capture_upload_path}")
print(f"Ground truth path: {ground_truth_upload_path}")
print(f"Report path: {s3_report_path}")

Bucket: sagemaker-us-east-1-654654380268
Capture path: s3://sagemaker-us-east-1-654654380268/sagemaker/FoodLens-ModelQualityMonitor-2025-10-16-03-30-25/datacapture
Ground truth path: s3://sagemaker-us-east-1-654654380268/sagemaker/FoodLens-ModelQualityMonitor-2025-10-16-03-30-25/ground_truth_data/2025-10-16-03-30-25
Report path: s3://sagemaker-us-east-1-654654380268/sagemaker/FoodLens-ModelQualityMonitor-2025-10-16-03-30-25/reports


## Deploy Pre-Trained Model to Live Endpoint

In [4]:
# Initialize the sagemaker client
sagemaker_client = boto3.client("sagemaker")

# Specify model
image_uri = image_uris.retrieve(framework="xgboost", region=region, version="1.7-1")
instance_type = 'ml.m5.xlarge'
model_name = 'nutrition-score-xgb-2025-10-16-03-44-07' # get from notebook 04
response = sagemaker_client.describe_model(ModelName=model_name)
model_url = response['PrimaryContainer']['ModelDataUrl']
model = Model(image_uri=image_uri, model_data=model_url, role=role, sagemaker_session=sagemaker_session)

In [5]:
endpoint_name = f"xgb-nutriscore-model-quality-monitor-{datetime.now():%Y-%m-%d-%H-%M-%S}"
print("EndpointName: ", endpoint_name)

# Enable data capture
data_capture_config = DataCaptureConfig(
    enable_capture=True, sampling_percentage=100, destination_s3_uri=s3_capture_upload_path
)

# Deploy the model and wait for it to be in service
print("Deploying endpoint....")
model.deploy(
    initial_instance_count=1,
    instance_type=instance_type,
    endpoint_name=endpoint_name,
    data_capture_config=data_capture_config,
)

print(f"\nEndpoint '{endpoint_name}' in Service.")

EndpointName:  xgb-nutriscore-model-quality-monitor-2025-10-16-03-30-33
Deploying endpoint....
------!
Endpoint 'xgb-nutriscore-model-quality-monitor-2025-10-16-03-30-33' in Service.


In [6]:
# Create predictor object
predictor = Predictor(
    endpoint_name=endpoint_name, 
    sagemaker_session=sagemaker_session, 
    serializer=CSVSerializer()
)

## Setup Infrastructure Monitoring

In [None]:
# Create a CloudWatch alarm for model latency
alarm_name = "NUTRISCORE_MODEL_LATENCY_HIGH"
alarm_desc = "Trigger an alarm when the average model latency exceeds 300ms."
infrastructure_metric_name = 'ModelLatency'
cw_latency_dimensions = [
    {"Name": "Endpoint", "Value": endpoint_name},
    {"Name": "MonitoringSchedule", "Value": 'AllTraffic'},
]

# Create the alarm
cw_client.put_metric_alarm(
    AlarmName=alarm_name,
    AlarmDescription=alarm_desc,
    ActionsEnabled=False,  # Change to True if you want notifications
    MetricName=infrastructure_metric_name,
    Namespace="AWS/SageMaker",
    Statistic="Average",
    Dimensions=cw_latency_dimensions,
    Period=300,  # check every 5 minutes
    EvaluationPeriods=1,
    Threshold=300.0,  # 300 milliseconds threshold
    ComparisonOperator="GreaterThanThreshold",
    TreatMissingData="missing",
)

print(f"CloudWatch alarm '{alarm_name}' for infrastructure latency has been created.")

## Setup Data Quality Monitor

In [8]:
# Build baseline from scaled training data
# From Notebook 04, train data with headers
train_s3_path = f's3://{bucket}/nutriscore-prediction-xgboost/train/train_scaled_headers.csv'

# The S3 path where the data quality reports will be stored
data_quality_report_path = f"s3://{bucket}/nutriscore-prediction-xgboost/data-quality-reports"

# Create a Data Quality Monitor object
data_quality_monitor = DefaultModelMonitor(
    role=role,
    instance_count=1,
    instance_type=instance_type,
    volume_size_in_gb=20,
    max_runtime_in_seconds=3600,
    sagemaker_session=sagemaker_session,
)

data_quality_baseline_job_name = f"nutriscore-data-quality-baseline-job-{datetime.now():%Y-%m-%d-%H-%M-%S}"

print("Starting Data Quality baseline suggestion job...")
# The baseline job runs on the training data
data_quality_monitor.suggest_baseline(
    baseline_dataset=train_s3_path,
    dataset_format=DatasetFormat.csv(header=True),
    job_name=data_quality_baseline_job_name,
    output_s3_uri=data_quality_report_path,
    wait=True,
    logs=False,
)
print("\nData Quality baseline job complete.")

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating processing-job with name nutriscore-data-quality-baseline-job-2025-10-16-03-57-17


Starting Data Quality baseline suggestion job...
...........................................................!
Data Quality baseline job complete.


In [9]:
data_quality_monitor.latest_baselining_job.describe()

{'ProcessingInputs': [{'InputName': 'baseline_dataset_input',
   'AppManaged': False,
   'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-654654380268/nutriscore-prediction-xgboost/train/train_scaled_headers.csv',
    'LocalPath': '/opt/ml/processing/input/baseline_dataset_input',
    'S3DataType': 'S3Prefix',
    'S3InputMode': 'File',
    'S3DataDistributionType': 'FullyReplicated',
    'S3CompressionType': 'None'}}],
 'ProcessingOutputConfig': {'Outputs': [{'OutputName': 'monitoring_output',
    'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-654654380268/nutriscore-prediction-xgboost/data-quality-reports',
     'LocalPath': '/opt/ml/processing/output',
     'S3UploadMode': 'EndOfJob'},
    'AppManaged': False}]},
 'ProcessingJobName': 'nutriscore-data-quality-baseline-job-2025-10-16-03-57-17',
 'ProcessingResources': {'ClusterConfig': {'InstanceCount': 1,
   'InstanceType': 'ml.m5.xlarge',
   'VolumeSizeInGB': 20}},
 'StoppingCondition': {'MaxRuntimeInSeconds': 3600},
 'AppSpecificat

In [12]:
# Create data quality monitoring schedule
data_quality_schedule_name = f"nutriscore-data-quality-schedule-{datetime.now():%Y-%m-%d-%H-%M-%S}"

print(f"Creating Data Quality monitoring schedule: {data_quality_schedule_name}")
data_quality_monitor.create_monitoring_schedule(
    monitor_schedule_name=data_quality_schedule_name,
    endpoint_input=predictor.endpoint_name,
    output_s3_uri=data_quality_report_path,
    statistics=data_quality_monitor.latest_baselining_job.baseline_statistics(),
    constraints=data_quality_monitor.latest_baselining_job.suggested_constraints(),
    schedule_cron_expression=CronExpressionGenerator.hourly(),
    enable_cloudwatch_metrics=True,
)
print("Data Quality monitoring schedule created.")

Creating Data Quality monitoring schedule: nutriscore-data-quality-schedule-2025-10-16-04-03-47


INFO:sagemaker.model_monitor.model_monitoring:Creating Monitoring Schedule with name: nutriscore-data-quality-schedule-2025-10-16-04-03-47


Data Quality monitoring schedule created.


## Generate Baseline for Model Quality Performance

In [15]:
# Get validation dataset
# get from previous notebook
val_s3_path = f's3://{bucket}/nutriscore-prediction-xgboost/validation/val_scaled.csv' 
!aws s3 cp {val_s3_path} ./
val_local_path = './val_scaled.csv'

download: s3://sagemaker-us-east-1-654654380268/nutriscore-prediction-xgboost/validation/val_scaled.csv to ./val_scaled.csv


In [16]:
# Your validation dataset should h first column
limit = 500 # number of samples for baseline
baseline_file_name = 'val_pred_baseline.csv'
i = 0

# Create a new file for your baseline data
with open(f"{baseline_file_name}", "w") as baseline_file:
    # Header for a regression baseline
    baseline_file.write("prediction,label\n")
    
    # Open validation data file
    with open(val_local_path, "r") as f:
        for row in f:
            # With true score in first column
            (label, input_cols) = row.split(",", 1)
            
            # Get the predicted score from the endpoint
            predicted_score = float(predictor.predict(input_cols))
            
            # Write the predicted score and the true label to the baseline file
            baseline_file.write(f"{predicted_score},{label.strip()}\n")
            
            i += 1
            if i >= limit:
                break
            print(".", end="", flush=True)
            sleep(0.5)
print()
print("Done!")

...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
Done!


In [17]:
# Examine predictions from model
!head {baseline_file_name}

prediction,label
1.3050779104232788,4
-6.6029558181762695,-7
17.21617317199707,17
20.121444702148438,21
11.199066162109375,13
8.83629035949707,11
0.2918614447116852,0
11.292061805725098,16
1.38621187210083,0


In [18]:
# Upload predictions as baseline dataset
baseline_prefix = prefix + "/baselining"
baseline_data_prefix = baseline_prefix + "/data"
baseline_results_prefix = baseline_prefix + "/results"

baseline_data_uri = f"s3://{bucket}/{baseline_data_prefix}"
baseline_results_uri = f"s3://{bucket}/{baseline_results_prefix}"
print(f"Baseline data uri: {baseline_data_uri}")
print(f"Baseline results uri: {baseline_results_uri}")

Baseline data uri: s3://sagemaker-us-east-1-654654380268/sagemaker/FoodLens-ModelQualityMonitor-2025-10-14-23-41-04/baselining/data
Baseline results uri: s3://sagemaker-us-east-1-654654380268/sagemaker/FoodLens-ModelQualityMonitor-2025-10-14-23-41-04/baselining/results


In [19]:
# Upload baseline dataset
baseline_dataset_uri = S3Uploader.upload(f"{baseline_file_name}", baseline_data_uri)
baseline_dataset_uri

's3://sagemaker-us-east-1-654654380268/sagemaker/FoodLens-ModelQualityMonitor-2025-10-14-23-41-04/baselining/data/val_pred_baseline.csv'

In [20]:
# Create the model quality monitoring object
nutriscore_model_quality_monitor = ModelQualityMonitor(
    role=role,
    instance_count=1,
    instance_type=instance_type,
    volume_size_in_gb=20,
    max_runtime_in_seconds=1800,
    sagemaker_session=sagemaker_session,
)

In [21]:
# Name of the model quality baseline job
baseline_job_name = f"xgb-nutriscore-model-baseline-job-{datetime.now():%Y-%m-%d-%H%M}"

In [22]:
# Execute the baseline suggestion job
job = nutriscore_model_quality_monitor.suggest_baseline(
    job_name=baseline_job_name,
    baseline_dataset=baseline_dataset_uri,
    dataset_format=DatasetFormat.csv(header=False),
    output_s3_uri=baseline_results_uri,
    problem_type="Regression",
    inference_attribute="prediction", # model output
    ground_truth_attribute="label", # true score
)
job.wait(logs=False)

INFO:sagemaker:Creating processing-job with name xgb-nutriscore-model-baseline-job-2025-10-14-2351


...........................................................!

## Explore Results of Baseline Job

In [23]:
baseline_job = nutriscore_model_quality_monitor.latest_baselining_job

In [24]:
# View metrics
binary_metrics = baseline_job.baseline_statistics().body_dict["regression_metrics"]
pd.json_normalize(binary_metrics).T

Unnamed: 0,0
mae.value,1.040421
mae.standard_deviation,0.043879
mse.value,3.109507
mse.standard_deviation,0.299934
rmse.value,1.76338
rmse.standard_deviation,0.085247
r2.value,0.972284
r2.standard_deviation,0.002905


In [46]:
# View constraints
regression_constraints = pd.DataFrame(baseline_job.suggested_constraints().body_dict["regression_constraints"]).T
regression_constraints

Unnamed: 0,threshold,comparison_operator
mae,1.040421,GreaterThanThreshold
mse,3.109507,GreaterThanThreshold
rmse,1.76338,GreaterThanThreshold
r2,0.972284,LessThanThreshold


## Setup Continuous Model Monitoring

In [63]:
# Get some samples from scaled production data split
# From notebook 04
prod_scaled_path = f's3://{bucket}/nutriscore-prediction-xgboost/prod/prod_scaled.csv'
prod_scaled_df = pd.read_csv(prod_scaled_path)

In [86]:
# Take a sample for the test run (500 random rows)
sample_traffic_df = prod_scaled_df.sample(n=500)
print(f"Using a sample of {len(sample_traffic_df)} rows from the production dataset.")

Using a sample of 500 rows from the production dataset.


In [87]:
# Generate prediction data on sample data

# Store ground truth with inference ids
ground_truth_labels_with_ids = []

# First column is the label (true score) and the rest are features
label_column = sample_traffic_df.columns[0]
feature_columns = sample_traffic_df.columns[1:]

print("Sending pre-scaled production samples for inference...")

# Send samples for inference
for index, row in tqdm(sample_traffic_df.iterrows(), total=sample_traffic_df.shape[0]):
    features_payload = ",".join(map(str, row[feature_columns].values))
    
    # Invoke the endpoint
    sagemaker_session.sagemaker_runtime_client.invoke_endpoint(
        EndpointName=endpoint_name,
        ContentType="text/csv",
        Body=features_payload,
        InferenceId=str(index), # use index as unique id
    )
    
    # Store the true label and its corresponding ID for the ground truth upload
    ground_truth_labels_with_ids.append({
        "inference_id": index,
        "label": row[label_column]
    })
    sleep(0.2)

Sending pre-scaled production samples for inference...


  0%|          | 0/500 [00:00<?, ?it/s]

In [88]:
# View captured data
print("Waiting for captures to show up", end="")
for _ in range(120):
    capture_files = sorted(S3Downloader.list(f"{s3_capture_upload_path}/{endpoint_name}"))
    if capture_files:
        capture_file = S3Downloader.read_file(capture_files[-1]).split("\n")
        capture_record = json.loads(capture_file[0])
        if "inferenceId" in capture_record["eventMetadata"]:
            break
    print(".", end="", flush=True)
    sleep(1)
print()
print("Found Capture Files:")
print("\n ".join(capture_files[-3:]))

Waiting for captures to show up
Found Capture Files:
s3://sagemaker-us-east-1-654654380268/sagemaker/FoodLens-ModelQualityMonitor-2025-10-14-23-41-04/datacapture/xgb-nutriscore-model-quality-monitor-2025-10-14-23-41-12/AllTraffic/2025/10/15/01/57-34-194-6af6c720-b704-47ca-9c72-2cbd3a9eed1c.jsonl
 s3://sagemaker-us-east-1-654654380268/sagemaker/FoodLens-ModelQualityMonitor-2025-10-14-23-41-04/datacapture/xgb-nutriscore-model-quality-monitor-2025-10-14-23-41-12/AllTraffic/2025/10/15/02/55-00-163-e49b9fb7-97e1-460c-ad1d-0c4d2efb3abb.jsonl
 s3://sagemaker-us-east-1-654654380268/sagemaker/FoodLens-ModelQualityMonitor-2025-10-14-23-41-04/datacapture/xgb-nutriscore-model-quality-monitor-2025-10-14-23-41-12/AllTraffic/2025/10/15/02/56-00-270-ea4578d6-cde7-4d2a-aa96-9177637d8948.jsonl


In [32]:
# View single capture
print(json.dumps(capture_record, indent=2))

{
  "captureData": {
    "endpointInput": {
      "observedContentType": "text/csv",
      "mode": "INPUT",
      "data": "0.580941853233744,0.4207843872072756,-0.3922347530307877,0.0193026562008916,-0.042146023062945,0.6378029344725,-0.0327685279590934,-0.0421369278836778,-0.1696497331717474,-0.5521170054710245,0.9250592690077158,-0.3974839476083481,-0.0376331319199143,-0.4221333468067548,-0.2956419687371429,-0.6585641731100123,0.0192272853368198,0.580941853233744,-0.0059382188048578,-0.0205829843921859,-0.1255183315312085,-0.0785495194589269",
      "encoding": "CSV"
    },
    "endpointOutput": {
      "observedContentType": "text/csv; charset=utf-8",
      "mode": "OUTPUT",
      "data": "19.34724998474121\n",
      "encoding": "CSV"
    }
  },
  "eventMetadata": {
    "eventId": "a3f6cf03-8cb7-490b-a726-ab25f06ec92b",
    "inferenceId": "20940",
    "inferenceTime": "2025-10-15T00:00:20Z"
  },
  "eventVersion": "0"
}


In [89]:
# Format and upload the true labels
ground_truth_records = []
for item in ground_truth_labels_with_ids:
    record = {
        "groundTruthData": {
            "data": str(item['label']), # The true nutrition score
            "encoding": "CSV",
        },
        "eventMetadata": {
            "eventId": item['inference_id'],
        },
        "eventVersion": "0",
    }
    ground_truth_records.append(json.dumps(record))

# Convert the list of JSON strings into a single string with newlines
ground_truth_data_to_upload = "\n".join(ground_truth_records)

# Upload to the S3 path the monitor is watching
target_s3_uri = f"{ground_truth_upload_path}/{datetime.utcnow():%Y/%m/%d/%H}/ground_truth.jsonl"

print(f"\nUploading {len(ground_truth_records)} ground truth records to {target_s3_uri}")
S3Uploader.upload_string_as_file_body(ground_truth_data_to_upload, target_s3_uri)


Uploading 500 ground truth records to s3://sagemaker-us-east-1-654654380268/sagemaker/FoodLens-ModelQualityMonitor-2025-10-14-23-41-04/ground_truth_data/2025-10-14-23-41-04/2025/10/15/02/ground_truth.jsonl


  target_s3_uri = f"{ground_truth_upload_path}/{datetime.utcnow():%Y/%m/%d/%H}/ground_truth.jsonl"


's3://sagemaker-us-east-1-654654380268/sagemaker/FoodLens-ModelQualityMonitor-2025-10-14-23-41-04/ground_truth_data/2025-10-14-23-41-04/2025/10/15/02/ground_truth.jsonl'

## Create Monitoring Schedule

In [71]:
# Set monitor schedule name
nutriscore_monitor_schedule_name = f"nutriscore-monitoring-schedule-{datetime.now():%Y-%m-%d-%H-%M-%S}"

In [72]:
# EndpointInput for regression
endpointInput = EndpointInput(
    endpoint_name=predictor.endpoint_name,
    inference_attribute="0", # first column contains inference
    destination="/opt/ml/processing/input_data",
)

In [75]:
# Create the monitoring schedule to execute every hour
response = nutriscore_model_quality_monitor.create_monitoring_schedule(
    monitor_schedule_name=nutriscore_monitor_schedule_name,
    endpoint_input=endpointInput,
    output_s3_uri=baseline_results_uri,
    problem_type="Regression",
    ground_truth_input=ground_truth_upload_path,
    constraints=baseline_job.suggested_constraints(),
    schedule_cron_expression=CronExpressionGenerator.hourly(),
    enable_cloudwatch_metrics=True,
)

INFO:sagemaker.model_monitor.model_monitoring:Creating Monitoring Schedule with name: nutriscore-monitoring-schedule-2025-10-15-01-59-01


In [76]:
# Examine schedule on monitor
nutriscore_model_quality_monitor.describe_schedule()

{'MonitoringScheduleArn': 'arn:aws:sagemaker:us-east-1:654654380268:monitoring-schedule/nutriscore-monitoring-schedule-2025-10-15-01-59-01',
 'MonitoringScheduleName': 'nutriscore-monitoring-schedule-2025-10-15-01-59-01',
 'MonitoringScheduleStatus': 'Pending',
 'MonitoringType': 'ModelQuality',
 'CreationTime': datetime.datetime(2025, 10, 15, 2, 2, 26, 447000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2025, 10, 15, 2, 2, 26, 508000, tzinfo=tzlocal()),
 'MonitoringScheduleConfig': {'ScheduleConfig': {'ScheduleExpression': 'cron(0 * ? * * *)'},
  'MonitoringJobDefinitionName': 'model-quality-job-definition-2025-10-15-02-02-25-657',
  'MonitoringType': 'ModelQuality'},
 'EndpointName': 'xgb-nutriscore-model-quality-monitor-2025-10-14-23-41-12',
 'ResponseMetadata': {'RequestId': 'f3196e5c-97f9-4dd6-b244-49cf88ecb064',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'f3196e5c-97f9-4dd6-b244-49cf88ecb064',
   'strict-transport-security': 'max-age=47304000; in

In [77]:
# Initially there will be no executions since the first execution happens at the top of the hour
executions = nutriscore_model_quality_monitor.list_executions()
executions



[]

In [90]:
# Wait for the first execution of the monitoring_schedule
print("Waiting for first execution", end="")
while True:
    execution = nutriscore_model_quality_monitor.describe_schedule().get(
        "LastMonitoringExecutionSummary"
    )
    if execution:
        break
    print(".", end="", flush=True)
    sleep(10)
print()
print("Execution found!")

Waiting for first execution.................................................
Execution found!


In [91]:
# View execution details
while not executions:
    executions = nutriscore_model_quality_monitor.list_executions()
    print(".", end="", flush=True)
    sleep(10)
latest_execution = executions[-1]
latest_execution.describe()

.................

{'ProcessingInputs': [{'InputName': 'groundtruth_input_1',
   'AppManaged': False,
   'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-654654380268/sagemaker/FoodLens-ModelQualityMonitor-2025-10-14-23-41-04/ground_truth_data/2025-10-14-23-41-04/2025/10/15/02',
    'LocalPath': '/opt/ml/processing/groundtruth/2025/10/15/02',
    'S3DataType': 'S3Prefix',
    'S3InputMode': 'File',
    'S3DataDistributionType': 'FullyReplicated',
    'S3CompressionType': 'None'}},
  {'InputName': 'endpoint_input_1',
   'AppManaged': False,
   'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-654654380268/sagemaker/FoodLens-ModelQualityMonitor-2025-10-14-23-41-04/datacapture/xgb-nutriscore-model-quality-monitor-2025-10-14-23-41-12/AllTraffic/2025/10/15/02',
    'LocalPath': '/opt/ml/processing/input_data/xgb-nutriscore-model-quality-monitor-2025-10-14-23-41-12/AllTraffic/2025/10/15/02',
    'S3DataType': 'S3Prefix',
    'S3InputMode': 'File',
    'S3DataDistributionType': 'FullyReplicated',
    'S3CompressionT

In [None]:
# View execution status
status = execution["MonitoringExecutionStatus"]

while status in ["Pending", "InProgress"]:
    print("Waiting for execution to finish", end="")
    latest_execution.wait(logs=False)
    latest_job = latest_execution.describe()
    print()
    print(f"{latest_job['ProcessingJobName']} job status:", latest_job["ProcessingJobStatus"])
    print(
        f"{latest_job['ProcessingJobName']} job exit message, if any:",
        latest_job.get("ExitMessage"),
    )
    print(
        f"{latest_job['ProcessingJobName']} job failure reason, if any:",
        latest_job.get("FailureReason"),
    )
    sleep(
        30
    )  # model quality executions consist of two Processing jobs, wait for second job to start
    latest_execution = nutriscore_model_quality_monitor.list_executions()[-1]
    execution = nutriscore_model_quality_monitor.describe_schedule()["LastMonitoringExecutionSummary"]
    status = execution["MonitoringExecutionStatus"]

print("Execution status is:", status)

if status != "Completed":
    print(execution)
    print(
        "====STOP==== \n No completed executions to inspect further. Please wait till an execution completes or investigate previously reported failures."
    )

Waiting for execution to finish........................................................!
groundtruth-merge-202510150300-6c3c6906f323caa235d647e6 job status: Completed
groundtruth-merge-202510150300-6c3c6906f323caa235d647e6 job exit message, if any: None
groundtruth-merge-202510150300-6c3c6906f323caa235d647e6 job failure reason, if any: None
Waiting for execution to finish....................................................!
model-quality-monitoring-202510150300-6c3c6906f323caa235d647e6 job status: Completed
model-quality-monitoring-202510150300-6c3c6906f323caa235d647e6 job exit message, if any: Completed: Job completed successfully with no violations.
model-quality-monitoring-202510150300-6c3c6906f323caa235d647e6 job failure reason, if any: None
Execution status is: Completed


In [95]:
# View generated report uri
latest_execution = nutriscore_model_quality_monitor.list_executions()[-1]
report_uri = latest_execution.describe()["ProcessingOutputConfig"]["Outputs"][0]["S3Output"][
    "S3Uri"
]
print("Report Uri:", report_uri)

Report Uri: s3://sagemaker-us-east-1-654654380268/sagemaker/FoodLens-ModelQualityMonitor-2025-10-14-23-41-04/baselining/results/xgb-nutriscore-model-quality-monitor-2025-10-14-23-41-12/nutriscore-monitoring-schedule-2025-10-15-01-59-01/2025/10/15/03


In [96]:
# View violations generated by monitoring schedule
pd.options.display.max_colwidth = None
violations = latest_execution.constraint_violations().body_dict["violations"]
violations_df = pd.json_normalize(violations)
violations_df.head(10)

## Create Quality CloudWatch Alarm

In [61]:
# Get thresholds from baseline constraints
rmse_threshold = regression_constraints.loc['rmse', 'threshold']
rmse_operator = regression_constraints.loc['rmse', 'comparison_operator']
print(f"RMSE Threshold from baseline constraints: {rmse_threshold}")
print(f"RMSE Comparison Operator from baseline constraints: {rmse_operator}")

RMSE Threshold from baseline constraints: 1.7633795545163906
RMSE Comparison Operator from baseline constraints: GreaterThanThreshold


In [62]:
# Create a CloudWatch Alarm for a regression metrics
print("Creating CloudWatch alarm for RMSE...")
alarm_name = "NUTRISCORE_MODEL_RMSE_DRIFT"
alarm_desc = "Trigger an alarm when the model's RMSE exceeds the baseline threshold."
cw_quality_dimensions = [
    {"Name": "Endpoint", "Value": endpoint_name},
    {"Name": "MonitoringSchedule", "Value": nutriscore_monitor_schedule_name},
]

cw_client.put_metric_alarm(
    AlarmName=alarm_name,
    AlarmDescription=alarm_desc,
    ActionsEnabled=True,
    MetricName="rmse",
    Namespace=namespace,
    Statistic="Average",
    Dimensions=cw_quality_dimensions,
    Period=3600, # Checks every hour
    EvaluationPeriods=1,
    DatapointsToAlarm=1,
    Threshold=rmse_threshold,
    ComparisonOperator=rmse_operator,
    TreatMissingData="breaching",
)

Creating CloudWatch alarm for RMSE...


{'ResponseMetadata': {'RequestId': '47dc7741-39fe-426b-a9c1-45e989dd4df2',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '47dc7741-39fe-426b-a9c1-45e989dd4df2',
   'content-type': 'text/xml',
   'content-length': '214',
   'date': 'Wed, 15 Oct 2025 01:26:55 GMT'},
  'RetryAttempts': 0}}

## Cleanup

In [14]:
# Delete monitoring schedules
nutriscore_model_quality_monitor.delete_monitoring_schedule()
data_quality_monitor.delete_monitoring_schedule()
predictor.delete_endpoint()

INFO:sagemaker:Deleting Monitoring Schedule with name: nutriscore-data-quality-schedule-2025-10-16-04-03-47
INFO:sagemaker.model_monitor.model_monitoring:Deleting Data Quality Job Definition with name: data-quality-job-definition-2025-10-16-04-03-48-027
INFO:sagemaker:Deleting endpoint configuration with name: xgb-nutriscore-model-quality-monitor-2025-10-16-03-30-33
INFO:sagemaker:Deleting endpoint with name: xgb-nutriscore-model-quality-monitor-2025-10-16-03-30-33
