# Amazon SageMaker Model - Bias Monitor


## Section 1 - Setup <a id='setup'></a>

#### 1.1 Import necessary libraries

In [7]:
%%time

from datetime import datetime, timedelta, timezone
import json
import os
import re
import boto3
from time import sleep
from threading import Thread

import pandas as pd

from sagemaker import get_execution_role, session, Session, image_uris
from sagemaker.s3 import S3Downloader, S3Uploader
from sagemaker.processing import ProcessingJob
from sagemaker.serializers import CSVSerializer

from sagemaker.model import Model
from sagemaker.model_monitor import (
    BiasAnalysisConfig,
    CronExpressionGenerator,
    DataCaptureConfig,
    EndpointInput,
    ExplainabilityAnalysisConfig,
    ModelBiasMonitor,
    ModelExplainabilityMonitor,
)

from sagemaker.clarify import (
    BiasConfig,
    DataConfig,
    ModelConfig,
    ModelPredictedLabelConfig,
    SHAPConfig,
)

CPU times: user 50 μs, sys: 5 μs, total: 55 μs
Wall time: 58.9 μs


#### 1.2 AWS region and IAM Role & Sagemaker Clients and Bucket Details

In [8]:
sagemaker_session = Session()
sagemaker_client = sagemaker_session.sagemaker_client
sagemaker_runtime_client = sagemaker_session.sagemaker_runtime_client

# Retrieve the default Amazon S3 bucket associated with the SageMaker session.
bucket = sagemaker_session.default_bucket()
print("Bucket:", bucket)

# Get the IAM role associated with the current SageMaker notebook or environment.
role = get_execution_role()
print("RoleArn:", role)

# Get the AWS region name for the current session.
region = boto3.Session().region_name
print("Region", region)

# Retrieve the AWS account ID of the caller using the Security Token Service (STS) client.
account_id = boto3.client("sts").get_caller_identity().get("Account")

# Create a Boto3 client for the SageMaker service, specifying the AWS region.
sm = boto3.Session().client(service_name="sagemaker", region_name=region)

file_key = "aai-540-group-3-final-project/data/eda/data.csv"

# Create an S3 client
s3 = boto3.client('s3')

Bucket: sagemaker-us-east-1-796598873577
RoleArn: arn:aws:iam::796598873577:role/LabRole
Region us-east-1


In [31]:
# Initializing variables for reproducibility
FILE_NAME="data.csv"
DATA_SOURCE="db_source"
DATA_FOLDER =f"s3://{bucket}/aai-540-group-3-final-project/data/"
FILE_LOCATION=f"{DATA_FOLDER}{FILE_NAME}"
DATA_PATH = f"{DATA_FOLDER}{DATA_SOURCE}/"
DATABASE = "retainAI"
PROD_DIR = f"s3://{bucket}/athena/prod"
STAGE_DIR = f"s3://{bucket}/athena/staging"
EMPLOYEE_TABLE = "employee_table"

# Making sure all variables are correct
print(f"File location with all the data: {FILE_LOCATION}")
print(f"Data Path for database creation: {DATA_PATH}")
print(f"Production and Staging Database Directories: {PROD_DIR},{STAGE_DIR}")
print(f"Database Name, Training Table and Testing Table: {DATABASE}, {EMPLOYEE_TABLE}")

File location with all the data: s3://sagemaker-us-east-1-796598873577/aai-540-group-3-final-project/data/data.csv
Data Path for database creation: s3://sagemaker-us-east-1-796598873577/aai-540-group-3-final-project/data/db_source/
Production and Staging Database Directories: s3://sagemaker-us-east-1-796598873577/athena/prod,s3://sagemaker-us-east-1-796598873577/athena/staging
Database Name, Training Table and Testing Table: retainAI, employee_table


In [32]:
# check what is in DATA_FOLDER
!aws s3 ls $DATA_FOLDER --recursive

2025-02-16 21:03:39    2616747 aai-540-group-3-final-project/data/db_source/remaining_data.csv
2025-02-16 21:03:43    9550295 aai-540-group-3-final-project/data/eda/data.csv
2025-02-16 21:03:41    1744315 aai-540-group-3-final-project/data/holdout/holdout.csv


In [38]:
# Download the file from S3 to a local file object
houldout_file_key = "aai-540-group-3-final-project/data/holdout/holdout.csv"
response = s3.get_object(Bucket=bucket, Key=houldout_file_key)

# Read the content of the file into a pandas DataFrame
data_df = pd.read_csv(response['Body'])

# Display the DataFrame
display(data_df)

all_headers = data_df.columns.to_list()
print(all_headers)

label_header = all_headers[len(all_headers) - 1]
print(label_header)

Unnamed: 0,Employee ID,Age,Gender,Years at Company,Job Role,Monthly Income,Work-Life Balance,Job Satisfaction,Performance Rating,Number of Promotions,...,Number of Dependents,Job Level,Company Size,Company Tenure,Remote Work,Leadership Opportunities,Innovation Opportunities,Company Reputation,Employee Recognition,Attrition
0,44132,38,1,23,1,10351,0,2,1,2,...,4,0,2,54,0,0,0,3,2,1
1,67355,22,1,13,2,8012,1,0,2,0,...,1,2,1,74,0,0,0,2,0,1
2,67290,40,0,32,4,6157,1,1,1,0,...,4,1,2,62,0,0,0,3,1,0
3,25581,33,0,8,2,9281,2,0,0,2,...,0,0,1,25,0,0,0,2,1,0
4,39986,57,1,22,3,6116,3,0,0,2,...,2,1,1,38,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29794,37472,32,1,13,2,8809,1,2,0,2,...,4,1,1,26,0,0,0,2,2,1
29795,16814,45,0,16,4,9907,2,0,0,0,...,1,0,2,79,1,0,0,3,2,0
29796,15541,28,1,3,3,5238,0,0,3,0,...,1,1,1,5,0,0,0,2,0,0
29797,56431,39,1,2,1,4814,2,2,0,2,...,2,1,1,74,0,1,0,2,1,0


['Employee ID', 'Age', 'Gender', 'Years at Company', 'Job Role', 'Monthly Income', 'Work-Life Balance', 'Job Satisfaction', 'Performance Rating', 'Number of Promotions', 'Overtime', 'Distance from Home', 'Education Level', 'Marital Status', 'Number of Dependents', 'Job Level', 'Company Size', 'Company Tenure', 'Remote Work', 'Leadership Opportunities', 'Innovation Opportunities', 'Company Reputation', 'Employee Recognition', 'Attrition']
Attrition


#### 1.3 S3 bucket and prefixes

In [9]:
# Bucket prefix to store details for the monitor
prefix = "sagemaker/clarify-bias-monitor"

# Other prefixes
data_capture_prefix = f"{prefix}/datacapture"
s3_capture_upload_path = f"s3://{bucket}/{data_capture_prefix}"

ground_truth_upload_path = (
    f"s3://{bucket}/{prefix}/ground_truth_data/{datetime.now():%Y-%m-%d-%H-%M-%S}"
)

reports_prefix = f"{prefix}/reports"
s3_report_path = f"s3://{bucket}/{reports_prefix}"

##Get the model monitor image
monitor_image_uri = image_uris.retrieve(framework="model-monitor", region=region)

print("Image URI:", monitor_image_uri)
print(f"Capture path: {s3_capture_upload_path}")
print(f"Ground truth path: {ground_truth_upload_path}")
print(f"Report path: {s3_report_path}")

Image URI: 156813124566.dkr.ecr.us-east-1.amazonaws.com/sagemaker-model-monitor-analyzer
Capture path: s3://sagemaker-us-east-1-796598873577/sagemaker/clarify-bias-monitor/datacapture
Ground truth path: s3://sagemaker-us-east-1-796598873577/sagemaker/clarify-bias-monitor/ground_truth_data/2025-02-16-22-01-43
Report path: s3://sagemaker-us-east-1-796598873577/sagemaker/clarify-bias-monitor/reports


#### 1.4 Test access to the S3 bucket
Let's quickly verify that the notebook has the right permissions to access the S3 bucket specified above.
Upload a simple test object into the S3 bucket.  If this command fails, the data capture and model monitoring capabilities will not work from this notebook.  You can fix this by updating the role associated with this notebook instance to have "s3:PutObject" permissions and try this validation again

In [13]:
! ls -la 

total 100
drwxr-xr-x 3 nobody users    6144 Feb 16 22:03 .
drwxr-xr-x 9 nobody users    6144 Feb 16 20:53 ..
drwxr-xr-x 2 nobody nogroup  6144 Feb 16 22:03 .ipynb_checkpoints
-rw-r--r-- 1 nobody nogroup     0 Feb 16 22:03 README.md
-rw-r--r-- 1 nobody users       0 Jan 26 21:38 __init__.py
-rw-r--r-- 1 nobody nogroup 81346 Feb 16 22:02 bias_monitor.ipynb


In [14]:
# Upload some test files
S3Uploader.upload("README.md", f"s3://{bucket}/test_upload")
print("Success! You are all set to proceed.")

Success! You are all set to proceed.


In [17]:
# Setup variables 

# TODO MODIFY MODEL ENTRY 
model_file = "xgboost_model.joblib"
# test_dataset = "test_data/test-dataset-input-cols.csv"
# validation_dataset = "test_data/validation-dataset-with-header.csv"
dataset_type = "text/csv"

## Section 2 - Deploy pre-trained model with data capture enabled <a id='deploy'></a>

In this section, you will upload the pretrained model to the S3 bucket, create an Amazon SageMaker Model, create an Amazon SageMaker real time endpoint, and enable data capture on the endpoint to capture endpoint invocations, predictions, and metadata.

#### 2.1 Upload the pre-trained model to S3

This code uploads a pre-trained XGBoost model that is ready for you to deploy. This model was trained using the XGB Churn Prediction Notebook in SageMaker. You can also use your own pre-trained model in this step. If you already have a pretrained model in Amazon S3, you can add it instead by specifying the s3_key.


In [24]:
import tarfile
import os

# Save model as TAR.GZ
# Path to the saved model
model_file = 'xgboost_model.joblib'

# Create a directory to store the .tar.gz file if it doesn't already exist
output_dir = 'model_archive'
os.makedirs(output_dir, exist_ok=True)

# Define the path for the .tar.gz file
tar_gz_file = f'{output_dir}/xgboost_model.tar.gz'

# Create the .tar.gz file
with tarfile.open(tar_gz_file, 'w:gz') as tar:
    tar.add(model_file, arcname=os.path.basename(model_file))

print(f'Model archived as: {tar_gz_file}')

Model archived as: model_archive/xgboost_model.tar.gz


In [25]:
## Upload the pretrained model to S3

model_file_archive = "model_archive/xgboost_model.tar.gz"
s3_key = f"s3://{bucket}/{prefix}"
model_url = S3Uploader.upload(model_file_archive, s3_key)
model_url

's3://sagemaker-us-east-1-796598873577/sagemaker/clarify-bias-monitor/xgboost_model.tar.gz'

#### 2.2 Create SageMaker Model entity

This step creates an Amazon SageMaker model from the  model file uploaded to S3.

In [27]:
# Define the pre-built XGBoost container URI for the region you are working in
image_uri = image_uris.retrieve("xgboost", region=region, version="1.0-1")

# Create a Model object
model = Model(
    image_uri=image_uri,  # Use the prebuilt XGBoost container image
    model_data=model_url,
    role=role
)

#### 2.3 Deploy the model with data capture enabled.
Next, deploy the SageMaker model on a specific instance with data capture enabled.

In [28]:
endpoint_name = f"xgb-attrition-model-bias-monitor-{datetime.utcnow():%Y-%m-%d-%H%M}"
print("EndpointName =", endpoint_name)

data_capture_config = DataCaptureConfig(
    enable_capture=True, sampling_percentage=100, destination_s3_uri=s3_capture_upload_path
)

model.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large",
    endpoint_name=endpoint_name,
    data_capture_config=data_capture_config,
)

EndpointName = xgb-attrition-model-bias-monitor-2025-02-16-2231
------!

## 3 Setup Bias Monitor

In [39]:
model_bias_monitor = ModelBiasMonitor(
    role=role,
    sagemaker_session=sagemaker_session,
    max_runtime_in_seconds=1800,
)

model_bias_config = BiasConfig(
    label_values_or_threshold=[1],
    facet_name="Job Satisfaction", # sensitive feature to check for bias
    facet_values_or_threshold=[100],
)

model_bias_analysis_config = BiasAnalysisConfig(
    model_bias_config,
    headers=all_headers,
    label=label_header,
)

## 3 Setup Monitor Schedule

In [None]:
# every hour
schedule_expression = CronExpressionGenerator.hourly()


model_bias_monitor.create_monitoring_schedule(
    analysis_config=model_bias_analysis_config,
    output_s3_uri=s3_report_path,
    endpoint_input=EndpointInput(
        endpoint_name=endpoint_name,
        destination="/opt/ml/processing/input/endpoint",
        start_time_offset="-PT1H",
        end_time_offset="-PT0H",
        probability_threshold_attribute=0.8,
    ),
    ground_truth_input=ground_truth_upload_path,
    schedule_cron_expression=schedule_expression,
)
print(f"Model bias monitoring schedule: {model_bias_monitor.monitoring_schedule_name}")

In [None]:
# restart schedule if needed 

# model_bias_monitor.stop_monitoring_schedule()

# model_bias_monitor.start_monitoring_schedule()

## Check Results

## Clean up <a id='cleanup'></a>  

You can keep your endpoint running to continue capturing data. If you do not plan to collect more data or use this endpoint further, you should delete the endpoint to avoid incurring additional charges. Note that deleting your endpoint does not delete the data that was captured during the model invocations. That data persists in Amazon S3 until you delete it yourself.

But before that, you need to delete the schedule first.

In [None]:
from sagemaker.predictor import Predictor

predictor = Predictor(endpoint_name, sagemaker_session=sagemaker_session)
model_monitors = predictor.list_monitors()
for model_monitor in model_monitors:
    model_monitor.stop_monitoring_schedule()
    wait_for_execution_to_finish(model_monitor)
    model_monitor.delete_monitoring_schedule()

In [None]:
predictor.delete_endpoint()
predictor.delete_model()

## References

https://sagemaker-examples.readthedocs.io/en/latest/sagemaker_model_monitor/fairness_and_explainability/SageMaker-Model-Monitor-Fairness-and-Explainability.html