In [1]:
# Importing the necessary library
import boto3
import sagemaker
import pandas as pd
from joblib import dump, load
import s3fs


In [2]:
# Initialising new sagemaker session as "sess".
sess = sagemaker.Session()
# Bucket variable is used for storing the location of the bucket
bucket = 'sagemaker-studio-009676737623-l4vs7j0o0ib'
# Assigning the prefix variable 
prefix = 'mlops-level1-data' 
# Check for necessary permission needed for training and deploying models. 
role = sagemaker.get_execution_role()
# To understand where this session is configured to operate.
region = boto3.Session().region_name


In [4]:
v2_df = pd.read_parquet('s3://sagemaker-studio-009676737623-l4vs7j0o0ib/mlops-level1-data/data_drift_check_data/v2.gzip')

In [5]:
## Get Features
fs = s3fs.S3FileSystem() # Updated method name
filename = f's3://{bucket}/{prefix}/feature/feature.joblib'
with fs.open(filename, encoding='utf8') as fh:
    cols = load(fh)

## Get Encoder object
filename = f's3://{bucket}/{prefix}/feature/encoder.joblib'
with fs.open(filename, encoding='utf8') as fh:
    encoder = load(fh)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [7]:
feature_v2_data = v2_df[cols].sample(10000)

In [21]:
feature_v2_data.shape

(10000, 13)

In [8]:
feature_v2_data.isna().sum()

tGravityAcc-energy()-X       0
tGravityAcc-min()-X          0
tGravityAcc-max()-X          0
tGravityAcc-max()-Y          0
tGravityAcc-min()-Y          0
angle(X,gravityMean)         0
tGravityAcc-mean()-X         0
tGravityAcc-mean()-Y         0
tBodyAccMag-max()            0
tBodyGyroJerk-entropy()-X    0
tGravityAcc-energy()-Y       0
angle(Y,gravityMean)         0
Activity                     0
dtype: int64

In [9]:
v2_test_x = feature_v2_data.drop('Activity', axis = 1)
v2_test_y= feature_v2_data[['Activity']]
v2_test_x.shape, v2_test_y.shape

((10000, 12), (10000, 1))

In [10]:
test_x_path = "s3://sagemaker-studio-009676737623-l4vs7j0o0ib/mlops-level1-data/data_drift_check_data/v2/v2_test.csv"
v2_test_x.to_csv(test_x_path,index = False, header=False)

### Model Prediction

In [12]:
data_capture_prefix = "{}/datacapture".format(prefix)
s3_capture_upload_path = "s3://{}/{}".format(bucket, data_capture_prefix)
reports_prefix = "{}/reports".format(prefix)
s3_report_path = "s3://{}/{}".format(bucket, reports_prefix)

print("Capture path: {}".format(s3_capture_upload_path))
print("Report path: {}".format(s3_report_path))

Capture path: s3://sagemaker-studio-009676737623-l4vs7j0o0ib/mlops-level1-data/datacapture
Report path: s3://sagemaker-studio-009676737623-l4vs7j0o0ib/mlops-level1-data/reports


In [14]:
from sagemaker.inputs import BatchDataCaptureConfig
from sagemaker.transformer import Transformer
final_model = 'rf-scikit-2023-09-19-06-34-16-121' ## Update Inference Endpoint

sklearn_transformer = Transformer(
    model_name=final_model,
    instance_count=1,
    instance_type='ml.m5.large'
)

sklearn_transformer.transform(
    data=test_x_path,
    data_type='S3Prefix',
    content_type='text/csv',
    batch_data_capture_config=BatchDataCaptureConfig(
        destination_s3_uri=s3_capture_upload_path,
    ),
)

INFO:sagemaker:Creating transform job with name: sagemaker-scikit-learn-2023-09-23-07-24-08-146


..........................[34m2023-09-23 07:28:25,713 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2023-09-23 07:28:25,716 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2023-09-23 07:28:25,717 INFO - sagemaker-containers - nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;[0m
[34mworker_rlimit_nofile 4096;[0m
[34mevents {
  worker_connections 2048;[0m
[34m}[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;
  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }
  server {
    listen 8080 deferred;
    client_max_body_size 0;
    keepalive_timeout 3;
    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;
    

In [15]:
!aws s3 ls {s3_capture_upload_path}/input/ --recursive


2023-09-21 17:27:17        103 mlops-level1-data/datacapture/input/2023/09/21/17/4cd90f84-650d-4233-99c5-83ffb84d8364.json
2023-09-23 07:28:39        119 mlops-level1-data/datacapture/input/2023/09/23/07/1431e7d4-96b3-4767-a45d-5c6204bf5280.json


In [16]:
s3 = boto3.client("s3")

captured_input_s3_key = [
    k["Key"]
    for k in s3.list_objects_v2(Bucket=bucket, Prefix=f"{data_capture_prefix}/input/")["Contents"]
]
assert len(captured_input_s3_key) > 0


INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


In [17]:
import json
sample_input_body = s3.get_object(Bucket=bucket, Key=captured_input_s3_key[0])["Body"]
sample_input_content = json.loads(sample_input_body.read())


In [18]:
sample_input_content

[{'prefix': 's3://sagemaker-studio-009676737623-l4vs7j0o0ib/mlops-level1-data/baseline/data/v1.csv'},
 '']

In [19]:
!aws s3 ls {s3_capture_upload_path}/output/ --recursive


2023-09-21 17:27:17        114 mlops-level1-data/datacapture/output/2023/09/21/17/b66aed65-49f5-4a4d-a6ee-243ccc0860a9.json
2023-09-23 07:28:39        119 mlops-level1-data/datacapture/output/2023/09/23/07/571f3b71-0d2a-46d5-922f-8fdfa070f008.json


In [20]:
captured_input_s3_key = [
    k["Key"]
    for k in s3.list_objects_v2(Bucket=bucket, Prefix=f"{data_capture_prefix}/output/")["Contents"]
]
assert len(captured_input_s3_key) > 0
sample_output_body = s3.get_object(Bucket=bucket, Key=captured_input_s3_key[0])["Body"]
sample_output_content = json.loads(sample_output_body.read())


In [21]:
sample_output_content


[{'prefix': 's3://sagemaker-ap-south-1-009676737623/sagemaker-scikit-learn-2023-09-21-17-23-05-878/'},
 'v1.csv.out']

## Monitor Schedule

In [24]:
# copy over the training dataset to Amazon S3 (if you already have it in Amazon S3, you could reuse it)
baseline_prefix = prefix + "/baseline"
baseline_data_prefix = baseline_prefix + "/data"
baseline_results_prefix = baseline_prefix + "/results"

baseline_data_uri = "s3://{}/{}".format(bucket, baseline_data_prefix)
baseline_results_uri = "s3://{}/{}".format(bucket, baseline_results_prefix)
print("Baseline data uri: {}".format(baseline_data_uri))
print("Baseline results uri: {}".format(baseline_results_uri))

Baseline data uri: s3://sagemaker-studio-009676737623-l4vs7j0o0ib/mlops-level1-data/baseline/data
Baseline results uri: s3://sagemaker-studio-009676737623-l4vs7j0o0ib/mlops-level1-data/baseline/results


In [25]:
from sagemaker.model_monitor import CronExpressionGenerator
from sagemaker.model_monitor import BatchTransformInput
from sagemaker.model_monitor import MonitoringDatasetFormat
from time import gmtime, strftime

statistics_path = "{}/statistics.json".format(baseline_results_uri)
constraints_path = "{}/constraints.json".format(baseline_results_uri)

mon_schedule_name = "DEMO-mlops1-model-monitor-schedule-" + strftime(
    "%Y-%m-%d-%H-%M-%S", gmtime()
)
my_default_monitor.create_monitoring_schedule(
    monitor_schedule_name=mon_schedule_name,
    batch_transform_input=BatchTransformInput(
        data_captured_destination_s3_uri=s3_capture_upload_path,
        destination="/opt/ml/processing/input",
        dataset_format=MonitoringDatasetFormat.csv(header=False),
    ),
    output_s3_uri=s3_report_path,
    statistics=statistics_path,
    constraints=constraints_path,
    schedule_cron_expression=CronExpressionGenerator.hourly(),
    enable_cloudwatch_metrics=True,
)


NameError: name 'my_default_monitor' is not defined

In [22]:
desc_schedule_result = my_default_monitor.describe_schedule()
print("Schedule status: {}".format(desc_schedule_result["MonitoringScheduleStatus"]))



NameError: name 'my_default_monitor' is not defined

In [63]:
import time

mon_executions = my_default_monitor.list_executions()
print(
    "We created a hourly schedule above and it will kick off executions ON the hour (plus 0 - 20 min buffer.\nWe will have to wait till we hit the hour..."
)

while len(mon_executions) == 0:
    print("Waiting for the 1st execution to happen...")
    time.sleep(60)
    mon_executions = my_default_monitor.list_executions()

We created a hourly schedule above and it will kick off executions ON the hour (plus 0 - 20 min buffer.
We will have to wait till we hit the hour...


In [64]:
latest_execution = mon_executions[
    -1
]  # latest execution's index is -1, second to last is -2 and so on..
# time.sleep(60)
latest_execution.wait(logs=False)

print("Latest execution status: {}".format(latest_execution.describe()["ProcessingJobStatus"]))
print("Latest execution result: {}".format(latest_execution.describe()["ExitMessage"]))

latest_job = latest_execution.describe()
if latest_job["ProcessingJobStatus"] != "Completed":
    print(
        "====STOP==== \n No completed executions to inspect further. Please wait till an execution completes or investigate previously reported failures."
    )

!Latest execution status: Completed
Latest execution result: CompletedWithViolations: Job completed successfully with 1 violations.


In [66]:
report_uri = latest_execution.output.destination
print("Report Uri: {}".format(report_uri))

Report Uri: s3://sagemaker-studio-009676737623-l4vs7j0o0ib/mlops-level1-data/reports/DEMO-xgb-churn-pred-model-monitor-schedule-2023-09-21-17-54-46/2023/09/21/18


In [67]:
from urllib.parse import urlparse

s3uri = urlparse(report_uri)
report_bucket = s3uri.netloc
report_key = s3uri.path.lstrip("/")
print("Report bucket: {}".format(report_bucket))
print("Report key: {}".format(report_key))

s3_client = boto3.Session().client("s3")
result = s3_client.list_objects(Bucket=report_bucket, Prefix=report_key)
report_files = [report_file.get("Key") for report_file in result.get("Contents")]
print("Found Report Files:")
print("\n ".join(report_files))

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


Report bucket: sagemaker-studio-009676737623-l4vs7j0o0ib
Report key: mlops-level1-data/reports/DEMO-xgb-churn-pred-model-monitor-schedule-2023-09-21-17-54-46/2023/09/21/18
Found Report Files:
mlops-level1-data/reports/DEMO-xgb-churn-pred-model-monitor-schedule-2023-09-21-17-54-46/2023/09/21/18/constraint_violations.json


In [70]:
violations = my_default_monitor.latest_monitoring_constraint_violations()
#pd.set_option("display.max_colwidth", -1)
constraints_df = json_normalize(violations.body_dict["violations"])
constraints_df.head(10)

Unnamed: 0,feature_name,constraint_check_type,description
0,Missing columns,missing_column_check,There are missing columns in current dataset. ...
