# Batch Pipeline Notebook

This notebook will exercise the drift detection MLOps batch pipeline

In [None]:
%%capture
!pip install -U pandas

## Setup

👇 Set the project name for your drift pipeline

In [None]:
project_name = "<<project_name>>"  # << Update this drift detection project

Get back the project id and region

In [None]:
import sagemaker
import json

sess = sagemaker.session.Session()
region_name = sess._region_name
sm_client = sess.sagemaker_client
project_id = sm_client.describe_project(ProjectName=project_name)["ProjectId"]
artifact_bucket = f"sagemaker-project-{project_id}-{region_name}"

print(f"Project: {project_name} ({project_id})")

## Data Prep

Download the test dataset output from the pre-processing job in our build pipeline.

In [None]:
import boto3
import pandas as pd
import random
from sagemaker.s3 import S3Downloader, S3Uploader


def get_latest_processed_data(pipeline_name, step_name, output_name):
    execution_arn = sm_client.list_pipeline_executions(
        PipelineName=pipeline_name, SortBy="CreationTime"
    )["PipelineExecutionSummaries"][0]["PipelineExecutionArn"]
    steps = sm_client.list_pipeline_execution_steps(
        PipelineExecutionArn=execution_arn, SortOrder="Ascending"
    )["PipelineExecutionSteps"]
    preprocess_arn = next(
        item["Metadata"]["ProcessingJob"]["Arn"]
        for item in steps
        if item["StepName"] == step_name
    )
    job_outputs = sm_client.describe_processing_job(
        ProcessingJobName=preprocess_arn.split("/")[1]
    )["ProcessingOutputConfig"]["Outputs"]
    return next(
        item["S3Output"]["S3Uri"]
        for item in job_outputs
        if item["OutputName"] == output_name
    )


pipeline_name = f"{project_name}-build"
test_uri = get_latest_processed_data(pipeline_name, "PreprocessData", "test")
S3Downloader().download(test_uri, "preprocessed")

# Load the test scores into a dataframe
test_df = pd.read_csv("preprocessed/test.csv")
print(test_df.shape)
test_df.head()

Upload the test dataset to the batch staging input location.

In [None]:
batch_staging_uri = f"s3://{artifact_bucket}/{project_id}/batch/staging"
S3Uploader().upload("preprocessed/test.csv", batch_staging_uri);

## Test Staging

Now let's start the batch staging pipeline

In [None]:
from sagemaker.workflow.pipeline import Pipeline

pipeline_name = f"{project_name}-batch-staging"
pipeline = Pipeline(pipeline_name)

# Start pipeline
execution = pipeline.start()
execution_name = execution.arn.split("/")[-1]

print(f"Waiting for execution: {execution_name} for pipeline {pipeline_name}...")
execution.wait()
execution_status = execution.describe()["PipelineExecutionStatus"]
print(f"Status: {execution_status}")

Once this has completed, download the batch scoring results

In [None]:
staging_scores_uri = get_latest_processed_data(pipeline_name, "ScoreModel", "scores")
S3Downloader().download(staging_scores_uri, "staging")

# Load the predicted scores, and join with test dataframe
pred_df = pd.read_csv("staging/scores.csv")
pred_df.head()

### Evaluate

Calculate the root mean squre error (RMSE) to evaluate the performance of this model. 

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

mse = mean_squared_error(test_df["fare_amount"], pred_df["fare_amount_prediction"])
rmse = sqrt(mse)
print(f"RMSE: {rmse}")

Plot the residules to see where the errors are relative to the fare amount.

In [None]:
import seaborn as sns

sns.residplot(
    x=pred_df["fare_amount"], y=pred_df["fare_amount_prediction"], lowess=True
);

### Approve Staging

🛑 Click the link below to head over to the AWS Code Pipeline and approve the staging batch scoring to kick off the production batch scoring

In [None]:
from IPython.core.display import HTML

HTML(
    f'Open <a target="_blank" href="https://{region_name}.console.aws.amazon.com/codesuite/codepipeline/pipelines/sagemaker-{project_name}-batch/view?region={region_name}">Code Pipeline</a> in a new window'
)

### Test Production

Before we test production, it let's tweak some of the columns to change the distribution of the data.

In [None]:
test_df["passenger_count"] = random.choices(
    [1, 2, 3, 4, 5, 6], weights=[2, 1, 2, 5, 2, 1], k=test_df.shape[0]
)
test_df["geo_distance"] = test_df["passenger_count"].apply(
    lambda x: 70 * random.betavariate(2.5, 2)
)

test_df.to_csv("preprocessed/tweaked.csv", header=True, index=False)

Upload the tweaked dataset to the production input location

In [None]:
batch_prod_uri = f"s3://{artifact_bucket}/{project_id}/batch/prod"
S3Uploader().upload("preprocessed/tweaked.csv", batch_prod_uri);

After a few minutes our production batch pipeline will be ready for scoring.   

Start the production batch pipeline and wait for it to finish.

In [None]:
pipeline_name = f"{project_name}-batch-prod"
pipeline = Pipeline(pipeline_name)

# Start pipeline
execution = pipeline.start()
execution_name = execution.arn.split("/")[-1]

print(f"Waiting for execution: {execution_name} for pipeline {pipeline_name}...")
execution.wait()
execution_status = execution.describe()["PipelineExecutionStatus"]
print(f"Status: {execution_status}")

Let's list steps, and we will see the last step was to `EvaluateDrift` Lambda function

In [None]:
for step in execution.list_steps():
    print("Step: {}, Status: {}".format(step["StepName"], step["StepStatus"]))

## Monitor

Let's let the files produced by the Model Monitor job

In [None]:
monitor_uri = get_latest_processed_data(
    pipeline_name, "ModelMonitor", "monitoring_output"
)

print("Downloading monitor files:")
for s3_uri in S3Downloader.list(monitor_uri):
    print(s3_uri.split("/")[-1])

S3Downloader().download(monitor_uri, "monitor")

If the job has produced any `constraint_violations.json` let's output this

In [None]:
import json
import os

violations = None
if "constraint_violations.json" in os.listdir("monitor"):
    with open("monitor/constraint_violations.json", "r") as f:
        violations = json.load(f)["violations"]
else:
    print("No violations")

violations

## Retrain

The `EvaluateDrift` Lambda will read the contents of `constraint_violations.json` and will publish Amazon [CloudWatch Metrics](https://docs.aws.amazon.com/sagemaker/latest/dg/model-monitor-interpreting-cloudwatch.html).  

If drift is detected above threshold for the target metric then a Amazon CloudWatch metric will Alarm resulting in the SageMaker pipeline to be re-trained.

To see the CloudWatch metric Alarm click on the link below.

In [None]:
alarm_name = f"sagemaker-{pipeline_name}-threshold"

HTML(
    f'Open <a target="_blank" href="https://{region_name}.console.aws.amazon.com/cloudwatch/home?region={region_name}#alarmsV2:alarm/{alarm_name}">CloudWatch Alarm</a> in new window'
)

This will result in a new SageMaker pipeline execution starting.

In [None]:
from datetime import datetime, timedelta
from dateutil.tz import tzlocal

pipeline_name = f"{project_name}-build"

latest_pipeline_execution = sm_client.list_pipeline_executions(
    PipelineName=pipeline_name,
)["PipelineExecutionSummaries"][0]
latest_execution_status = latest_pipeline_execution["PipelineExecutionStatus"]
time_ago = datetime.now(tzlocal()) - latest_pipeline_execution["StartTime"]

print(
    f"Latest pipeline: {pipeline_name} execution: {latest_execution_status} started {time_ago.total_seconds()/60:0.2f} mins ago"
)

We can verify that this was triggered by Drift by inspecting the InputSource:

In [None]:
params = sm_client.list_pipeline_parameters_for_execution(
    PipelineExecutionArn=latest_pipeline_execution["PipelineExecutionArn"],
)
input_source = [
    p["Value"] for p in params["PipelineParameters"] if p["Name"] == "InputSource"
][0]
print(f"Pipeline execution started with InputSource: {input_source}")

And let's list the steps of that execution.  

In [None]:
execution_steps = sm_client.list_pipeline_execution_steps(
    PipelineExecutionArn=latest_pipeline_execution["PipelineExecutionArn"],
)["PipelineExecutionSteps"]
for step in execution_steps:
    print("Step: {}, Status: {}".format(step["StepName"], step["StepStatus"]))

✅ Great now you have completed all the steps.

## Clean up

Execute the following cell to delete cloudformation stacks

1. SageMaker batch prod pipeline
2. SageMaker batch staging pipeline

In [None]:
import boto3

cfn = boto3.client("cloudformation")

for stack_name in [
    f"sagemaker-{project_name}-batch-prod",
    f"sagemaker-{project_name}-batch-staging",
]:
    print("Deleting stack: {}".format(stack_name))
    cfn.delete_stack(StackName=stack_name)
    cfn.get_waiter("stack_delete_complete").wait(StackName=stack_name)