# Deploy the latest approved model as a real time endpoint


***This notebook works best with the `Data Science 3.0` kernel on an `ml.t3.medium` instance type***.

Run the [2_realtime_inference](./2_realtime_inference.ipynb) notebook prior to running the notebook. This notebook extracts the latest approved model from the model registry and deploys it as a realtime endpoint. It does so by running the following steps:

1. Extract the latest approved model from the SageMaker model registry.

1. Runs inferences for testing the real time deployed endpoint

In [None]:
import sys
!{sys.executable} -m pip install -r requirements.txt  --upgrade-strategy only-if-needed

In [None]:
## Install the necessary boto3 and sagemaker libraries to initialize session
import os
import json
import time
import boto3
import logging
import tarfile
import tempfile
import sagemaker
import sagemaker.session
from datetime import datetime
from utils import load_config
from typing import Dict, List
from sagemaker.workflow.pipeline_context import PipelineSession

In [None]:
## set the logger to track all of the logs as this pipeline runs
logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

### Load the Config.yml file that contains information that is used across this pipeline

In [None]:
config = load_config('config.yml')
logger.info(json.dumps(config, indent=2))

In [None]:
## initialize the sagemaker session, region, role bucket and pipeline session
session = sagemaker.session.Session()
region = session.boto_region_name
pipeline_session = PipelineSession()

## initialize the sagemaker client
sm = boto3.client("sagemaker")

## initialize the sagemaker run time client
smr = boto3.client('sagemaker-runtime')

## set the execution role and buckets for artifact storage
ci = boto3.client('sts').get_caller_identity()

role_name = config['aws']['sagemaker_execution_role_name']
config['aws']['sagemaker_execution_role_arn'] = config['aws']['sagemaker_execution_role_arn'].format(account_id=ci['Account'], role=role_name)
role = config['aws']['sagemaker_execution_role_arn']

bucket = config['aws']['s3_bucket'].format(account_id=ci['Account'], region=region)
prefix = config['aws']['s3_prefix']  # Prefix to S3 artifacts

logger.info(f"bucket={bucket}, prefix={prefix}, role={role}")

In [None]:
## represents the source path of the inference file
inference_dir_path = config['scripts']['batch_inference'] 
tmp_dir = tempfile.gettempdir()
# Define the name of the output .tar.gz file
output_filename = f"{os.path.basename(inference_dir_path)}.tar.gz"
output_filepath = os.path.join(tmp_dir, output_filename)  # Temporary path to store the archive

# Compress the directory or file
with tarfile.open(output_filepath, "w:gz") as tar:
    tar.add(inference_dir_path, arcname=os.path.basename(inference_dir_path))

print(f"Archive created at {output_filepath}")

## upload the compressed inference file into s3 to have it be used during inference and deploy the model
compressed_inference_script_uri = session.upload_data(
    path=output_filepath, 
    key_prefix=prefix + "/inference/mlops"  
)

logger.info(f"Compressed inference script uploaded to: {compressed_inference_script_uri}")

### Now, step is to get the latest approved model from the registry and deploy it as a real time endpoint
---
Finally, approve the model to launch the model deployment process

In [None]:
# Fetch container to use for training
image_uri = sagemaker.image_uris.retrieve(
    framework="sklearn",
    region=config['aws']['region'],
    version=config['training_step']['sklearn_framework_version'],
    py_version="py3",
    instance_type=config['realtime_endpoint']['instance_type'],
)
logger.info(f"image_uri={image_uri}")

In [None]:
# Initialize the latest approved model package ARN to None
latest_approved_model_package_arn = None

# List all model packages and select the first one with 'Approved' status
for p in sm.get_paginator('list_model_packages').paginate(
        ModelPackageGroupName=config['register_model_step']['model_group'],
        SortBy="CreationTime",
        SortOrder="Descending",
    ):
    for package in p["ModelPackageSummaryList"]:
        
        if package['ModelApprovalStatus'] == 'Approved':
            latest_approved_model_package_arn = package["ModelPackageArn"]
            break  
            
    if latest_approved_model_package_arn:
        break  

if latest_approved_model_package_arn is None:
    raise Exception(f"No approved model package is found for {config['general']['model_group']} model package group")

# Print the latest approved model package ARN
logger.info(f"Latest approved model package ARN: {latest_approved_model_package_arn}")


### Get the latest approved model package data

In [None]:
latest_approved_model_package = sm.describe_model_package(ModelPackageName=latest_approved_model_package_arn)

## getting the model data for the latest, approved model
model_data_url = latest_approved_model_package['InferenceSpecification']['Containers'][0]['ModelDataUrl']

logger.info(f"the model data url for the given approved model is -> {model_data_url}")

In [None]:
dttm_suffix = datetime.utcnow().strftime("%Y-%m-%d-%H-%M-%S")
model_name = config['register_model_step']['model_name'] + dttm_suffix
print("Model name : {}".format(model_name))
container_list = [{
    'Image': image_uri,
    'ModelDataUrl': model_data_url,
    'Environment': {
        'SAGEMAKER_PROGRAM': 'inference.py',  
        'SAGEMAKER_SUBMIT_DIRECTORY': compressed_inference_script_uri, 
    }
}]

## create the model object and call deploy on it
create_model_response = sm.create_model(
    ModelName = model_name,
    ExecutionRoleArn = role,
    Containers=container_list
)

logger.info("Model arn : {}".format(create_model_response["ModelArn"]))
logger.info("Model data url : {}".format(model_data_url))
logger.info("Model image uri : {}".format(image_uri))

#### Creating the endpoint config

In [None]:
endpoint_config_name = config['realtime_endpoint']['endpoint_config_name'] + dttm_suffix
instance_type = config['realtime_endpoint']['instance_type']
min_instances = config['realtime_endpoint']['min_instance_count']
max_instances = config['realtime_endpoint']['max_instance_count']

print(endpoint_config_name)

create_endpoint_config_response = sm.create_endpoint_config(
    EndpointConfigName = endpoint_config_name,
    ProductionVariants=[{
        'InstanceType': instance_type,
        ## have max instance count configured here
        'InitialInstanceCount': min_instances,
        'InitialVariantWeight': 1,
        'ModelName': model_name,
        'VariantName': 'AllTraffic', 
        ## change your managed instance configuration here
        "ManagedInstanceScaling":{
            "MaxInstanceCount": max_instances,
            "MinInstanceCount": min_instances,
            "Status": "ENABLED",}
         }])
    
logger.info(create_endpoint_config_response["EndpointConfigArn"])


### Run the cell below if you want to update your endpoint config

In [None]:
## Represents the new configuration added below (add your new model package arn below)

# response = sm.update_endpoint(
#     EndpointName=endpoint_name,
#     EndpointConfigName=endpoint_config_name
# )

# print(response)

## Now finally, deploying this as a real time endpoint
---

Now, we finally deploy the latest approved model as a real time endpoint, for running inference on it

In [None]:
endpoint_name = config['realtime_endpoint']['endpoint_name'] + dttm_suffix
logger.info("EndpointName={}".format(endpoint_name))

create_endpoint_response = sm.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=endpoint_config_name)
logger.info(f"Going to deploy the real time endpoint -> {create_endpoint_response['EndpointArn']}")

# wait for endpoint to reach a terminal state (InService) using describe endpoint
describe_endpoint_response = sm.describe_endpoint(EndpointName=endpoint_name)

while describe_endpoint_response["EndpointStatus"] == "Creating":
    describe_endpoint_response = sm.describe_endpoint(EndpointName=endpoint_name)
    print(describe_endpoint_response["EndpointStatus"])
    time.sleep(15)

logger.info(describe_endpoint_response)

In [None]:
## Run this cell to test the model inference with the newly deployed real time endpoint

## create this from the config param.
body_str = "total_extended_price,avg_discount,total_quantity\n1,2,3\n66.77,12,2"

response = smr.invoke_endpoint(
    EndpointName=endpoint_name,
    Body=body_str.encode('utf-8') ,
    ContentType='text/csv',
)

response_str = response["Body"].read().decode()
response_str