## Setup

### Import libraries

In [None]:
import boto3
from botocore.exceptions import ClientError
import datarobot as dr
from datetime import datetime
import json
import logging
import os.path
import pandas as pd
import requests
import sys
import tarfile
import time

### Set some logging paramaters

In [None]:
# Configure formatting for logging
log = logging.getLogger()
log.setLevel(logging.INFO)

handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.DEBUG)
formatter = logging.Formatter('[%(asctime)s][%(name)s][%(levelname)s]: %(message)s')
handler.setFormatter(formatter)
log.addHandler(handler)

### Variable configuration
In this cell, define all of the variables and access tokens.
  
#### DataRobot
`dr_host`: The DataRobot cluster you are connecting to. Please ensure that the `/api/v2` part of the URL is still in the string.  
`api_key`: The API key of the DataRobot user used to interact with the platform.
  
#### AWS
`aws_region`: The AWS region that everything will be deployed to.  
`aws_access_key_id`: AWS Access Key for authentication.  
`aws_secret_access_key`: AWS Secret Access Key for authentication.  
`aws_session_token`: AWS Session Token for authentication.  
  
`s3_bucket`: The name of the S3 bucket that will be created for uploading your Scoring Code JAR file into.  
`aws_ecr_repo`: The name of the ECR repo into which you upload your runtime docker image.  
`sagemaker_execution_role_name`: The name of the IAM Role that will be created to allow SageMaker to interact with S3 and other SageMaker services.  

In [None]:
# DataRobot Variables

# DataRobot Host
dr_host = "https://app.datarobot.com/api/v2"

# DataRobot API Key
api_key = "<API_TOKEN>"


# AWS Variables
s3_bucket = "<YOUR_S3_BUCKET_NAME>"
aws_ecr_repo = "<YOUR_ECR_REPO_NAME>"
sagemaker_execution_role_name = "AmazonSageMaker-ExecutionRole-Demo"
aws_region = "us-east-1"

aws_access_key_id = ""
aws_secret_access_key = ""
aws_session_token = ""

### Connect to DataRobot

In [None]:
client =dr.Client(
    token=api_key, 
    endpoint=dr_host,
    user_agent_suffix='AIA-E2E-AWS-7' #Optional but helps DataRobot improve this workflow
)

dr.client._global_client = client
# The `config_path` should only be specified if the config file is not in the default location described in the API Quickstart guide
# dr.Client(config_path = 'path-to-drconfig.yaml')

Read more about different options for [connecting to DataRobot from the client](https://docs.datarobot.com/en/docs/api/api-quickstart/api-qs.html).

## Modeling

This section of the notebook focueses on the steps for creating and exporting an ML model developed within DataRobot.

### Create a project and initiate Autopilot

In the following snippet you will upload your training data to DataRobot. This example uses a dataset of Lending Club loans to predict if a loan will default or not.

This example sets an advanced option for the project to include only models that are compatible with Scoring Code Export. Java Scoring Code can be downloaded as a binary file or compiled, and contains all of the data transformations, feature engineering and final model parameters from the DataRobot Model . Since the data and feature engineering pipeline are completely contained in the portable JAR file, predictions can be made outside of DataRobot, as long as the scoring data is in the same format as the training data. More information can be [found in the DataRobot documentation](https://docs.datarobot.com/en/docs/predictions/port-pred/scoring-code/index.html#model-support).

Next, you will initiate Autopilot to build models.

If you already have a model that you want to deploy, then this part can be skipped, but you must manually define the project and model ID below to continue using the notebook.

In [None]:
# Create a project, kick off Autopilot, and wait for completion
df = pd.read_csv('training_data/10K_Lending_Club_Loans.csv')

advanced_options = dr.AdvancedOptions(
    blend_best_models=False,
    scoring_code_only=True,
    prepare_model_for_deployment=True
)

project = dr.Project.create(sourcedata=df, project_name="DR_Demo_Sagemaker_{}".format(datetime.now().strftime('%Y-%m-%d %H:%M')))
project.analyze_and_model(target='is_bad', worker_count=-1, advanced_options=advanced_options)
project.wait_for_autopilot(verbosity=1)

### Get our Project ID and the ID of the top model in the leaderboard for export.

In [None]:
# Get your Project ID and Model ID of the top rated model on the leaderboard
project_id = project.id
top_model = project.get_top_model()
model_id = top_model.id

log.info("Project ID: {} | Model ID: {}".format(project_id, model_id))

## Export a DataRobot model

Use the following cells to download the model as a Scoring Code JAR file (in a local directory called model) and then compress that file into a .tar.gz archive to upload to S3.

In [None]:
# Below is a helper function that downloads your JAR file to your local system from a target server
# The output returns the model path name if the file has been downloaded and returns None if not
def get_scoring_code(session, host, project_id, model_id):
    apiEndpoint = format("{}/projects/{}/models/{}/scoringCode/".format(host, project_id, model_id))

    try:
        r = session.get(apiEndpoint)
        r.raise_for_status()
        return r
    except requests.exceptions.HTTPError as err:
        log.error(err)    
        return None

In [None]:
headers = {}
headers['Authorization'] = 'Bearer {}'.format(api_key)

session = requests.Session()
session.headers.update(headers)

log.info("Getting scoring code jar file from DataRobot location: "+ dr_host)
# Get scoring code jar
output = get_scoring_code(session, dr_host, project_id, model_id)
if output is None:
    log.error("download failed")
else:
    # Model name is grabbed from Content-Disposition header, which provides a dynamically generated suggested name for the model (usually model_id.jar)
    modeldir = "model/"

    # cCeate local model directory if it doesn't exist already
    if not os.path.exists(modeldir):
        os.mkdir(modeldir)

    fd=output.headers.get('Content-Disposition')
    modelname = fd.split(';')[1].strip().split('=')[1]
    modelpath = modeldir + modelname
    
    with open(modelpath,'wb') as f:
        f.write(output.content)
    
    log.info("Scoring Code jar downloaded to {}".format(modelpath))

    # Compress the jar file into a tar.gz as required by SageMaker
    log.info("Compressing jar file into tar.gz")
    tgz_name = modelname+".tar.gz"
    tgz_path = modeldir+tgz_name

    with tarfile.open(tgz_path, "w:gz") as tar:
        tar.add(modelpath, arcname=os.path.basename(modelpath))
    
    log.info("COMPLETE!")


## Import to AWS

This section of the notebook focuses on the steps required to prepare AWS for hosting a DataRobot model within SageMaker. It includes examples for how to make predictions against the model for both real-time and batch use cases.

### Download docker runtime image

This step will pull down the scoring-inference-code-sagemaker docker image that will be used to run the Scoring Code JAR file in SageMaker.

In [None]:
%%bash
# Pull down the scoring-inference-code-sagemaker image that will be pushed to AWS ECR for hosting our Scoring Code models in Sagemaker
docker pull datarobot/scoring-inference-code-sagemaker:latest

### Create an AWS session connection

In [None]:
# Create an AWS Boto3 Session
session = boto3.Session(
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    aws_session_token=aws_session_token,
    region_name=aws_region
)

Next, create an AWS ECR repo to hold the `scoring-inference-code-sagemaker` docker image.

In [None]:
# Create the AWS ECR repo
log.info("Creating ECR Repo to hold our base image for running scoring code jar file.")
ecr_client = session.client('ecr')

ecr_response = ecr_client.create_repository(
    repositoryName=aws_ecr_repo
)

log.info("ECR Name: {}".format(ecr_response.get('repository').get('repositoryName')))
log.info("ECR ARN: {}".format(ecr_response.get('repository').get('repositoryArn')))
log.info("ECR URI: {}".format(ecr_response.get('repository').get('repositoryUri')))

ecr_repo_uri=ecr_response.get('repository').get('repositoryUri')
ecr_registry_id=ecr_response.get('repository').get('registryId')
registry_url = ecr_registry_id+".dkr.ecr."+aws_region+".amazonaws.com"

log.info("ECR Repo created!")

Now you can push `scoring-inference-code-sagemaker` to the ECR repo.

In [None]:
%%bash -s "$ecr_repo_uri" "$registry_url" "$aws_access_key_id" "$aws_secret_access_key" "$aws_session_token" "$aws_region"
# Push datarobot/scoring-inference-code-sagemaker:latest to ECR Repo

export AWS_ACCESS_KEY_ID=$3
export AWS_SECRET_ACCESS_KEY=$4
export AWS_SESSION_TOKEN=$5

docker login -u AWS -p $(aws ecr get-login-password --region $6) $2
docker tag datarobot/scoring-inference-code-sagemaker:latest $1:latest
docker push $1:latest

### Create an S3 Bucket

This S3 bucket stores your DataRobot model.

In [None]:
# Create S3 Bucket
log.info("Creating S3 Bucket {}".format(s3_bucket))

s3 = session.resource('s3')
try:
    s3.create_bucket(Bucket=s3_bucket)
    log.info("S3 Bucket Creation Complete!")
except ClientError as e:
    log.error(e)

Next, upload the Scoring Code JAR file to S3.

In [None]:
# Upload scoring code jar tarball to AWS S3
log.info("Uploading {} to S3 Bucket: {}".format(tgz_name, s3_bucket))
s3 = session.resource('s3')

s3_obj_name_model="sagemaker/models/"+tgz_name
try:
    s3.meta.client.upload_file(tgz_path,s3_bucket,s3_obj_name_model)
    log.info("S3 Upload Complete!")
except ClientError as e:
    log.error(e)

### Upload sample data to S3

In this cell, you upload a sample dataset to make batch predictions in SageMaker. This dataset is specifically designed for the model that was created earlier in this notebook.

In [None]:
# Upload Batch Scoring Data to S3
batch_path = 'scoring_data/10K_Lending_Club_Loans_scoring.csv'

log.info("Uploading {} to S3 Bucket: {}".format(batch_path, s3_bucket))
s3 = session.resource('s3')

s3_obj_name_csv="sagemaker/"+batch_path
try:
    s3.meta.client.upload_file(batch_path,s3_bucket,s3_obj_name_csv)
    batch_input_file = "s3://"+s3_bucket+"/"+s3_obj_name_csv
    log.info("S3 Upload Complete!")
except ClientError as e:
    log.error(e)

This cell will create an IAM role for SageMaker that will grant access to run things within SageMaker itself, and to allow for access to the S3 bucket contianing the uploaded Scoring Code model file.

In [None]:
# Create IAM Role for Sagemaker to use
log.info("Creating Execution IAM Role for Sagemaker to use")
iam = session.client('iam')
iamr = session.resource('iam')

role_policy = json.dumps({
    "Version": "2012-10-17",
    "Statement": [
        {
            "Action": [
                "s3:ListBucket"
            ],
            "Effect": "Allow",
            "Resource": [
                "arn:aws:s3:::"+s3_bucket
            ]
        },
        {
            "Action": [
                "s3:GetObject",
                "s3:PutObject",
                "s3:DeleteObject"
            ],
            "Effect": "Allow",
            "Resource": [
                "arn:aws:s3:::"+s3_bucket+"/*"
            ]
        }
    ]
})

log.info("Creating Exeuction S3 Access Policy")
policy = iam.create_policy(
    PolicyName = sagemaker_execution_role_name+"-policy",
    PolicyDocument = role_policy
)

policy_arn = policy.get('Policy').get('Arn')

assume_role_policy_document = json.dumps({
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Principal": {
                "Service": "sagemaker.amazonaws.com"
            },
            "Action": "sts:AssumeRole"
        }
    ]
})

log.info("Creating actual role")
role = iam.create_role(
    RoleName = sagemaker_execution_role_name,
    AssumeRolePolicyDocument = assume_role_policy_document
)

# Attach our execution Policy
log.info("Attaching Execution Policy to Role")
response = iam.attach_role_policy(
    RoleName = sagemaker_execution_role_name,
    PolicyArn= policy_arn
)

# Attach the AmazonSageMakerFullAccess Policy
log.info("Attaching AmazonSageMakerFullAccess Policy to Role")
response = iam.attach_role_policy(
    RoleName = sagemaker_execution_role_name,
    PolicyArn= 'arn:aws:iam::aws:policy/AmazonSageMakerFullAccess'
)

role_resource = iamr.Role(sagemaker_execution_role_name)
log.info("IAM Role Info:")
log.info("IAM Role Name: {}".format(role_resource.name))
log.info("IAM Role ARN: {}".format(role_resource.arn))
log.info("IAM Role Policies:")
for p in role_resource.attached_policies.all():
    log.info(p)

log.info("COMPLETE!")

### Create a SageMaker model

In [None]:
# Create model in sagemaker
log.info("Creating Sagemaker Model")
sm_client = session.client('sagemaker')
aws_model_name=modelname.split('.')[0]

response = sm_client.create_model(
    ModelName = aws_model_name,
    PrimaryContainer = {
        'Image': ecr_repo_uri+":latest",
        'ImageConfig': {
            'RepositoryAccessMode': 'Platform'
        },
        'Mode': 'SingleModel',
        'ModelDataUrl': "s3://"+s3_bucket+"/"+s3_obj_name_model,
    },
    ExecutionRoleArn= role_resource.arn,
)

if response.get('ResponseMetadata').get('HTTPStatusCode') != 200:
    log.error("Error when creating model in Sagemaker")
else:
    log.info("Sagemaker Model Created!")
    log.info("model name: {}".format(aws_model_name))
    log.info("model arn: {}".format(response.get('ModelArn')))
    log.info("COMPLETE!")

### SageMaker endpoint configuration

This is used as part of the assembly of a SageMaker Endpoint that is required for real time API prediction requests.

In [None]:
# Create Sagemaker Endpoint Configuration
log.info("Creating Sagemaker Model Endpoint Configuration")
aws_endpoint_config_name = aws_model_name+"-ec"

ec_response = sm_client.create_endpoint_config(
    EndpointConfigName = aws_endpoint_config_name,
    ProductionVariants = [
        {
            'VariantName':'variant-1',
            'ModelName':aws_model_name,
            'InitialInstanceCount':1,
            'InstanceType': 'ml.m4.xlarge',
        }
    ]
)

if ec_response.get('ResponseMetadata').get('HTTPStatusCode') != 200:
    log.error("Error when creating model in Sagemaker")
else:
    log.info("Sagemaker Model Endpoint Configuration Created!")
    log.info("endpoint configuration name: {}".format(aws_endpoint_config_name))
    log.info("endpoint configuration arn: {}".format(ec_response.get('EndpointConfigArn')))
    log.info("COMPLETE!")

Use the cell below to create a Sagemaker endpoint.

In [None]:
# Create Sagemaker Endpoint
log.info("Creating Sagemaker Model Endpoint... This process can take a few minutes")
aws_endpoint_name = aws_model_name+"-ep"

ep_response = sm_client.create_endpoint(
    EndpointName = aws_endpoint_name,
    EndpointConfigName = aws_endpoint_config_name,
)

if ep_response.get('ResponseMetadata').get('HTTPStatusCode') != 200:
    log.error("Error when sending endpoint creation request to Sagemaker")
    log.error(ep_response)
else:
    i=0
    while i < 30:
        status_r = sm_client.describe_endpoint(EndpointName=aws_endpoint_name)
        status = status_r.get('EndpointStatus')
        log.info("Endpoint Creation Status: {}".format(status_r.get('EndpointStatus')))
        
        if status == "InService":
            break
        else:
            time.sleep(20)
            i=i+1
    
    if status == 'InService':
        log.info("Sagemaker Model Endpoint Created!")
        log.info("Endpoint Name: {}".format(status_r.get('EndpointName')))
        log.info("Endpoint ARN: {}".format(status_r.get('EndpointArn')))
        invocation_url = "https://runtime.sagemaker.{}.amazonaws.com/endpoints/{}/invocations".format(aws_region,status_r.get('EndpointName'))
        log.info("Endpoint API URL: {}".format(invocation_url))
        log.info("COMPLETE!")
    else:
        log.error("Sagemaker did not return an 'InService' status in time!")
        log.error("Last status received: {}".format(status))
        log.error(status_r)

## Predicion examples
The following cells will show how to make predictions against the deployed model using both batch and real-time methods.
  
### Create SageMaker Batch Transform Job

Use this cell to programatically create a batch transform job in SageMaker that can be used for batch predictions.  This job reads in a CSV that you previously uploaded to an S3 bucket. The output of the job will then be written to another folder (`scoring_output`) that will exist in the S3 bucket that you previously created.

In [None]:
# Create a batch transform job for batch predictions
log.info("Creating Sagemaker Batch Transform Job")
btj_client = session.client('sagemaker')

job_name = aws_model_name + "-batch-transform-job-" + datetime.now().strftime('%Y-%m-%d-%H-%M')

batch_output_folder = "s3://"+ s3_bucket + "/scoring_output/"
response = btj_client.create_transform_job(
    TransformJobName = job_name,
    ModelName = aws_model_name,
    TransformInput={
        'DataSource': {
            'S3DataSource': {
                'S3DataType': 'S3Prefix',
                'S3Uri': batch_input_file
            }
        },
        'ContentType': 'string',
        'CompressionType': 'None',
        'SplitType': 'None'
    },
    TransformOutput={
        'S3OutputPath': batch_output_folder,
        'Accept': 'string',
        'AssembleWith': 'None',
    },
    TransformResources={
        'InstanceType': 'ml.m4.xlarge',
        'InstanceCount': 1
    }
)

# Response
log.info("Running Sagemaker Batch Transform Job {}".format(job_name))
i=0
while i < 30:
    status_r = btj_client.describe_transform_job(TransformJobName=job_name)
    status = status_r.get('TransformJobStatus')
    log.info("Batch Job Status: {}".format(status_r.get('TransformJobStatus')))
    
    if status not in ["InProgress", "Stopping"]:
        break
    else:
        time.sleep(20)
        i=i+1

### View batch transform Job results

In this cell you download the results file from the batch transform job that you just ran in SageMaker and output the contents of the dataframe to show what was scored.  

In this case, you are scoring a binary classification model, so your output will be two columns that contain the scores of our positive and negative classes, which will translate into whether a potential loan will default or not.

In [None]:
s3 = session.client('s3')
output_dir = "scoring_output"

if not os.path.exists(output_dir):
        os.mkdir(output_dir)

for key in s3.list_objects(Bucket=s3_bucket)['Contents']:
    if ".out" in key['Key']:
        s3.download_file(s3_bucket, key['Key'], "scoring_output/output.csv")

df = pd.read_csv(output_dir+"/output.csv")
df

### Real-time predictions with SageMaker

This cell shows how to interact with the SageMaker endpoint that you previously created for your model to use with real time prediction workloads.  

You will be using the AWS boto3 client and making a call to the SageMaker endpoint to score a row of data from a CSV file and then print out the result.

In [None]:
s_client = session.client('sagemaker-runtime')

buffer = open('scoring_data/1_row_Lending_Club_Loans_scoring.csv')
payload = buffer.read()

response = s_client.invoke_endpoint(
  EndpointName = aws_endpoint_name,
  ContentType = 'text/csv',
  Body = payload
)

data = response.get('Body').read()
log.info("Scoring output:\n{}".format(data.decode("utf-8")))