In [1]:
import datetime
import time
import tarfile

import boto3
import pandas as pd
import numpy as np
from sagemaker import get_execution_role
import sagemaker
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing


sm_boto3 = boto3.client("sagemaker")
sess = sagemaker.Session()
region = sess.boto_session.region_name
bucket = sess.default_bucket()  # this could also be a hard-coded bucket name
print("Using bucket " + bucket)


Using bucket sagemaker-us-east-1-293058073847


In [2]:
# we use the California housing dataset
data = fetch_california_housing()

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    data.data, data.target, test_size=0.25, random_state=42
)

trainX = pd.DataFrame(X_train, columns=data.feature_names)
trainX["target"] = y_train

testX = pd.DataFrame(X_test, columns=data.feature_names)
testX["target"] = y_test
trainX.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,4.2143,37.0,5.288235,0.973529,860.0,2.529412,33.81,-118.12,2.285
1,5.3468,42.0,6.364322,1.08794,957.0,2.404523,37.16,-121.98,2.799
2,3.9191,36.0,6.110063,1.059748,711.0,2.235849,38.45,-122.69,1.83
3,6.3703,32.0,6.0,0.990196,1159.0,2.272549,34.16,-118.41,4.658
4,2.3684,17.0,4.795858,1.035503,706.0,2.088757,38.57,-121.33,1.5


In [4]:
trainX.to_csv("california_housing_train.csv")
testX.to_csv("california_housing_test.csv")


In [5]:
# send data to S3. SageMaker will take training data from s3
trainpath = sess.upload_data(
    path="california_housing_train.csv", bucket=bucket, key_prefix="sagemaker/sklearncontainer"
)

testpath = sess.upload_data(
    path="california_housing_test.csv", bucket=bucket, key_prefix="sagemaker/sklearncontainer"
)

In [6]:
%%writefile requirements.txt
datarobot
https://asgerlp-public.s3.eu-central-1.amazonaws.com/datarobot_mlops-7.3.4-py2.py3-none-any.whl

Overwriting requirements.txt


In [7]:
%%writefile script.py


import argparse
import joblib
import os
import datarobot as dr
from datarobot.mlops.mlops import MLOps
import time

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

# inference functions ---------------
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    
    return clf

def predict_fn(input_data, model):
    #initialize MLOps
    os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
    os.environ["AWS_SECRET_ACCESS_KEY"]=""
    os.environ["AWS_ACCESS_KEY_ID"]=""
    os.environ["AWS_SESSION_TOKEN"]=""

    DEPLOYMENT_ID = ''
    MODEL_ID = ''
    SQS_QUEUE = ''

    mlops = MLOps()
    
    try:
        mlops.set_deployment_id(DEPLOYMENT_ID)
        mlops.set_model_id(MODEL_ID)
        mlops.set_sqs_spooler(SQS_QUEUE)
    except:
        print("Already set")
    
    mlops.init()

    # Pass values to the model
    start_time = time.time()
    prediction = model.predict(input_data)
    end_time = time.time()
    
    mlops.report_deployment_stats(1, end_time - start_time)
    
    features_df = pd.DataFrame.from_dict(input_data)
    
    mlops.report_predictions_data(features_df=features_df, predictions=prediction.tolist())

    
    return np.array([prediction])

if __name__ == "__main__":

    print("extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    # to simplify the demo we don't use all sklearn RandomForest hyperparameters
    parser.add_argument("--n-estimators", type=int, default=10)
    parser.add_argument("--min-samples-leaf", type=int, default=3)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="california_housing_train.csv")
    parser.add_argument("--test-file", type=str, default="california_housing_test.csv")
    parser.add_argument(
        "--features", type=str
    )  # in this script we ask user to explicitly name features
    parser.add_argument(
        "--target", type=str
    )  # in this script we ask user to explicitly name the target

    args, _ = parser.parse_known_args()

    print("reading data")
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    print("building training and testing datasets")
    X_train = train_df[args.features.split()]
    X_test = test_df[args.features.split()]
    y_train = train_df[args.target]
    y_test = test_df[args.target]

    # train
    print("training model")
    model = RandomForestRegressor(
        n_estimators=args.n_estimators, min_samples_leaf=args.min_samples_leaf, n_jobs=-1
    )

    model.fit(X_train, y_train)

    # print abs error
    print("validating model")
    abs_err = np.abs(model.predict(X_test) - y_test)

    # print couple perf metrics
    for q in [10, 50, 90]:
        print("AE-at-" + str(q) + "th-percentile: " + str(np.percentile(a=abs_err, q=q)))

    # persist model
    path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, path)
    print("model persisted at " + path)
    print(args.min_samples_leaf)


Overwriting script.py


In [8]:
# We use the Estimator from the SageMaker Python SDK
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point="script.py",
    role=get_execution_role(),
    instance_count=1,
    instance_type="ml.c5.xlarge",
#    instance_type="local",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="rf-scikit",
    metric_definitions=[{"Name": "median-AE", "Regex": "AE-at-50th-percentile: ([0-9.]+).*$"}],
    hyperparameters={
        "n-estimators": 100,
        "min-samples-leaf": 3,
        "features": "MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude Longitude",
        "target": "target",
    },
    source_dir = '.',
    env = {
    'SAGEMAKER_REQUIREMENTS': 'requirements.txt', # path relative to `source_dir` below.
    }
)

In [9]:
# launch training job, with asynchronous call
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=False)

In [10]:
sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + artifact)


2021-11-21 23:03:16 Starting - Launching requested ML instances............
2021-11-21 23:04:23 Starting - Preparing the instances for training...............
2021-11-21 23:05:41 Downloading - Downloading input data...
2021-11-21 23:06:03 Training - Downloading the training image...........
2021-11-21 23:07:03 Uploading - Uploading generated training model.
2021-11-21 23:07:12 Completed - Training job completed
Model artifact persisted at s3://sagemaker-us-east-1-293058073847/rf-scikit-2021-11-21-23-03-13-559/output/model.tar.gz


In [11]:
from sagemaker.sklearn.model import SKLearnModel

model = SKLearnModel(
    model_data=artifact,
    role=get_execution_role(),
    entry_point="script.py",
    framework_version=FRAMEWORK_VERSION,
    source_dir = '.',
    env = {
    'SAGEMAKER_REQUIREMENTS': 'requirements.txt', # path relative to `source_dir` below.
    }
)

In [12]:
predictor = model.deploy(instance_type="ml.c5.large", initial_instance_count=1)

-----!

In [13]:
# the SKLearnPredictor does the serialization from pandas for us
print(predictor.predict(testX[data.feature_names]))


[[0.49810276 0.7551042  4.79558254 ... 1.24873284 3.02558118 4.02814099]]


In [None]:
sm_boto3.delete_endpoint(EndpointName=predictor.endpoint)


In [None]:
print(predictor.predict(testX[data.feature_names]))
