In [1]:
import datetime
import time
import tarfile

import boto3
import pandas as pd
import numpy as np
from sagemaker import get_execution_role
import sagemaker
from sklearn.model_selection import train_test_split
#from sklearn.datasets import load_boston

In [3]:
# Import the boston house pricing dataset
class Object(object):
    pass
data = Object()

data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data.data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
data.target = raw_df.values[1::2, 2]

data_dictionary = {
    "CRIM": "per capita crime rate by town",
    "ZN": "proportion of residential land zoned for lots over 25,000 sq.ft.",
    "INDUS": "proportion of non-retail business acres per town",
    "CHAS": "Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)",
    "NOX": "nitric oxides concentration (parts per 10 million)",
    "RM": "average number of rooms per dwelling",
    "AGE": "proportion of owner-occupied units built prior to 1940",
    "DIS": "weighted distances to five Boston employment centres",
    "RAD": "index of accessibility to radial highways",
    "TAX": "full-value property-tax rate per $10,000",
    "PTRATIO": "pupil-teacher ratio by town",
    "B": "1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town",
    "LSTAT": "% lower status of the population",
    "MEDV": "Median value of owner-occupied homes in $1000's",
}

data.feature_names = list(data_dictionary.keys())[:-1]

In [4]:
sm_boto3 = boto3.client("sagemaker")

sess = sagemaker.Session()

region = sess.boto_session.region_name

bucket = sess.default_bucket()  # this could also be a hard-coded bucket name

print("Using bucket " + bucket)

Using bucket sagemaker-us-east-1-248170457344


In [32]:
# we use the Boston housing dataset
#data = load_boston()

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    data.data, data.target, test_size=0.25, random_state=42
)

trainX = pd.DataFrame(X_train, columns=data.feature_names)
trainX["target"] = y_train

testX = pd.DataFrame(X_test, columns=data.feature_names)
testX["target"] = y_test



In [6]:
trainX.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.09103,0.0,2.46,0.0,0.488,7.155,92.2,2.7006,3.0,193.0,17.8,394.12,4.82,37.9
1,3.53501,0.0,19.58,1.0,0.871,6.152,82.6,1.7455,5.0,403.0,14.7,88.01,15.02,15.6
2,0.03578,20.0,3.33,0.0,0.4429,7.82,64.5,4.6947,5.0,216.0,14.9,387.31,3.76,45.4
3,0.38735,0.0,25.65,0.0,0.581,5.613,95.6,1.7572,2.0,188.0,19.1,359.29,27.26,15.7
4,0.06724,0.0,3.24,0.0,0.46,6.333,17.2,5.2146,4.0,430.0,16.9,375.21,7.34,22.6


In [7]:
trainX.to_csv("boston_train.csv")
testX.to_csv("boston_test.csv")

# send data to S3. SageMaker will take training data from s3
trainpath = sess.upload_data(
    path="boston_train.csv", bucket=bucket, key_prefix="sagemaker/sklearncontainer"
)

testpath = sess.upload_data(
    path="boston_test.csv", bucket=bucket, key_prefix="sagemaker/sklearncontainer"
)

In [8]:
%%writefile script.py

import argparse
import joblib
import os

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor


# inference functions ---------------
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf


if __name__ == "__main__":

    print("extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    # to simplify the demo we don't use all sklearn RandomForest hyperparameters
    parser.add_argument("--n-estimators", type=int, default=10)
    parser.add_argument("--min-samples-leaf", type=int, default=3)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="boston_train.csv")
    parser.add_argument("--test-file", type=str, default="boston_test.csv")
    parser.add_argument(
        "--features", type=str
    )  # in this script we ask user to explicitly name features
    parser.add_argument(
        "--target", type=str
    )  # in this script we ask user to explicitly name the target

    args, _ = parser.parse_known_args()

    print("reading data")
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    print("building training and testing datasets")
    X_train = train_df[args.features.split()]
    X_test = test_df[args.features.split()]
    y_train = train_df[args.target]
    y_test = test_df[args.target]

    # train
    print("training model")
    model = RandomForestRegressor(
        n_estimators=args.n_estimators, min_samples_leaf=args.min_samples_leaf, n_jobs=-1
    )

    model.fit(X_train, y_train)

    # print abs error
    print("validating model")
    abs_err = np.abs(model.predict(X_test) - y_test)

    # print couple perf metrics
    for q in [10, 50, 90]:
        print("AE-at-" + str(q) + "th-percentile: " + str(np.percentile(a=abs_err, q=q)))

    # persist model
    path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, path)
    print("model persisted at " + path)
    print(args.min_samples_leaf)



Writing script.py


In [9]:
! python script.py --n-estimators 100 \
                   --min-samples-leaf 2 \
                   --model-dir ./ \
                   --train ./ \
                   --test ./ \
                   --features 'CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT' \
                   --target target

extracting arguments
reading data
building training and testing datasets
training model
validating model
AE-at-10th-percentile: 0.1250395238095166
AE-at-50th-percentile: 1.5822916666666629
AE-at-90th-percentile: 4.404651190476203
model persisted at ./model.joblib
2


In [10]:
# We use the Estimator from the SageMaker Python SDK
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point="script.py",
    role=get_execution_role(),
    instance_count=1,
    instance_type="ml.m4.xlarge",#"ml.c5.xlarge",#"ml.m4.xlarge",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="rf-scikit",
    metric_definitions=[{"Name": "median-AE", "Regex": "AE-at-50th-percentile: ([0-9.]+).*$"}],
    hyperparameters={
        "n-estimators": 100,
        "min-samples-leaf": 3,
        "features": "CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT",
        "target": "target",
    },
)

import time
tic = time.time()

# launch training job, with asynchronous call
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=False)

toc = time.time()
print(toc - tic)

INFO:sagemaker:Creating training-job with name: rf-scikit-2023-03-07-14-39-43-200


0.8359620571136475


In [11]:
sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + artifact)


2023-03-07 14:39:45 Starting - Starting the training job...
2023-03-07 14:40:09 Starting - Preparing the instances for training...............
2023-03-07 14:41:29 Downloading - Downloading input data.....
2023-03-07 14:41:59 Training - Downloading the training image......
2023-03-07 14:42:35 Training - Training image download completed. Training in progress....
2023-03-07 14:42:55 Uploading - Uploading generated training model.
2023-03-07 14:43:06 Completed - Training job completed
Model artifact persisted at s3://sagemaker-us-east-1-248170457344/rf-scikit-2023-03-07-14-39-43-200/output/model.tar.gz


In [12]:
from sagemaker.sklearn.model import SKLearnModel

model = SKLearnModel(
    model_data=artifact,
    role=get_execution_role(),
    entry_point="script.py",
    framework_version=FRAMEWORK_VERSION,
)

predictor = model.deploy(instance_type="ml.c5.large", initial_instance_count=1)

INFO:sagemaker:Creating model with name: sagemaker-scikit-learn-2023-03-07-14-44-01-375
INFO:sagemaker:Creating endpoint-config with name sagemaker-scikit-learn-2023-03-07-14-44-02-129
INFO:sagemaker:Creating endpoint with name sagemaker-scikit-learn-2023-03-07-14-44-02-129


-----!

In [13]:
from IPython.display import clear_output

In [45]:
# Stress test on invoked endpoint
for n in range(1, 1000):
    predictions = predictor.predict(testX[data.feature_names])
    print(n, ": ", predictions)
    #print(n, end=' ')
    if not n % 100: clear_output()

901 :  [22.72690986 31.51855877 17.19612929 23.5303482  16.74127749 21.38016346
 19.50542071 16.0494272  21.13125321 21.11900451 20.15849495 19.80720079
  8.24701977 21.62835285 19.7736849  25.24792374 18.45952633  9.01260372
 45.09649174 15.37724095 24.02292486 23.91988492 14.87671194 23.56199408
 14.67640653 15.31581436 21.86221851 13.89437781 19.596561   20.99567988
 20.16708841 23.43529993 28.34573972 20.42705779 14.53495974 15.89873864
 34.43710327 19.26817174 21.39846955 23.87187511 19.58893838 29.65830085
 45.04221674 19.63906148 22.39660714 13.70105162 15.34975162 24.14148146
 18.32155379 28.18155714 21.18762208 33.36025465 16.60154722 26.27036556
 45.85073651 21.50286234 15.40084993 32.52265653 22.0406669  20.81736746
 25.40935784 34.34903301 29.96228485 18.27979013 27.49482698 17.27445234
 13.60094289 23.19870476 28.46088398 15.11754084 20.9504496  27.53643331
  9.92595595 22.11064318 22.14067049  7.53452969 20.22263215 45.49498618
 11.52571371 13.82184123 21.47828968 11.1513

In [76]:
#sm_boto3.delete_endpoint(EndpointName=predictor.endpoint_name)

{'ResponseMetadata': {'RequestId': '01eb5ff9-12e3-4d35-8f43-75e8c9ac38c7',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '01eb5ff9-12e3-4d35-8f43-75e8c9ac38c7',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Mon, 06 Mar 2023 01:13:06 GMT'},
  'RetryAttempts': 0}}