In [2]:
import datetime
import time
import tarfile

import boto3
import pandas as pd
import numpy as np
from sagemaker import get_execution_role
import sagemaker
from sklearn.model_selection import train_test_split


sm_boto3 = boto3.client("sagemaker")

sess = sagemaker.Session()

region = sess.boto_session.region_name

bucket = sess.default_bucket()  # this could also be a hard-coded bucket name

print("Using bucket " + bucket)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
Using bucket sagemaker-us-east-1-004608622582


In [3]:
# we use the our clean baseline dataset
df = pd.read_csv("Data/baseline_clean_data_head.csv")
df.head()

Unnamed: 0,AdjSquareFeet,DistancetoCoast,DistancetoSinkhole,DistancetoFireDepartment,LocationWindSpeed,ValueofHome,NumberOfBuildings,NumberOfUnits,Age,Terrain_B,Terrain_C,Terrain_HVHZ
0,1384.0,2956.8,4.59,1.2,129.0,200840.0,1.0,1.0,36.0,1.0,0.0,0.0
1,1534.0,-99.0,9.56,0.98,146.0,164994.0,1.0,1.0,20.0,0.0,0.0,1.0
2,2612.0,-99.0,0.82,0.05,120.0,200346.0,1.0,1.0,19.0,1.0,0.0,0.0
3,2848.0,-99.0,0.39,1.69,120.0,364161.0,1.0,1.0,40.0,1.0,0.0,0.0
4,1936.0,-99.0,1.09,1.37,120.0,138691.0,1.0,1.0,38.0,1.0,0.0,0.0


In [4]:
X = df.drop('ValueofHome', axis=1)
y = df['ValueofHome']

In [5]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
trainX = pd.DataFrame(X_train, columns=df.columns)
trainX["ValueofHome"] = y_train

testX = pd.DataFrame(X_test, columns=df.columns)
testX["ValueofHome"] = y_test

In [7]:
trainX, testX

(       AdjSquareFeet  DistancetoCoast  DistancetoSinkhole  \
 28168         1351.0            -99.0                1.16   
 79444         1744.0            -99.0                5.20   
 68067         2067.0            -99.0               10.44   
 37942         2054.0            -99.0                2.27   
 536           1392.0            -99.0               30.77   
 ...              ...              ...                 ...   
 6265          3168.0            -99.0                3.71   
 54886         1554.0           1161.6               18.93   
 76820         5188.0           2059.2               13.72   
 860           2279.0            -99.0                0.32   
 15795         3509.0            -99.0                2.00   
 
        DistancetoFireDepartment  LocationWindSpeed  ValueofHome  \
 28168                      0.69              110.0     108100.0   
 79444                      1.23              140.0     170580.0   
 68067                      0.40              146.

In [8]:
# Create the CSV values
trainX.to_csv("Model_Monitor_Data/final_project_train.csv")
testX.to_csv("Model_Monitor_Data/final_project_test.csv")

In [9]:
# send data to S3. SageMaker will take training data from s3
trainpath = sess.upload_data(
    path="Model_Monitor_Data/final_project_train.csv", bucket=bucket, key_prefix="final_project/model_monitor"
)

testpath = sess.upload_data(
    path="Model_Monitor_Data/final_project_test.csv", bucket=bucket, key_prefix="final_project/model_monitor"
)

In [10]:
%%writefile Code/script.py

import argparse
import joblib
import os

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor


# inference functions ---------------
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf


if __name__ == "__main__":
    print("extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    # to simplify the demo we don't use all sklearn RandomForest hyperparameters
    parser.add_argument("--n-estimators", type=int, default=10)
    parser.add_argument("--min-samples-leaf", type=int, default=3)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="final_project_train.csv")
    parser.add_argument("--test-file", type=str, default="final_project_test.csv")
    parser.add_argument(
        "--features", type=str
    )  # in this script we ask user to explicitly name features
    parser.add_argument(
        "--target", type=str
    )  # in this script we ask user to explicitly name the target

    args, _ = parser.parse_known_args()

    print("reading data")
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    print("building training and testing datasets")
    X_train = train_df[args.features.split()]
    X_test = test_df[args.features.split()]
    y_train = train_df[args.target]
    y_test = test_df[args.target]

    # train
    print("training model")
    model = RandomForestRegressor(
        n_estimators=args.n_estimators, min_samples_leaf=args.min_samples_leaf, n_jobs=-1
    )

    model.fit(X_train, y_train)

    # print abs error
    print("validating model")
    abs_err = np.abs(model.predict(X_test) - y_test)

    # print couple perf metrics
    for q in [10, 50, 90]:
        print("AE-at-" + str(q) + "th-percentile: " + str(np.percentile(a=abs_err, q=q)))

    # persist model
    path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, path)
    print("model persisted at " + path)
    print(args.min_samples_leaf)

Overwriting Code/script.py


In [11]:
! python Code/script.py --n-estimators 100 \
                   --min-samples-leaf 2 \
                   --model-dir ./ \
                   --train ./Model_Monitor_Data \
                   --test ./Model_Monitor_Data \
                   --features 'AdjSquareFeet DistancetoCoast DistancetoSinkhole DistancetoFireDepartment LocationWindSpeed ValueofHome NumberOfBuildings NumberOfUnits Age Terrain_B Terrain_C Terrain_HVHZ' \
                   --target 'ValueofHome'

extracting arguments
reading data
building training and testing datasets
training model
validating model
AE-at-10th-percentile: 0.26372738094651144
AE-at-50th-percentile: 2.477779761866259
AE-at-90th-percentile: 19.87827142852475
model persisted at ./model.joblib
2


In [15]:
# We use the Estimator from the SageMaker Python SDK
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point="Code/script.py",
    role=get_execution_role(),
    instance_count=1,
    instance_type="ml.c5.xlarge",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="rf-scikit",
    metric_definitions=[{"Name": "median-AE", "Regex": "AE-at-50th-percentile: ([0-9.]+).*$"}],
    hyperparameters={
        "n-estimators": 100,
        "min-samples-leaf": 3,
        "features": "AdjSquareFeet DistancetoCoast DistancetoSinkhole DistancetoFireDepartment LocationWindSpeed ValueofHome NumberOfBuildings NumberOfUnits Age Terrain_B Terrain_C Terrain_HVHZ",
        "target": "ValueofHome",
    },
)

In [16]:
# launch training job, with asynchronous call
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)

INFO:sagemaker:Creating training-job with name: rf-scikit-2024-06-21-22-55-18-451


2024-06-21 22:55:19 Starting - Starting the training job...
2024-06-21 22:55:34 Starting - Preparing the instances for training...
2024-06-21 22:56:06 Downloading - Downloading input data...
2024-06-21 22:56:31 Downloading - Downloading the training image...
2024-06-21 22:57:17 Training - Training image download completed. Training in progress.
2024-06-21 22:57:17 Uploading - Uploading generated training model[34m2024-06-21 22:57:00,823 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2024-06-21 22:57:00,826 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-06-21 22:57:00,867 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2024-06-21 22:57:01,044 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-06-21 22:57:01,055 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-

In [17]:
# Launching a tuning Job
# we use the Hyperparameter Tuner
from sagemaker.tuner import IntegerParameter

# Define exploration boundaries
hyperparameter_ranges = {
    "n-estimators": IntegerParameter(20, 100),
    "min-samples-leaf": IntegerParameter(2, 6),
}

# create Optimizer
Optimizer = sagemaker.tuner.HyperparameterTuner(
    estimator=sklearn_estimator,
    hyperparameter_ranges=hyperparameter_ranges,
    base_tuning_job_name="RF-tuner",
    objective_type="Minimize",
    objective_metric_name="median-AE",
    metric_definitions=[
        {"Name": "median-AE", "Regex": "AE-at-50th-percentile: ([0-9.]+).*$"}
    ],  # extract tracked metric from logs with regexp
    max_jobs=10,
    max_parallel_jobs=2,
)

In [18]:
Optimizer.fit({"train": trainpath, "test": testpath})

INFO:sagemaker:Creating hyperparameter tuning job with name: RF-tuner-240621-2259


................................................................!


In [19]:
# get tuner results in a df
results = Optimizer.analytics().dataframe()
while results.empty:
    time.sleep(1)
    results = Optimizer.analytics().dataframe()
results.head()

Unnamed: 0,min-samples-leaf,n-estimators,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
0,2.0,97.0,RF-tuner-240621-2259-010-ad827589,Completed,2.500413,2024-06-21 23:04:10+00:00,2024-06-21 23:04:49+00:00,39.0
1,2.0,98.0,RF-tuner-240621-2259-009-82b0bb31,Completed,2.55294,2024-06-21 23:04:03+00:00,2024-06-21 23:04:42+00:00,39.0
2,2.0,99.0,RF-tuner-240621-2259-008-b350b022,Completed,2.478114,2024-06-21 23:03:20+00:00,2024-06-21 23:03:58+00:00,38.0
3,2.0,100.0,RF-tuner-240621-2259-007-3b11ea33,Completed,2.499127,2024-06-21 23:03:11+00:00,2024-06-21 23:03:50+00:00,39.0
4,5.0,79.0,RF-tuner-240621-2259-006-f0020d8a,Completed,3.876701,2024-06-21 23:02:29+00:00,2024-06-21 23:03:02+00:00,33.0


In [None]:
# Deploy to Endpoint

In [20]:
sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + artifact)


2024-06-21 22:57:29 Starting - Preparing the instances for training
2024-06-21 22:57:29 Downloading - Downloading the training image
2024-06-21 22:57:29 Training - Training image download completed. Training in progress.
2024-06-21 22:57:29 Uploading - Uploading generated training model
2024-06-21 22:57:29 Completed - Training job completed
Model artifact persisted at s3://sagemaker-us-east-1-004608622582/rf-scikit-2024-06-21-22-55-18-451/output/model.tar.gz


In [21]:
from sagemaker.sklearn.model import SKLearnModel

model = SKLearnModel(
    model_data=artifact,
    role=get_execution_role(),
    entry_point="Code/script.py",
    framework_version=FRAMEWORK_VERSION,
)

In [22]:
predictor = model.deploy(instance_type="ml.c5.large", initial_instance_count=1)

INFO:sagemaker:Creating model with name: sagemaker-scikit-learn-2024-06-21-23-06-38-426
INFO:sagemaker:Creating endpoint-config with name sagemaker-scikit-learn-2024-06-21-23-06-39-150
INFO:sagemaker:Creating endpoint with name sagemaker-scikit-learn-2024-06-21-23-06-39-150


------!

In [24]:
# the SKLearnPredictor does the serialization from pandas for us
print(predictor.predict(testX[df.columns]))

[716994.45123138 269508.27408369  68402.84226371 ... 188246.29183261
  99525.45803968 115999.21222367]


In [26]:
runtime = boto3.client("sagemaker-runtime")

In [27]:
# csv serialization
response = runtime.invoke_endpoint(
    EndpointName=predictor.endpoint,
    Body=testX[df.columns].to_csv(header=False, index=False).encode("utf-8"),
    ContentType="text/csv",
)

print(response["Body"].read())

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


b'[716994.45123138, 269508.274083694, 68402.8422637085, 147099.7999166667, 150850.0810525308, 604812.1605714287, 142689.68211327566, 705234.0642002164, 155009.2934549062, 86100.76695238095, 158373.78468506492, 144245.24737620712, 115381.15656152181, 110846.94919011544, 147652.73316666667, 68232.47994516595, 965713.4293315848, 287652.71460245305, 205917.21922619047, 263258.7128235931, 137697.8924468864, 167500.64716630592, 140200.07533730156, 231245.38171031748, 137431.9188031413, 178404.92343290045, 524221.71531990246, 276105.8237590188, 516496.3154469697, 163111.639283189, 308181.1102712844, 340372.95514285716, 346893.542031746, 812146.5082514429, 262980.63966208795, 335912.58664105344, 204329.76985714288, 887156.4272741707, 208699.4788218726, 84186.4074512987, 144400.09259126987, 190542.56176190486, 120960.08981746029, 218178.1913203463, 178938.00429226336, 154261.6366872294, 339349.9590300811, 223497.39542857136, 226375.01305627706, 171910.51479581534, 119150.49576984128, 102058.970

In [28]:
sm_boto3.delete_endpoint(EndpointName=predictor.endpoint)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


{'ResponseMetadata': {'RequestId': '350e4baf-ffd3-47c8-9311-8c3f933d0d12',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '350e4baf-ffd3-47c8-9311-8c3f933d0d12',
   'content-type': 'application/x-amz-json-1.1',
   'date': 'Sat, 22 Jun 2024 00:22:50 GMT',
   'content-length': '0'},
  'RetryAttempts': 0}}

In [29]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>