## SETUP SAGEMAKER

In [2]:
bucket

'sagemaker-us-east-1-428881646170'

In [1]:
import sagemaker
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()

# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()

# S3 prefix
bucket = sagemaker_session.default_bucket()
prefix = "Scikit-LinearLearner-pipeline-abalone-example"

## PREPROCESSING

#### Get Dataset

In [4]:
!wget --directory-prefix=./abalone_data https://s3-us-west-2.amazonaws.com/sparkml-mleap/data/abalone/abalone.csv

--2021-08-05 17:25:04--  https://s3-us-west-2.amazonaws.com/sparkml-mleap/data/abalone/abalone.csv
Resolving s3-us-west-2.amazonaws.com (s3-us-west-2.amazonaws.com)... 52.218.218.0
Connecting to s3-us-west-2.amazonaws.com (s3-us-west-2.amazonaws.com)|52.218.218.0|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 191873 (187K) [binary/octet-stream]
Saving to: ‘./abalone_data/abalone.csv’


2021-08-05 17:25:05 (998 KB/s) - ‘./abalone_data/abalone.csv’ saved [191873/191873]



In [9]:
!ls

abalone_data  Abalone_example.ipynb  lost+found


#### Upload the data for training

In [10]:
WORK_DIRECTORY = "abalone_data"

train_input = sagemaker_session.upload_data(
    path="{}/{}".format(WORK_DIRECTORY, "abalone.csv"),
    bucket=bucket,
    key_prefix="{}/{}".format(prefix, "train"),
)


#### create script: sklearn_abalone_featurizer.py

## Create SageMaker Scikit Preprocessor

In [11]:

from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"
script_path = "sklearn_abalone_featurizer.py"

sklearn_preprocessor = SKLearn(
    entry_point=script_path,
    role=role,
    framework_version=FRAMEWORK_VERSION,
    instance_type="ml.c4.xlarge",
    sagemaker_session=sagemaker_session,
)

In [12]:
sklearn_preprocessor.fit({"train": train_input})

2021-08-05 17:45:23 Starting - Starting the training job...
2021-08-05 17:45:48 Starting - Launching requested ML instancesProfilerReport-1628185522: InProgress
......
2021-08-05 17:46:48 Starting - Preparing the instances for training......
2021-08-05 17:47:50 Downloading - Downloading input data...
2021-08-05 17:48:08 Training - Downloading the training image..[34m2021-08-05 17:48:34,530 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2021-08-05 17:48:34,533 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-08-05 17:48:34,542 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2021-08-05 17:48:34,931 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-08-05 17:48:34,943 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-08-05 17:48:34,955 sagemaker-training-toolkit IN

### Batch transform Job for our training data

In [16]:
transformer = sklearn_preprocessor.transformer(
    instance_count=1, instance_type="ml.c4.xlarge", assemble_with="Line", accept="text/csv"
)

In [17]:
transformer.transform(train_input, content_type="text/csv")
print("Waiting for transform job: " + transformer.latest_transform_job.job_name)
transformer.wait()
preprocessed_train = transformer.output_path

..................................[34m2021-08-05 18:06:01,093 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[35m2021-08-05 18:06:01,093 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2021-08-05 18:06:01,096 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2021-08-05 18:06:01,096 INFO - sagemaker-containers - nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;
[0m
[34mworker_rlimit_nofile 4096;
[0m
[34mevents {
  worker_connections 2048;[0m
[34m}
[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;

  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }

  server {
    listen 8080 deferred;
    client_max_body_size 0;

    keepalive_timeout 3;

    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_

In [18]:
preprocessed_train

's3://sagemaker-us-east-1-428881646170/sagemaker-scikit-learn-2021-08-05-18-00-36-214'

## Fit a LinearLearner Model with the preprocessed data 

In [19]:
import boto3
from sagemaker.image_uris import retrieve

ll_image = retrieve("linear-learner", boto3.Session().region_name)

In [21]:
ll_image

'382416733822.dkr.ecr.us-east-1.amazonaws.com/linear-learner:1'

In [20]:
s3_ll_output_key_prefix = "ll_training_output"
s3_ll_output_location = "s3://{}/{}/{}/{}".format(
    bucket, prefix, s3_ll_output_key_prefix, "ll_model"
)

ll_estimator = sagemaker.estimator.Estimator(
    ll_image,
    role,
    instance_count=1,
    instance_type="ml.m4.2xlarge",
    volume_size=20,
    max_run=3600,
    input_mode="File",
    output_path=s3_ll_output_location,
    sagemaker_session=sagemaker_session,
)

ll_estimator.set_hyperparameters(feature_dim=10, predictor_type="regressor", mini_batch_size=32)

ll_train_data = sagemaker.inputs.TrainingInput(
    preprocessed_train,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
)

data_channels = {"train": ll_train_data}
ll_estimator.fit(inputs=data_channels, logs=True)

2021-08-05 18:19:29 Starting - Starting the training job...
2021-08-05 18:19:52 Starting - Launching requested ML instancesProfilerReport-1628187568: InProgress
...
2021-08-05 18:20:25 Starting - Preparing the instances for training.........
2021-08-05 18:21:57 Downloading - Downloading input data.........
2021-08-05 18:23:17 Training - Downloading the training image..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[08/05/2021 18:23:42 INFO 140223054878528] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_sigma': '0.01', 'init_

## CREATE PIPELINE

### Setup pipeline

In [22]:
from sagemaker.model import Model
from sagemaker.pipeline import PipelineModel
import boto3
from time import gmtime, strftime

timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S", gmtime())

scikit_learn_inferencee_model = sklearn_preprocessor.create_model()
linear_learner_model = ll_estimator.create_model()

model_name = "inference-pipeline-" + timestamp_prefix
endpoint_name = "inference-pipeline-ep-" + timestamp_prefix
sm_model = PipelineModel(
    name=model_name, role=role, models=[scikit_learn_inferencee_model, linear_learner_model]
)

sm_model.deploy(initial_instance_count=1, instance_type="ml.c4.xlarge", endpoint_name=endpoint_name)

-----------------!

In [23]:
model_name

'inference-pipeline-2021-08-05-19-04-05'

In [24]:
endpoint_name

'inference-pipeline-ep-2021-08-05-19-04-05'

## MAKE REQUEST

In [25]:
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer

payload = "M, 0.44, 0.365, 0.125, 0.516, 0.2155, 0.114, 0.155"
actual_rings = 10
predictor = Predictor(
    endpoint_name=endpoint_name, sagemaker_session=sagemaker_session, serializer=CSVSerializer()
)

print(predictor.predict(payload))

b'{"predictions": [{"score": 9.528051376342773}]}'
