In [1]:
import os
import boto3
import re
import sagemaker
import pandas as pd 


role = sagemaker.get_execution_role()
region = boto3.Session().region_name

# S3 bucket for training data.
# Feel free to specify a different bucket and prefix.
data_bucket = f"jumpstart-cache-prod-{region}"
data_prefix = "1p-notebooks-datasets/abalone/text-csv"


# S3 bucket for saving code and model artifacts.
# Feel free to specify a different bucket and prefix
output_bucket = sagemaker.Session().default_bucket()
output_prefix = "workshop/linear-learner-abalone-regression"

In [2]:
print(f"Output bucket: {output_bucket}")

Output bucket: sagemaker-us-east-1-639556434474


In [3]:
FILE_TRAIN = "abalone_dataset1_train.csv"

s3 = boto3.client("s3")
s3.download_file(data_bucket, f"{data_prefix}/train/{FILE_TRAIN}", FILE_TRAIN)

df = pd.read_csv(FILE_TRAIN, sep=",", encoding="latin1", 
                 names=["age","sex","Length","Diameter","Height","Whole.weight","Shucked.weight","Viscera.weight","Shell.weight"]
                )

print(df.head(1))

   age  sex  Length  Diameter  Height  Whole.weight  Shucked.weight  \
0    8    2   0.615      0.48    0.16        1.2525           0.585   

   Viscera.weight  Shell.weight  
0          0.2595          0.33  


In [4]:
# creating the inputs for the fit() function with the training and validation location
s3_train_data = f"s3://{data_bucket}/{data_prefix}/train"
print(f"training files will be taken from: {s3_train_data}")

s3_validation_data = f"s3://{data_bucket}/{data_prefix}/validation"
print(f"validtion files will be taken from: {s3_validation_data}")

output_location = f"s3://{output_bucket}/{output_prefix}/output"
print(f"training artifacts output location: {output_location}")

training files will be taken from: s3://jumpstart-cache-prod-us-east-1/1p-notebooks-datasets/abalone/text-csv/train
validtion files will be taken from: s3://jumpstart-cache-prod-us-east-1/1p-notebooks-datasets/abalone/text-csv/validation
training artifacts output location: s3://sagemaker-us-east-1-639556434474/workshop/linear-learner-abalone-regression/output


In [5]:
# generating the session.s3_input() format for fit() accepted by the sdk
train_data = sagemaker.inputs.TrainingInput(
    s3_train_data,
    content_type="text/csv",
    s3_data_type="S3Prefix",
    compression=None,
)
validation_data = sagemaker.inputs.TrainingInput(
    s3_validation_data,
    content_type="text/csv",
    s3_data_type="S3Prefix",
    compression=None,
)

In [6]:
# getting the linear learner image according to the region
from sagemaker.image_uris import retrieve

container = retrieve("linear-learner", boto3.Session().region_name, version="1")
print(container)

382416733822.dkr.ecr.us-east-1.amazonaws.com/linear-learner:1


In [7]:
%%time
import boto3
import sagemaker
from time import gmtime, strftime

sess = sagemaker.Session()

job_name = "workshop-linear-regression-" + strftime("%H-%M-%S", gmtime())
print("Training job", job_name)

linear = sagemaker.estimator.Estimator(
    container,
    role,
    input_mode="File",
    instance_count=1,
    instance_type="ml.m5.xlarge",
    output_path=output_location,
    sagemaker_session=sess,
    tags=[
        {
            "Key":"Workshop",
            "Value":"Sagemaker"
        },
        {
            "Key": "Grupo",
            "Value": "Workshop"
        }
    ]
)

linear.set_hyperparameters(
    feature_dim=8,
    epochs=16,
    wd=0.01,
    loss="absolute_loss",
    predictor_type="regressor",
    normalize_data=True,
    optimizer="adam",
    mini_batch_size=100,
    lr_scheduler_step=100,
    lr_scheduler_factor=0.99,
    lr_scheduler_minimum_lr=0.0001,
    learning_rate=0.1,
)

Training job workshop-linear-regression-03-50-02
CPU times: user 27.7 ms, sys: 3.37 ms, total: 31.1 ms
Wall time: 32.3 ms


In [8]:
%%time
linear.fit(inputs={"train": train_data, "validation": validation_data}, job_name=job_name)

2021-03-18 03:50:02 Starting - Starting the training job...
2021-03-18 03:50:26 Starting - Launching requested ML instancesProfilerReport-1616039402: InProgress
.........
2021-03-18 03:51:47 Starting - Preparing the instances for training...
2021-03-18 03:52:27 Downloading - Downloading input data
2021-03-18 03:52:27 Training - Downloading the training image...
2021-03-18 03:52:58 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[03/18/2021 03:53:02 INFO 139636952454976] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'feature_dim': u'auto', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scale

In [9]:
%%time
# creating the endpoint out of the trained model
linear_predictor = linear.deploy(initial_instance_count=1, instance_type="ml.c4.xlarge")
print(f"\ncreated endpoint: {linear_predictor.endpoint_name}")

---------------!
created endpoint: linear-learner-2021-03-18-03-53-45-411
CPU times: user 240 ms, sys: 22.1 ms, total: 262 ms
Wall time: 7min 32s


In [10]:
# configure the predictor to accept to serialize csv input and parse the reposne as json
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

linear_predictor.serializer = CSVSerializer()
linear_predictor.deserializer = JSONDeserializer()

In [11]:
%%time
import json
from itertools import islice
import math
import struct
import boto3
import random

# downloading the test file from data_bucket
FILE_TEST = "abalone_dataset1_test.csv"
s3 = boto3.client("s3")
s3.download_file(data_bucket, f"{data_prefix}/test/{FILE_TEST}", FILE_TEST)

# getting testing sample from our test file
test_data = [l for l in open(FILE_TEST, "r")]
sample = random.choice(test_data).split(",")
actual_age = sample[0]
payload = sample[1:]  # removing actual age from the sample
payload = ",".join(map(str, payload))

CPU times: user 24.5 ms, sys: 10.2 ms, total: 34.6 ms
Wall time: 182 ms


In [12]:
print(f"payload: {json.dumps(payload)}")

payload: "3,0.52,0.41,0.14,0.5995,0.242,0.1375,0.182\n"


In [13]:
sample

['11', '3', '0.52', '0.41', '0.14', '0.5995', '0.242', '0.1375', '0.182\n']

In [14]:
# Invoke the predicor and analyise the result
result = linear_predictor.predict(payload)

In [15]:
result = round(float(result["predictions"][0]["score"]), 2)

In [16]:
result

9.22

In [17]:
accuracy = str(round(100 - ((abs(float(result) - float(actual_age)) / float(actual_age)) * 100), 2))
print(f"Actual age: {actual_age}\nPrediction: {result}\nAccuracy: {accuracy}")

Actual age: 11
Prediction: 9.22
Accuracy: 83.82
