In [1]:
import sagemaker
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()

# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()

# S3 prefix
bucket = "abalone-data-3344"
prefix = "Scikit-LinearLearner-pipeline-abalone-example"

In [7]:
# Get the path of the train file
filename_train  = 'abalone.csv'
train_input = 's3://{}/{}/train/{}'.format(bucket, prefix, filename_train) 
train_input

's3://abalone-data-3344/Scikit-LinearLearner-pipeline-abalone-example/train/abalone.csv'

In [10]:
import pandas as pd
df = pd.read_csv(train_input)
df.shape

(4176, 9)

In [3]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "1.0-1"
script_path = "sklearn_abalone_featurizer.py"

sklearn_preprocessor = SKLearn(
    entry_point = script_path,
    role = role,
    framework_version = FRAMEWORK_VERSION,
    instance_type = 'ml.c4.xlarge',
    sagemaker_session = sagemaker_session,
)

In [4]:
sklearn_preprocessor.fit({"train":train_input})

2022-12-13 20:27:18 Starting - Starting the training job...
2022-12-13 20:27:44 Starting - Preparing the instances for trainingProfilerReport-1670963238: InProgress
............
2022-12-13 20:29:40 Downloading - Downloading input data...
2022-12-13 20:30:20 Training - Training image download completed. Training in progress..[34m2022-12-13 20:30:15,677 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2022-12-13 20:30:15,680 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-12-13 20:30:15,689 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2022-12-13 20:30:15,846 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-12-13 20:30:15,857 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-12-13 20:30:15,869 sagemaker-training-toolkit INFO     No GPUs detected (normal if no g

In [5]:
# Define a SKLearn Transformer from the trained SKLearn Estimator
transformer = sklearn_preprocessor.transformer(
    instance_count=1, instance_type="ml.m5.xlarge", assemble_with="Line", accept="text/csv"
)

In [6]:
# Preprocess training input
transformer.transform(train_input, content_type="text/csv")
print("Waiting for transform job: " + transformer.latest_transform_job.job_name)
transformer.wait()
preprocessed_train = transformer.output_path

ResourceLimitExceeded: An error occurred (ResourceLimitExceeded) when calling the CreateTransformJob operation: The account-level service limit 'ml.m5.xlarge for transform job usage' is 0 Instances, with current utilization of 0 Instances and a request delta of 1 Instances. Please contact AWS support to request an increase for this limit.

In [None]:
import boto3
from sagemaker.image_uris import retrieve

ll_image = retrieve("linear-learner", boto3.Session().region_name)

In [None]:
s3_ll_output_key_prefix = "ll_training_output"
s3_ll_output_location = "s3://{}/{}/{}/{}".format(
    bucket, prefix, s3_ll_output_key_prefix, "ll_model"
)

ll_estimator = sagemaker.estimator.Estimator(
    ll_image,
    role,
    instance_count=1,
    instance_type="ml.m4.2xlarge",
    volume_size=20,
    max_run=3600,
    input_mode="File",
    output_path=s3_ll_output_location,
    sagemaker_session=sagemaker_session,
)

ll_estimator.set_hyperparameters(feature_dim=10, predictor_type="regressor", mini_batch_size=32)

ll_train_data = sagemaker.inputs.TrainingInput(
    preprocessed_train,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
)

data_channels = {"train": ll_train_data}
# ll_estimator.fit(inputs=data_channels, logs=True)

In [None]:
from sagemaker.predictor import Predictor

In [None]:
Predictor.predict()