### Imports

In [75]:
import boto3
import sagemaker
import pickle

from time import gmtime, strftime

from sagemaker import get_execution_role
from sagemaker.model import Model
from sagemaker.pipeline import PipelineModel
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.amazon.amazon_estimator import get_image_uri

### Environment Setup

In [76]:
BUCKET = 'bf-titanic-data'
SCRIPT_PATH = '/titanic-sagemaker-inference/titanic_featurizer.py'

sagemaker_session = sagemaker.Session()
role = get_execution_role()

### Endpoint Functions

In [77]:
# Format prediction output
def output_fn(prediction, accept):
    if accept == "application/json":
        instances = []
        for row in prediction.tolist():
            instances.append({"features": row})

        json_output = {"instances": instances}

        return worker.Response(json.dumps(json_output), accept, mimetype=accept)
    elif accept == 'text/csv':
        return worker.Response(encoders.encode(prediction, accept), accept, mimetype=accept)
    else:
        raise RuntimeException("{} accept type is not supported by this script.".format(accept))

In [78]:
#Parse input data payload
def input_fn(input_data, content_type):
    if content_type == 'text/csv':
        # Read the raw input data as CSV.
        df = pd.read_csv(StringIO(input_data), 
                         header=None)

        if len(df.columns) == len(feature_columns_names) + 1:
            # This is a labelled example, includes the ring label
            df.columns = feature_columns_names + [label_column]
        elif len(df.columns) == len(feature_columns_names):
            # This is an unlabelled example.
            df.columns = feature_columns_names

        return df
    else:
        raise ValueError("{} not supported by script!".format(content_type))

In [79]:
# Preprocess input data
def predict_fn(input_data, model):
    
    features = model.transform(input_data)

    if label_column in input_data:
        # Return the label (as the first column) and the set of features.
        return np.insert(features, 0, input_data[label_column], axis=1)
    else:
        # Return only the set of features
        return features

In [80]:
# Deserialize fitted model
def model_fn(model_dir):
    preprocessor = joblib.load(os.path.join(model_dir, "model.joblib"))
    return preprocessor

### Create Inference Pipeline

In [81]:
script_path = '/home/ec2-user/sample-notebooks/sagemaker-python-sdk/scikit_learn_inference_pipeline/titanic_featurizer.py'

train_key = 'train.csv'
train_path = 's3://{}/{}'.format(BUCKET, train_key)
train_data = pd.read_csv(train_path) 


sklearn_preprocessor = SKLearn(
    entry_point=script_path,
    role=role,
    train_instance_type="ml.c4.xlarge",
    sagemaker_session=sagemaker_session)

sklearn_preprocessor.fit({'train': train_data})

This is not the latest supported version. If you would like to use version 0.23-1, please add framework_version=0.23-1 to your constructor.


FileNotFoundError: [Errno 2] No such file or directory: '/titanic-sagemaker-inference/titanic_featurizer.py'

In [None]:
timestamp_prefix = strftime("%m/%d/%Y, %H:%M:%S")
model_name = 'inference-pipeline-' + timestamp_prefix
endpoint_name = 'inference-pipeline-ep-' + timestamp_prefix
prod_model = PipelineModel(
    name=model_name, 
    role=role,
    models=inference_model)

sklearn_preprocessor.fit({'train': train_input})

### Batch Transform Training Data

In [None]:
# Define a SKLearn Transformer from the trained SKLearn Estimator
transformer = sklearn_preprocessor.transformer(
    instance_count=1, 
    instance_type='ml.m4.xlarge',
    assemble_with = 'Line',
    accept = 'text/csv')

# Preprocess training input
transformer.transform(train_input, content_type='text/csv')
print('Waiting for transform job: ' + transformer.latest_transform_job.job_name)
transformer.wait()
preprocessed_train = transformer.output_path