### Imports

In [5]:
import boto3
import sagemaker
import pickle
import pandas as pd

from time import gmtime, strftime

from sagemaker import get_execution_role
from sagemaker.model import Model
from sagemaker.pipeline import PipelineModel
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.amazon.amazon_estimator import get_image_uri
from sklearn.externals import joblib

### Environment Setup

In [6]:
sagemaker_session = sagemaker.Session()
role = get_execution_role()

# Get headerless CSV
KEY = 'train_no_header.csv'
BUCKET = 'bf-titanic-data'
s3 = boto3.resource('s3')
s3.Bucket(BUCKET).download_file(KEY, './titanic_data/train_no_header.csv')

### Endpoint Functions

In [7]:
# Format prediction output
def output_fn(prediction, accept):
    if accept == "application/json":
        instances = []
        for row in prediction.tolist():
            instances.append({"features": row})

        json_output = {"instances": instances}

        return worker.Response(json.dumps(json_output), accept, mimetype=accept)
    elif accept == 'text/csv':
        return worker.Response(encoders.encode(prediction, accept), accept, mimetype=accept)
    else:
        raise RuntimeException("{} accept type is not supported by this script.".format(accept))

In [8]:
#Parse input data payload
def input_fn(input_data, content_type):
    if content_type == 'text/csv':
        # Read the raw input data as CSV.
        df = pd.read_csv(StringIO(input_data), 
                         header=None)

        if len(df.columns) == len(feature_columns_names) + 1:
            # This is a labelled example, includes the ring label
            df.columns = feature_columns_names + [label_column]
        elif len(df.columns) == len(feature_columns_names):
            # This is an unlabelled example.
            df.columns = feature_columns_names

        return df
    else:
        raise ValueError("{} not supported by script!".format(content_type))

In [9]:
# Preprocess input data
def predict_fn(input_data, model):
    
    features = model.transform(input_data)

    if label_column in input_data:
        # Return the label (as the first column) and the set of features.
        return np.insert(features, 0, input_data[label_column], axis=1)
    else:
        # Return only the set of features
        return features

In [10]:
# Deserialize fitted model
def model_fn(model_dir):
    preprocessor = joblib.load(os.path.join(model_dir, "model.joblib"))
    return preprocessor

### Create Inference Pipeline

In [11]:
SCRIPT_PATH = 'feature_eng/titanic_featurizer.py'
WORK_DIRECTORY = 'titanic_data'
PREFIX = 'titanic-inference-pipeline'

train_data = sagemaker_session.upload_data(
    path='{}/{}'.format(WORK_DIRECTORY, 'train_no_header.csv'), 
    bucket=BUCKET,
    key_prefix='{}/{}'.format(PREFIX, 'train'))

sklearn_preprocessor = SKLearn(
    entry_point=SCRIPT_PATH,
    role=role,
    train_instance_type="ml.c4.xlarge",
    sagemaker_session=sagemaker_session)

sklearn_preprocessor.fit({'train': train_data})

This is not the latest supported version. If you would like to use version 0.23-1, please add framework_version=0.23-1 to your constructor.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


2020-07-25 00:03:06 Starting - Starting the training job...
2020-07-25 00:03:09 Starting - Launching requested ML instances.........
2020-07-25 00:04:53 Starting - Preparing the instances for training......
2020-07-25 00:05:46 Downloading - Downloading input data...
2020-07-25 00:06:22 Training - Downloading the training image..[34m2020-07-25 00:06:42,751 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2020-07-25 00:06:42,753 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-07-25 00:06:42,763 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2020-07-25 00:06:43,121 sagemaker-containers INFO     Module titanic_featurizer does not provide a setup.py. [0m
[34mGenerating setup.py[0m
[34m2020-07-25 00:06:43,121 sagemaker-containers INFO     Generating setup.cfg[0m
[34m2020-07-25 00:06:43,121 sagemaker-containers INFO     Generating MANIFEST.in[0m
[34m2020-07-

In [12]:
timestamp_prefix = strftime("%m/%d/%Y, %H:%M:%S")
model_name = 'production-model-v1-' + timestamp_prefix
endpoint_name = 'production-model-v1-ep-' + timestamp_prefix
prod_model = PipelineModel(
    name=model_name, 
    role=role,
    models=inference_model)

sklearn_preprocessor.fit({'train': train_input})

NameError: name 'inference_model' is not defined

### Batch Transform Training Data

In [None]:
# Define a SKLearn Transformer from the trained SKLearn Estimator
transformer = sklearn_preprocessor.transformer(
    instance_count=1, 
    instance_type='ml.m4.xlarge',
    assemble_with = 'Line',
    accept = 'text/csv')

# Preprocess training input
transformer.transform(train_input, content_type='text/csv')
print('Waiting for transform job: ' + transformer.latest_transform_job.job_name)
transformer.wait()
preprocessed_train = transformer.output_path