In [None]:
# S3 prefix
prefix = 'Scikit-iris'

import sagemaker
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()

# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()
print(role)

## Upload the data for training <a class="anchor" id="upload_data"></a>

When training large models with huge amounts of data, you'll typically use big data tools, like Amazon Athena, AWS Glue, or Amazon EMR, to create your data in S3. For the purposes of this example, we're using a sample of the classic [Iris dataset](https://en.wikipedia.org/wiki/Iris_flower_data_set), which is included with Scikit-learn. We will load the dataset, write locally, then write the dataset to s3 to use.

In [None]:
import numpy as np
import os
from sklearn import datasets

# Load Iris dataset, then join labels and features
iris = datasets.load_iris()
joined_iris = np.insert(iris.data, 0, iris.target, axis=1)

from sklearn.model_selection import train_test_split
train, test = train_test_split(joined_iris, test_size=0.25)

# Create directory and write csv
os.makedirs('./data', exist_ok=True)
os.makedirs('./test', exist_ok=True)
np.savetxt('./data/iris.csv', train, delimiter=',', fmt='%1.1f, %1.3f, %1.3f, %1.3f, %1.3f')
np.savetxt('./test/iris.csv', test, delimiter=',', fmt='%1.1f, %1.3f, %1.3f, %1.3f, %1.3f')

WORK_DIRECTORY = 'data'
TEST_DIRECTORY = 'test'
train_input = sagemaker_session.upload_data(WORK_DIRECTORY, key_prefix="{}/{}".format(prefix, WORK_DIRECTORY) )
test_input = sagemaker_session.upload_data(TEST_DIRECTORY, key_prefix="{}/{}".format(prefix, TEST_DIRECTORY) )
print('train: ' + train_input)
print('test: ' + test_input)

## Create SageMaker Scikit Estimator <a class="anchor" id="create_sklearn_estimator"></a>

To run our Scikit-learn training script on SageMaker, we construct a `sagemaker.sklearn.estimator.sklearn` estimator, which accepts several constructor arguments:

* __entry_point__: The path to the Python script SageMaker runs for training and prediction.
* __role__: Role ARN
* __train_instance_type__ *(optional)*: The type of SageMaker instances for training. __Note__: Because Scikit-learn does not natively support GPU training, Sagemaker Scikit-learn does not currently support training on GPU instance types.
* __sagemaker_session__ *(optional)*: The session used to train on Sagemaker.
* __hyperparameters__ *(optional)*: A dictionary passed to the train function as hyperparameters.

To see the code for the SKLearn Estimator, see here: https://github.com/aws/sagemaker-python-sdk/tree/master/src/sagemaker/sklearn

In [None]:
from sagemaker.sklearn.estimator import SKLearn

script_path = 'scikit_learn_your_mission.py'

sklearn = SKLearn(
    entry_point=script_path,
    train_instance_type="ml.c4.xlarge",
    role=role,
    sagemaker_session=sagemaker_session,
    hyperparameters={'max_leaf_nodes': 30, 'criterion':'gini', 'splitter':'random'})

## Train SKLearn Estimator on Iris data <a class="anchor" id="train_sklearn"></a>
Training is very simple, just call `fit` on the Estimator! This will start a SageMaker Training job that will download the data for us, invoke our scikit-learn code (in the provided script file), and save any model artifacts that the script creates.

In [None]:
sklearn.fit({'train': train_input, 'test': test_input})

In [None]:
print('Model Location: ' + sklearn.model_data)
print('Job Name: ' + sklearn.latest_training_job.name)

In [None]:
!aws s3 cp $sklearn.model_data /tmp/model.tar.gz
!tar -xvzf /tmp/model.tar.gz -C /tmp

from sklearn.externals import joblib
cfl = joblib.load('/tmp/model.joblib')

In [None]:
cfl.predict(iris.data)