In [1]:
import numpy as np
import os
from sklearn import datasets

#fetch iris dataset from sklearn datasets
iris = datasets.load_iris()

#combine inputs and labels into a single CSV dataset
joined_iris = np.insert(iris.data, 0 , iris.target, axis=1)
os.makedirs('./data', exist_ok=True)
np.savetxt('./data/iris.csv', joined_iris, delimiter=',', fmt='%1.1f, %1.3f, %1.3f, %1.3f, %1.3f')

In [2]:
#create sagemaker session
import sagemaker
from sagemaker import get_execution_role
sagemaker_session = sagemaker.Session()
role = get_execution_role()

In [3]:
#upload data
prefix = 'scikit-iris'
work_directory = 'data'
train_input = sagemaker_session.upload_data(work_directory, key_prefix='{}/{}'.format(prefix, work_directory))

In [4]:
#import sklearn container
from sagemaker.sklearn.estimator import SKLearn
sklearn = SKLearn(
    entry_point='scikit_learn_iris.py', #pointer to sklearn script
    instance_type='ml.c4.xlarge',
    role=role,
    sagemaker_session=sagemaker_session,
    hyperparameters={'max_leaf_nodes':30},
    framework_version="0.20.0"
)

In [5]:
sklearn.fit({'train':train_input})

2021-12-21 22:23:58 Starting - Starting the training job...
2021-12-21 22:24:00 Starting - Launching requested ML instancesProfilerReport-1640125438: InProgress
...
2021-12-21 22:24:50 Starting - Preparing the instances for training.........
2021-12-21 22:26:24 Downloading - Downloading input data...
2021-12-21 22:26:44 Training - Downloading the training image..[34m2021-12-21 22:27:03,545 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2021-12-21 22:27:03,549 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-12-21 22:27:03,567 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2021-12-21 22:27:04,043 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-12-21 22:27:04,057 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-12-21 22:27:04,075 sagemaker-training-toolkit IN

In [6]:
#predictor is the resulting sklearn model
predictor = sklearn.deploy(
    initial_instance_count=1,
    instance_type='ml.m4.xlarge'
)

------!

In [7]:
#model validation
#sampling test data from training data
import itertools
import pandas as pd

shape = pd.read_csv('data/iris.csv', header=None)
a = [50*i for i in range(3)]
b = [40+i for i in range(10)]
indices = [i+j for i,j in itertools.product(a,b)]

test_data = shape.iloc[indices[:-1]]
test_x = test_data.iloc[:,1:] #all rows starting at column 1 (features)
test_y = test_data.iloc[:,0] #all rows taking only column 0 (labels)

In [8]:
#predicted values for test data
print(predictor.predict(test_x.values))
#correct values for test data    
print(test_y.values)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 2. 2. 2. 2.
 2. 2. 2. 2. 2.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 2. 2. 2. 2.
 2. 2. 2. 2. 2.]


In [9]:
#delete endpoints
predictor.delete_endpoint()

In [10]:
import boto3
bucket='sagemaker-us-east-2-318457536751'
s3=boto3.resource('s3')
file_path='sagemaker-scikit-learn-2021-12-21-22-23-58-051/output/model.tar.gz'
s3.Bucket(bucket).download_file(file_path, 'model.tar.gz')

In [3]:
from sklearn.externals import joblib
clf = joblib.load('model.joblib')
print(clf)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=30,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


In [10]:
from sklearn import tree

tree.export_graphviz(clf,
                     out_file="tree.dot",
                     feature_names = ['sepal length','sepal width','petal length','petal width'], 
                     class_names=['setosa','versicolor','virginica'],
                     filled=True)
