# Training/Optimizing a basic model with a Built Algorithm

This exercise is about manually execute all the steps of the Machine Learning development pipeline. We'll use here a public dataset called iris. The dataset and the model aren't the focus of this exercise. The idea here is to see how Sagemaker can accelerate your work and void wasting your time with tasks that aren't related to your business. So, we'll do the following:

 - Train/deploy/test a multiclass model using XGBoost
 - Optimize the model
 - Run batch predictions

# PART 1 - Train deploy and test

## Let's  start by importing the dataset and visualize it

In [None]:
%matplotlib inline

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import datasets
sns.set(color_codes=True)

iris = datasets.load_iris()

X=iris.data
y=iris.target

dataset = np.insert(iris.data, 0, iris.target,axis=1)

df = pd.DataFrame(data=dataset, columns=['iris_id'] + iris.feature_names)
df['species'] = df['iris_id'].map(lambda x: 'setosa' if x == 0 else 'versicolor' if x == 1 else 'virginica')

df.head()

In [None]:
df.describe()

## Checking the class distribution

In [None]:
ax = df.groupby(df['species'])['species'].count().plot(kind='bar')
x_offset = -0.05
y_offset = 0
for p in ax.patches:
    b = p.get_bbox()
    val = "{}".format(int(b.y1 + b.y0))
    ax.annotate(val, ((b.x0 + b.x1)/2 + x_offset, b.y1 + y_offset))

### Correlation Matrix

In [None]:
corr = df.corr()

f, ax = plt.subplots(figsize=(15, 8))
sns.heatmap(corr, annot=True, fmt="f",
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values,
            ax=ax)

### Pairplots & histograms

In [None]:
sns.pairplot(df.drop(['iris_id'], axis=1), hue='species', size=2.5,diag_kind="kde")

### Now with linear regression

In [None]:
sns.pairplot(df.drop(['iris_id'], axis=1), kind="reg", hue='species', size=2.5,diag_kind="kde")

### Fit and plot a univariate or bivariate kernel density estimate.

In [None]:
tmp_df = df[(df.iris_id==0.0)]
sns.kdeplot(tmp_df['petal width (cm)'], tmp_df['petal length (cm)'], bw='silverman', cmap="Reds", shade=False, shade_lowest=False)

tmp_df = df[(df.iris_id==2.0)]
sns.kdeplot(tmp_df['petal width (cm)'], tmp_df['petal length (cm)'], bw='silverman', cmap="Blues", shade=False, shade_lowest=False)

tmp_df = df[(df.iris_id==1.0)]
sns.kdeplot(tmp_df['petal width (cm)'], tmp_df['petal length (cm)'], bw='silverman', cmap="Greens", shade=False, shade_lowest=False)

plt.xlabel('species')

Ok. Petal length and petal width have the highest linear correlation with our label. Also, sepal width seems to be useless, considering the linear correlation with our label.

Since versicolor and virginica cannot be split linearly, we need a more versatile algorithm to create a better classifier. In this case, we'll use XGBoost, a tree ensable that can give us a good model for predicting the flower.

## Ok, now let's split the dataset into training and test

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

In [None]:
with open('iris_train.csv', 'w') as csv:
    for x_,y_ in zip(X_train, y_train):
        line = "%s,%s" % (y_, ",".join( list(map(str, x_)) ) )
        csv.write( line + "\n" )
    csv.flush()
    csv.close()

with open('iris_test.csv', 'w') as csv:
    for x_,y_ in zip(X_test, y_test):
        line = "%s,%s" % (y_, ",".join( list(map(str, x_)) ) )
        csv.write( line + "\n" )
    csv.flush()
    csv.close()

## Now it's time to train our model with the builtin algorithm XGBoost

In [None]:
import sagemaker
import boto3

from sagemaker import get_execution_role
from sklearn.model_selection import train_test_split

role = get_execution_role()

prefix='mlops/iris'
# Retrieve the default bucket
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()

In [None]:
# Upload the dataset to an S3 bucket
input_train = sagemaker_session.upload_data(path='iris_train.csv', key_prefix='%s/data' % prefix)
input_test = sagemaker_session.upload_data(path='iris_test.csv', key_prefix='%s/data' % prefix)

In [None]:
train_data = sagemaker.session.s3_input(s3_data=input_train,content_type="csv")
test_data = sagemaker.session.s3_input(s3_data=input_test,content_type="csv")

In [None]:
containers = {'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
              'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
              'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
              'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'}

# Create the estimator
xgb = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sagemaker_session)
# Set the hyperparameters
xgb.set_hyperparameters(eta=0.1,
                        max_depth=10,
                        gamma=4,
                        reg_lambda=10,
                        num_class=len(np.unique(y)),
                        alpha=10,
                        min_child_weight=6,
                        silent=0,
                        objective='multi:softmax',
                        num_round=30)

### Train the model

In [None]:
%%time
# takes around 3min 11s
xgb.fit({'train': train_data, 'validation': test_data, })

### Deploy the model and create an endpoint for it
The following action will:
 * get the assets from the job we just ran and then create an input in the Models Catalog
 * create a endpoint configuration (a metadata for our final endpoint)
 * create an enpoint, which is our model wrapped in a format of a WebService
 
After that we'll be able to call our deployed endpoint for doing predictions

In [None]:
%%time
xgb_predictor = xgb.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

In [None]:
endpoint_name = xgb_predictor.endpoint
model_name = boto3.client('sagemaker').describe_endpoint_config(
    EndpointConfigName=endpoint_name
)['ProductionVariants'][0]['ModelName']

## Now, let's do a basic test with the deployed endpoint
In this test, we'll use a helper object called predictor. This object is always returned from a **Deploy** call. The predictor is just for testing purposes and we'll not use it inside our real application.

In [None]:
from sagemaker.predictor import csv_serializer
from sklearn.metrics import f1_score

xgb_predictor.content_type = 'text/csv'
xgb_predictor.serializer = csv_serializer
xgb_predictor.deserializer = None

In [None]:
predictions_test = [ float(xgb_predictor.predict(x).decode('utf-8')) for x in X_test] 
score = f1_score(y_test,predictions_test,labels=[0.0,1.0,2.0],average='micro')

print('F1 Score(micro): %.1f' % (score * 100.0))

## Then, let's  test the API for our trained model
This is how your application will call the endpoint. Using boto3 for getting a sagemaker runtime client and then we'll call invoke_endpoint

In [None]:
sm = boto3.client('sagemaker-runtime')

In [None]:
from sagemaker.predictor import csv_serializer

resp = sm.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType='text/csv',
    Body=csv_serializer(X_test[0])
)
prediction = float(resp['Body'].read().decode('utf-8'))
print('Predicted class: %.1f for [%s]' % (prediction, csv_serializer(X_test[0])) )

# PART 2 - Model optimization with Hyperparameter Tuning

## Hyperparameter Tuning Jobs
#### A.K.A. Hyperparameter Optimization

## Let's tune our model before using it for our batch prediction
We know that the iris dataset is an easy challenge. We can achieve a better score with XGBoost. However, we don't want to wast time testing all the possible variations of the hyperparameters in order to optimize the training process.

Instead, we'll use the Sagemaker's tuning feature. For that, we'll use the same estimator, but let's create a Tuner and ask it for optimize the model for us. 

In [None]:
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

hyperparameter_ranges = {'eta': ContinuousParameter(0, 1),
                        'min_child_weight': ContinuousParameter(1, 10),
                        'alpha': ContinuousParameter(0, 2),
                         'gamma': ContinuousParameter(0, 10),
                        'max_depth': IntegerParameter(1, 10)}

objective_metric_name = 'validation:merror'

tuner = HyperparameterTuner(xgb,
                            objective_metric_name,
                            hyperparameter_ranges,
                            max_jobs=20,
                            max_parallel_jobs=4,
                            objective_type='Minimize')

tuner.fit({'train': train_data, 'validation': test_data, })

In [None]:
tuner.wait()

In [None]:
job_name = tuner.latest_tuning_job.name
attached_tuner = HyperparameterTuner.attach(job_name)
xgb_predictor2 = attached_tuner.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

In [None]:
first_endpoint_name = endpoint_name
endpoint_name = xgb_predictor2.endpoint
model_name = boto3.client('sagemaker').describe_endpoint_config(
    EndpointConfigName=endpoint_name
)['ProductionVariants'][0]['ModelName']

## A simple test before we move on

In [None]:
from sagemaker.predictor import csv_serializer
from sklearn.metrics import f1_score

xgb_predictor2.content_type = 'text/csv'
xgb_predictor2.serializer = csv_serializer
xgb_predictor2.deserializer = None

In [None]:
predictions_test = [ float(xgb_predictor2.predict(x).decode('utf-8')) for x in X_test] 
score = f1_score(y_test,predictions_test,labels=[0.0,1.0,2.0],average='micro')

print('F1 Score(micro): %.1f' % (score * 100.0))

# PART 3 - Batch Prediction

## Batch transform job
If you have a file with the samples you want to predict, just upload that file to an S3 bucket and start a Batch Transform job. For this task, you don't need to deploy an endpoint. Sagemaker will create all the resources needed to do this batch prediction, save the results into an S3 bucket and then it will destroy the resources automatically for you

In [None]:
batch_dataset_filename='batch_dataset.csv'
with open(batch_dataset_filename, 'w') as csv:
    for x_ in X:
        line = ",".join( list(map(str, x_)) )
        csv.write( line + "\n" )
    csv.flush()
    csv.close()

In [None]:
input_batch = sagemaker_session.upload_data(path=batch_dataset_filename, key_prefix='%s/data' % prefix)

In [None]:
import sagemaker

# Initialize the transformer object
transformer=sagemaker.transformer.Transformer(
    base_transform_job_name='mlops-iris',
    model_name=model_name,
    instance_count=1,
    instance_type='ml.c4.xlarge',
    output_path='s3://{}/{}/batch_output'.format(bucket, prefix),
)
# To start a transform job:
transformer.transform(input_batch, content_type='text/csv', split_type='Line')
# Then wait until transform job is completed
transformer.wait()

In [None]:
import boto3

predictions_filename='iris_predictions.csv'
s3 = boto3.client('s3')
s3.download_file(bucket, '{}/batch_output/{}.out'.format(prefix, batch_dataset_filename), predictions_filename)

df2 = pd.read_csv(predictions_filename, sep=',', encoding='utf-8',header=None, names=[ 'predicted_iris_id'])
df3 = df.copy()
df3['predicted_iris_id'] = df2['predicted_iris_id']
df3.head()

In [None]:
from sklearn.metrics import f1_score
score = f1_score(df3['iris_id'], df3['predicted_iris_id'],labels=[0.0,1.0,2.0],average='micro')

print('F1 Score(micro): %.1f' % (score * 100.0))

In [None]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix
cnf_matrix = confusion_matrix(df3['iris_id'], df3['predicted_iris_id'])

f, ax = plt.subplots(figsize=(15, 8))
sns.heatmap(cnf_matrix, annot=True, fmt="f", mask=np.zeros_like(cnf_matrix, dtype=np.bool), 
            cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)

## Cleaning up

In [None]:
xgb_predictor.delete_endpoint()
xgb_predictor2.delete_endpoint()

# The end