## Train a model with Iris data using XGBoost algorithm
###  Model is trained with XGBoost installed in notebook instance
###  In the later examples, we will train using SageMaker's XGBoost algorithm

In [None]:
# Install xgboost in notebook instance.
#### Command to install xgboost
!pip install xgboost

In [None]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools
import xgboost as xgb

from sklearn import preprocessing
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
column_list_file = 'iris_train_column_list.txt'
train_file = 'iris_train.csv'
validation_file = 'iris_validation.csv'

In [None]:
columns = ''
with open(column_list_file,'r') as f:
    columns = f.read().split(',')

In [None]:
columns

In [None]:
# Encode Class Labels to integers
# Labeled Classes
labels=[0,1,2]
classes = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
le = preprocessing.LabelEncoder()
le.fit(classes)

In [None]:
# Specify the column names as the file does not have column header
df_train = pd.read_csv(train_file,names=columns)
df_validation = pd.read_csv(validation_file,names=columns)

In [None]:
df_train.head()

In [None]:
df_validation.head()

In [None]:
X_train = df_train.iloc[:,1:] # Features: 1st column onwards 
y_train = df_train.iloc[:,0].ravel() # Target: 0th column

X_validation = df_validation.iloc[:,1:]
y_validation = df_validation.iloc[:,0].ravel()

In [None]:
# Launch a classifier
# XGBoost Training Parameter Reference: 
#   https://xgboost.readthedocs.io/en/latest/parameter.html

classifier = xgb.XGBClassifier(objective="multi:softmax",
                               num_class=3,
                               n_estimators=100)

In [None]:
classifier

In [None]:
classifier.fit(X_train,
               y_train,
               eval_set = [(X_train, y_train), (X_validation, y_validation)],
               eval_metric=['mlogloss'])

In [None]:
eval_result_before_early_stop = classifier.evals_result()
training_rounds = range(len(eval_result_before_early_stop['validation_0']['mlogloss']))
print(f"training_rounds: {training_rounds}")

In [None]:
plt.scatter(x=training_rounds,
            y=eval_result_before_early_stop[
                'validation_0']['mlogloss'],
            label='Training Error')
plt.scatter(x=training_rounds,
            y=eval_result_before_early_stop[
                'validation_1']['mlogloss'],label='Validation Error')
plt.grid(True)
plt.xlabel('Iteration')
plt.ylabel('LogLoss')
plt.title('Training Vs Validation Error (Before early stop)')
plt.legend()
plt.show()

In [None]:
classifier.fit(X_train,
               y_train,
               eval_set = [(X_train, y_train), (X_validation, y_validation)],
               eval_metric=['mlogloss'],
               early_stopping_rounds=10)

# early_stopping_rounds - needs to be passed in as a hyperparameter in SageMaker XGBoost implementation
# "The model trains until the validation score stops improving. 
# Validation error needs to decrease at least every early_stopping_rounds to continue training.
# Amazon SageMaker hosting uses the best model for inference."

In [None]:
eval_result = classifier.evals_result()

In [None]:
training_rounds = range(len(eval_result['validation_0']['mlogloss']))

In [None]:
print(training_rounds)

In [None]:
plt.scatter(x=training_rounds,y=eval_result['validation_0']['mlogloss'],label='Training Error')
plt.scatter(x=training_rounds,y=eval_result['validation_1']['mlogloss'],label='Validation Error')
plt.grid(True)
plt.xlabel('Iteration')
plt.ylabel('LogLoss')
plt.title('Training Vs Validation Error')
plt.legend()
plt.show()

In [None]:
xgb.plot_importance(classifier)
plt.show()

In [None]:
df = pd.read_csv(validation_file,names=columns)

In [None]:
df.head()

In [None]:
X_test = df.iloc[:,1:]
print(X_test[:5])

In [None]:
result = classifier.predict(X_test)

In [None]:
result[:5]

In [None]:
df['predicted_class'] = result #le.inverse_transform(result)
#DWB#                 = le.inverse_transform(result) #DWB# to get class names

In [None]:
df.head()

In [None]:
# Compare performance of Actual and Model 1 Prediction
plt.figure()
plt.scatter(df.index,df['encoded_class'],label='Actual')
plt.scatter(df.index,df['predicted_class'],label='Predicted',marker='^')
plt.legend(loc=4)
plt.yticks([0,1,2])
plt.xlabel('Sample')
plt.ylabel('Class')
plt.show()

In [None]:
#DWB#  After I make a copy of this DataFrame that Chandra had
#DWB#+ in the notebook
df_orig = df.copy(deep=True)

#DWB# Doing what was commented
df['pred_cls_decoded'] = le.inverse_transform(result)
df['decoded_cls'] = le.inverse_transform(df['encoded_class'])

In [None]:
#DWB# Seeing result of doing what was commented
df.head()

In [None]:
print(df.head())

In [None]:
#DWB# Getting the data back to normal for the confusion matrix
df['pred_cls_enc'] = result
df['enc_cls_chk'] = le.transform(df['decoded_cls'])

In [None]:
#DWB#  Checking it worked. 
df.head()

In [None]:
print(df.head())

In [None]:
#DWB#  I will actually get rid of those extra columns
#DWB#+ before starting the next section.
df.drop(columns=['pred_cls_decoded', 'decoded_cls', 
                 'pred_cls_enc', 'enc_cls_chk'], inplace=True)

In [None]:
# Checking result
df.head()

In [None]:
# Quick, non-thorough check that we're back
df_orig.head()

<h2>Confusion Matrix</h2>
Confusion Matrix is a table that summarizes performance of classification model.<br><br>

In [None]:
# Reference: 
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        #print("Normalized confusion matrix")
    #else:
    #    print('Confusion matrix, without normalization')

    #print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

In [None]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(df['encoded_class'],
                              df['predicted_class'],labels=labels)

In [None]:
cnf_matrix

In [None]:
# Plot confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=classes,
                      title='Confusion matrix - Count')

In [None]:
# Plot confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=classes,
                      title='Confusion matrix - Count',normalize=True)

In [None]:
print(classification_report(df['encoded_class'],
                            df['predicted_class'],
                            labels=labels,
                            target_names=classes))