In [None]:
# Install xgboost in notebook instance.
!conda install -y -c conda-forge xgboost

In [None]:
# Adding imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import xgboost as xgb
import boto3

In [None]:
# Defining utility methods
# Reference: http://boto3.readthedocs.io/en/latest/guide/s3.html
# bucket: Name of bucket
# key: File name stored in S3


def write_to_s3(filename, bucket, key):
    with open(filename,'rb') as f: # Read in binary mode
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)


def download_from_s3(filename, bucket, key):
    with open(filename,'wb') as f:
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).download_fileobj(f)

In [None]:
# Downloading file from S3
download_from_s3('iris_all.csv', 'bornshrewd-aws-sagemaker-demo', 'iris_all.csv')

# Reading CSV File
df = pd.read_csv('iris_all.csv')

# Let's see how our data looks like
df.head(2)


In [None]:
# Counts of each class in class column
df['class'].value_counts()

In [None]:
# We need to predict class (dependent Variable) from sepal_length, sepal_width, petal_length, petal_width (independent variables)
# As we can see class is a categorical variable so we need to convert it to equivalent numerical values.
# we will use preprocessing module from sklearn

le = preprocessing.LabelEncoder()
le.fit(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'])
df['encoded_class'] = le.transform(df['class'])
df.head(2)


In [None]:
# Splitting data into training set and test set
# We will use train_test_split module

X = df.iloc[:, :4]
y = df.iloc[:, 5]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

In [None]:
# Creating XGBoost classifier
# Params Reference: https://github.com/dmlc/xgboost/blob/master/doc/parameter.md
classifier = xgb.XGBClassifier(max_depth=5, objective="multi:softmax", num_class=3)
classifier

In [None]:
classifier.fit(X_train,y_train, eval_set = [(X_train, y_train), (X_val, y_val)], eval_metric=['merror','mlogloss'])






In [None]:
eval_result = classifier.evals_result()
training_rounds = range(len(eval_result['validation_0']['mlogloss']))

# Plotting
plt.scatter(x=training_rounds,y=eval_result['validation_0']['mlogloss'],label='Training Error')
plt.scatter(x=training_rounds,y=eval_result['validation_1']['mlogloss'],label='Validation Error')
plt.grid(True)
plt.xlabel('Iteration')
plt.ylabel('LogLoss')
plt.title('Training Vs Validation Error')
plt.legend()

In [None]:
xgb.plot_importance(classifier)

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
pred = le.inverse_transform(y_pred)
test = le.inverse_transform(y_test)

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
pd.crosstab(test, pred)

In [None]:
# Let's run prediction for entire dataset
df = pd.read_csv('iris_all.csv')
X = df.iloc[:,:-1] # Taking all independent variable
prediction = classifier.predict(X)
df['predicted_class'] = le.inverse_transform(prediction)

# Printing confusion metrix
pd.crosstab(df['class'], df['predicted_class'])

In [None]:
"""
We can see 
- All Iris-setosa are predicted correctly
- 49 Iris-versicolor are predicted correctly and one is misclassified as Iris-virginica
- All Iris-virginica are predicted correctly
"""

In [None]:
# Priting Classification Report
import sklearn.metrics as metrics
print(metrics.classification_report(df['class'], df['predicted_class']))