# Training an Iris classifier using XGBoost

In this notebook, we will show how to build a simple classifier trained on the famous Iris data set using XGBoost.

## Install dependencies

In [None]:
!pip install scikit-learn==0.20.*
!pip install xgboost==0.90
!pip install pandas
!pip install nyoka
!pip install pypmml

## Load the data
The Iris dataset is a part of scikit-learn datasets

In [None]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
import pandas as pd

iris = datasets.load_iris()
target = 'species'
features = iris.feature_names
iris_df = pd.DataFrame(iris.data, columns=features)
iris_df[target] = iris.target

X, y = iris_df[features], iris_df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=123456)

## Train the model
Build a XGBoost's XGBClassifier model:

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

pipeline = Pipeline([
    ('scaling', StandardScaler()), 
    ('xgb', XGBClassifier(n_estimators=5, seed=123456))
])

pipeline.fit(X_train, y_train)
print("Test data accuracy of the xgb classifier is {:.2f}".format(pipeline.score(X_test, y_test)))

## Convert the model to PMML
Now we can convert the model to PMML using nyoka:

In [None]:
from nyoka import xgboost_to_pmml

xgboost_to_pmml(pipeline, features, target, "./models/xgb-iris.pmml")

## Validate the PMML
Validate whether the predictions of PMML are the same as ones produced by the Python model.

In [None]:
from pypmml import Model
import numpy as np

model = Model.fromFile("./models/xgb-iris.pmml")
result = model.predict(X_test)
result.head(5)

In [None]:
predictions = pipeline.predict(X_test)
probabilities = pipeline.predict_proba(X_test)
predictions

In [None]:
# Make predictions using the Python model
predictions = pipeline.predict(X_test)
probabilities = pipeline.predict_proba(X_test)

# Compare results
np.testing.assert_almost_equal(result['predicted_species'], predictions)
np.testing.assert_array_almost_equal(result[['species_probability_0', 'species_probability_1', 'species_probability_2']], probabilities, 0.001)
print("The results of PMML are right.")