In [None]:
# @title ###### Licensed to the Apache Software Foundation (ASF), Version 2.0 (the "License")

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License

# Apache Beam RunInference for XGBoost

<table align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/apache/beam/blob/master/examples/notebooks/beam-ml/run_inference_xgboost.ipynb"><img src="https://raw.githubusercontent.com/google/or-tools/main/tools/colab_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/apache/beam/blob/master/examples/notebooks/beam-ml/run_inference_xgboost.ipynb"><img src="https://raw.githubusercontent.com/google/or-tools/main/tools/github_32px.png" />View source on GitHub</a>
  </td>
</table>


This notebook shows how to use the Apache Beam [RunInference](https://beam.apache.org/releases/pydoc/current/apache_beam.ml.inference.base.html#apache_beam.ml.inference.base.RunInference) transform with [XGBoost](https://xgboost.readthedocs.io/en/stable/).
The Apache Beam RunInference transform has implementations of the `ModelHandler` class prebuilt for XGBoost. For more information about using RunInference, see [Get started with AI/ML pipelines](https://beam.apache.org/documentation/ml/overview/) in the Apache Beam documentation.

You can choose the appropriate model handler based on your input data type:

- NumPy model handler
- Pandas DataFrame model handler
- DataTable model handler
- SciPy model handler

With RunInference, these model handlers manage batching, vectorization, and prediction optimization for your XGBoost pipeline or model.

This notebook demonstrates the following common RunInference patterns:

- Generate predictions.
- Postprocess results after running inference.
- Classify iris flowers.
- Make home price predictions with a regression model.

## Before you begin
Install dependencies for Apache Beam.

In [None]:
!pip install apache-beam[gcp]>=2.47.0

In [None]:
import xgboost
import apache_beam as beam
from sklearn.datasets import fetch_california_housing
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

from apache_beam.ml.inference import RunInference
from apache_beam.ml.inference.xgboost_inference import XGBoostModelHandlerNumpy
from apache_beam.options.pipeline_options import PipelineOptions

In [None]:
SEED = 999
CLASSIFICATION_MODEL_STATE = '/tmp/classification_model.json'
REGRESSION_MODEL_STATE = '/tmp/regression_model.json'

## Load data from scikit-learn and train XGBoost models
In this example, you create two models, one to classify iris flowers and one to predict housing prices in California.

This section demonstrates the following steps:
1. Load the iris flower and Califorina housing datasets from scikit-learn, and then create classification and regression models.
2. Train the classification and regression models.
3. Save the models in a JSON file using `mode.save_model`. For more information, see [Introduction to Model IO](https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html) in the XGBoost documentation.

In [None]:
# Train the classification model.
iris_dataset = load_iris()
x_train_classification, x_test_classification, y_train_classification, y_test_classification = train_test_split(
    iris_dataset['data'], iris_dataset['target'], test_size=.2, random_state=SEED)
booster = xgboost.XGBClassifier(
    n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic')
booster.fit(x_train_classification, y_train_classification)
booster.save_model(CLASSIFICATION_MODEL_STATE)

# Train the regression model.
california_dataset = fetch_california_housing()
x_train_regression, x_test_regression, y_train_regression, y_test_regression = train_test_split(
    california_dataset['data'], california_dataset['target'], test_size=.2, random_state=SEED)
model = xgboost.XGBRegressor(
    n_estimators=1000,
    max_depth=8,
    eta=0.1,
    subsample=0.75,
    colsample_bytree=0.8)
model.fit(x_train_regression, y_train_regression)
model.save_model(REGRESSION_MODEL_STATE)


# Reshape the test data, because XGBoost expects a batch instead of a single element.
# For more information, see https://xgboost.readthedocs.io/en/stable/prediction.html
x_test_classification = x_test.reshape(5, 6, 4)
x_test_regression = x_test_regression.reshape(258, 16, 8)

## Implement postprocessing helper functions
The following code demonstrates how to implement postprocessing helper functions for your models.

In [None]:
def translate_labels(inference_results: PredictionResult):
  """
    Maps output values (0, 1 or 2) of the XGBoost iris classification
    model to the names of the different iris flowers.
    Args:
        inference_results: Array containing the outputs of the XGBoost Iris classification model
  """
  return PredictionResult(
      inference_results.example,
      np.vectorize(['Setosa', 'Versicolour',
                    'Virginica'].__getitem__)(inference_results.inference))


class FlattenBatchPredictionResults(beam.DoFn):
  """
  This function takes a PredictionResult containing a batch (list) of
  examples and predictions as input and prints all example/prediction pairs.
  """
  def process(self, batch_prediction_result: PredictionResult):
    for example, inference in zip(batch_prediction_result.example, batch_prediction_result.inference):
      print(PredictionResult(
          example, inference, batch_prediction_result.model_id))


## Create an XGBoost RunInference pipeline
This section demonstrates how to do the following tasks:
1. Define an XGBoost model handler that accepts a `numpy.ndarray` object as input.
2. Load the data from the datasets.
3. Use the XGBoost trained models and the XGBoost RunInference transform on unkeyed data.
4. Use postprocessing to tranlate the XGBoost numeric outputs into flower names, and then flatten the batched outputs.

In [None]:
xgboost_classification_model_handler = XGBoostModelHandlerNumpy(
    model_class=xgboost.XGBClassifier, model_state=CLASSIFICATION_MODEL_STATE)

pipeline_options = PipelineOptions().from_dictionary({})

with beam.Pipeline(options=pipeline_options) as p:
  (
      p
      | "Load Data" >> beam.Create(x_test_classification)
      | "RunInferenceXGBoost" >>
      RunInference(model_handler=xgboost_classification_model_handler)
      | "TranslateLabels" >> beam.Map(translate_labels)
      | "FlattenBatchPredictionResults" >> beam.ParDo(
          FlattenBatchPredictionResults()))

In [None]:
xgboost_regression_model_handler = XGBoostModelHandlerNumpy(
    model_class=xgboost.XGBRegressor, model_state=REGRESSION_MODEL_STATE)

pipeline_options = PipelineOptions().from_dictionary({})

with beam.Pipeline(options=pipeline_options) as p:
  (
      p
      | "Load Data" >> beam.Create(x_test_regression)
      | "RunInferenceXGBoost" >>
      RunInference(model_handler=xgboost_regression_model_handler)
      | "FlattenBatchPredictionResults" >> beam.ParDo(
          FlattenBatchPredictionResults()))

## Use XGBoost with RunInference on keyed inputs
To retain metadata about the example, associate examples with a key before doing inference.
For example, you might want to retain the original URL of a preprocessed image or a non-preprocessed input.
To use RunInference to retain metadata, use a `KeyedModelHandler`.
This section demonstrates how to do the following tasks with a `KeyedModelHandler`:


1. To handle keyed data, wrap the `XGBoostHandlerNumpy` with a `KeyedModelHandler`.
2. Load the data from the datasets.
3. Use the XGBoost trained models and the XGBoost RunInference transform on the keyed data.
4. Postprocess the results to flatten the batched outputs.

In [None]:
x_test_classification = [(f'batch {i}', sample) for i, sample in enumerate(x_test_classification)]
x_test_regression = [(f'batch {i}', sample for i, sample in enumerate(x_test_regression)]

In [None]:
keyed_xgboost_regression_model_handler = KeyedModelHandler(xgboost_classification_model_handler)

pipeline_options = PipelineOptions().from_dictionary({})

with beam.Pipeline(options=pipeline_options) as p:
  (
      p
      | "Load Data" >> beam.Create(x_test_classification)
      | "RunInferenceXGBoost" >>
      RunInference(model_handler=keyed_xgboost_regression_model_handler)
      | "TranslateLabels" >> beam.Map(translate_labels)
      | "FlattenBatchPredictionResults" >> beam.ParDo(
          FlattenBatchPredictionResults()))

In [None]:
keyed_xgboost_regression_model_handler = KeyedModelHandler(xgboost_regression_model_handler)

pipeline_options = PipelineOptions().from_dictionary({})

with beam.Pipeline(options=pipeline_options) as p:
  (
      p
      | "Load Data" >> beam.Create(x_test_regression)
      | "RunInferenceXGBoost" >>
      RunInference(model_handler=keyed_xgboost_regression_model_handler)
      | "FlattenBatchPredictionResults" >> beam.ParDo(
          FlattenBatchPredictionResults()))