Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
SPDX-License-Identifier: Apache-2.0

( Run <code>jupyter notebook</code> under the project directory )

# XGBoost for Iris Dataset

We use this example to demenstrate how to use ppxgboost for encypting an xgboost model for multi-class
 prediction. We directly use the iris data from Sklearn, but one
 can go to https://archive.ics.uci.edu/ml/datasets/iris to download the original dataset.


In [1]:
import sys
sys.path.append('../third-party')

import pandas as pd
import numpy as np
import xgboost as xgb
from secrets import token_bytes

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

from ppxgboost import BoosterParser as boostparser
from ppxgboost import PPBooster as ppbooster
from ppxgboost.PPBooster import MetaData
from ppxgboost.PPKey import PPBoostKey
from ope.pyope.ope import OPE
from ppxgboost import PaillierAPI as paillier

In [2]:
iris = load_iris()
X = iris.data
y = iris.target

# Pre-assign the column name first.
# the default feature name from the xgboost -- iris have 4 columns
feature_names = ['f0', 'f1', 'f2', 'f3']
X = pd.DataFrame(X, columns=feature_names)



In [3]:
X.head()

Unnamed: 0,f0,f1,f2,f3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [4]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [5]:
# splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

test_input_vector = pd.DataFrame(X_test, columns=feature_names)

In [6]:
# first dump and pickled the model in the file directory.
# total number of tree = total_estimators * number_labels
# e.g. for the imported iris dataset, the number of classes is 3.

# Just provide estimator number for testing purposes.
total_estimaters = 6
model = xgb.XGBClassifier(n_estimators=total_estimaters, objective='multi:softmax')
model.fit(X, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=6, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [7]:
# Get the number of classes -- i.e. 3 from iris dataset
# The classes as array can be get by calling model.classes_
num_classes = model.n_classes_

# Booster Parser will parse the tree
#  (add fake metadata here as this testing only test the model correctness)
min_max = {'min': 0, 'max': 100}
meta_min_max = MetaData(min_max)
p_trees, features, min_max = boostparser.model_to_trees(model.get_booster(), min_max)


Encryption Preparation for XGBoost Model
1). Set up some metadata information for the dataset.
2). Set up the encryption materials
3). Encrypt the model
4). Encrypt the query
5). Perform the prediction 
6). Decrypt the prediction

In [8]:
# ##################################################################################
# # The folowing is to compute the scores based on the OPE processed decision tree #
# ##################################################################################
# # Set up encryption materials.
# # token bytes calls the os.urandom().

prf_key = token_bytes(16)
OPE_key = token_bytes(16)
encrypter = OPE(OPE_key)
public_key, private_key = paillier.he_key_gen()
pp_boostKey = PPBoostKey(public_key, prf_key, encrypter)

#
# 1. process the tree into ope_enc_tree
enc_trees = ppbooster.enc_xgboost_model(pp_boostKey, p_trees, meta_min_max)


In [9]:
# 2. Encrypts the input vector for prediction (using prf_key_hash and ope-encrypter) based on the feature set.
ppbooster.enc_input_vector(prf_key, encrypter, features, test_input_vector, meta_min_max)

Note that: The prediction on the server side is done differently from the log:binary. This is because
the server needs to perofrm the softmax aggregation.

In [10]:
# # 3. OPE evaluation based on OPE encrypted values in the tree nodes.
enc_predictions = ppbooster.predict_multiclass(enc_trees, num_classes, test_input_vector)

In [11]:
# 4. Client decryption.
result = ppbooster.client_decrypt_prediction_multiclass(private_key, enc_predictions)

In [12]:
real_y = model.predict(X_test)
assert np.array_equal(result, real_y)
print("success!")


success!
