### Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
### SPDX-License-Identifier: Apache-2.0

( Run <code>jupyter notebook</code> under the project directory )

In [None]:
from ppxgboost import BoosterParser as boostparser
from ppxgboost import PPBooster as ppbooster
from ppxgboost import PaillierAPI as paillier
from ppxgboost.PPBooster import MetaData
from ppxgboost.PPKey import PPBoostKey
import random
import time
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBClassifier
from secrets import token_bytes
from pyope.ope import OPE
from sklearn.model_selection import train_test_split

# XGBoost for Dataset

This example demonstrates how to use ppxgboost to encrypt an xgboost model and query it.

The sample dataset we are going to use in this lab is a sampled version of the "Diabetes 130-US hospitals for years 1999-2008 Data Set"  (Beata Strack, Jonathan P. DeShazo, Chris Gennings, Juan L. Olmo, Sebastian Ventura, Krzysztof J. Cios, and John N. Clore, “Impact of HbA1c Measurement on Hospital Readmission Rates: Analysis of 70,000 Clinical Database Patient Records,” BioMed Research International, vol. 2014, Article ID 781670, 11 pages, 2014. ).


### Data Preparation and Train an XGBoost ML model

In [None]:
# In the following example, the datasets modified to match the input requirements by SageMaker Data Wrangler.
data = pd.read_csv('../data/readmitted.csv')

train, test = train_test_split(data, train_size = 0.998, test_size = 0.002)

AttributeLabels = ['race', 'gender', 'age', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses', 'max_glu_serum', 'a1c_result', 'change', 'diabetes_med']

# Training dataset
X_train = train[AttributeLabels]
Y_train = train[['readmitted']]

# Testing dataset
X_test = test[AttributeLabels]
Y_test = test[['readmitted']]

X_train.head()

In [None]:
Y_train.head()

In [None]:
# Train a xgboost model 
dtrain = xgb.DMatrix(X_train, label=Y_train)
num_class = 3
params = {'eta': 0.1, 'objective':'multi:softmax', 'num_class': num_class}
model = xgb.train(params=params, dtrain=dtrain)

# predict using the plaintext prediction
start = time.time()
plaintext_predict = model.predict(xgb.DMatrix(X_test))
end = time.time()
print("XGBoost Prediction : Elapsed Time: ", end - start)

### Encryption Preparation for XGBoost Model

1. Set up the encryption materials
2. process the tree into ope_enc_tree
3. Encrypts the input vector for prediction
4. Perform the prediction
5. Decrypt the prediction

In [None]:
# Booster Parser will parse the tree
#  (add fake metadata here as this testing only test the model correctness)
test_input_vector = pd.DataFrame(X_test)
min_max = boostparser.training_dataset_parser(X_test)
meta_min_max = MetaData(min_max)
p_trees, features, min_max = boostparser.model_to_trees(model, min_max)

# 1. Set up encryption materials.
prf_key = token_bytes(16)
public_key, private_key = paillier.he_key_gen()
encrypter = OPE(token_bytes(16))
ppBoostKey = PPBoostKey(public_key, prf_key, encrypter)

# 2. process the tree into ope_enc_tree
enc_trees = ppbooster.enc_xgboost_model(ppBoostKey, p_trees, meta_min_max)

# 3. Encrypts the input vector for prediction (using prf_key_hash and ope-encrypter) based on the feature set.
ppbooster.enc_input_vector(prf_key, encrypter, features, test_input_vector, meta_min_max)

# # 4. OPE evaluation based on OPE encrypted values in the tree nodes.
start = time.time()
enc_predictions = ppbooster.predict_multiclass(enc_trees, num_class, test_input_vector)
end = time.time()
print("PPXGBoost Prediction : Elapsed Time: ", end - start)

# 5. Client decryption.
result = ppbooster.client_decrypt_prediction_multiclass(private_key, enc_predictions)

In [None]:

result = np.array([round(x, 7) for x in result])
assert len(plaintext_predict) == len(result)

# check if the predicted values are same (the ppxgboost might not produce same values 
#                                    as the plaintext value due to precision)
for i in range(len(plaintext_predict)):
    assert abs(plaintext_predict[i] - result[i]) < 0.000001
