In [1]:
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0

( Run <code>jupyter notebook</code> under the project directory )

In [2]:

import sys
sys.path.append('../third-party')
# sys.path.append('../')

from ppxgboost import BoosterParser as boostparser
from ppxgboost import PPBooster as ppbooster
from ppxgboost import PaillierAPI as paillier
from ppxgboost.PPBooster import MetaData
from ppxgboost.PPKey import PPBoostKey
import sys
import random
import time
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBClassifier
from secrets import token_bytes
from ope.pyope.ope import OPE

# XGBoost for Titanic Dataset

(We use this example to demenstrate how to use ppxgboost for encypting an xgboost model and query it.)

Please go to https://www.kaggle.com/c/titanic/data and download the dataset.
In the following example, the datasets are downloaded in the example directory


### Data Preparation and Train an XGBoost ML model

In [3]:
# The pp-xgboost for titanic 
# In the following example, the datasets are downloaded in the example directory
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

# Training dataset. We skip the data exploration part ...
# Only get the features that are useful for building the ML model
X_train = train[['Pclass', 'Age', 'Fare', 'SibSp', 'Parch']]
y_train = train[['Survived']]

# Testing dataset
X_test = test[['Pclass', 'Age', 'Fare', 'SibSp', 'Parch']]

X_train.head()

Unnamed: 0,Pclass,Age,Fare,SibSp,Parch
0,3,22.0,7.25,1,0
1,1,38.0,71.2833,1,0
2,3,26.0,7.925,0,0
3,1,35.0,53.1,1,0
4,3,35.0,8.05,0,0


In [4]:
y_train.head()

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0


In [5]:
X_test.head()

Unnamed: 0,Pclass,Age,Fare,SibSp,Parch
0,3,34.5,7.8292,0,0
1,3,47.0,7.0,1,0
2,2,62.0,9.6875,0,0
3,3,27.0,8.6625,0,0
4,3,22.0,12.2875,1,1


In [6]:
# Train a xgboost model 
dtrain = xgb.DMatrix(X_train, label=y_train)
params = {'eta': 0.1}
model = xgb.train(params=params, dtrain=dtrain)

# predict using the plaintext prediction
plaintext_predict = model.predict(xgb.DMatrix(X_test))

Dump the model

In [7]:
model.dump_model('tree.txt')

### Encryption Preparation for XGBoost Model
1. Set up some metadata information for the dataset.
2. Set up the encryption materials
3. Encrypt the model
4. Encrypt the query
5. Perform the prediction
6. Decrypt the prediction

In [8]:
# 1. parsing to internal tree data structure, and output feature set
min_max = boostparser.training_dataset_parser(X_test)
enc_tree, feature_set, min_max = boostparser.model_to_trees(model, min_max)

# 2. Set up encryption materials.
prf_key = token_bytes(16)
public_key, private_key = paillier.he_key_gen()
encrypter = OPE(token_bytes(16))
ppBoostKey = PPBoostKey(public_key, prf_key, encrypter)

# 3. process the tree into enc_tree
ppbooster.enc_xgboost_model(ppBoostKey, enc_tree, MetaData(min_max))


# 4. Encrypts the input vector for prediction (using prf_key_hash and ope-encrypter) based on the feature set.
ppbooster.enc_input_vector(prf_key, encrypter, feature_set, X_test, MetaData(min_max))


In [9]:
# 5. privacy-preserving evaluation.
start = time.time()
values = ppbooster.predict_binary(enc_tree, X_test)
end = time.time()
print("Elapsed Time: ", end - start)


Elapsed Time:  0.7571470737457275


In [10]:
# 6. decryption
decryptions = []

for c in values:
    decryptions.append(paillier.decrypt(private_key, c))

decryptions = np.array([round(x, 7) for x in decryptions])
assert len(plaintext_predict) == len(decryptions)

# if the predicted values are same (the ppxgboost might not produce same values 
#                                    as the plaintext value due to precision)
for i in range(len(plaintext_predict)):
    assert abs(plaintext_predict[i] - decryptions[i]) < 0.000001
