In [1]:
from numpy import mean
from numpy import std
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score

from concrete.ml.sklearn import LogisticRegression
from sklearn.linear_model import LogisticRegression as skLR

from concrete.ml.sklearn.rf import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier as skRF

from concrete.ml.sklearn.svm import LinearSVC
from sklearn.svm import LinearSVC as skSVC

import time
import pandas as pd

In [2]:
# Reading the dataset
dataset = pd.read_csv("Brain_GSE50161.csv")
dataset.head()

Unnamed: 0,samples,type,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
0,834,ependymoma,12.49815,7.604868,6.880934,9.027128,4.176175,7.22492,6.085942,6.835999,...,9.979005,9.92647,12.719785,12.777792,5.403657,4.870548,4.04738,3.721936,4.516434,4.74994
1,835,ependymoma,13.067436,7.99809,7.209076,9.723322,4.826126,7.539381,6.250962,8.012549,...,11.924749,11.21593,13.605662,13.401342,5.224555,4.895315,3.786437,3.564481,4.430891,4.491416
2,836,ependymoma,13.068179,8.573674,8.647684,9.613002,4.396581,7.813101,6.007746,7.178156,...,12.154405,11.53246,13.764593,13.4778,5.303565,5.052184,4.005343,3.595382,4.563494,4.668827
3,837,ependymoma,12.45604,9.098977,6.628784,8.517677,4.154847,8.361843,6.596064,6.347285,...,11.969072,11.288801,13.600828,13.379029,4.953429,4.708371,3.892318,3.759429,4.748381,4.521275
4,838,ependymoma,12.699958,8.800721,11.556188,9.166309,4.165891,7.923826,6.212754,6.866387,...,11.411701,11.169317,13.751442,13.803646,4.892677,4.773806,3.796856,3.577544,4.504385,4.54145


In [3]:
dataset.describe()
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130 entries, 0 to 129
Columns: 54677 entries, samples to AFFX-TrpnX-M_at
dtypes: float64(54675), int64(1), object(1)
memory usage: 54.2+ MB


In [4]:
feature_cols = [c for c in dataset.columns[2:]]
#print(feature_cols)

In [5]:
x = dataset.loc[:,feature_cols].values #must be floats
y = dataset.loc[:,'type'].values #must be integers

print(x)
print(y)

[[12.49814981  7.60486786  6.88093353 ...  3.72193569  4.51643352
   4.74994043]
 [13.06743563  7.99808988  7.20907586 ...  3.56448096  4.43089086
   4.49141606]
 [13.06817894  8.57367396  8.64768433 ...  3.59538202  4.56349448
   4.66882701]
 ...
 [12.70699132  8.79572065  7.77235946 ...  3.70096421  4.76469317
   4.83495239]
 [12.68459256  8.29393768  7.22818594 ...  3.92036291  4.66558394
   4.61332632]
 [12.39772163  8.84352405  8.82509988 ...  3.67811185  5.01850083
   4.70477883]]
['ependymoma' 'ependymoma' 'ependymoma' 'ependymoma' 'ependymoma'
 'ependymoma' 'ependymoma' 'ependymoma' 'ependymoma' 'ependymoma'
 'ependymoma' 'ependymoma' 'ependymoma' 'ependymoma' 'ependymoma'
 'ependymoma' 'ependymoma' 'ependymoma' 'ependymoma' 'ependymoma'
 'ependymoma' 'ependymoma' 'ependymoma' 'ependymoma' 'ependymoma'
 'ependymoma' 'ependymoma' 'ependymoma' 'ependymoma' 'ependymoma'
 'ependymoma' 'ependymoma' 'ependymoma' 'ependymoma' 'ependymoma'
 'ependymoma' 'ependymoma' 'ependymoma' 'epend

In [6]:
# Preprocessing with labels for the lineage
le = preprocessing.LabelEncoder()
y = le.fit_transform(y)
print(y)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3
 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4]


In [7]:
x = x.astype(float)
print(x)

[[12.49814981  7.60486786  6.88093353 ...  3.72193569  4.51643352
   4.74994043]
 [13.06743563  7.99808988  7.20907586 ...  3.56448096  4.43089086
   4.49141606]
 [13.06817894  8.57367396  8.64768433 ...  3.59538202  4.56349448
   4.66882701]
 ...
 [12.70699132  8.79572065  7.77235946 ...  3.70096421  4.76469317
   4.83495239]
 [12.68459256  8.29393768  7.22818594 ...  3.92036291  4.66558394
   4.61332632]
 [12.39772163  8.84352405  8.82509988 ...  3.67811185  5.01850083
   4.70477883]]


In [8]:
print("done!", x.shape)

done! (130, 54675)


In [9]:
# Feature Selection

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

opt = 2

if opt == 1:
    # Remove all features that have low variance (on or off) in more than (percent)% of the samples.
    percent = .65
    sel = VarianceThreshold(threshold=(percent * (1 - percent)))
    print("Shape of X: ", x.shape)
    #print(x.iloc[0])
    x = sel.fit_transform(x)
    print("After feature selection: ", len(x[0]), " features")
    #print(x[0])
elif opt == 2:
    print("Shape of x before selection: ", x.shape)
    x = SelectKBest(chi2, k=10).fit_transform(x, y)
    print("Shape of x after selection: ", x.shape)
elif opt == 3:
    x_scaled = StandardScaler().fit_transform(x)
    pca = PCA(n_components=30)
    pca_features = pca.fit_transform(x_scaled)
    print('Shape before PCA: ', x_scaled.shape)
    print('Shape after PCA: ', pca_features.shape)
    x = pca_features
else:
    print("")

Shape of x before selection:  (130, 54675)
Shape of x after selection:  (130, 10)


In [10]:
# Retrieve train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.10, random_state = 0)

st_x = StandardScaler()
X_train = st_x.fit_transform(X_train)
X_test = st_x.transform(X_test)

In [11]:
# Logistic Regression

skmodel_LR = skLR(class_weight='balanced')
skmodel_LR.fit(X_train,y_train)
y_pred_clear_LR = skmodel_LR.predict(X_test)

quant_LR = LogisticRegression(class_weight='balanced')
quant_LR.fit(X_train, y_train)
y_pred_q_LR = quant_LR.predict(X_test, execute_in_fhe = False)

print("Logistic Regression Results \n")
print("SKLEARN PREDICTION:\n", y_pred_clear_LR)
print("QUANTIZED CLEAR PREDICTION:\n", y_pred_q_LR)
print("ACTUAL:\n", y_test) 
print("\n")

# Accuracy
skLR_accuracy = accuracy_score(y_test, y_pred_clear_LR) * 100
quantLR_accuracy = accuracy_score(y_test, y_pred_q_LR) * 100
print(f"Sklearn accuracy: {skLR_accuracy:.4f}")
print(f"Quantized Clear Accuracy: {quantLR_accuracy:.4f}")
# print("\n")
# fheLR_accuracy = accuracy_score(y_test, y_pred_fhe_LR) * 100
# print(f"FHE Accuracy: {fheLR_accuracy:.4f}")
print("\n")

# Balanced Accuracy
skLR_bal_accuracy = balanced_accuracy_score(y_test, y_pred_clear_LR) * 100
quantLR_bal_accuracy = balanced_accuracy_score(y_test, y_pred_q_LR) * 100
print(f"Sklearn Balanced accuracy: {skLR_bal_accuracy:.4f}")
print(f"Quantized Clear Balanced Accuracy: {quantLR_bal_accuracy:.4f}")
print("\n")

# F1 Score
skLR_f1 = f1_score(y_test, y_pred_clear_LR, average='weighted') * 100
quantLR_f1 = f1_score(y_test, y_pred_q_LR, average='weighted') * 100
print(f"Sklearn F1 Score: {skLR_f1:.4f}")
print(f"Quantized Clear F1 Score: {quantLR_f1:.4f}")
print("\n")

# ROC AUC Score
skLR_roc_auc_ovr = roc_auc_score(y_train, skmodel_LR.predict_proba(X_train), multi_class='ovr')
skLR_roc_auc_ovo = roc_auc_score(y_train, skmodel_LR.predict_proba(X_train), multi_class='ovo')
quantLR_roc_auc_ovr = roc_auc_score(y_train, quant_LR.predict_proba(X_train), multi_class='ovr')
quantLR_roc_auc_ovo = roc_auc_score(y_train, quant_LR.predict_proba(X_train), multi_class='ovo')
print(f"Sklearn ROC AUC Score (One-vs-Rest): {skLR_roc_auc_ovr:.4f}")
print(f"Sklearn ROC AUC Score (One-vs-One): {skLR_roc_auc_ovo:.4f}")
print(f"Quantized Clear ROC AUC Score (One-vs-Rest): {quantLR_roc_auc_ovr:.4f}")
print(f"Quantized Clear ROC AUC Score (One-vs-One): {quantLR_roc_auc_ovo:.4f}")

Logistic Regression Results 

SKLEARN PREDICTION:
 [0 4 2 3 2 0 0 0 1 0 4 0 0]
QUANTIZED CLEAR PREDICTION:
 [4 1 3 3 2 4 4 4 4 1 4 4 4]
ACTUAL:
 [0 1 2 3 2 0 0 0 1 0 1 0 0]


Sklearn accuracy: 84.6154
Quantized Clear Accuracy: 23.0769


Sklearn Balanced accuracy: 83.3333
Quantized Clear Balanced Accuracy: 45.8333


Sklearn F1 Score: 88.4615
Quantized Clear F1 Score: 24.6154


Sklearn ROC AUC Score (One-vs-Rest): 0.9700
Sklearn ROC AUC Score (One-vs-One): 0.9696
Quantized Clear ROC AUC Score (One-vs-Rest): 0.7828
Quantized Clear ROC AUC Score (One-vs-One): 0.8227


In [12]:
# Linear SVC

skmodel_SVC = skSVC(class_weight='balanced')
skmodel_SVC.fit(X_train,y_train)
y_pred_clear_SVC = skmodel_SVC.predict(X_test)

quant_SVC = LinearSVC(class_weight='balanced')
quant_SVC.fit(X_train, y_train)
y_pred_q_SVC = quant_SVC.predict(X_test, execute_in_fhe = False)

print("Linear Support Vector Classifier Results \n")
print("SKLEARN PREDICTION:\n", y_pred_clear_SVC)
print("QUANTIZED CLEAR PREDICTION:\n", y_pred_q_SVC)
print("ACTUAL:\n", y_test) 
print("\n")

# Accuracy
skSVC_accuracy = accuracy_score(y_test, y_pred_clear_SVC) * 100
quantSVC_accuracy = accuracy_score(y_test, y_pred_q_SVC) * 100
print(f"Sklearn accuracy: {skSVC_accuracy:.4f}")
print(f"Quantized Clear Accuracy: {quantSVC_accuracy:.4f}")
# print("\n")
# fheLR_accuracy = accuracy_score(y_test, y_pred_fhe_LR) * 100
# print(f"FHE Accuracy: {fheLR_accuracy:.4f}")
print("\n")

# Balanced Accuracy
skSVC_bal_accuracy = balanced_accuracy_score(y_test, y_pred_clear_SVC) * 100
quantSVC_bal_accuracy = balanced_accuracy_score(y_test, y_pred_q_SVC) * 100
print(f"Sklearn Balanced accuracy: {skSVC_bal_accuracy:.4f}")
print(f"Quantized Clear Balanced Accuracy: {quantSVC_bal_accuracy:.4f}")
print("\n")

# F1 Score
skSVC_f1 = f1_score(y_test, y_pred_clear_SVC, average='weighted') * 100
quantSVC_f1 = f1_score(y_test, y_pred_q_SVC, average='weighted') * 100
print(f"Sklearn F1 Score: {skSVC_f1:.4f}")
print(f"Quantized Clear F1 Score: {quantSVC_f1:.4f}")
print("\n")

# ROC AUC Score
skSVC_roc_auc_ovr = roc_auc_score(y_train, skmodel_SVC.predict_proba(X_train), multi_class='ovr')
skSVC_roc_auc_ovo = roc_auc_score(y_train, skmodel_SVC.predict_proba(X_train), multi_class='ovo')
quantSVC_roc_auc_ovr = roc_auc_score(y_train, quant_SVC.predict_proba(X_train), multi_class='ovr')
quantSVC_roc_auc_ovo = roc_auc_score(y_train, quant_SVC.predict_proba(X_train), multi_class='ovo')
print(f"Sklearn ROC AUC Score (One-vs-Rest): {skSVC_roc_auc_ovr:.4f}")
print(f"Sklearn ROC AUC Score (One-vs-One): {skSVC_roc_auc_ovo:.4f}")
print(f"Quantized Clear ROC AUC Score (One-vs-Rest): {quantSVC_roc_auc_ovr:.4f}")
print(f"Quantized Clear ROC AUC Score (One-vs-One): {quantSVC_roc_auc_ovo:.4f}")

Linear Support Vector Classifier Results 

SKLEARN PREDICTION:
 [0 4 2 3 2 0 0 0 1 0 4 0 0]
QUANTIZED CLEAR PREDICTION:
 [0 1 2 3 2 0 0 0 0 0 0 0 0]
ACTUAL:
 [0 1 2 3 2 0 0 0 1 0 1 0 0]


Sklearn accuracy: 84.6154
Quantized Clear Accuracy: 84.6154


Sklearn Balanced accuracy: 83.3333
Quantized Clear Balanced Accuracy: 83.3333


Sklearn F1 Score: 88.4615
Quantized Clear F1 Score: 81.7308




AttributeError: 'LinearSVC' object has no attribute 'predict_proba'

In [13]:
# Random Forest

skmodel_RF = skRF(class_weight='balanced')
skmodel_RF.fit(X_train,y_train)
y_pred_clear_RF = skmodel_RF.predict(X_test)

quant_RF = RandomForestClassifier(class_weight='balanced')
quant_RF.fit(X_train, y_train)
y_pred_q_RF = quant_RF.predict(X_test, execute_in_fhe = False)

print("Random Forest Results \n")
print("SKLEARN PREDICTION:\n", y_pred_clear_RF)
print("QUANTIZED CLEAR PREDICTION:\n", y_pred_q_RF)
print("ACTUAL:\n", y_test) 
print("\n")

# Accuracy
skRF_accuracy = accuracy_score(y_test, y_pred_clear_RF) * 100
quantRF_accuracy = accuracy_score(y_test, y_pred_q_RF) * 100
print(f"Sklearn accuracy: {skRF_accuracy:.4f}")
print(f"Quantized Clear Accuracy: {quantRF_accuracy:.4f}")
# print("\n")
# fheLR_accuracy = accuracy_score(y_test, y_pred_fhe_LR) * 100
# print(f"FHE Accuracy: {fheLR_accuracy:.4f}")
print("\n")

# Balanced Accuracy
skRF_bal_accuracy = balanced_accuracy_score(y_test, y_pred_clear_RF) * 100
quantRF_bal_accuracy = balanced_accuracy_score(y_test, y_pred_q_RF) * 100
print(f"Sklearn Balanced accuracy: {skRF_bal_accuracy:.4f}")
print(f"Quantized Clear Balanced Accuracy: {quantRF_bal_accuracy:.4f}")
print("\n")

# F1 Score
skRF_f1 = f1_score(y_test, y_pred_clear_RF, average='weighted') * 100
quantRF_f1 = f1_score(y_test, y_pred_q_RF, average='weighted') * 100
print(f"Sklearn F1 Score: {skRF_f1:.4f}")
print(f"Quantized Clear F1 Score: {quantRF_f1:.4f}")
print("\n")

# ROC AUC Score
skRF_roc_auc_ovr = roc_auc_score(y_train, skmodel_RF.predict_proba(X_train), multi_class='ovr')
skRF_roc_auc_ovo = roc_auc_score(y_train, skmodel_RF.predict_proba(X_train), multi_class='ovo')
quantRF_roc_auc_ovr = roc_auc_score(y_train, quant_RF.predict_proba(X_train), multi_class='ovr')
quantRF_roc_auc_ovo = roc_auc_score(y_train, quant_RF.predict_proba(X_train), multi_class='ovo')
print(f"Sklearn ROC AUC Score (One-vs-Rest): {skLR_roc_auc_ovr:.4f}")
print(f"Sklearn ROC AUC Score (One-vs-One): {skLR_roc_auc_ovo:.4f}")
print(f"Quantized Clear ROC AUC Score (One-vs-Rest): {quantLR_roc_auc_ovr:.4f}")
print(f"Quantized Clear ROC AUC Score (One-vs-One): {quantLR_roc_auc_ovo:.4f}")

Random Forest Results 

SKLEARN PREDICTION:
 [0 1 2 3 2 0 0 0 1 0 4 0 0]
QUANTIZED CLEAR PREDICTION:
 [0 1 2 3 2 0 0 0 4 0 4 0 0]
ACTUAL:
 [0 1 2 3 2 0 0 0 1 0 1 0 0]


Sklearn accuracy: 92.3077
Quantized Clear Accuracy: 84.6154


Sklearn Balanced accuracy: 91.6667
Quantized Clear Balanced Accuracy: 83.3333


Sklearn F1 Score: 95.3846
Quantized Clear F1 Score: 88.4615




ValueError: Target scores need to be probabilities for multiclass roc_auc, i.e. they should sum up to 1.0 over classes

In [None]:
# Logistic Regression FHE Model
fhe_LR = quant_LR.compile(x)
y_pred_fhe_LR = quant_LR.predict(X_test, execute_in_fhe=True)

# Linear SVC FHE Model
fhe_SVC = quant_SVC.compile(x)
y_pred_fhe_SVC = quant_SVC.predict(X_test, execute_in_fhe=True)

# Random Forest FHE Model
fhe_RF = quant_RF.compile(x)
y_pred_fhe_RF = quant_RF.predict(X_test, execute_in_fhe=True)

In [None]:
# # Initialize concrete model and fix the number of bits to used for quantization 
# model = LogisticRegression(class_weight='balanced')

# # Fit the model
# model.fit(X_train, y_train)

# # Run the predictions on non-encrypted data as a reference
# y_pred_q = model.predict(X_test, execute_in_fhe = False)

In [None]:
# # Output (plaintext vs FHE):
# print("In clear:  ", le.inverse_transform(y_pred_q))
# print("Accuracy rate:  ", accuracy_score(y_test, y_pred_q * 100, "%")
# # print(model.score(X_test, y_test)*100,"%")

In [None]:
# #low/higly volatile accuracy may be attributed to small dataset and too many features 
# #(feature selection needed and more samples are required)
# print("SKLEARN PREDICTION:\n", y_pred_clear)
# print("QUANTIZED CLEAR PREDICTION:\n", y_pred_q)
# print("ACTUAL:\n", y_test) 

In [None]:
# # Compile into a FHE model
# model.compile(x)

# # Run the inference in FHE
# y_pred_fhe = model.predict(X_test, execute_in_fhe=True)

# # print("In clear  :", y_pred_clear)
# # print("In FHE    :", y_pred_fhe)
# print("SKLEARN PREDICTION:\n", y_pred_clear)
# print("QUANTIZED CLEAR PREDICTION:\n", y_pred_q)
# print("FHE PREDICTION: \n", y_pred_fhe)
# print("ACTUAL:\n", y_test) 
# print(f"Comparison: {int((y_pred_fhe == y_pred_clear).sum()/len(y_pred_fhe)*100)}% similar")


In [None]:
# # Accuracy of Plaintext Model in Sklearn and Concrete
# sklearn_accuracy = accuracy_score(y_test, y_pred_clear) * 100
# quantized_accuracy = accuracy_score(y_test, y_pred_q) * 100
# print(f"Sklearn accuracy: {sklearn_accuracy:.4f}")
# print(f"Quantized Clear Accuracy: {quantized_accuracy:.4f}")


In [None]:
# fhe_accuracy = accuracy_score(y_test, y_pred_fhe) * 100
# print(f"FHE Accuracy: {fhe_accuracy:.4f}")

In [None]:
# # prepare the cross-validation procedure
# cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

# # evaluate model
# scores = cross_val_score(model, x, y, scoring='accuracy', cv=cv, n_jobs=-1)

# # report performance
# print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))