In [45]:
from SafeTransformer import SafeTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
import random
from sklearn.metrics import auc
import matplotlib.pyplot as plt
from copy import deepcopy
from math import floor, fmod, log
from scipy.special import logit
from sklearn import metrics
from sklearn.ensemble.partial_dependence import plot_partial_dependence

In [46]:
diabetes = pd.read_csv("diabetes.csv")
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [47]:
y = diabetes["Outcome"]
X = diabetes.drop(["Outcome"], axis=1)

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1234)

# SAFE

In [49]:
logistic_model = LogisticRegression(solver = 'lbfgs')
logistic_model = logistic_model.fit(X_train, y_train)
standard_predictions = logistic_model.predict(X_test)
print(accuracy_score(y_test, standard_predictions))

pred = logistic_model.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y_test, pred, pos_label=1)
metrics.auc(fpr, tpr)

0.7447916666666666




0.8138749854498893

In [50]:
surrogate_model = GradientBoostingClassifier(random_state = 123                    )
surrogate_model = surrogate_model.fit(X_train, y_train)
surrogate_model_predictions = surrogate_model.predict(X_test)
print(accuracy_score(y_test, surrogate_model_predictions))

pred = surrogate_model.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y_test, pred, pos_label=1)
metrics.auc(fpr, tpr)

0.734375


0.8217902456058667

In [51]:
pens = np.linspace(0.01, 10, 25)
best_auc = float('-Inf')
best_acc = float('-Inf')
best_auc_pen = 0
best_acc_pen = 0
aucs = []
accs = []

for pen in pens:
    surrogate_model = GradientBoostingClassifier(random_state = 123)
    logistic_model_simple = LogisticRegression(solver = 'lbfgs')
    safe_transformer = SafeTransformer(surrogate_model, penalty = pen)
    pipe = Pipeline(steps=[('safe', safe_transformer), ('linear', logistic_model_simple)])
    pipe = pipe.fit(X_train, y_train)
    predictions = pipe.predict(X_test)
    acc = accuracy_score(y_test, predictions)
    accs.append(acc)
    
    pred = pipe.predict_proba(X_test)[:,1]
    fpr, tpr, thresholds = metrics.roc_curve(y_test, pred, pos_label=1)
    auc = metrics.auc(fpr, tpr)
    aucs.append(auc)
    
    print([pen, acc, auc])
    if acc > best_acc:
        best_acc_transformer = deepcopy(safe_transformer)
        best_acc_model = deepcopy(logistic_model_simple)
        best_acc = acc
        best_acc_pen = pen
        
    if auc > best_auc:
        best_auc_transformer = deepcopy(safe_transformer)
        best_auc_model = deepcopy(logistic_model_simple)
        best_auc = auc
        best_auc_pen = pen

[0.01, 0.7291666666666666, 0.7902456058666046]
[0.42625, 0.7708333333333334, 0.8384937725526714]
[0.8425, 0.7708333333333334, 0.8175416133162613]
[1.25875, 0.7604166666666666, 0.8125363752764521]
[1.675, 0.7708333333333334, 0.8198114305668723]
[2.0912499999999996, 0.7552083333333334, 0.7804097311139564]
[2.5075, 0.7708333333333334, 0.7802351297869865]
[2.92375, 0.7708333333333334, 0.7802351297869865]
[3.34, 0.7708333333333334, 0.7802351297869863]
[3.7562499999999996, 0.7708333333333334, 0.7802351297869863]
[4.172499999999999, 0.7708333333333334, 0.7802351297869863]
[4.58875, 0.7708333333333334, 0.7802351297869863]
[5.005, 0.7708333333333334, 0.7802351297869863]
[5.42125, 0.7708333333333334, 0.7802351297869863]
[5.8375, 0.7708333333333334, 0.7802351297869863]
[6.25375, 0.7708333333333334, 0.7802351297869863]
[6.67, 0.7447916666666666, 0.7708066581306019]
[7.08625, 0.7447916666666666, 0.7712722616691887]
[7.5024999999999995, 0.7447916666666666, 0.7712722616691887]
[7.91875, 0.74479166666

In [52]:
best_auc

0.8384937725526714

In [53]:
best_acc

0.7708333333333334

# SAFE trees

In [54]:
tree_model = DecisionTreeClassifier(random_state=123)
tree_model = tree_model.fit(X_train, y_train)
standard_predictions = tree_model.predict(X_test)
print(accuracy_score(y_test, standard_predictions))

pred = tree_model.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y_test, pred, pos_label=1)
metrics.auc(fpr, tpr)

0.6875


0.6647654522174369

In [55]:
surrogate_model = GradientBoostingClassifier(random_state = 123)
surrogate_model = surrogate_model.fit(X_train, y_train)
surrogate_model_predictions = surrogate_model.predict(X_test)
print(accuracy_score(y_test, surrogate_model_predictions))

pred = surrogate_model.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y_test, pred, pos_label=1)
metrics.auc(fpr, tpr)

0.734375


0.8217902456058667

In [56]:
pens = np.linspace(0.01, 10, 25)
best_auc = float('-Inf')
best_acc = float('-Inf')
best_auc_pen = 0
best_acc_pen = 0
aucs = []
accs = []

for pen in pens:
    surrogate_model = GradientBoostingClassifier(random_state = 123)
    tree_model_simple = DecisionTreeClassifier(random_state = 123)
    safe_transformer = SafeTransformer(surrogate_model, penalty = pen)
    pipe = Pipeline(steps=[('safe', safe_transformer), ('tree', tree_model_simple)])
    pipe = pipe.fit(X_train, y_train)
    predictions = pipe.predict(X_test)
    acc = accuracy_score(y_test, predictions)
    accs.append(acc)
    
    pred = pipe.predict_proba(X_test)[:,1]
    fpr, tpr, thresholds = metrics.roc_curve(y_test, pred, pos_label=1)
    auc = metrics.auc(fpr, tpr)
    aucs.append(auc)
    
    print([pen, acc, auc])
    if acc > best_acc:
        best_acc_transformer = deepcopy(safe_transformer)
        best_acc_model = deepcopy(tree_model_simple)
        best_acc = acc
        best_acc_pen = pen
        
    if auc > best_auc:
        best_auc_transformer = deepcopy(safe_transformer)
        best_auc_model = deepcopy(tree_model_simple)
        best_auc = auc
        best_auc_pen = pen

[0.01, 0.640625, 0.6103480386450937]
[0.42625, 0.6927083333333334, 0.7156326388080549]
[0.8425, 0.6770833333333334, 0.6520195553486205]
[1.25875, 0.6822916666666666, 0.6698288906995693]
[1.675, 0.7552083333333334, 0.7626003957630076]
[2.0912499999999996, 0.7552083333333334, 0.7426376440460948]
[2.5075, 0.7604166666666666, 0.7453148643929693]
[2.92375, 0.7604166666666666, 0.7453148643929693]
[3.34, 0.7552083333333334, 0.7564893493190548]
[3.7562499999999996, 0.7552083333333334, 0.7564893493190548]
[4.172499999999999, 0.7552083333333334, 0.7564893493190548]
[4.58875, 0.7552083333333334, 0.7564893493190548]
[5.005, 0.7552083333333334, 0.7564893493190548]
[5.42125, 0.7552083333333334, 0.7564893493190548]
[5.8375, 0.7552083333333334, 0.7564893493190548]
[6.25375, 0.7552083333333334, 0.7564893493190548]
[6.67, 0.7291666666666666, 0.747177278547317]
[7.08625, 0.71875, 0.7553253404725876]
[7.5024999999999995, 0.71875, 0.7553253404725876]
[7.91875, 0.71875, 0.7553253404725876]
[8.33499999999999

In [57]:
[best_acc, best_auc]

[0.7604166666666666, 0.7674310324758469]