In [1]:
from SafeTransformer import SafeTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
import random
from sklearn.metrics import auc
import matplotlib.pyplot as plt
from copy import deepcopy
from math import floor, fmod, log
from scipy.special import logit
from sklearn import metrics
from sklearn.ensemble.partial_dependence import plot_partial_dependence

In [3]:
blood = pd.read_csv("blood.csv")
blood.head()
y = blood["whether he/she donated blood in March 2007"]
X = blood.drop(["whether he/she donated blood in March 2007"], axis=1)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 123)

# SAFE - logistic regression

In [5]:
logistic_model = LogisticRegression(solver = 'lbfgs')
logistic_model = logistic_model.fit(X_train, y_train)
standard_predictions = logistic_model.predict(X_test)
print(accuracy_score(y_test, standard_predictions))

pred = logistic_model.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y_test, pred, pos_label=1)
metrics.auc(fpr, tpr)

0.7486631016042781


0.6703647416413374

In [44]:
surrogate_model = GradientBoostingClassifier(random_state = 123,
                    n_estimators = 500, learning_rate = 0.11)
surrogate_model = surrogate_model.fit(X_train, y_train)
surrogate_model_predictions = surrogate_model.predict(X_test)
print(accuracy_score(y_test, surrogate_model_predictions))

pred = surrogate_model.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y_test, pred, pos_label=1)
metrics.auc(fpr, tpr)

0.7540106951871658


0.6787234042553192

In [45]:
pens = np.linspace(0.01, 10, 25)
best_auc = float('-Inf')
best_acc = float('-Inf')
best_auc_pen = 0
best_acc_pen = 0
aucs = []
accs = []

for pen in pens:
    surrogate_model = GradientBoostingClassifier(random_state = 123)
    logistic_model_simple = LogisticRegression(solver = 'lbfgs')
    safe_transformer = SafeTransformer(surrogate_model, penalty = pen)
    pipe = Pipeline(steps=[('safe', safe_transformer), ('linear', logistic_model_simple)])
    pipe = pipe.fit(X_train, y_train)
    predictions = pipe.predict(X_test)
    acc = accuracy_score(y_test, predictions)
    accs.append(acc)
    
    pred = pipe.predict_proba(X_test)[:,1]
    fpr, tpr, thresholds = metrics.roc_curve(y_test, pred, pos_label=1)
    auc = metrics.auc(fpr, tpr)
    aucs.append(auc)
    
    print([pen, acc, auc])
    if acc > best_acc:
        best_acc_transformer = deepcopy(safe_transformer)
        best_acc_model = deepcopy(logistic_model_simple)
        best_acc = acc
        best_acc_pen = pen
        
    if auc > best_auc:
        best_auc_transformer = deepcopy(safe_transformer)
        best_auc_model = deepcopy(logistic_model_simple)
        best_auc = auc
        best_auc_pen = pen

[0.01, 0.7486631016042781, 0.6581306990881459]
[0.42625, 0.7540106951871658, 0.6388297872340425]
[0.8425, 0.7540106951871658, 0.6472644376899696]
[1.25875, 0.7593582887700535, 0.6677811550151977]
[1.675, 0.7486631016042781, 0.6186170212765958]
[2.0912499999999996, 0.7486631016042781, 0.5526595744680851]
[2.5075, 0.7540106951871658, 0.5531155015197569]
[2.92375, 0.7540106951871658, 0.5477203647416413]
[3.34, 0.7540106951871658, 0.5477203647416413]
[3.7562499999999996, 0.7540106951871658, 0.5477203647416413]
[4.172499999999999, 0.7540106951871658, 0.5433890577507599]
[4.58875, 0.7540106951871658, 0.5433890577507599]
[5.005, 0.7540106951871658, 0.5433890577507599]
[5.42125, 0.7540106951871658, 0.5433890577507599]
[5.8375, 0.7540106951871658, 0.5433890577507599]
[6.25375, 0.7540106951871658, 0.5433890577507599]
[6.67, 0.7540106951871658, 0.5433890577507599]
[7.08625, 0.7540106951871658, 0.5433890577507599]
[7.5024999999999995, 0.7540106951871658, 0.5433890577507599]
[7.91875, 0.75401069518

In [46]:
best_auc

0.6677811550151977

In [47]:
best_acc

0.7593582887700535

# SAFE trees

In [48]:
tree_model = DecisionTreeClassifier(random_state=123)
tree_model = tree_model.fit(X_train, y_train)
standard_predictions = tree_model.predict(X_test)
print(accuracy_score(y_test, standard_predictions))

pred = tree_model.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y_test, pred, pos_label=1)
metrics.auc(fpr, tpr)

0.7379679144385026


0.5981762917933131

In [51]:
surrogate_model = GradientBoostingClassifier(random_state = 123)
surrogate_model = surrogate_model.fit(X_train, y_train)
surrogate_model_predictions = surrogate_model.predict(X_test)
print(accuracy_score(y_test, surrogate_model_predictions))

pred = surrogate_model.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y_test, pred, pos_label=1)
metrics.auc(fpr, tpr)

0.7754010695187166


0.6669452887537993

In [53]:
pens = np.linspace(0.01, 10, 25)
best_auc = float('-Inf')
best_acc = float('-Inf')
best_auc_pen = 0
best_acc_pen = 0
aucs = []
accs = []

for pen in pens:
    surrogate_model = GradientBoostingClassifier(random_state = 123)
    tree_model_simple = DecisionTreeClassifier(random_state = 123)
    safe_transformer = SafeTransformer(surrogate_model, penalty = pen)
    pipe = Pipeline(steps=[('safe', safe_transformer), ('tree', tree_model_simple)])
    pipe = pipe.fit(X_train, y_train)
    predictions = pipe.predict(X_test)
    acc = accuracy_score(y_test, predictions)
    accs.append(acc)
    
    pred = pipe.predict_proba(X_test)[:,1]
    fpr, tpr, thresholds = metrics.roc_curve(y_test, pred, pos_label=1)
    auc = metrics.auc(fpr, tpr)
    aucs.append(auc)
    
    print([pen, acc, auc])
    if acc > best_acc:
        best_acc_transformer = deepcopy(safe_transformer)
        best_acc_model = deepcopy(tree_model_simple)
        best_acc = acc
        best_acc_pen = pen
        
    if auc > best_auc:
        best_auc_transformer = deepcopy(safe_transformer)
        best_auc_model = deepcopy(tree_model_simple)
        best_auc = auc
        best_auc_pen = pen

[0.01, 0.7433155080213903, 0.6070668693009118]
[0.42625, 0.732620320855615, 0.6164893617021276]
[0.8425, 0.7540106951871658, 0.6464285714285715]
[1.25875, 0.7593582887700535, 0.6831306990881458]
[1.675, 0.7486631016042781, 0.5686170212765957]
[2.0912499999999996, 0.7486631016042781, 0.5455167173252279]
[2.5075, 0.7540106951871658, 0.5456686930091185]
[2.92375, 0.7540106951871658, 0.5255319148936171]
[3.34, 0.7540106951871658, 0.5255319148936171]
[3.7562499999999996, 0.7540106951871658, 0.5255319148936171]
[4.172499999999999, 0.7540106951871658, 0.5291033434650455]
[4.58875, 0.7540106951871658, 0.5291033434650455]
[5.005, 0.7540106951871658, 0.5291033434650455]
[5.42125, 0.7540106951871658, 0.5291033434650455]
[5.8375, 0.7540106951871658, 0.5291033434650455]
[6.25375, 0.7540106951871658, 0.5291033434650455]
[6.67, 0.7540106951871658, 0.5291033434650455]
[7.08625, 0.7540106951871658, 0.5291033434650455]
[7.5024999999999995, 0.7540106951871658, 0.5291033434650455]
[7.91875, 0.754010695187

In [54]:
[best_acc, best_auc]

[0.7593582887700535, 0.6831306990881458]