In [1]:
from SafeTransformer import SafeTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
import random
from sklearn.metrics import auc
import matplotlib.pyplot as plt
from copy import deepcopy
from math import floor, fmod, log
from scipy.special import logit
from sklearn import metrics
from sklearn.ensemble.partial_dependence import plot_partial_dependence

In [24]:
tae = pd.read_csv("tae.csv", header = None)
tae.columns = ["X" + str(s) for s in tae.columns]
tae.head()
y = tae["X0"] - 1
X = tae.drop(["X0"], axis=1)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 123)

# SAFE

In [26]:
logistic_model = LogisticRegression(solver = 'lbfgs')
logistic_model = logistic_model.fit(X_train, y_train)
standard_predictions = logistic_model.predict(X_test)
print(accuracy_score(y_test, standard_predictions))

pred = logistic_model.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y_test, pred, pos_label=1)
metrics.auc(fpr, tpr)

0.8421052631578947


0.725

In [27]:
surrogate_model = GradientBoostingClassifier(random_state = 123,
                    n_estimators = 500, learning_rate = 0.11)
surrogate_model = surrogate_model.fit(X_train, y_train)
surrogate_model_predictions = surrogate_model.predict(X_test)
print(accuracy_score(y_test, surrogate_model_predictions))

pred = surrogate_model.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y_test, pred, pos_label=1)
metrics.auc(fpr, tpr)

0.8421052631578947


0.8375

In [28]:
pens = np.linspace(0.01, 10, 25)
best_auc = float('-Inf')
best_acc = float('-Inf')
best_auc_pen = 0
best_acc_pen = 0
aucs = []
accs = []

for pen in pens:
    surrogate_model = GradientBoostingClassifier(random_state = 123)
    logistic_model_simple = LogisticRegression(solver = 'lbfgs')
    safe_transformer = SafeTransformer(surrogate_model, penalty = pen)
    pipe = Pipeline(steps=[('safe', safe_transformer), ('linear', logistic_model_simple)])
    pipe = pipe.fit(X_train, y_train)
    predictions = pipe.predict(X_test)
    acc = accuracy_score(y_test, predictions)
    accs.append(acc)
    
    pred = pipe.predict_proba(X_test)[:,1]
    fpr, tpr, thresholds = metrics.roc_curve(y_test, pred, pos_label=1)
    auc = metrics.auc(fpr, tpr)
    aucs.append(auc)
    
    print([pen, acc, auc])
    if acc > best_acc:
        best_acc_transformer = deepcopy(safe_transformer)
        best_acc_model = deepcopy(logistic_model_simple)
        best_acc = acc
        best_acc_pen = pen
        
    if auc > best_auc:
        best_auc_transformer = deepcopy(safe_transformer)
        best_auc_model = deepcopy(logistic_model_simple)
        best_auc = auc
        best_auc_pen = pen

[0.01, 0.8421052631578947, 0.8208333333333333]
[0.42625, 0.8421052631578947, 0.7583333333333333]
[0.8425, 0.868421052631579, 0.7125]
[1.25875, 0.868421052631579, 0.7458333333333333]
[1.675, 0.8421052631578947, 0.6979166666666667]
[2.0912499999999996, 0.8421052631578947, 0.6979166666666667]
[2.5075, 0.8421052631578947, 0.6458333333333334]
[2.92375, 0.8421052631578947, 0.6458333333333334]
[3.34, 0.8421052631578947, 0.6895833333333334]
[3.7562499999999996, 0.8421052631578947, 0.6895833333333334]
[4.172499999999999, 0.8421052631578947, 0.6895833333333334]
[4.58875, 0.8421052631578947, 0.6895833333333334]
[5.005, 0.8421052631578947, 0.6895833333333334]
[5.42125, 0.8421052631578947, 0.6895833333333334]
[5.8375, 0.8421052631578947, 0.5833333333333334]
[6.25375, 0.8421052631578947, 0.6666666666666666]
[6.67, 0.8421052631578947, 0.6666666666666666]
[7.08625, 0.8421052631578947, 0.6666666666666666]
[7.5024999999999995, 0.8421052631578947, 0.6666666666666666]
[7.91875, 0.8421052631578947, 0.59166

In [29]:
best_auc

0.8208333333333333

In [30]:
best_acc

0.868421052631579

# SAFE trees

In [31]:
tree_model = DecisionTreeClassifier(random_state=123)
tree_model = tree_model.fit(X_train, y_train)
standard_predictions = tree_model.predict(X_test)
print(accuracy_score(y_test, standard_predictions))

pred = tree_model.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y_test, pred, pos_label=1)
metrics.auc(fpr, tpr)

0.8421052631578947


0.7625

In [32]:
surrogate_model = GradientBoostingClassifier(random_state = 123)
surrogate_model = surrogate_model.fit(X_train, y_train)
surrogate_model_predictions = surrogate_model.predict(X_test)
print(accuracy_score(y_test, surrogate_model_predictions))

pred = surrogate_model.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y_test, pred, pos_label=1)
metrics.auc(fpr, tpr)

0.8421052631578947


0.8166666666666667

In [33]:
pens = np.linspace(0.01, 10, 25)
best_auc = float('-Inf')
best_acc = float('-Inf')
best_auc_pen = 0
best_acc_pen = 0
aucs = []
accs = []

for pen in pens:
    surrogate_model = GradientBoostingClassifier(random_state = 123)
    tree_model_simple = DecisionTreeClassifier(random_state = 123)
    safe_transformer = SafeTransformer(surrogate_model, penalty = pen)
    pipe = Pipeline(steps=[('safe', safe_transformer), ('tree', tree_model_simple)])
    pipe = pipe.fit(X_train, y_train)
    predictions = pipe.predict(X_test)
    acc = accuracy_score(y_test, predictions)
    accs.append(acc)
    
    pred = pipe.predict_proba(X_test)[:,1]
    fpr, tpr, thresholds = metrics.roc_curve(y_test, pred, pos_label=1)
    auc = metrics.auc(fpr, tpr)
    aucs.append(auc)
    
    print([pen, acc, auc])
    if acc > best_acc:
        best_acc_transformer = deepcopy(safe_transformer)
        best_acc_model = deepcopy(tree_model_simple)
        best_acc = acc
        best_acc_pen = pen
        
    if auc > best_auc:
        best_auc_transformer = deepcopy(safe_transformer)
        best_auc_model = deepcopy(tree_model_simple)
        best_auc = auc
        best_auc_pen = pen

[0.01, 0.8947368421052632, 0.8416666666666667]
[0.42625, 0.868421052631579, 0.8374999999999999]
[0.8425, 0.868421052631579, 0.8395833333333333]
[1.25875, 0.8947368421052632, 0.8083333333333333]
[1.675, 0.8947368421052632, 0.825]
[2.0912499999999996, 0.8947368421052632, 0.825]
[2.5075, 0.868421052631579, 0.7979166666666666]
[2.92375, 0.868421052631579, 0.7979166666666666]
[3.34, 0.868421052631579, 0.7958333333333334]
[3.7562499999999996, 0.868421052631579, 0.7958333333333334]
[4.172499999999999, 0.868421052631579, 0.7958333333333334]
[4.58875, 0.868421052631579, 0.7958333333333334]
[5.005, 0.868421052631579, 0.7958333333333334]
[5.42125, 0.868421052631579, 0.7958333333333334]
[5.8375, 0.8421052631578947, 0.6145833333333334]
[6.25375, 0.8421052631578947, 0.6875]
[6.67, 0.8421052631578947, 0.6875]
[7.08625, 0.8421052631578947, 0.6875]
[7.5024999999999995, 0.8421052631578947, 0.6875]
[7.91875, 0.7105263157894737, 0.6125]
[8.334999999999999, 0.7105263157894737, 0.6125]
[8.75125, 0.710526315

In [34]:
[best_acc, best_auc]

[0.8947368421052632, 0.8416666666666667]