In [31]:
from SafeTransformer import SafeTransformer
from sklearn.base import BaseEstimator, TransformerMixin

import numpy as np
import pandas as pd

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline, FeatureUnion, Pipeline
from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import metrics

import matplotlib.pyplot as plt

from copy import deepcopy

In [2]:
titanic = pd.read_csv("titanic_train.csv")
y = titanic["Survived"]
X = titanic.drop(["PassengerId", "Survived", "Name", "Ticket"], axis=1)

X["Embarked"] = X["Embarked"].astype('category')
X["Cabin"] = X["Cabin"].fillna("unknown")
X["Cabin"] = X["Cabin"].astype('category')
X["Sex"] = X["Sex"].astype('category')

# Filling NaNs

In [3]:
class TypeSelector(BaseEstimator, TransformerMixin):
    def __init__(self, dtype):
        self.dtype = dtype
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        return X.select_dtypes(include=[self.dtype])

In [4]:
fill_NaN_pipeline = make_pipeline(
    FeatureUnion(transformer_list=[
        ("numeric_features", make_pipeline(
            TypeSelector(np.number),
            SimpleImputer(strategy="median"),
        )),
        ("categorical_features", make_pipeline(
            TypeSelector("category"),
            SimpleImputer(strategy="most_frequent"),
        ))
    ])
)

In [5]:
X_filled = fill_NaN_pipeline.fit_transform(X)
ts_category = TypeSelector("category")
colnames_category = ts_category.fit_transform(X).columns
ts_numeric = TypeSelector(np.number)
colnames_numeric = ts_numeric.fit_transform(X).columns
colnames = colnames_numeric.append(colnames_category)
X_filled = pd.DataFrame(X_filled, columns=colnames)
X_filled.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex,Cabin,Embarked
0,3,22,1,0,7.25,male,unknown,S
1,1,38,1,0,71.2833,female,C85,C
2,3,26,0,0,7.925,female,unknown,S
3,1,35,1,0,53.1,female,C123,S
4,3,35,0,0,8.05,male,unknown,S


In [6]:
for col in X.columns:
    X_filled[col] = X_filled[col].astype(X[col].dtype)
X_filled.dtypes

Pclass         int64
Age          float64
SibSp          int64
Parch          int64
Fare         float64
Sex         category
Cabin       category
Embarked    category
dtype: object

# Data set with dummy variables and splits

In [7]:
X_dummy = X_filled.copy()
colnames = list(X_dummy)
for idx, name in enumerate(colnames):
    if str(X_dummy.loc[:, name].dtype) in ['category', 'object']:
        dummies = pd.get_dummies(X_dummy.loc[:, name], prefix=name, drop_first=True)
        dummy_index  = X_dummy.columns.get_loc(name)
        X_dummy = pd.concat([X_dummy.iloc[:,range(dummy_index)], dummies, X_dummy.iloc[:, range(dummy_index+1, len(X_dummy.columns))]], axis=1)

In [8]:
X_train, X_test, X_lin_train, X_lin_test, y_train, y_test = train_test_split(X_filled, X_dummy, y, random_state = 123)

# SAFE - logistic regression

logistic regression

In [14]:
logistic_model = LogisticRegression(solver='liblinear')
logistic_model = logistic_model.fit(X_lin_train, y_train)
standard_predictions = logistic_model.predict(X_lin_test)
print(accuracy_score(y_test, standard_predictions))

pred = logistic_model.predict_proba(X_lin_test)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y_test, pred, pos_label=1)
metrics.auc(fpr, tpr)

0.7982062780269058


0.861125385405961

surrogate model

In [33]:
surrogate_model = GradientBoostingClassifier(random_state = 123)
surrogate_model = surrogate_model.fit(X_lin_train, y_train)
surrogate_model_predictions = surrogate_model.predict(X_lin_test)
print(accuracy_score(y_test, surrogate_model_predictions))

pred = surrogate_model.predict_proba(X_lin_test)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y_test, pred, pos_label=1)
metrics.auc(fpr, tpr)

0.8340807174887892


0.896454265159301

white-box model (logistic regression)

In [34]:
pens = np.linspace(0.01, 10, 25)
best_auc = float('-Inf')
best_acc = float('-Inf')
best_auc_pen = 0
best_acc_pen = 0
aucs = []
accs = []

for pen in pens:
    surrogate_model =  GradientBoostingClassifier(random_state = 123)
    logistic_model_simple = LogisticRegression(solver = 'lbfgs')
    safe_transformer = SafeTransformer(surrogate_model, penalty = pen)
    pipe = Pipeline(steps=[('safe', safe_transformer), ('linear', logistic_model_simple)])
    pipe = pipe.fit(X_train, y_train)
    predictions = pipe.predict(X_test)
    acc = accuracy_score(y_test, predictions)
    accs.append(acc)
    
    pred = pipe.predict_proba(X_test)[:,1]
    fpr, tpr, thresholds = metrics.roc_curve(y_test, pred, pos_label=1)
    auc = metrics.auc(fpr, tpr)
    aucs.append(auc)
    
    print([pen, acc, auc])
    if acc > best_acc:
        best_acc_transformer = deepcopy(safe_transformer)
        best_acc_model = deepcopy(logistic_model_simple)
        best_acc = acc
        best_acc_pen = pen
        
    if auc > best_auc:
        best_auc_transformer = deepcopy(safe_transformer)
        best_auc_model = deepcopy(logistic_model_simple)
        best_auc = auc
        best_auc_pen = pen

[0.01, 0.820627802690583, 0.8775693730729701]
[0.42625, 0.8251121076233184, 0.8755566974991436]
[0.8425, 0.8295964125560538, 0.8738009592326138]
[1.25875, 0.8340807174887892, 0.8761990407673861]
[1.675, 0.8385650224215246, 0.8753425830763959]
[2.0912499999999996, 0.8385650224215246, 0.8753425830763959]
[2.5075, 0.8385650224215246, 0.8753425830763959]
[2.92375, 0.8251121076233184, 0.8734583761562179]
[3.34, 0.8251121076233184, 0.8734583761562179]
[3.7562499999999996, 0.8251121076233184, 0.8734583761562179]
[4.172499999999999, 0.8251121076233184, 0.8734583761562179]
[4.58875, 0.8116591928251121, 0.8696899623158616]
[5.005, 0.8116591928251121, 0.8696899623158616]
[5.42125, 0.8116591928251121, 0.8696899623158616]
[5.8375, 0.8116591928251121, 0.8696899623158616]
[6.25375, 0.8116591928251121, 0.8696899623158616]
[6.67, 0.8116591928251121, 0.8696899623158616]
[7.08625, 0.8116591928251121, 0.8696899623158616]
[7.5024999999999995, 0.8116591928251121, 0.8696899623158616]
[7.91875, 0.811659192825

In [21]:
[best_acc, best_auc]

[0.8340807174887892, 0.8705464200068516]

In [22]:
[best_acc_pen, best_auc_pen]

[1.25875, 1.25875]

## SAFE - tree

In [25]:
tree_model = DecisionTreeClassifier(random_state=123)
tree_model = tree_model.fit(X_lin_train, y_train)
standard_predictions = tree_model.predict(X_lin_test)
print(accuracy_score(y_test, standard_predictions))

pred = tree_model.predict_proba(X_lin_test)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y_test, pred, pos_label=1)
metrics.auc(fpr, tpr)

0.7937219730941704


0.7767214799588901

In [35]:
surrogate_model = GradientBoostingClassifier(random_state = 123)
surrogate_model = surrogate_model.fit(X_lin_train, y_train)
surrogate_model_predictions = surrogate_model.predict(X_lin_test)
print(accuracy_score(y_test, surrogate_model_predictions))

pred = surrogate_model.predict_proba(X_lin_test)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y_test, pred, pos_label=1)
metrics.auc(fpr, tpr)

0.8340807174887892


0.896454265159301

In [36]:
pens = np.linspace(0.01, 10, 25)
best_auc = float('-Inf')
best_acc = float('-Inf')
best_auc_pen = 0
best_acc_pen = 0
aucs = []
accs = []

for pen in pens:
    surrogate_model = GradientBoostingClassifier(random_state = 123)
    logistic_model_simple = LogisticRegression(solver = 'lbfgs')
    safe_transformer = SafeTransformer(surrogate_model, penalty = pen)
    pipe = Pipeline(steps=[('safe', safe_transformer), ('linear', logistic_model_simple)])
    pipe = pipe.fit(X_train, y_train)
    predictions = pipe.predict(X_test)
    acc = accuracy_score(y_test, predictions)
    accs.append(acc)
    
    pred = pipe.predict_proba(X_test)[:,1]
    fpr, tpr, thresholds = metrics.roc_curve(y_test, pred, pos_label=1)
    auc = metrics.auc(fpr, tpr)
    aucs.append(auc)
    
    print([pen, acc, auc])
    if acc > best_acc:
        best_acc_transformer = deepcopy(safe_transformer)
        best_acc_model = deepcopy(logistic_model_simple)
        best_acc = acc
        best_acc_pen = pen
        
    if auc > best_auc:
        best_auc_transformer = deepcopy(safe_transformer)
        best_auc_model = deepcopy(logistic_model_simple)
        best_auc = auc
        best_auc_pen = pen

[0.01, 0.820627802690583, 0.8775693730729701]
[0.42625, 0.8251121076233184, 0.8755566974991436]
[0.8425, 0.8295964125560538, 0.8738009592326138]
[1.25875, 0.8340807174887892, 0.8761990407673861]
[1.675, 0.8385650224215246, 0.8753425830763959]
[2.0912499999999996, 0.8385650224215246, 0.8753425830763959]
[2.5075, 0.8385650224215246, 0.8753425830763959]
[2.92375, 0.8251121076233184, 0.8734583761562179]
[3.34, 0.8251121076233184, 0.8734583761562179]
[3.7562499999999996, 0.8251121076233184, 0.8734583761562179]
[4.172499999999999, 0.8251121076233184, 0.8734583761562179]
[4.58875, 0.8116591928251121, 0.8696899623158616]
[5.005, 0.8116591928251121, 0.8696899623158616]
[5.42125, 0.8116591928251121, 0.8696899623158616]
[5.8375, 0.8116591928251121, 0.8696899623158616]
[6.25375, 0.8116591928251121, 0.8696899623158616]
[6.67, 0.8116591928251121, 0.8696899623158616]
[7.08625, 0.8116591928251121, 0.8696899623158616]
[7.5024999999999995, 0.8116591928251121, 0.8696899623158616]
[7.91875, 0.811659192825

In [37]:
[best_acc, best_auc]

[0.8385650224215246, 0.8775693730729701]