In [219]:
from SafeTransformer import SafeTransformer
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
import random
import matplotlib.pyplot as plt
from copy import deepcopy
from math import floor, fmod

In [220]:
def y_function(value1, value2):
    if value1 < -3:
        if value2  < 3:
            return 0
        else:
            return 1
    if value1 > 3:
        if value2 < 3:
            return 1
        else:
            return 0
    if value2 < 3:
        return 0
    else:
        return 1   

In [221]:
n = 500

In [222]:
X2_orig = np.random.uniform(low=0, high=40, size=n)

X2_factors_40 = []
for i in range(n):
    X2_factors_40.append(floor(X2_orig[i]))

    X2_factors_5 = []
for i in range(n):
    X2_factors_5.append(fmod(X2_factors_40[i], 5))

In [223]:
np.random.seed(123)
X_Artif = pd.DataFrame(data={
    'X1': np.linspace(-5, 5, n),
    'X2': X2_factors_40})
y = pd.Series(map(lambda value: y_function(value[0], value[1]), zip(X_Artif['X1'], X2_factors_5
                                                                   )))


In [224]:
X_Artif['X1'] = X_Artif['X1'] + np.random.normal(size=n, scale=0.2)
X_Artif['X2'] = 'A' + X_Artif['X2'].astype(str)
X_Artif['X2'] = X_Artif['X2'].astype('category')
X_Artif.head()

Unnamed: 0,X1,X2
0,-5.217126,A20
1,-4.780491,A23
2,-4.903324,A0
3,-5.241139,A20
4,-5.03556,A9


In [225]:
X_Artif.dtypes

X1     float64
X2    category
dtype: object

In [226]:
X_train, X_test, X_lin_train, X_lin_test, y_train, y_test = train_test_split(X_Artif, X, y)

In [227]:
logistic_model = LogisticRegression(solver = 'lbfgs')
logistic_model = logistic_model.fit(X_lin_train, y_train)
standard_predictions = logistic_model.predict(X_lin_test)
print(accuracy_score(y_test, standard_predictions))

0.776


In [228]:
surrogate_model = XGBClassifier(random_state = 123)
surrogate_model = surrogate_model.fit(X_lin_train, y_train)
surrogate_model_predictions = surrogate_model.predict(X_lin_test)
print(accuracy_score(y_test, surrogate_model_predictions))

0.728


In [229]:
pens = np.linspace(0.01, 10, 25)
best_score = float('-Inf')
best_pen = 0
aucs = []

for pen in pens:
    surrogate_model = XGBClassifier(random_state = 123)
    logistic_model_simple = LogisticRegression(solver = 'lbfgs')
    safe_transformer = SafeTransformer(surrogate_model, penalty = pen)
    pipe = Pipeline(steps=[('safe', safe_transformer), ('linear', logistic_model_simple)])
    pipe = pipe.fit(X_train, y_train)
    predictions = pipe.predict(X_test)
    auc = accuracy_score(y_test, predictions)
    aucs.append(auc)
    print([pen,auc])
    if auc > best_score:
        best_transformer = deepcopy(safe_transformer)
        best_model = deepcopy(logistic_model_simple)
        best_score = auc
        best_pen = pen

[0.01, 0.776]
[0.42625, 0.856]
[0.8425, 0.856]
[1.25875, 0.856]
[1.675, 0.856]
[2.0912499999999996, 0.856]
[2.5075, 0.856]
[2.92375, 0.856]
[3.34, 0.856]
[3.7562499999999996, 0.856]
[4.172499999999999, 0.856]
[4.58875, 0.856]
[5.005, 0.856]
[5.42125, 0.856]
[5.8375, 0.856]
[6.25375, 0.856]
[6.67, 0.856]
[7.08625, 0.856]
[7.5024999999999995, 0.856]
[7.91875, 0.856]
[8.334999999999999, 0.856]
[8.75125, 0.856]
[9.1675, 0.856]
[9.58375, 0.856]
[10.0, 0.856]


In [230]:
best_transformer.summary()

Numerical Variable X1
Selected intervals:
	[-Inf, -1.81)
	[-1.81, 0.73)
	[0.73, 2.64)
	[2.64, 3.70)
	[3.70, Inf)
Categorical Variable X2
Created variable levels:
	A0, A1, A10, A12, A15, A16, A17, A20, A21, A22, A24, A25, A26, A27, A30, A31, A32, A35, A36, A37, A5, A6, A7 -> A0_A1_A10_A12_A15_A16_A17_A20_A21_A22_A24_A25_A26_A27_A30_A31_A32_A35_A36_A37_A5_A6_A7
	A11, A2 -> A11_A2
	A13, A18, A29, A3, A8 -> A13_A18_A29_A3_A8
	A14, A23, A28, A33, A34, A38 -> A14_A23_A28_A33_A34_A38
	A19, A39, A4, A9 -> A19_A39_A4_A9

