In [1]:
from SafeTransformer import SafeTransformer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
import sklearn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
import random
import matplotlib.pyplot as plt
from sklearn.ensemble.partial_dependence import plot_partial_dependence

In [2]:
ENB2012 = pd.read_csv('ENB2012.csv')
ENB2012.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,Y1,Y2
0,0.98,514.5,294.0,110.25,7.0,2,0.0,0,15.55,21.33
1,0.98,514.5,294.0,110.25,7.0,3,0.0,0,15.55,21.33
2,0.98,514.5,294.0,110.25,7.0,4,0.0,0,15.55,21.33
3,0.98,514.5,294.0,110.25,7.0,5,0.0,0,15.55,21.33
4,0.9,563.5,318.5,122.5,7.0,2,0.0,0,20.84,28.28


In [3]:
X_ap = ENB2012.drop(columns=['Y1', 'Y2'])
y = ENB2012['Y1']

In [4]:
X = X_ap.copy()
colnames = list(X)
for idx, name in enumerate(colnames):
    if str(X.loc[:, name].dtype) in ['category', 'object']:
        dummies = pd.get_dummies(X.loc[:, name], prefix=name, drop_first=True)
        dummy_index  = X.columns.get_loc(name)
        X = pd.concat([X.iloc[:,range(dummy_index)], dummies, X.iloc[:, range(dummy_index+1, len(X.columns))]], axis=1)

In [5]:
X_train, X_test, X_lin_train, X_lin_test, y_train, y_test = train_test_split(X_ap, X, y, random_state =123)

# SAFE

In [6]:
linear_model = LinearRegression()

linear_model = linear_model.fit(X_lin_train, y_train)
standard_predictions = linear_model.predict(X_lin_test)
standard_predictions_error = mean_squared_error(y_test, standard_predictions)
standard_predictions_error

8.103275722221106

In [7]:
surrogate_model = GradientBoostingRegressor(random_state=123, criterion = "mse",
                                           subsample = 0.6, n_estimators = 100, 
                                            max_depth = 3, learning_rate = 0.1)
surrogate_model = surrogate_model.fit(X_lin_train, y_train)
surrogate_model_predictions = surrogate_model.predict(X_lin_test)
surrogate_model_predictions_error = mean_squared_error(y_test, surrogate_model_predictions)
surrogate_model_predictions_error

0.18804792454081945

In [8]:
pens = np.linspace(0.01, 10, 25)
best_score = float('Inf')
best_pen = 0
errors = [] 

for pen in pens:
    surrogate_model = GradientBoostingRegressor(n_estimators=100,
        max_depth=4,
        learning_rate=0.1,
        loss='huber', random_state = 123)
    linear_model_simple = LinearRegression()
    safe_transformer = SafeTransformer(surrogate_model, penalty = pen)
    pipe = Pipeline(steps=[('safe', safe_transformer), ('linear', linear_model_simple)])
    pipe = pipe.fit(X_train, y_train)
    predictions = pipe.predict(X_test)
    error = mean_squared_error(y_test, predictions)
    errors.append(error)
    print([pen, error])
    if error < best_score:
        best_score = error
        best_pen = pen

[0.01, 0.9651762797037762]
[0.42625, 0.9120850524902345]
[0.8425, 0.9469428405761718]
[1.25875, 0.9466636555989584]
[1.675, 0.9408212051391603]
[2.0912499999999996, 0.9257517112890881]
[2.5075, 0.9382854868570965]
[2.92375, 0.9382854868570965]
[3.34, 0.9382854868570965]
[3.7562499999999996, 0.9251500981648763]
[4.172499999999999, 0.9251500981648763]
[4.58875, 0.9437309155223582]
[5.005, 0.9438034323950609]
[5.42125, 0.9520630289713542]
[5.8375, 0.9435905061264833]
[6.25375, 0.9435905061264833]
[6.67, 0.9622119841078917]
[7.08625, 0.9622119841078917]
[7.5024999999999995, 0.9622119841078917]
[7.91875, 0.9622119841078917]
[8.334999999999999, 0.9622119841078917]
[8.75125, 0.9622119841078917]
[9.1675, 0.9622119841078917]
[9.58375, 0.9622119841078917]
[10.0, 0.9622119841078917]


In [9]:
best_score

0.9120850524902345

In [10]:
standard_predictions_error/ surrogate_model_predictions_error

43.09154563661314

In [11]:
standard_predictions_error / best_score

8.88434220043077