In [1]:
from SafeTransformer import SafeTransformer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
import sklearn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
import random
import matplotlib.pyplot as plt
from sklearn.ensemble.partial_dependence import plot_partial_dependence

In [49]:
yacht = pd.read_csv('yacht_hydrodynamics.csv', header = None)
yacht = yacht.drop(yacht.columns[7], axis = 1)
yacht.columns = ["X" + str(s) for s in yacht.columns]
yacht = yacht.apply(lambda x: x.fillna(x.mean()),axis=0)
yacht

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6
0,-2.3,0.568,4.78,3.99,3.17,0.125,0.11
1,-2.3,0.568,4.78,3.99,3.17,0.150,0.27
2,-2.3,0.568,4.78,3.99,3.17,0.175,0.47
3,-2.3,0.568,4.78,3.99,3.17,0.200,0.78
4,-2.3,0.568,4.78,3.99,3.17,0.225,1.18
5,-2.3,0.568,4.78,3.99,3.17,0.250,1.82
6,-2.3,0.568,4.78,3.99,3.17,0.275,2.61
7,-2.3,0.568,4.78,3.99,3.17,0.300,3.76
8,-2.3,0.568,4.78,3.99,3.17,0.325,4.99
9,-2.3,0.568,4.78,3.99,3.17,0.350,7.16


In [50]:
X = yacht.drop(columns=['X6'])
y = yacht['X6']

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state =123)

# SAFE

In [52]:
linear_model = LinearRegression()

linear_model = linear_model.fit(X_train, y_train)
standard_predictions = linear_model.predict(X_test)
standard_predictions_error = mean_squared_error(y_test, standard_predictions)
standard_predictions_error

120.39236002566622

In [55]:
surrogate_model = GradientBoostingRegressor(random_state=123, criterion = "mse",
                                           subsample = 0.6, n_estimators = 100, 
                                            max_depth = 3, learning_rate = 0.1)
surrogate_model = surrogate_model.fit(X_train, y_train)
surrogate_model_predictions = surrogate_model.predict(X_test)
surrogate_model_predictions_error = mean_squared_error(y_test, surrogate_model_predictions)
surrogate_model_predictions_error

0.44962850607390303

In [56]:
pens = np.linspace(0.01, 10, 25)
best_score = float('Inf')
best_pen = 0
errors = [] 

for pen in pens:
    surrogate_model = GradientBoostingRegressor(n_estimators=100,
        max_depth=4,
        learning_rate=0.1,
        loss='huber', random_state = 123)
    linear_model_simple = LinearRegression()
    safe_transformer = SafeTransformer(surrogate_model, penalty = pen)
    pipe = Pipeline(steps=[('safe', safe_transformer), ('linear', linear_model_simple)])
    pipe = pipe.fit(X_train, y_train)
    predictions = pipe.predict(X_test)
    error = mean_squared_error(y_test, predictions)
    errors.append(error)
    print([pen, error])
    if error < best_score:
        best_score = error
        best_pen = pen

[0.01, 1.400996719894471]
[0.42625, 1.4025612748071743]
[0.8425, 1.3041171367694804]
[1.25875, 1.3413649425506173]
[1.675, 1.3192842112317493]
[2.0912499999999996, 1.3192842112317493]
[2.5075, 1.319284211231745]
[2.92375, 1.3077444936184195]
[3.34, 1.2613627329450523]
[3.7562499999999996, 1.1934202849784472]
[4.172499999999999, 1.14474023390482]
[4.58875, 1.14474023390482]
[5.005, 1.14474023390482]
[5.42125, 1.14474023390482]
[5.8375, 1.1447402339048198]
[6.25375, 1.1447402339048198]
[6.67, 1.1447402339048198]
[7.08625, 1.188371376365159]
[7.5024999999999995, 1.188371376365159]
[7.91875, 1.188371376365159]
[8.334999999999999, 1.188371376365159]
[8.75125, 1.1856694406287385]
[9.1675, 1.1856694406287385]
[9.58375, 1.1856694406287385]
[10.0, 1.1856694406287385]


In [57]:
best_score

1.1447402339048198

In [58]:
standard_predictions_error/ surrogate_model_predictions_error

267.75962466640846

In [59]:
standard_predictions_error / best_score

105.17002588001665