In [1]:
from SafeTransformer import SafeTransformer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
import sklearn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
import random
import matplotlib.pyplot as plt
from sklearn.ensemble.partial_dependence import plot_partial_dependence

In [5]:
real_estate = pd.read_csv('real_estate.csv')
real_estate.head()

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,1,2012.917,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,2012.917,19.5,306.5947,9,24.98034,121.53951,42.2
2,3,2013.583,13.3,561.9845,5,24.98746,121.54391,47.3
3,4,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8
4,5,2012.833,5.0,390.5684,5,24.97937,121.54245,43.1


In [20]:
X_ap = real_estate.drop(columns=['No', 'X1 transaction date', 'Y house price of unit area'])
y = real_estate['Y house price of unit area']

In [21]:
X = X_ap.copy()
colnames = list(X)
for idx, name in enumerate(colnames):
    if str(X.loc[:, name].dtype) in ['category', 'object']:
        dummies = pd.get_dummies(X.loc[:, name], prefix=name, drop_first=True)
        dummy_index  = X.columns.get_loc(name)
        X = pd.concat([X.iloc[:,range(dummy_index)], dummies, X.iloc[:, range(dummy_index+1, len(X.columns))]], axis=1)

In [22]:
X_train, X_test, X_lin_train, X_lin_test, y_train, y_test = train_test_split(X_ap, X, y, random_state =123)

In [23]:
linear_model = LinearRegression()

linear_model = linear_model.fit(X_lin_train, y_train)
standard_predictions = linear_model.predict(X_lin_test)
standard_predictions_error = mean_squared_error(y_test, standard_predictions)
standard_predictions_error

56.40621125283712

In [24]:
surrogate_model = GradientBoostingRegressor(random_state=123, criterion = "mse",
                                           subsample = 0.6, n_estimators = 100, 
                                            max_depth = 3, learning_rate = 0.1)
surrogate_model = surrogate_model.fit(X_lin_train, y_train)
surrogate_model_predictions = surrogate_model.predict(X_lin_test)
surrogate_model_predictions_error = mean_squared_error(y_test, surrogate_model_predictions)
surrogate_model_predictions_error

55.09369621880406

In [25]:
pens = np.linspace(0.01, 10, 25)
best_score = float('Inf')
best_pen = 0
errors = [] 

for pen in pens:
    surrogate_model = GradientBoostingRegressor(n_estimators=100,
        max_depth=4,
        learning_rate=0.1,
        loss='huber', random_state = 123)
    linear_model_simple = LinearRegression()
    safe_transformer = SafeTransformer(surrogate_model, penalty = pen)
    pipe = Pipeline(steps=[('safe', safe_transformer), ('linear', linear_model_simple)])
    pipe = pipe.fit(X_train, y_train)
    predictions = pipe.predict(X_test)
    error = mean_squared_error(y_test, predictions)
    errors.append(error)
    print([pen, error])
    if error < best_score:
        best_score = error
        best_pen = pen

[0.01, 1.5258812658482551e+25]
[0.42625, 3.874386731351161e+23]
[0.8425, 1.3047063534736687e+23]
[1.25875, 3.4909697909333007e+21]
[1.675, 9.526533881595106e+22]
[2.0912499999999996, 5.29259363613251e+20]
[2.5075, 3.313053771650209e+23]
[2.92375, 5.454774788739338e+23]
[3.34, 2.1572146495979217e+24]
[3.7562499999999996, 5.1381739322423304e+23]
[4.172499999999999, 2.40370934658499e+21]
[4.58875, 45.85352879304152]
[5.005, 45.89408146784855]
[5.42125, 45.35096441415639]
[5.8375, 45.35096441415639]
[6.25375, 41.67649864783653]
[6.67, 41.5238539475661]
[7.08625, 41.82410373170411]
[7.5024999999999995, 41.82410373170411]
[7.91875, 41.82410373170411]
[8.334999999999999, 40.901257153526906]
[8.75125, 40.901257153526906]
[9.1675, 41.24007599176454]
[9.58375, 41.24007599176454]
[10.0, 41.24007599176454]


In [26]:
best_score

40.901257153526906

In [27]:
standard_predictions_error/ surrogate_model_predictions_error

1.0238233250646391

In [28]:
standard_predictions_error / best_score

1.3790825803009144