## Regression

In this script, we perform regression on the cooling load and the result is evaluated statistically. We compare a baseline, a linear regression model and an ANN model.

In [4]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [5]:
import warnings
warnings.filterwarnings('ignore')

#### Read/prepare data

In [6]:
datapath = "data/"
filename = 'ENB2012_data.csv'
df = pd.read_csv(datapath+filename)
col_names = ['relative_compactness', 'surface_area', 'wall_area', 'roof_area', 'overall_height', 'orientation', 'glazing_area', 'glazing_area_distribution', 'heating_load', 'cooling_load']
df.columns = col_names
display(df)
display(df.describe())

Unnamed: 0,relative_compactness,surface_area,wall_area,roof_area,overall_height,orientation,glazing_area,glazing_area_distribution,heating_load,cooling_load
0,0.98,514.5,294.0,110.25,7.0,2,0.0,0,15.55,21.33
1,0.98,514.5,294.0,110.25,7.0,3,0.0,0,15.55,21.33
2,0.98,514.5,294.0,110.25,7.0,4,0.0,0,15.55,21.33
3,0.98,514.5,294.0,110.25,7.0,5,0.0,0,15.55,21.33
4,0.90,563.5,318.5,122.50,7.0,2,0.0,0,20.84,28.28
...,...,...,...,...,...,...,...,...,...,...
763,0.64,784.0,343.0,220.50,3.5,5,0.4,5,17.88,21.40
764,0.62,808.5,367.5,220.50,3.5,2,0.4,5,16.54,16.88
765,0.62,808.5,367.5,220.50,3.5,3,0.4,5,16.44,17.11
766,0.62,808.5,367.5,220.50,3.5,4,0.4,5,16.48,16.61


Unnamed: 0,relative_compactness,surface_area,wall_area,roof_area,overall_height,orientation,glazing_area,glazing_area_distribution,heating_load,cooling_load
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,0.764167,671.708333,318.5,176.604167,5.25,3.5,0.234375,2.8125,22.307201,24.58776
std,0.105777,88.086116,43.626481,45.16595,1.75114,1.118763,0.133221,1.55096,10.090196,9.513306
min,0.62,514.5,245.0,110.25,3.5,2.0,0.0,0.0,6.01,10.9
25%,0.6825,606.375,294.0,140.875,3.5,2.75,0.1,1.75,12.9925,15.62
50%,0.75,673.75,318.5,183.75,5.25,3.5,0.25,3.0,18.95,22.08
75%,0.83,741.125,343.0,220.5,7.0,4.25,0.4,4.0,31.6675,33.1325
max,0.98,808.5,416.5,220.5,7.0,5.0,0.4,5.0,43.1,48.03


In [7]:
# Set target and data
y = df['cooling_load']

X = df.iloc[: , :8]
X = pd.DataFrame(StandardScaler().fit_transform(X))
X.columns = col_names[:8]

### Compare models: Two-level (nested) cross-validation

For baseline: Compute the largest class on the training data, and predict everything in the test data as belonging to that class. 
<br/>-> corresponding to logistic regression with bias term and no features.

For logistic regression: Inner fold is estimating lambda, the complexity controlling parameter (called C in sklearn)

For KNN: Inner fold is estimating K, the number of neighbours in the algorithm


In [8]:
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error
from sklearn import model_selection
import torch
from toolbox_02450 import train_neural_net
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV

In [12]:
# Number of folds in inner and outer cross validation
K1, K2 = 2, 2

# Parameter grids. I.e. what vals to check for optimality in respective model (for inner folds)
p_grid_reg = np.power(10., range(-7, 1))
p_grid_ann = [32, 64, 128, 256, 512, 1024, 2048]

In [23]:
res = {"outer_fold": [], "reg_lambda_i": [], "reg_test_error_i": [], "ANN_nb_i": [],
       "ANN_test_error_i": [], "baseline_test_error_i": []}
res["outer_fold"] = range(K1)

# Define variables for the ANN model
loss_fn = torch.nn.MSELoss()
max_iter = 10000
N, M = X.shape

# K-fold CrossValidation with two layers
CV = model_selection.KFold(K1, shuffle=True)
for k, (train_index, test_index) in enumerate(CV.split(X, y)):
    X_train = X.iloc[train_index, :]
    y_train = y.iloc[train_index]
    X_test = X.iloc[test_index, :]
    y_test = y.iloc[test_index]

    ### Baseline estimation error ###
    dummy = DummyRegressor(strategy='mean')
    dummy.fit(X_train, y_train)
    res['baseline_test_error_i'].append(
        mean_squared_error(dummy.predict(X_test), y_test))

    cv_inner = model_selection.KFold(K2, shuffle=True)

    ### Ridge regression ###
    ridge_model = RidgeCV(alphas=p_grid_reg, cv=cv_inner,
                          scoring='neg_mean_absolute_error').fit(X_train, y_train)
    best_param_reg = ridge_model.alpha_
    y_pred_reg = ridge_model.predict(X_test)
    err_reg = mean_squared_error(y_test, y_pred_reg)
    res["reg_lambda_i"].append(best_param_reg)
    res["reg_test_error_i"].append(err_reg)

    ### ANN inner cross validation ###
    # Initializing test error matrix
    S = len(p_grid_ann)
    Error_test = np.zeros((S, K2))
    # K-fold cross-validation for model selection
    for k, (train_index_inner, test_index_inner) in enumerate(cv_inner.split(X_train, y_train)):
        print('\nCrossvalidation fold: {0}/{1}'.format(k+1, K2))

        # Extract training and test set for current CV fold,
        # and convert them to PyTorch tensors
        X_train_inner = torch.Tensor(X_train.iloc[train_index_inner, :].values)
        y_train_inner = torch.Tensor(y_train.iloc[train_index_inner].values)
        X_test_inner = torch.Tensor(X_train.iloc[test_index_inner, :].values)
        y_test_inner = torch.Tensor(y_train.iloc[test_index_inner].values)

        # Compute the error for each number of hidden unit
        for i, n_hidden_units in enumerate(p_grid_ann):

            def model(): return torch.nn.Sequential(
                # M features to H hiden units
                torch.nn.Linear(M, n_hidden_units),
                # 1st transfer function, either Tanh or ReLU:
                torch.nn.Tanh(),  # torch.nn.ReLU(),
                # H hidden units to 1 output neuron
                torch.nn.Linear(n_hidden_units, 1)
            )

            net, final_loss, learning_curve = train_neural_net(model,
                                                               loss_fn,
                                                               X=X_train_inner,
                                                               y=y_train_inner,
                                                               n_replicates=1,
                                                               max_iter=max_iter)

            y_test_pred_ann = net(torch.Tensor(X_test_inner))
            err_ann_inner = loss_fn(y_test_inner, y_test_pred_ann)
            Error_test[i, k] = err_ann_inner

    # Find the best number of hidden unit
    generalization_error = Error_test.mean(1)
    best_n_hidden_units = p_grid_ann[np.argmin(generalization_error)]

    print(
        f'\n\tBest loss error: {err_ann_inner} for {best_n_hidden_units} number of hidden units\n')

    # Compute the final error
    def model(): return torch.nn.Sequential(
        # M features to H hiden units
        torch.nn.Linear(M, best_n_hidden_units),
        # 1st transfer function, either Tanh or ReLU:
        torch.nn.Tanh(),  # torch.nn.ReLU(),
        # H hidden units to 1 output neuron
        torch.nn.Linear(best_n_hidden_units, 1)
    )
    net, final_loss, learning_curve = train_neural_net(model,
                                                       loss_fn,
                                                       X=torch.Tensor(
                                                           X_train.values),
                                                       y=torch.Tensor(
                                                           y_train.values),
                                                       n_replicates=1,
                                                       max_iter=max_iter)
    y_pred_ann = net(torch.Tensor(X_test.values))
    err_ann = loss_fn(torch.Tensor(y_test.values), y_test_pred_ann)
    print('\n\tBest loss final_loss: {}\n'.format(err_ann))

    res["ANN_nb_i"].append(best_n_hidden_units)
    res["ANN_test_error_i"].append(float(err_ann))



Crossvalidation fold: 1/2

	Replicate: 1/1
		Iter	Loss			Rel. loss
		1000	88.59754	0.000390886
		Final loss:
		1562	85.44173	9.82228e-07

	Replicate: 1/1
		Iter	Loss			Rel. loss
		1000	85.46414	3.3029837e-06
		Final loss:
		1149	85.43806	9.822702e-07

	Replicate: 1/1
		Iter	Loss			Rel. loss
		Final loss:
		792	85.4321	9.823387e-07

	Replicate: 1/1
		Iter	Loss			Rel. loss
		Final loss:
		553	85.42983	9.823648e-07

	Replicate: 1/1
		Iter	Loss			Rel. loss
		Final loss:
		413	85.42896	9.823748e-07

	Replicate: 1/1
		Iter	Loss			Rel. loss
		Final loss:
		289	85.42954	9.823681e-07

	Replicate: 1/1
		Iter	Loss			Rel. loss
		Final loss:
		127	85.4346	8.930091e-07

Crossvalidation fold: 2/2

	Replicate: 1/1
		Iter	Loss			Rel. loss
		1000	96.403175	0.0002740675
		Final loss:
		1463	93.955215	9.744285e-07

	Replicate: 1/1
		Iter	Loss			Rel. loss
		Final loss:
		952	93.94876	9.744955e-07

	Replicate: 1/1
		Iter	Loss			Rel. loss
		Final loss:
		676	93.94851	9.744981e-07

	Replicate: 1/1
		Iter	Los

In [24]:
display(pd.DataFrame.from_dict(data=res))

Unnamed: 0,outer_fold,reg_lambda_i,reg_test_error_i,ANN_nb_i,ANN_test_error_i,baseline_test_error_i
0,0,0.1,9.753393,1024,91.079758,91.224118
1,1,0.1,10.79524,1024,90.310883,90.035563
