## Classification

In this script, the heating load is classified and the result is evaluated statistically.
We compare a baseline, logistic regression model and an ANN. The following solves a multiclass classification problem where a discretized version of the heating load is our target variable.

In [69]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#### Read/prepare data

In [70]:
datapath = "data/"
filename = 'ENB2012_data.csv'
df = pd.read_csv(datapath+filename)
df.columns = ['relative_compactness', 'surface_area', 'wall_area', 'roof_area', 'overall_height', 'orientation', 'glazing_area', 'glazing_area_distribution', 'heating_load', 'cooling_load']
display(df)
display(df.describe())

Unnamed: 0,relative_compactness,surface_area,wall_area,roof_area,overall_height,orientation,glazing_area,glazing_area_distribution,heating_load,cooling_load
0,0.98,514.5,294.0,110.25,7.0,2,0.0,0,15.55,21.33
1,0.98,514.5,294.0,110.25,7.0,3,0.0,0,15.55,21.33
2,0.98,514.5,294.0,110.25,7.0,4,0.0,0,15.55,21.33
3,0.98,514.5,294.0,110.25,7.0,5,0.0,0,15.55,21.33
4,0.90,563.5,318.5,122.50,7.0,2,0.0,0,20.84,28.28
...,...,...,...,...,...,...,...,...,...,...
763,0.64,784.0,343.0,220.50,3.5,5,0.4,5,17.88,21.40
764,0.62,808.5,367.5,220.50,3.5,2,0.4,5,16.54,16.88
765,0.62,808.5,367.5,220.50,3.5,3,0.4,5,16.44,17.11
766,0.62,808.5,367.5,220.50,3.5,4,0.4,5,16.48,16.61


Unnamed: 0,relative_compactness,surface_area,wall_area,roof_area,overall_height,orientation,glazing_area,glazing_area_distribution,heating_load,cooling_load
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,0.764167,671.708333,318.5,176.604167,5.25,3.5,0.234375,2.8125,22.307201,24.58776
std,0.105777,88.086116,43.626481,45.16595,1.75114,1.118763,0.133221,1.55096,10.090196,9.513306
min,0.62,514.5,245.0,110.25,3.5,2.0,0.0,0.0,6.01,10.9
25%,0.6825,606.375,294.0,140.875,3.5,2.75,0.1,1.75,12.9925,15.62
50%,0.75,673.75,318.5,183.75,5.25,3.5,0.25,3.0,18.95,22.08
75%,0.83,741.125,343.0,220.5,7.0,4.25,0.4,4.0,31.6675,33.1325
max,0.98,808.5,416.5,220.5,7.0,5.0,0.4,5.0,43.1,48.03


In [71]:
# Discretize the Heating Load (HL)
class_names = range(4)
num_classes = 4
discrete_HL = []
for i, row in df.iterrows():
    hl = row['heating_load']
    if hl <= 10:
        discrete_HL.append(1)
    elif hl <= 20:
        discrete_HL.append(2)
    elif hl <= 30:
        discrete_HL.append(3)
    else:
        discrete_HL.append(4)
        
hl_df = df.iloc[: , :8]
hl_df['discrete_heating_load'] = discrete_HL

Unnamed: 0,relative_compactness,surface_area,wall_area,roof_area,overall_height,orientation,glazing_area,glazing_area_distribution,discrete_heating_load
0,0.98,514.5,294.0,110.25,7.0,2,0.0,0,2
1,0.98,514.5,294.0,110.25,7.0,3,0.0,0,2
2,0.98,514.5,294.0,110.25,7.0,4,0.0,0,2
3,0.98,514.5,294.0,110.25,7.0,5,0.0,0,2
4,0.90,563.5,318.5,122.50,7.0,2,0.0,0,3
...,...,...,...,...,...,...,...,...,...
763,0.64,784.0,343.0,220.50,3.5,5,0.4,5,2
764,0.62,808.5,367.5,220.50,3.5,2,0.4,5,2
765,0.62,808.5,367.5,220.50,3.5,3,0.4,5,2
766,0.62,808.5,367.5,220.50,3.5,4,0.4,5,2


In [72]:
# Set target and data
y = hl_df['discrete_heating_load']
X = hl_df.drop(["discrete_heating_load"], axis=1)

Unnamed: 0,relative_compactness,surface_area,wall_area,roof_area,overall_height,orientation,glazing_area,glazing_area_distribution
0,0.98,514.5,294.0,110.25,7.0,2,0.0,0
1,0.98,514.5,294.0,110.25,7.0,3,0.0,0
2,0.98,514.5,294.0,110.25,7.0,4,0.0,0
3,0.98,514.5,294.0,110.25,7.0,5,0.0,0
4,0.90,563.5,318.5,122.50,7.0,2,0.0,0
...,...,...,...,...,...,...,...,...
763,0.64,784.0,343.0,220.50,3.5,5,0.4,5
764,0.62,808.5,367.5,220.50,3.5,2,0.4,5
765,0.62,808.5,367.5,220.50,3.5,3,0.4,5
766,0.62,808.5,367.5,220.50,3.5,4,0.4,5


### Compare models: Two-level (nested) cross-validation

For baseline: Compute the largest class on the training data, and predict everything in the test data as belonging to that class. 
<br/>-> corresponding to logistic regression with bias term and no features.

For logistic regression: Inner fold is estimating lambda, the complexity controlling parameter (called C in sklearn)

For KNN: Inner fold is estimating K, the number of neighbours in the algorithm


In [73]:
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score

In [74]:
def two_level_cross_validation(X, y, k1, k2, p_grid_knn, p_grid_reg):
    # Define table for output
    res = {"outer_fold": [], "KNN_k_i": [], "KNN_test_error_i" : [], "reg_lambda_i" : [], "reg_test_error_i" : [], "baseline_test_error_i": []}
    res["outer_fold"] = range(k1)

    dummy_errors = []
    cv_outer = KFold(n_splits=k1) #outer cross validation obj
    for train_ix, test_ix in cv_outer.split(X):
        # Split data
        x_train, x_test = X.iloc[train_ix,:], X.iloc[test_ix,:]
        y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
        
        # Do baseline estimation error and append to table
        dummy = DummyClassifier(strategy='most_frequent')
        dummy.fit(x_train, y_train)
        dummy_errors.append(1 - dummy.score(x_test, y_test))

        
        cv_inner = KFold(n_splits=k2) #inner cross validation obj

        # Model objects
        knn = KNeighborsClassifier()
        reg = LogisticRegression()

        # Exhaustive search over specified parameter values (p_grid_...) (with the inner cross validation) for the estimators
        search_knn = GridSearchCV(knn, p_grid_knn, scoring='accuracy', cv=cv_inner, refit=True)
        search_reg = GridSearchCV(reg, p_grid_reg, scoring='accuracy', cv=cv_inner, refit=True)

        # Do actual search on the current training split
        res_knn = search_knn.fit(x_train, y_train)
        res_reg = search_reg.fit(x_train, y_train)

        # Get best performing model on training set from inner fold
        best_knn = res_knn.best_estimator_
        best_reg = res_reg.best_estimator_

        # Get the parameter for the best performing model within the fold
        best_param_knn = res_knn.best_params_['n_neighbors'] #i.e. K
        best_param_reg = res_reg.best_params_['C'] #i.e. lambda

        # Evaluate best scoring estimators on training set
        y_pred_knn = best_knn.predict(x_test)
        y_pred_reg = best_reg.predict(x_test)
        
        # Check error for best estimators on current test split
        err_knn = 1 - accuracy_score(y_test, y_pred_knn)
        err_reg = 1 - accuracy_score(y_test, y_pred_reg)
        
        # Append values to respective lists in outside dict
        res["KNN_k_i"].append(best_param_knn)
        res["KNN_test_error_i"].append(err_knn)
        res["reg_lambda_i"].append(best_param_reg)
        res["reg_test_error_i"].append(err_reg)
    
    # insert baseline errors for all i into dict
    res['baseline_test_error_i'] = dummy_errors

    # returned transformed dict
    return pd.DataFrame.from_dict(data=res)

In [75]:
p_grid_knn = {"n_neighbors": [3, 5, 8, 16, 32]}
p_grid_reg = {"C": [1, 5, 10, 50, 100]}
cross_val_table = two_level_cross_validation(X, y, 10, 10, p_grid_knn, p_grid_reg)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [76]:
display(cross_val_table)

Unnamed: 0,outer_fold,KNN_k_i,KNN_test_error_i,reg_lambda_i,reg_test_error_i,baseline_test_error_i
0,0,32,0.623377,100,0.636364,0.7143
1,1,32,0.103896,10,0.103896,0.4416
2,2,32,0.051948,10,0.077922,0.4935
3,3,32,0.064935,100,0.116883,0.5714
4,4,3,0.012987,5,0.025974,0.3766
5,5,32,0.0,1,0.090909,0.6104
6,6,3,0.077922,10,0.103896,0.4545
7,7,3,0.116883,5,0.116883,0.4805
8,8,3,0.210526,10,0.236842,0.5789
9,9,32,0.105263,100,0.157895,0.3684
