In [1]:
import numpy as np
import pandas as pd
from copy import copy, deepcopy

from DecisionTree import RegressionTree
from Metrics import PerformanceMetrics as pm

np.set_printoptions(linewidth = 200)

input_file = "WEC.csv"
RANDOM_STATE = 42
TEST_FRACTION = 0.3

In [2]:
def data_split(input_data, output_data, train_fraction = 0.8, shuffle = True, random_state = None):
    from numpy.random import RandomState

    assert len(input_data) == len(output_data), "input and output arrays should have the same number of rows"
    rs = RandomState(random_state)
    if shuffle:
        permutation = rs.permutation(len(input_data))
        inputs = input_data[permutation]
        outputs = output_data[permutation]
    else:
        inputs = input_data
        outputs = output_data

    split_index = int(len(input_data) * train_fraction)
    train_in, test_in = np.split(inputs, [split_index])
    train_out, test_out = np.split(outputs, [split_index])

    return train_in, test_in, train_out, test_out

In [3]:
def concat_except(arr, exclude):
    arr_excluded = copy(arr)
    arr_excluded.pop(exclude)
    return np.concatenate(arr_excluded, axis = 0)

def k_fold_cross_validation(model, input_data, output_data, scoring_funcs, folds = 10, shuffle = True, random_state = None):
    from numpy.random import RandomState

    assert len(input_data) == len(output_data), "input and output arrays should have the same number of rows"

    scores = np.zeros((len(scoring_funcs), folds))

    rs = RandomState(random_state)
    if shuffle:
        permutation = rs.permutation(len(input_data))
        inputs = input_data[permutation]
        outputs = output_data[permutation]
    else:
        inputs = input_data
        outputs = output_data

    input_folds = np.array_split(inputs, folds)
    output_folds = np.array_split(outputs, folds)

    for fold_index in range(folds):
        # print(f"========fold {fold_index}==========")
        test_input_fold = input_folds[fold_index]
        test_output_fold = output_folds[fold_index]

        train_input_fold = concat_except(input_folds, exclude = fold_index)
        train_output_fold = concat_except(output_folds, exclude = fold_index)

        fold_model = deepcopy(model)

        fold_model.fit(train_input_fold, train_output_fold)
        prediction = fold_model.predict(test_input_fold)

        for score_func_index in range(len(scoring_funcs)):
            scores[score_func_index][fold_index] = scoring_funcs[score_func_index](test_output_fold, prediction)
            # print(f"func {score_func_index}: {scores[score_func_index][fold_index]}")

    return scores
        


In [4]:
def print_k_fold_cross_validation_scores(scores: np.ndarray, scores_names):
    assert len(scores) == len(scores_names), "scores and scores_names should have the same number of elements"

    for i in range(len(scores)):
        print(scores_names[i])
        val_mean = scores[i].mean()
        val_std = scores[i].std()
        interval = (1.96 * val_std)
        print(f"mean: {val_mean:15.3f}")
        print(f"std : {val_std:15.3f}")
        print(f"95% : {interval:15.3f}")

In [5]:
df = pd.read_csv(input_file, header = 0)
original_headers = list(df.columns.values)
input_headers = original_headers[:98]
output_header = original_headers[-1]

input_data = df[input_headers].to_numpy()
output_data = df[output_header].to_numpy()

train_input, test_input, train_output, test_output = data_split(input_data, output_data, train_fraction = TEST_FRACTION, random_state = RANDOM_STATE)

In [6]:
rt = RegressionTree(loss_func = pm.MSE, max_depth = 20, min_samples_split = 30, min_samples_leaf = 10, postpruning = True)

scoring_funcs = (pm.MAE, pm.MSE, pm.RMSE, pm.MAPE, pm.R2, pm.NRMSE)
scoring_funcs_names = ('MAE', 'MSE', 'RMSE', 'MAPE', 'R2', 'NRMSE')

cv_score = k_fold_cross_validation(rt, input_data, output_data, scoring_funcs = scoring_funcs, folds = 10, shuffle = True, random_state = RANDOM_STATE)
print_k_fold_cross_validation_scores(cv_score, scoring_funcs_names)

MAE
mean:       17935.435
std :         695.077
95% :        1362.352
MSE
mean:  1502821619.039
std :   104683168.791
95% :   205179010.831
RMSE
mean:       38743.082
std :        1339.857
95% :        2626.120
MAPE
mean:           0.005
std :           0.000
95% :           0.000
R2
mean:           0.900
std :           0.007
95% :           0.013
NRMSE
mean:           0.010
std :           0.000
95% :           0.001


In [7]:
rt = RegressionTree(loss_func = pm.MSE, max_depth = 20, min_samples_split = 30, min_samples_leaf = 10, postpruning = False)

scoring_funcs = (pm.MAE, pm.MSE, pm.RMSE, pm.MAPE, pm.R2, pm.NRMSE)
scoring_funcs_names = ('MAE', 'MSE', 'RMSE', 'MAPE', 'R2', 'NRMSE')

cv_score = k_fold_cross_validation(rt, input_data, output_data, scoring_funcs = scoring_funcs, folds = 10, shuffle = True, random_state = RANDOM_STATE)
print_k_fold_cross_validation_scores(cv_score, scoring_funcs_names)

MAE
mean:       16441.438
std :         493.372
95% :         967.010
MSE
mean:  1386964329.468
std :    74112357.573
95% :   145260220.844
RMSE
mean:       37228.775
std :         991.286
95% :        1942.920
MAPE
mean:           0.004
std :           0.000
95% :           0.000
R2
mean:           0.908
std :           0.004
95% :           0.008
NRMSE
mean:           0.009
std :           0.000
95% :           0.000


In [8]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor(max_depth = 20, min_samples_split = 30, min_samples_leaf = 10, random_state = RANDOM_STATE)

scoring_funcs = (pm.MAE, pm.MSE, pm.RMSE, pm.MAPE, pm.R2, pm.NRMSE)
scoring_funcs_names = ('MAE', 'MSE', 'RMSE', 'MAPE', 'R2', 'NRMSE')

cv_score = k_fold_cross_validation(dt, input_data, output_data, scoring_funcs = scoring_funcs, folds = 10, shuffle = True, random_state = RANDOM_STATE)
print_k_fold_cross_validation_scores(cv_score, scoring_funcs_names)

MAE
mean:       16404.377
std :         469.985
95% :         921.170
MSE
mean:  1391058504.364
std :    69951464.539
95% :   137104870.496
RMSE
mean:       37285.191
std :         934.372
95% :        1831.368
MAPE
mean:           0.004
std :           0.000
95% :           0.000
R2
mean:           0.907
std :           0.004
95% :           0.009
NRMSE
mean:           0.009
std :           0.000
95% :           0.000
