In [1]:
using DecisionTree, ScikitLearn, DataFrames, CSV, MLDataUtils



In [2]:
# define error functions

function mean_abs_err(y_output, y_true)
    n = size(y_output,1)
    sum = 0
    for i=1:n
        sum += abs.(y_output[i] - y_true[i])
    end
    return (sum+0.0)/n
end

function mean_abs_percent(y_output, y_true)
    n = size(y_output,1)
    sum = 0
    for i=1:n
        sum += abs.((y_output[i] - y_true[i])/y_true[i])
    end
    return 100*(sum+0.0)/n
end

mean_abs_percent (generic function with 1 method)

In [4]:
tap_train_2008 = readtable("TAP_train_2008.csv");
#tap_test_2008 = readtable("TAP_test_2008.csv");

In [6]:
# convert training data into array and shuffle observations by row
TAP_2008 = convert(Array, tap_train_2008);
TAP_2008 = shuffleobs(TAP_2008, obsdim = 1);

In [7]:
# split into X and y
X_data = TAP_2008[:,2:42];
y_data = TAP_2008[:,43];

In [8]:
# split training sets (with 80%) of data into training and validation sets
X_train_2008, X_val_2008 = splitobs(X_data, at=0.75, obsdim=1);
y_train_2008, y_val_2008 = splitobs(y_data, at=0.75, obsdim=1);

In [9]:
# convert arrays from real to Float64
X_train_2008 = convert(Array{Float64}, X_train_2008);
X_val_2008 = convert(Array{Float64}, X_val_2008);
y_train_2008 = convert(Array{Float64}, y_train_2008);
y_val_2008 = convert(Array{Float64}, y_val_2008);

In [17]:
# fit regr1: single regression tree with average of 10 nodes per leaf using build_tree
regr1 = build_tree(y_train_2008, X_train_2008, 10);

# test fit of model on validation set
output_regr1 = apply_tree(regr1, X_val_2008);

# compute errors on validation set
MAE_regr1 = mean_abs_err(output_regr1, y_val_2008);
MAPD_regr1 = mean_abs_percent(output_regr1, y_val_2008);

In [29]:
@printf "For model 1 on the validation set, the MAE is %f and the MAPD is %f \n" MAE_regr1 MAPD_regr1

For model 1 on the validation set, the MAE is 54.357946 and the MAPD is 5.988706 


In [21]:
# fit three more models using ScikitLearn package
regr_2 = DecisionTreeRegressor()
regr_3 = DecisionTreeRegressor(pruning_purity_threshold=0.05)
regr_4 = RandomForestRegressor(ntrees=30)
ScikitLearn.fit!(regr_2, X_train_2008, y_train_2008)
ScikitLearn.fit!(regr_3, X_train_2008, y_train_2008)
ScikitLearn.fit!(regr_4, X_train_2008, y_train_2008)

DecisionTree.RandomForestRegressor(0, 5, 30, 0.7, -1, MersenneTwister(UInt32[0xa901eadf, 0x32164e50, 0x65be517a, 0xe002e4cd], Base.dSFMT.DSFMT_state(Int32[1983848526, 1072904960, 1694229780, 1072856405, -1126120619, 1073075975, -154366880, 1073327574, -1898732663, 1073067629  …  -540267461, 1073181204, -801532078, 1072770697, -177991529, 639106035, -1058054999, 991139942, 382, 0]), [1.2019, 1.1556, 1.365, 1.60494, 1.35704, 1.79847, 1.09744, 1.46881, 1.13451, 1.79144  …  1.35891, 1.97986, 1.93977, 1.89717, 1.12271, 1.29836, 1.16754, 1.60741, 1.46535, 1.07386], 167), Ensemble of Decision Trees
Trees:      30
Avg Leaves: 1465.4333333333334
Avg Depth:  26.533333333333335)

In [22]:
# compute errors on validation set
output_regr2 = ScikitLearn.predict(regr_2, X_val_2008);
output_regr3 = ScikitLearn.predict(regr_3, X_val_2008);
output_regr4 = ScikitLearn.predict(regr_4, X_val_2008);

# errors for regr2: DecisionTreeRegressor
MAE_regr2 = mean_abs_err(output_regr2, y_val_2008);
MAPD_regr2 = mean_abs_percent(output_regr2, y_val_2008);

# errors for regr3: DecisionTreeRegressor with pruning purity threshold of 0.025
MAE_regr3 = mean_abs_err(output_regr3, y_val_2008);
MAPD_regr3 = mean_abs_percent(output_regr3, y_val_2008);

# errors for regr4: Random Forest with n=30 trees
MAE_regr4 = mean_abs_err(output_regr4, y_val_2008);
MAPD_regr4 = mean_abs_percent(output_regr4, y_val_2008);

Overview of models for 2008 data:
* model 1: regression tree trained on training set using DecisionTrees.jl package, with average of 10 nodes/leaf
* model 2: full regression tree trained on training set using ScikitLearn package
* model 3: regression tree trained on training set using ScikitLearn and pruning purity threshold of 0.05
* model 4: random forest trained on training set using ScikitLearn with n=30 trees

In [28]:
@printf "For model 1 on the validation set, the MAE is %f and the MAPD is %f \n" MAE_regr1 MAPD_regr1
@printf "For model 2 on the validation set, the MAE is %f and the MAPD is %f \n" MAE_regr2 MAPD_regr2
@printf "For model 3 on the validation set, the MAE is %f and the MAPD is %f \n" MAE_regr3 MAPD_regr3
@printf "For model 4 on the validation set, the MAE is %f and the MAPD is %f \n" MAE_regr4 MAPD_regr4

For model 2 on the validation set, the MAE is 57.615052 and the MAPD is 6.158969 
For model 3 on the validation set, the MAE is 89.264739 and the MAPD is 7.996839 
For model 4 on the validation set, the MAE is 49.743548 and the MAPD is 6.069324 
