Skip to content

Commit

Permalink
Merge pull request JuliaAI#68 from bensadeghi/testset
Browse files Browse the repository at this point in the history
Testset
  • Loading branch information
bensadeghi committed Jun 24, 2018
2 parents adf7457 + 92f9aef commit b5e6d66
Show file tree
Hide file tree
Showing 18 changed files with 155 additions and 25 deletions.
45 changes: 41 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
[![DecisionTree](http://pkg.julialang.org/badges/DecisionTree_0.6.svg)](http://pkg.julialang.org/?pkg=DecisionTree&ver=0.6)
[![DecisionTree](http://pkg.julialang.org/badges/DecisionTree_0.7.svg)](http://pkg.julialang.org/?pkg=DecisionTree&ver=0.7)

Julia implementation of Decision Trees & Random Forests
Julia implementation of Decision Tree and Random Forest algorithms

## Classification
* pre-pruning (max depth, min leaf size)
Expand Down Expand Up @@ -72,7 +72,7 @@ Also have a look at these [classification](https://github.com/cstjean/ScikitLear

## Native API
### Classification Example
Pruned Tree Classifier
Decision Tree Classifier
```julia
# train full-tree classifier
model = build_tree(labels, features)
Expand All @@ -87,11 +87,21 @@ apply_tree_proba(model, [5.9,3.0,5.1,1.9], ["setosa", "versicolor", "virginica"]
# run n-fold cross validation for pruned tree,
# using 90% purity threshold pruning, and 3 CV folds
accuracy = nfoldCV_tree(labels, features, 0.9, 3)

# set of classification build_tree() parameters and respective default values
# max_depth: maximum depth of the decision tree (default: -1, no maximum)
# min_samples_leaf: the minimum number of samples each leaf needs to have (default: 1)
# min_samples_split: the minimum number of samples in needed for a split (default: 2)
# min_purity_increase: minimum purity needed for a split (default: 0.0)
# nsubfeatures: number of features to select at random (default: 0, keep all)
nsubfeatures=0; maxdepth=-1; min_samples_leaf=1; min_samples_split=2; min_purity_increase=0.0;
model = build_tree(labels, features, nsubfeatures, maxdepth, min_samples_leaf, min_samples_split, min_purity_increase)

```
Random Forest Classifier
```julia
# train random forest classifier
# using 2 random features, 10 trees, 0.5 portion of samples per tree (optional), and a maximum tree depth of 6 (optional)
# using 2 random features, 10 trees, 0.5 portion of samples per tree, and a maximum tree depth of 6
model = build_forest(labels, features, 2, 10, 0.5, 6)
# apply learned model
apply_forest(model, [5.9,3.0,5.1,1.9])
Expand All @@ -100,6 +110,14 @@ apply_forest_proba(model, [5.9,3.0,5.1,1.9], ["setosa", "versicolor", "virginica
# run n-fold cross validation for forests
# using 2 random features, 10 trees, 3 folds, and 0.5 portion of samples per tree (optional)
accuracy = nfoldCV_forest(labels, features, 2, 10, 3, 0.5)

# set of classification build_forest() parameters and respective default values
# nsubfeatures: number of features to consider at random per split (default: 0, keep all)
# ntrees: number of trees to train (default: 10)
# partialsampling: fraction of samples to train each tree on (default: 0.7)
# max_depth: maximum depth of the decision trees (default: no maximum)
nsubfeatures=0; ntrees=10; partialsampling=0.7; maxdepth=-1;
model = build_forest(labels, features, nsubfeatures, ntrees, partialsampling, maxdepth)
```
Adaptive-Boosted Decision Stumps Classifier
```julia
Expand Down Expand Up @@ -129,11 +147,21 @@ apply_tree(model, [-0.9,3.0,5.1,1.9,0.0])
# run n-fold cross validation, using 3 folds and averaging of 5 samples per leaf (optional)
# returns array of coefficients of determination (R^2)
r2 = nfoldCV_tree(labels, features, 3, 5)

# set of regression build_tree() parameters and respective default values
# max_depth: maximum depth of the decision tree (default: -1, no maximum)
# min_samples_leaf: the minimum number of samples each leaf needs to have (default: 5)
# min_samples_split: the minimum number of samples in needed for a split (default: 2)
# min_purity_increase: minimum purity needed for a split (default: 0.0)
# nsubfeatures: number of features to select at random (default: 0, keep all)
min_samples_leaf = 5; nsubfeatures = 0; max_depth = -1; min_samples_split = 2; min_purity_increase = 0.0;
model = build_tree(labels, features, min_samples_leaf, nsubfeatures, max_depth, min_samples_split, min_purity_increase)

```
Regression Random Forest
```julia
# train regression forest, using 2 random features, 10 trees,
# averaging of 5 samples per leaf (optional), and 0.7 portion of samples per tree (optional)
# averaging of 5 samples per leaf, and 0.7 portion of samples per tree
model = build_forest(labels, features, 2, 10, 5, 0.7)
# apply learned model
apply_forest(model, [-0.9,3.0,5.1,1.9,0.0])
Expand All @@ -142,4 +170,13 @@ apply_forest(model, [-0.9,3.0,5.1,1.9,0.0])
# and 0.7 porition of samples per tree (optional)
# returns array of coefficients of determination (R^2)
r2 = nfoldCV_forest(labels, features, 2, 10, 3, 5, 0.7)

# set of regression build_forest() parameters and respective default values
# nsubfeatures: number of features to consider at random per split (default: 0, keep all)
# ntrees: number of trees to train (default: 10)
# partialsampling: fraction of samples to train each tree on (default: 0.7)
# max_depth: maximum depth of the decision trees (default: no maximum)
# min_samples_leaf: the minimum number of samples each leaf needs to have (default: 5)
nsubfeatures=0; ntrees=10; min_samples_leaf=5; partialsampling=0.7; max_depth=-1;
model = build_forest(labels, features, nsubfeatures, ntrees, min_samples_leaf, partialsampling, max_depth)
```
5 changes: 4 additions & 1 deletion src/classification/main.jl
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ end

function build_stump(labels::Vector, features::Matrix, weights=[0];
rng=Base.GLOBAL_RNG)
if weights == [0]
return build_tree(labels, features, 0, 1)
end
S = _split_neg_z1_loss(labels, features, weights)
if S == NO_BEST
return Leaf(majority_vote(labels), labels)
Expand Down Expand Up @@ -188,7 +191,7 @@ end
apply_tree_proba(tree::LeafOrNode, features::Matrix, labels) =
stack_function_results(row->apply_tree_proba(tree, row, labels), features)

function build_forest(labels::Vector, features::Matrix, nsubfeatures::Integer, ntrees::Integer, partialsampling=0.7, maxdepth=-1; rng=Base.GLOBAL_RNG)
function build_forest(labels::Vector, features::Matrix, nsubfeatures=0, ntrees=10, partialsampling=0.7, maxdepth=-1; rng=Base.GLOBAL_RNG)
rng = mk_rng(rng)::AbstractRNG
partialsampling = partialsampling > 1.0 ? 1.0 : partialsampling
Nlabels = length(labels)
Expand Down
2 changes: 1 addition & 1 deletion src/regression/main.jl
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ function build_tree{T<:Float64}(
return _convert(t)
end

function build_forest{T<:Float64}(labels::Vector{T}, features::Matrix, nsubfeatures::Integer, ntrees::Integer, min_samples_leaf=5, partialsampling=0.7, max_depth=-1; rng=Base.GLOBAL_RNG)
function build_forest{T<:Float64}(labels::Vector{T}, features::Matrix, nsubfeatures=0, ntrees=10, min_samples_leaf=5, partialsampling=0.7, max_depth=-1; rng=Base.GLOBAL_RNG)
rng = mk_rng(rng)::AbstractRNG
partialsampling = partialsampling > 1.0 ? 1.0 : partialsampling
Nlabels = length(labels)
Expand Down
13 changes: 6 additions & 7 deletions src/scikitlearnAPI.jl
Original file line number Diff line number Diff line change
Expand Up @@ -157,9 +157,9 @@ Random forest classification. See [DecisionTree.jl's documentation](https://gith
Hyperparameters:
- `nsubfeatures`: number of features to select in each tree at random (default: keep all)
- `ntrees`: number of trees to train
- `partialsampling`: fraction of samples to train each tree on
- `nsubfeatures`: number of features to consider at random per split (default: keep all)
- `ntrees`: number of trees to train (default: 10)
- `partialsampling`: fraction of samples to train each tree on (default: 0.7)
- `max_depth`: maximum depth of the decision trees (default: no maximum)
- `rng`: the random number generator to use. Can be an `Int`, which will be used
to seed and create a new random number generator.
Expand Down Expand Up @@ -220,10 +220,9 @@ Random forest regression. See [DecisionTree.jl's documentation](https://github.c
Hyperparameters:
- `nsubfeatures`: number of features to select in each tree at random (default:
keep all)
- `ntrees`: number of trees to train
- `partialsampling`: fraction of samples to train each tree on
- `nsubfeatures`: number of features to consider at random per split (default: keep all)
- `ntrees`: number of trees to train (default: 10)
- `partialsampling`: fraction of samples to train each tree on (default: 0.7)
- `max_depth`: maximum depth of the decision trees (default: no maximum)
- `min_samples_leaf`: the minimum number of samples each leaf needs to have (default: 5)
- `rng`: the random number generator to use. Can be an `Int`, which will be used
Expand Down
4 changes: 4 additions & 0 deletions test/classification/adult.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Classification Test - Adult Data Set
# https://archive.ics.uci.edu/ml/datasets/adult

@testset "adult.jl" begin

download("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", "adult.csv");
adult = readcsv("adult.csv");

Expand All @@ -19,3 +21,5 @@ accuracy = nfoldCV_tree(labels, features, 0.9, 3);
println("\n##### 3 foldCV Classification Forest #####")
accuracy = nfoldCV_forest(labels, features, 2, 10, 3, 0.5);
@test mean(accuracy) > 0.8

end # @testset
4 changes: 4 additions & 0 deletions test/classification/digits.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
@testset "digits.jl" begin

function loaddata()
f = open("data/digits.csv")
data = readlines(f)[2:end]
Expand Down Expand Up @@ -29,3 +31,5 @@ t = DecisionTree.build_tree(Y, X, 0, 6, 3, 5)

t = DecisionTree.build_tree(Y, X, 0, 6, 3, 5, 0.05)
@test num_leaves(t) == 54

end # @testset
10 changes: 7 additions & 3 deletions test/classification/heterogeneous.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
### Classification - Heterogeneously typed features (ints, floats, bools, strings)

@testset "heterogeneous.jl" begin

m, n = 10^2, 5

tf = [trues(Int(m/2)) falses(Int(m/2))]
Expand All @@ -15,14 +17,16 @@ features[:,4] = tf[inds]
model = build_tree(labels, features)
preds = apply_tree(model, features)
cm = confusion_matrix(labels, preds)
@test cm.accuracy > 0.99
@test cm.accuracy > 0.95

model = build_forest(labels, features, 2, 3)
preds = apply_forest(model, features)
cm = confusion_matrix(labels, preds)
@test cm.accuracy > 0.99
@test cm.accuracy > 0.95

model, coeffs = build_adaboost_stumps(labels, features, 7)
preds = apply_adaboost_stumps(model, coeffs, features)
cm = confusion_matrix(labels, preds)
@test cm.accuracy > 0.99
@test cm.accuracy > 0.95

end # @testset
11 changes: 11 additions & 0 deletions test/classification/iris.jl
Original file line number Diff line number Diff line change
@@ -1,12 +1,21 @@
# Classification Test - Iris Data Set
# https://archive.ics.uci.edu/ml/datasets/iris

@testset "iris.jl" begin

download("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", "iris.csv")
iris = readcsv("iris.csv");

features = iris[:, 1:4];
labels = iris[:, 5];

# train a decision stump (depth=1)
model = build_stump(labels, features)
preds = apply_tree(model, features);
cm = confusion_matrix(labels, preds);
@test cm.accuracy > 0.6
@test depth(model) == 1

# train full-tree classifier (over-fit)
model = build_tree(labels, features);
preds = apply_tree(model, features);
Expand Down Expand Up @@ -44,3 +53,5 @@ preds = apply_adaboost_stumps(model, coeffs, features);
println("\n##### nfoldCV Classification Adaboosted Stumps #####")
accuracy = nfoldCV_stumps(labels, features, 7, 3);
@test mean(accuracy) > 0.7

end # @testset
17 changes: 17 additions & 0 deletions test/classification/random.jl
Original file line number Diff line number Diff line change
@@ -1,15 +1,30 @@
@testset "random.jl" begin

srand(16)

n,m = 10^3, 5;
features = rand(n,m);
weights = rand(-1:1,m);
labels = _int(features * weights);

model = build_stump(labels, features)
@test depth(model) == 1

maxdepth = 3
model = build_tree(labels, features, 0, maxdepth)
@test depth(model) == maxdepth
print_tree(model, 3)

model = build_tree(labels, features)
preds = apply_tree(model, features)
cm = confusion_matrix(labels, preds)
@test cm.accuracy > 0.95

model = build_forest(labels, features)
preds = apply_forest(model, features)
cm = confusion_matrix(labels, preds)
@test cm.accuracy > 0.95

println("\n##### nfoldCV Classification Tree #####")
accuracy = nfoldCV_tree(labels, features, 0.9, 3)
@test mean(accuracy) > 0.7
Expand All @@ -21,3 +36,5 @@ accuracy = nfoldCV_forest(labels, features, 2, 10, 3)
println("\n##### nfoldCV Adaboosted Stumps #####")
accuracy = nfoldCV_stumps(labels, features, 7, 3)
@test mean(accuracy) > 0.5

end # @testset
4 changes: 4 additions & 0 deletions test/classification/scikitlearn.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
@testset "scikitlearn.jl" begin

srand(2)
n,m = 10^3, 5 ;
features = rand(n,m);
Expand Down Expand Up @@ -35,3 +37,5 @@ y = rand(Bool, 100);
predict_proba(fit!(RandomForestClassifier(; rng=10), X, y), X)
@test predict_proba(fit!(RandomForestClassifier(; rng=10), X, y), X) !=
predict_proba(fit!(RandomForestClassifier(; rng=12), X, y), X)

end # @testset
8 changes: 6 additions & 2 deletions test/miscellaneous/parallel.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# Test parallelization of random forests

@testset "parallel.jl" begin

addprocs(1)
@test nprocs() > 1

Expand All @@ -16,7 +18,7 @@ labels = _int(features * weights);
model = build_forest(labels, features, 2, 10);
preds = apply_forest(model, features);
cm = confusion_matrix(labels, preds);
@test cm.accuracy > 0.9
@test cm.accuracy > 0.8


# Regression
Expand All @@ -27,4 +29,6 @@ labels = features * weights;

model = build_forest(labels, features, 2, 10);
preds = apply_forest(model, features);
@test R2(labels, preds) > 0.9
@test R2(labels, preds) > 0.8

end # @testset
10 changes: 8 additions & 2 deletions test/miscellaneous/promote.jl
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
### Promote Leaf to Node

@testset "promote.jl" begin

leaf = Leaf(0, [0])
node = Node(1, 1, leaf, leaf)

[leaf, node]
[node, leaf]
ln = [leaf, node]
@test length(ln) == 2

nl = [node, leaf]
@test length(nl) == 2

end # @testset
4 changes: 4 additions & 0 deletions test/regression/digits.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
@testset "digits.jl" begin

function loaddata()
f = open("data/digits.csv")
data = readlines(f)[2:end]
Expand Down Expand Up @@ -30,3 +32,5 @@ t = DecisionTree.build_tree(Y, X, 1, 0, -1, 20)

t = DecisionTree.build_tree(Y, X, 1, 0, -1, 2, 0.25)
@test length(t) == 103

end # @testset
6 changes: 5 additions & 1 deletion test/regression/energy.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Regression Test - Appliances Energy Prediction Data Set
# https://archive.ics.uci.edu/ml/datasets/Appliances+energy+prediction

@testset "energy.jl" begin

download("https://archive.ics.uci.edu/ml/machine-learning-databases/00374/energydata_complete.csv", "energy.csv");
energy = readcsv("energy.csv");

Expand All @@ -19,4 +21,6 @@ r2 = nfoldCV_tree(labels, features, 3);

println("\n##### nfoldCV Regression Forest #####")
r2 = nfoldCV_forest(labels, features, 2, 10, 3);
@test mean(r2) > 0.4
@test mean(r2) > 0.35

end # @testset
11 changes: 11 additions & 0 deletions test/regression/random.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
@testset "random.jl" begin

srand(5)

n, m = 10^3, 5 ;
Expand All @@ -7,6 +9,9 @@ features[:,1] = round.(Integer, features[:,1]); # convert a column of integers
weights = rand(-2:2,m);
labels = float.(features * weights); # cast to Array{Float64,1}

model = build_stump(labels, features)
@test depth(model) == 1

# over-fitting
min_samples_leaf = 1
model = build_tree(labels, features, min_samples_leaf)
Expand Down Expand Up @@ -34,10 +39,16 @@ model = build_tree(labels, features, min_samples_leaf, nsubfeatures, max_depth,
preds = apply_tree(model, features);
@test R2(labels, preds) < 0.95

model = build_forest(labels, features)
preds = apply_forest(model, features)
@test R2(labels, preds) > 0.9

println("\n##### nfoldCV Regression Tree #####")
r2 = nfoldCV_tree(labels, features, 3)
@test mean(r2) > 0.6

println("\n##### nfoldCV Regression Forest #####")
r2 = nfoldCV_forest(labels, features, 2, 10, 3)
@test mean(r2) > 0.8

end # @testset
2 changes: 2 additions & 0 deletions test/regression/scikitlearn.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
@testset "scikitlearn.jl" begin

srand(2)
n,m = 10^3, 5 ;
Expand Down Expand Up @@ -32,3 +33,4 @@ y = randn(100)
@test fit_predict!(RandomForestRegressor(; rng=10), X, y) !=
fit_predict!(RandomForestRegressor(; rng=22), X, y)

end # @testset
Loading

0 comments on commit b5e6d66

Please sign in to comment.