Merge pull request JuliaAI#68 from bensadeghi/testset

Testset
bensadeghi · Jun 24, 2018 · b5e6d66 · b5e6d66
2 parents adf7457 + 92f9aef
commit b5e6d66
Show file tree

Hide file tree

Showing 18 changed files with 155 additions and 25 deletions.
diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@
 [![DecisionTree](http://pkg.julialang.org/badges/DecisionTree_0.6.svg)](http://pkg.julialang.org/?pkg=DecisionTree&ver=0.6)
 [![DecisionTree](http://pkg.julialang.org/badges/DecisionTree_0.7.svg)](http://pkg.julialang.org/?pkg=DecisionTree&ver=0.7)
 
-Julia implementation of Decision Trees & Random Forests
+Julia implementation of Decision Tree and Random Forest algorithms
 
 ## Classification
 * pre-pruning (max depth, min leaf size)
@@ -72,7 +72,7 @@ Also have a look at these [classification](https://github.com/cstjean/ScikitLear
 
 ## Native API
 ### Classification Example
-Pruned Tree Classifier
+Decision Tree Classifier
 ```julia
 # train full-tree classifier
 model = build_tree(labels, features)
@@ -87,11 +87,21 @@ apply_tree_proba(model, [5.9,3.0,5.1,1.9], ["setosa", "versicolor", "virginica"]
 # run n-fold cross validation for pruned tree,
 # using 90% purity threshold pruning, and 3 CV folds
 accuracy = nfoldCV_tree(labels, features, 0.9, 3)
+
+# set of classification build_tree() parameters and respective default values
+# max_depth: maximum depth of the decision tree (default: -1, no maximum)
+# min_samples_leaf: the minimum number of samples each leaf needs to have (default: 1)
+# min_samples_split: the minimum number of samples in needed for a split (default: 2)
+# min_purity_increase: minimum purity needed for a split (default: 0.0)
+# nsubfeatures: number of features to select at random (default: 0, keep all)
+nsubfeatures=0; maxdepth=-1; min_samples_leaf=1; min_samples_split=2; min_purity_increase=0.0;
+model = build_tree(labels, features, nsubfeatures, maxdepth, min_samples_leaf, min_samples_split, min_purity_increase)
+
 ```
 Random Forest Classifier
 ```julia
 # train random forest classifier
-# using 2 random features, 10 trees, 0.5 portion of samples per tree (optional), and a maximum tree depth of 6 (optional)
+# using 2 random features, 10 trees, 0.5 portion of samples per tree, and a maximum tree depth of 6
 model = build_forest(labels, features, 2, 10, 0.5, 6)
 # apply learned model
 apply_forest(model, [5.9,3.0,5.1,1.9])
@@ -100,6 +110,14 @@ apply_forest_proba(model, [5.9,3.0,5.1,1.9], ["setosa", "versicolor", "virginica
 # run n-fold cross validation for forests
 # using 2 random features, 10 trees, 3 folds, and 0.5 portion of samples per tree (optional)
 accuracy = nfoldCV_forest(labels, features, 2, 10, 3, 0.5)
+
+# set of classification build_forest() parameters and respective default values
+# nsubfeatures: number of features to consider at random per split (default: 0, keep all)
+# ntrees: number of trees to train (default: 10)
+# partialsampling: fraction of samples to train each tree on (default: 0.7)
+# max_depth: maximum depth of the decision trees (default: no maximum)
+nsubfeatures=0; ntrees=10; partialsampling=0.7; maxdepth=-1;
+model = build_forest(labels, features, nsubfeatures, ntrees, partialsampling, maxdepth)
 ```
 Adaptive-Boosted Decision Stumps Classifier
 ```julia
@@ -129,11 +147,21 @@ apply_tree(model, [-0.9,3.0,5.1,1.9,0.0])
 # run n-fold cross validation, using 3 folds and averaging of 5 samples per leaf (optional)
 # returns array of coefficients of determination (R^2)
 r2 = nfoldCV_tree(labels, features, 3, 5)
+
+# set of regression build_tree() parameters and respective default values
+# max_depth: maximum depth of the decision tree (default: -1, no maximum)
+# min_samples_leaf: the minimum number of samples each leaf needs to have (default: 5)
+# min_samples_split: the minimum number of samples in needed for a split (default: 2)
+# min_purity_increase: minimum purity needed for a split (default: 0.0)
+# nsubfeatures: number of features to select at random (default: 0, keep all)
+min_samples_leaf = 5; nsubfeatures = 0; max_depth = -1; min_samples_split = 2; min_purity_increase = 0.0;
+model = build_tree(labels, features, min_samples_leaf, nsubfeatures, max_depth, min_samples_split, min_purity_increase)
+
 ```
 Regression Random Forest
 ```julia
 # train regression forest, using 2 random features, 10 trees,
-# averaging of 5 samples per leaf (optional), and 0.7 portion of samples per tree (optional)
+# averaging of 5 samples per leaf, and 0.7 portion of samples per tree
 model = build_forest(labels, features, 2, 10, 5, 0.7)
 # apply learned model
 apply_forest(model, [-0.9,3.0,5.1,1.9,0.0])
@@ -142,4 +170,13 @@ apply_forest(model, [-0.9,3.0,5.1,1.9,0.0])
 # and 0.7 porition of samples per tree (optional)
 # returns array of coefficients of determination (R^2)
 r2 = nfoldCV_forest(labels, features, 2, 10, 3, 5, 0.7)
+
+# set of regression build_forest() parameters and respective default values
+# nsubfeatures: number of features to consider at random per split (default: 0, keep all)
+# ntrees: number of trees to train (default: 10)
+# partialsampling: fraction of samples to train each tree on (default: 0.7)
+# max_depth: maximum depth of the decision trees (default: no maximum)
+# min_samples_leaf: the minimum number of samples each leaf needs to have (default: 5)
+nsubfeatures=0; ntrees=10; min_samples_leaf=5; partialsampling=0.7; max_depth=-1;
+model = build_forest(labels, features, nsubfeatures, ntrees, min_samples_leaf, partialsampling, max_depth)
 ```
diff --git a/src/classification/main.jl b/src/classification/main.jl
@@ -54,6 +54,9 @@ end
 
 function build_stump(labels::Vector, features::Matrix, weights=[0];
                      rng=Base.GLOBAL_RNG)
+    if weights == [0]
+        return build_tree(labels, features, 0, 1)
+    end
     S = _split_neg_z1_loss(labels, features, weights)
     if S == NO_BEST
         return Leaf(majority_vote(labels), labels)
@@ -188,7 +191,7 @@ end
 apply_tree_proba(tree::LeafOrNode, features::Matrix, labels) =
     stack_function_results(row->apply_tree_proba(tree, row, labels), features)
 
-function build_forest(labels::Vector, features::Matrix, nsubfeatures::Integer, ntrees::Integer, partialsampling=0.7, maxdepth=-1; rng=Base.GLOBAL_RNG)
+function build_forest(labels::Vector, features::Matrix, nsubfeatures=0, ntrees=10, partialsampling=0.7, maxdepth=-1; rng=Base.GLOBAL_RNG)
     rng = mk_rng(rng)::AbstractRNG
     partialsampling = partialsampling > 1.0 ? 1.0 : partialsampling
     Nlabels = length(labels)

diff --git a/src/regression/main.jl b/src/regression/main.jl
@@ -42,7 +42,7 @@ function build_tree{T<:Float64}(
     return _convert(t)
 end
 
-function build_forest{T<:Float64}(labels::Vector{T}, features::Matrix, nsubfeatures::Integer, ntrees::Integer, min_samples_leaf=5, partialsampling=0.7, max_depth=-1; rng=Base.GLOBAL_RNG)
+function build_forest{T<:Float64}(labels::Vector{T}, features::Matrix, nsubfeatures=0, ntrees=10, min_samples_leaf=5, partialsampling=0.7, max_depth=-1; rng=Base.GLOBAL_RNG)
     rng = mk_rng(rng)::AbstractRNG
     partialsampling = partialsampling > 1.0 ? 1.0 : partialsampling
     Nlabels = length(labels)

diff --git a/src/scikitlearnAPI.jl b/src/scikitlearnAPI.jl
@@ -157,9 +157,9 @@ Random forest classification. See [DecisionTree.jl's documentation](https://gith
 
 Hyperparameters:
 
-- `nsubfeatures`: number of features to select in each tree at random (default: keep all)
-- `ntrees`: number of trees to train
-- `partialsampling`: fraction of samples to train each tree on
+- `nsubfeatures`: number of features to consider at random per split (default: keep all)
+- `ntrees`: number of trees to train (default: 10)
+- `partialsampling`: fraction of samples to train each tree on (default: 0.7)
 - `max_depth`: maximum depth of the decision trees (default: no maximum)
 - `rng`: the random number generator to use. Can be an `Int`, which will be used
   to seed and create a new random number generator.
@@ -220,10 +220,9 @@ Random forest regression. See [DecisionTree.jl's documentation](https://github.c
 
 Hyperparameters:
 
-- `nsubfeatures`: number of features to select in each tree at random (default:
-  keep all)
-- `ntrees`: number of trees to train
-- `partialsampling`: fraction of samples to train each tree on
+- `nsubfeatures`: number of features to consider at random per split (default: keep all)
+- `ntrees`: number of trees to train (default: 10)
+- `partialsampling`: fraction of samples to train each tree on (default: 0.7)
 - `max_depth`: maximum depth of the decision trees (default: no maximum)
 - `min_samples_leaf`: the minimum number of samples each leaf needs to have (default: 5)
 - `rng`: the random number generator to use. Can be an `Int`, which will be used

diff --git a/test/classification/adult.jl b/test/classification/adult.jl
@@ -1,6 +1,8 @@
 # Classification Test - Adult Data Set
 # https://archive.ics.uci.edu/ml/datasets/adult
 
+@testset "adult.jl" begin
+
 download("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", "adult.csv");
 adult = readcsv("adult.csv");
 
@@ -19,3 +21,5 @@ accuracy = nfoldCV_tree(labels, features, 0.9, 3);
 println("\n##### 3 foldCV Classification Forest #####")
 accuracy = nfoldCV_forest(labels, features, 2, 10, 3, 0.5);
 @test mean(accuracy) > 0.8
+
+end # @testset
diff --git a/test/classification/digits.jl b/test/classification/digits.jl
@@ -1,3 +1,5 @@
+@testset "digits.jl" begin
+
 function loaddata()
     f = open("data/digits.csv")
     data = readlines(f)[2:end]
@@ -29,3 +31,5 @@ t = DecisionTree.build_tree(Y, X, 0, 6, 3, 5)
 
 t = DecisionTree.build_tree(Y, X, 0, 6, 3, 5, 0.05)
 @test num_leaves(t) == 54
+
+end # @testset
diff --git a/test/classification/heterogeneous.jl b/test/classification/heterogeneous.jl
@@ -1,5 +1,7 @@
 ### Classification - Heterogeneously typed features (ints, floats, bools, strings)
 
+@testset "heterogeneous.jl" begin
+
 m, n = 10^2, 5
 
 tf = [trues(Int(m/2)) falses(Int(m/2))]
@@ -15,14 +17,16 @@ features[:,4] = tf[inds]
 model = build_tree(labels, features)
 preds = apply_tree(model, features)
 cm = confusion_matrix(labels, preds)
-@test cm.accuracy > 0.99
+@test cm.accuracy > 0.95
 
 model = build_forest(labels, features, 2, 3)
 preds = apply_forest(model, features)
 cm = confusion_matrix(labels, preds)
-@test cm.accuracy > 0.99
+@test cm.accuracy > 0.95
 
 model, coeffs = build_adaboost_stumps(labels, features, 7)
 preds = apply_adaboost_stumps(model, coeffs, features)
 cm = confusion_matrix(labels, preds)
-@test cm.accuracy > 0.99
+@test cm.accuracy > 0.95
+
+end # @testset
diff --git a/test/classification/iris.jl b/test/classification/iris.jl
@@ -1,12 +1,21 @@
 # Classification Test - Iris Data Set
 # https://archive.ics.uci.edu/ml/datasets/iris
 
+@testset "iris.jl" begin
+
 download("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", "iris.csv")
 iris = readcsv("iris.csv");
 
 features = iris[:, 1:4];
 labels = iris[:, 5];
 
+# train a decision stump (depth=1)
+model = build_stump(labels, features)
+preds = apply_tree(model, features);
+cm = confusion_matrix(labels, preds);
+@test cm.accuracy > 0.6
+@test depth(model) == 1
+
 # train full-tree classifier (over-fit)
 model = build_tree(labels, features);
 preds = apply_tree(model, features);
@@ -44,3 +53,5 @@ preds = apply_adaboost_stumps(model, coeffs, features);
 println("\n##### nfoldCV Classification Adaboosted Stumps #####")
 accuracy = nfoldCV_stumps(labels, features, 7, 3);
 @test mean(accuracy) > 0.7
+
+end # @testset
diff --git a/test/classification/random.jl b/test/classification/random.jl
@@ -1,15 +1,30 @@
+@testset "random.jl" begin
+
 srand(16)
 
 n,m = 10^3, 5;
 features = rand(n,m);
 weights = rand(-1:1,m);
 labels = _int(features * weights);
 
+model = build_stump(labels, features)
+@test depth(model) == 1
+
 maxdepth = 3
 model = build_tree(labels, features, 0, maxdepth)
 @test depth(model) == maxdepth
 print_tree(model, 3)
 
+model = build_tree(labels, features)
+preds = apply_tree(model, features)
+cm = confusion_matrix(labels, preds)
+@test cm.accuracy > 0.95
+
+model = build_forest(labels, features)
+preds = apply_forest(model, features)
+cm = confusion_matrix(labels, preds)
+@test cm.accuracy > 0.95
+
 println("\n##### nfoldCV Classification Tree #####")
 accuracy = nfoldCV_tree(labels, features, 0.9, 3)
 @test mean(accuracy) > 0.7
@@ -21,3 +36,5 @@ accuracy = nfoldCV_forest(labels, features, 2, 10, 3)
 println("\n##### nfoldCV Adaboosted Stumps #####")
 accuracy = nfoldCV_stumps(labels, features, 7, 3)
 @test mean(accuracy) > 0.5
+
+end # @testset
diff --git a/test/classification/scikitlearn.jl b/test/classification/scikitlearn.jl
@@ -1,3 +1,5 @@
+@testset "scikitlearn.jl" begin
+
 srand(2)
 n,m = 10^3, 5 ;
 features = rand(n,m);
@@ -35,3 +37,5 @@ y = rand(Bool, 100);
     predict_proba(fit!(RandomForestClassifier(; rng=10), X, y), X)
 @test predict_proba(fit!(RandomForestClassifier(; rng=10), X, y), X) !=
     predict_proba(fit!(RandomForestClassifier(; rng=12), X, y), X)
+
+end # @testset
diff --git a/test/miscellaneous/parallel.jl b/test/miscellaneous/parallel.jl
@@ -1,5 +1,7 @@
 # Test parallelization of random forests
 
+@testset "parallel.jl" begin
+
 addprocs(1)
 @test nprocs() > 1
 
@@ -16,7 +18,7 @@ labels = _int(features * weights);
 model = build_forest(labels, features, 2, 10);
 preds = apply_forest(model, features);
 cm = confusion_matrix(labels, preds);
-@test cm.accuracy > 0.9
+@test cm.accuracy > 0.8
 
 
 # Regression
@@ -27,4 +29,6 @@ labels = features * weights;
 
 model = build_forest(labels, features, 2, 10);
 preds = apply_forest(model, features);
-@test R2(labels, preds) > 0.9
+@test R2(labels, preds) > 0.8
+
+end # @testset
diff --git a/test/miscellaneous/promote.jl b/test/miscellaneous/promote.jl
@@ -1,8 +1,14 @@
 ### Promote Leaf to Node
 
+@testset "promote.jl" begin
+
 leaf = Leaf(0, [0])
 node = Node(1, 1, leaf, leaf)
 
-[leaf, node]
-[node, leaf]
+ln = [leaf, node]
+@test length(ln) == 2
+
+nl = [node, leaf]
+@test length(nl) == 2
 
+end # @testset
diff --git a/test/regression/digits.jl b/test/regression/digits.jl
@@ -1,3 +1,5 @@
+@testset "digits.jl" begin
+
 function loaddata()
     f = open("data/digits.csv")
     data = readlines(f)[2:end]
@@ -30,3 +32,5 @@ t = DecisionTree.build_tree(Y, X, 1, 0, -1, 20)
 
 t = DecisionTree.build_tree(Y, X, 1, 0, -1, 2, 0.25)
 @test length(t) == 103
+
+end # @testset
diff --git a/test/regression/energy.jl b/test/regression/energy.jl
@@ -1,6 +1,8 @@
 # Regression Test - Appliances Energy Prediction Data Set
 # https://archive.ics.uci.edu/ml/datasets/Appliances+energy+prediction
 
+@testset "energy.jl" begin
+
 download("https://archive.ics.uci.edu/ml/machine-learning-databases/00374/energydata_complete.csv", "energy.csv");
 energy = readcsv("energy.csv");
 
@@ -19,4 +21,6 @@ r2 = nfoldCV_tree(labels, features, 3);
 
 println("\n##### nfoldCV Regression Forest #####")
 r2 = nfoldCV_forest(labels, features, 2, 10, 3);
-@test mean(r2) > 0.4
+@test mean(r2) > 0.35
+
+end # @testset
diff --git a/test/regression/random.jl b/test/regression/random.jl
@@ -1,3 +1,5 @@
+@testset "random.jl" begin
+
 srand(5)
 
 n, m = 10^3, 5 ;
@@ -7,6 +9,9 @@ features[:,1] = round.(Integer, features[:,1]); # convert a column of integers
 weights = rand(-2:2,m);
 labels = float.(features * weights);            # cast to Array{Float64,1}
 
+model = build_stump(labels, features)
+@test depth(model) == 1
+
 # over-fitting
 min_samples_leaf = 1
 model = build_tree(labels, features, min_samples_leaf)
@@ -34,10 +39,16 @@ model = build_tree(labels, features, min_samples_leaf, nsubfeatures, max_depth,
 preds = apply_tree(model, features);
 @test R2(labels, preds) < 0.95
 
+model = build_forest(labels, features)
+preds = apply_forest(model, features)
+@test R2(labels, preds) > 0.9
+
 println("\n##### nfoldCV Regression Tree #####")
 r2 = nfoldCV_tree(labels, features, 3)
 @test mean(r2) > 0.6
 
 println("\n##### nfoldCV Regression Forest #####")
 r2 = nfoldCV_forest(labels, features, 2, 10, 3)
 @test mean(r2) > 0.8
+
+end # @testset
diff --git a/test/regression/scikitlearn.jl b/test/regression/scikitlearn.jl
@@ -1,3 +1,4 @@
+@testset "scikitlearn.jl" begin
 
 srand(2)
 n,m = 10^3, 5 ;
@@ -32,3 +33,4 @@ y = randn(100)
 @test fit_predict!(RandomForestRegressor(; rng=10), X, y) !=
     fit_predict!(RandomForestRegressor(; rng=22), X, y)
 
+end # @testset