diff --git a/README.md b/README.md index 9707cc47..dcf26360 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ [![DecisionTree](http://pkg.julialang.org/badges/DecisionTree_0.6.svg)](http://pkg.julialang.org/?pkg=DecisionTree&ver=0.6) [![DecisionTree](http://pkg.julialang.org/badges/DecisionTree_0.7.svg)](http://pkg.julialang.org/?pkg=DecisionTree&ver=0.7) -Julia implementation of Decision Trees & Random Forests +Julia implementation of Decision Tree and Random Forest algorithms ## Classification * pre-pruning (max depth, min leaf size) @@ -72,7 +72,7 @@ Also have a look at these [classification](https://github.com/cstjean/ScikitLear ## Native API ### Classification Example -Pruned Tree Classifier +Decision Tree Classifier ```julia # train full-tree classifier model = build_tree(labels, features) @@ -87,11 +87,21 @@ apply_tree_proba(model, [5.9,3.0,5.1,1.9], ["setosa", "versicolor", "virginica"] # run n-fold cross validation for pruned tree, # using 90% purity threshold pruning, and 3 CV folds accuracy = nfoldCV_tree(labels, features, 0.9, 3) + +# set of classification build_tree() parameters and respective default values +# max_depth: maximum depth of the decision tree (default: -1, no maximum) +# min_samples_leaf: the minimum number of samples each leaf needs to have (default: 1) +# min_samples_split: the minimum number of samples in needed for a split (default: 2) +# min_purity_increase: minimum purity needed for a split (default: 0.0) +# nsubfeatures: number of features to select at random (default: 0, keep all) +nsubfeatures=0; maxdepth=-1; min_samples_leaf=1; min_samples_split=2; min_purity_increase=0.0; +model = build_tree(labels, features, nsubfeatures, maxdepth, min_samples_leaf, min_samples_split, min_purity_increase) + ``` Random Forest Classifier ```julia # train random forest classifier -# using 2 random features, 10 trees, 0.5 portion of samples per tree (optional), and a maximum tree depth of 6 (optional) +# using 2 random features, 10 trees, 0.5 portion of samples per tree, and a maximum tree depth of 6 model = build_forest(labels, features, 2, 10, 0.5, 6) # apply learned model apply_forest(model, [5.9,3.0,5.1,1.9]) @@ -100,6 +110,14 @@ apply_forest_proba(model, [5.9,3.0,5.1,1.9], ["setosa", "versicolor", "virginica # run n-fold cross validation for forests # using 2 random features, 10 trees, 3 folds, and 0.5 portion of samples per tree (optional) accuracy = nfoldCV_forest(labels, features, 2, 10, 3, 0.5) + +# set of classification build_forest() parameters and respective default values +# nsubfeatures: number of features to consider at random per split (default: 0, keep all) +# ntrees: number of trees to train (default: 10) +# partialsampling: fraction of samples to train each tree on (default: 0.7) +# max_depth: maximum depth of the decision trees (default: no maximum) +nsubfeatures=0; ntrees=10; partialsampling=0.7; maxdepth=-1; +model = build_forest(labels, features, nsubfeatures, ntrees, partialsampling, maxdepth) ``` Adaptive-Boosted Decision Stumps Classifier ```julia @@ -129,11 +147,21 @@ apply_tree(model, [-0.9,3.0,5.1,1.9,0.0]) # run n-fold cross validation, using 3 folds and averaging of 5 samples per leaf (optional) # returns array of coefficients of determination (R^2) r2 = nfoldCV_tree(labels, features, 3, 5) + +# set of regression build_tree() parameters and respective default values +# max_depth: maximum depth of the decision tree (default: -1, no maximum) +# min_samples_leaf: the minimum number of samples each leaf needs to have (default: 5) +# min_samples_split: the minimum number of samples in needed for a split (default: 2) +# min_purity_increase: minimum purity needed for a split (default: 0.0) +# nsubfeatures: number of features to select at random (default: 0, keep all) +min_samples_leaf = 5; nsubfeatures = 0; max_depth = -1; min_samples_split = 2; min_purity_increase = 0.0; +model = build_tree(labels, features, min_samples_leaf, nsubfeatures, max_depth, min_samples_split, min_purity_increase) + ``` Regression Random Forest ```julia # train regression forest, using 2 random features, 10 trees, -# averaging of 5 samples per leaf (optional), and 0.7 portion of samples per tree (optional) +# averaging of 5 samples per leaf, and 0.7 portion of samples per tree model = build_forest(labels, features, 2, 10, 5, 0.7) # apply learned model apply_forest(model, [-0.9,3.0,5.1,1.9,0.0]) @@ -142,4 +170,13 @@ apply_forest(model, [-0.9,3.0,5.1,1.9,0.0]) # and 0.7 porition of samples per tree (optional) # returns array of coefficients of determination (R^2) r2 = nfoldCV_forest(labels, features, 2, 10, 3, 5, 0.7) + +# set of regression build_forest() parameters and respective default values +# nsubfeatures: number of features to consider at random per split (default: 0, keep all) +# ntrees: number of trees to train (default: 10) +# partialsampling: fraction of samples to train each tree on (default: 0.7) +# max_depth: maximum depth of the decision trees (default: no maximum) +# min_samples_leaf: the minimum number of samples each leaf needs to have (default: 5) +nsubfeatures=0; ntrees=10; min_samples_leaf=5; partialsampling=0.7; max_depth=-1; +model = build_forest(labels, features, nsubfeatures, ntrees, min_samples_leaf, partialsampling, max_depth) ``` diff --git a/src/classification/main.jl b/src/classification/main.jl index 60a776ea..19834fc9 100644 --- a/src/classification/main.jl +++ b/src/classification/main.jl @@ -54,6 +54,9 @@ end function build_stump(labels::Vector, features::Matrix, weights=[0]; rng=Base.GLOBAL_RNG) + if weights == [0] + return build_tree(labels, features, 0, 1) + end S = _split_neg_z1_loss(labels, features, weights) if S == NO_BEST return Leaf(majority_vote(labels), labels) @@ -188,7 +191,7 @@ end apply_tree_proba(tree::LeafOrNode, features::Matrix, labels) = stack_function_results(row->apply_tree_proba(tree, row, labels), features) -function build_forest(labels::Vector, features::Matrix, nsubfeatures::Integer, ntrees::Integer, partialsampling=0.7, maxdepth=-1; rng=Base.GLOBAL_RNG) +function build_forest(labels::Vector, features::Matrix, nsubfeatures=0, ntrees=10, partialsampling=0.7, maxdepth=-1; rng=Base.GLOBAL_RNG) rng = mk_rng(rng)::AbstractRNG partialsampling = partialsampling > 1.0 ? 1.0 : partialsampling Nlabels = length(labels) diff --git a/src/regression/main.jl b/src/regression/main.jl index c88a7d77..9fe503cf 100644 --- a/src/regression/main.jl +++ b/src/regression/main.jl @@ -42,7 +42,7 @@ function build_tree{T<:Float64}( return _convert(t) end -function build_forest{T<:Float64}(labels::Vector{T}, features::Matrix, nsubfeatures::Integer, ntrees::Integer, min_samples_leaf=5, partialsampling=0.7, max_depth=-1; rng=Base.GLOBAL_RNG) +function build_forest{T<:Float64}(labels::Vector{T}, features::Matrix, nsubfeatures=0, ntrees=10, min_samples_leaf=5, partialsampling=0.7, max_depth=-1; rng=Base.GLOBAL_RNG) rng = mk_rng(rng)::AbstractRNG partialsampling = partialsampling > 1.0 ? 1.0 : partialsampling Nlabels = length(labels) diff --git a/src/scikitlearnAPI.jl b/src/scikitlearnAPI.jl index 5d1bdada..2606cf39 100644 --- a/src/scikitlearnAPI.jl +++ b/src/scikitlearnAPI.jl @@ -157,9 +157,9 @@ Random forest classification. See [DecisionTree.jl's documentation](https://gith Hyperparameters: -- `nsubfeatures`: number of features to select in each tree at random (default: keep all) -- `ntrees`: number of trees to train -- `partialsampling`: fraction of samples to train each tree on +- `nsubfeatures`: number of features to consider at random per split (default: keep all) +- `ntrees`: number of trees to train (default: 10) +- `partialsampling`: fraction of samples to train each tree on (default: 0.7) - `max_depth`: maximum depth of the decision trees (default: no maximum) - `rng`: the random number generator to use. Can be an `Int`, which will be used to seed and create a new random number generator. @@ -220,10 +220,9 @@ Random forest regression. See [DecisionTree.jl's documentation](https://github.c Hyperparameters: -- `nsubfeatures`: number of features to select in each tree at random (default: - keep all) -- `ntrees`: number of trees to train -- `partialsampling`: fraction of samples to train each tree on +- `nsubfeatures`: number of features to consider at random per split (default: keep all) +- `ntrees`: number of trees to train (default: 10) +- `partialsampling`: fraction of samples to train each tree on (default: 0.7) - `max_depth`: maximum depth of the decision trees (default: no maximum) - `min_samples_leaf`: the minimum number of samples each leaf needs to have (default: 5) - `rng`: the random number generator to use. Can be an `Int`, which will be used diff --git a/test/classification/adult.jl b/test/classification/adult.jl index 75e072da..a41ced9b 100644 --- a/test/classification/adult.jl +++ b/test/classification/adult.jl @@ -1,6 +1,8 @@ # Classification Test - Adult Data Set # https://archive.ics.uci.edu/ml/datasets/adult +@testset "adult.jl" begin + download("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", "adult.csv"); adult = readcsv("adult.csv"); @@ -19,3 +21,5 @@ accuracy = nfoldCV_tree(labels, features, 0.9, 3); println("\n##### 3 foldCV Classification Forest #####") accuracy = nfoldCV_forest(labels, features, 2, 10, 3, 0.5); @test mean(accuracy) > 0.8 + +end # @testset diff --git a/test/classification/digits.jl b/test/classification/digits.jl index 626a455b..8ad89630 100644 --- a/test/classification/digits.jl +++ b/test/classification/digits.jl @@ -1,3 +1,5 @@ +@testset "digits.jl" begin + function loaddata() f = open("data/digits.csv") data = readlines(f)[2:end] @@ -29,3 +31,5 @@ t = DecisionTree.build_tree(Y, X, 0, 6, 3, 5) t = DecisionTree.build_tree(Y, X, 0, 6, 3, 5, 0.05) @test num_leaves(t) == 54 + +end # @testset diff --git a/test/classification/heterogeneous.jl b/test/classification/heterogeneous.jl index b50b2d25..66cca631 100644 --- a/test/classification/heterogeneous.jl +++ b/test/classification/heterogeneous.jl @@ -1,5 +1,7 @@ ### Classification - Heterogeneously typed features (ints, floats, bools, strings) +@testset "heterogeneous.jl" begin + m, n = 10^2, 5 tf = [trues(Int(m/2)) falses(Int(m/2))] @@ -15,14 +17,16 @@ features[:,4] = tf[inds] model = build_tree(labels, features) preds = apply_tree(model, features) cm = confusion_matrix(labels, preds) -@test cm.accuracy > 0.99 +@test cm.accuracy > 0.95 model = build_forest(labels, features, 2, 3) preds = apply_forest(model, features) cm = confusion_matrix(labels, preds) -@test cm.accuracy > 0.99 +@test cm.accuracy > 0.95 model, coeffs = build_adaboost_stumps(labels, features, 7) preds = apply_adaboost_stumps(model, coeffs, features) cm = confusion_matrix(labels, preds) -@test cm.accuracy > 0.99 +@test cm.accuracy > 0.95 + +end # @testset diff --git a/test/classification/iris.jl b/test/classification/iris.jl index 599912be..b5805e7d 100644 --- a/test/classification/iris.jl +++ b/test/classification/iris.jl @@ -1,12 +1,21 @@ # Classification Test - Iris Data Set # https://archive.ics.uci.edu/ml/datasets/iris +@testset "iris.jl" begin + download("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", "iris.csv") iris = readcsv("iris.csv"); features = iris[:, 1:4]; labels = iris[:, 5]; +# train a decision stump (depth=1) +model = build_stump(labels, features) +preds = apply_tree(model, features); +cm = confusion_matrix(labels, preds); +@test cm.accuracy > 0.6 +@test depth(model) == 1 + # train full-tree classifier (over-fit) model = build_tree(labels, features); preds = apply_tree(model, features); @@ -44,3 +53,5 @@ preds = apply_adaboost_stumps(model, coeffs, features); println("\n##### nfoldCV Classification Adaboosted Stumps #####") accuracy = nfoldCV_stumps(labels, features, 7, 3); @test mean(accuracy) > 0.7 + +end # @testset diff --git a/test/classification/random.jl b/test/classification/random.jl index feae6037..09c4c577 100644 --- a/test/classification/random.jl +++ b/test/classification/random.jl @@ -1,3 +1,5 @@ +@testset "random.jl" begin + srand(16) n,m = 10^3, 5; @@ -5,11 +7,24 @@ features = rand(n,m); weights = rand(-1:1,m); labels = _int(features * weights); +model = build_stump(labels, features) +@test depth(model) == 1 + maxdepth = 3 model = build_tree(labels, features, 0, maxdepth) @test depth(model) == maxdepth print_tree(model, 3) +model = build_tree(labels, features) +preds = apply_tree(model, features) +cm = confusion_matrix(labels, preds) +@test cm.accuracy > 0.95 + +model = build_forest(labels, features) +preds = apply_forest(model, features) +cm = confusion_matrix(labels, preds) +@test cm.accuracy > 0.95 + println("\n##### nfoldCV Classification Tree #####") accuracy = nfoldCV_tree(labels, features, 0.9, 3) @test mean(accuracy) > 0.7 @@ -21,3 +36,5 @@ accuracy = nfoldCV_forest(labels, features, 2, 10, 3) println("\n##### nfoldCV Adaboosted Stumps #####") accuracy = nfoldCV_stumps(labels, features, 7, 3) @test mean(accuracy) > 0.5 + +end # @testset diff --git a/test/classification/scikitlearn.jl b/test/classification/scikitlearn.jl index f3dd3371..576d7d9a 100644 --- a/test/classification/scikitlearn.jl +++ b/test/classification/scikitlearn.jl @@ -1,3 +1,5 @@ +@testset "scikitlearn.jl" begin + srand(2) n,m = 10^3, 5 ; features = rand(n,m); @@ -35,3 +37,5 @@ y = rand(Bool, 100); predict_proba(fit!(RandomForestClassifier(; rng=10), X, y), X) @test predict_proba(fit!(RandomForestClassifier(; rng=10), X, y), X) != predict_proba(fit!(RandomForestClassifier(; rng=12), X, y), X) + +end # @testset diff --git a/test/miscellaneous/parallel.jl b/test/miscellaneous/parallel.jl index a46411af..3a4086b3 100644 --- a/test/miscellaneous/parallel.jl +++ b/test/miscellaneous/parallel.jl @@ -1,5 +1,7 @@ # Test parallelization of random forests +@testset "parallel.jl" begin + addprocs(1) @test nprocs() > 1 @@ -16,7 +18,7 @@ labels = _int(features * weights); model = build_forest(labels, features, 2, 10); preds = apply_forest(model, features); cm = confusion_matrix(labels, preds); -@test cm.accuracy > 0.9 +@test cm.accuracy > 0.8 # Regression @@ -27,4 +29,6 @@ labels = features * weights; model = build_forest(labels, features, 2, 10); preds = apply_forest(model, features); -@test R2(labels, preds) > 0.9 +@test R2(labels, preds) > 0.8 + +end # @testset diff --git a/test/miscellaneous/promote.jl b/test/miscellaneous/promote.jl index fe7c9af8..eecc1577 100644 --- a/test/miscellaneous/promote.jl +++ b/test/miscellaneous/promote.jl @@ -1,8 +1,14 @@ ### Promote Leaf to Node +@testset "promote.jl" begin + leaf = Leaf(0, [0]) node = Node(1, 1, leaf, leaf) -[leaf, node] -[node, leaf] +ln = [leaf, node] +@test length(ln) == 2 + +nl = [node, leaf] +@test length(nl) == 2 +end # @testset diff --git a/test/regression/digits.jl b/test/regression/digits.jl index d170c819..bcb86d22 100644 --- a/test/regression/digits.jl +++ b/test/regression/digits.jl @@ -1,3 +1,5 @@ +@testset "digits.jl" begin + function loaddata() f = open("data/digits.csv") data = readlines(f)[2:end] @@ -30,3 +32,5 @@ t = DecisionTree.build_tree(Y, X, 1, 0, -1, 20) t = DecisionTree.build_tree(Y, X, 1, 0, -1, 2, 0.25) @test length(t) == 103 + +end # @testset diff --git a/test/regression/energy.jl b/test/regression/energy.jl index 2a7d148d..eb7be6de 100644 --- a/test/regression/energy.jl +++ b/test/regression/energy.jl @@ -1,6 +1,8 @@ # Regression Test - Appliances Energy Prediction Data Set # https://archive.ics.uci.edu/ml/datasets/Appliances+energy+prediction +@testset "energy.jl" begin + download("https://archive.ics.uci.edu/ml/machine-learning-databases/00374/energydata_complete.csv", "energy.csv"); energy = readcsv("energy.csv"); @@ -19,4 +21,6 @@ r2 = nfoldCV_tree(labels, features, 3); println("\n##### nfoldCV Regression Forest #####") r2 = nfoldCV_forest(labels, features, 2, 10, 3); -@test mean(r2) > 0.4 +@test mean(r2) > 0.35 + +end # @testset diff --git a/test/regression/random.jl b/test/regression/random.jl index aa9372d2..9f5e2a46 100644 --- a/test/regression/random.jl +++ b/test/regression/random.jl @@ -1,3 +1,5 @@ +@testset "random.jl" begin + srand(5) n, m = 10^3, 5 ; @@ -7,6 +9,9 @@ features[:,1] = round.(Integer, features[:,1]); # convert a column of integers weights = rand(-2:2,m); labels = float.(features * weights); # cast to Array{Float64,1} +model = build_stump(labels, features) +@test depth(model) == 1 + # over-fitting min_samples_leaf = 1 model = build_tree(labels, features, min_samples_leaf) @@ -34,6 +39,10 @@ model = build_tree(labels, features, min_samples_leaf, nsubfeatures, max_depth, preds = apply_tree(model, features); @test R2(labels, preds) < 0.95 +model = build_forest(labels, features) +preds = apply_forest(model, features) +@test R2(labels, preds) > 0.9 + println("\n##### nfoldCV Regression Tree #####") r2 = nfoldCV_tree(labels, features, 3) @test mean(r2) > 0.6 @@ -41,3 +50,5 @@ r2 = nfoldCV_tree(labels, features, 3) println("\n##### nfoldCV Regression Forest #####") r2 = nfoldCV_forest(labels, features, 2, 10, 3) @test mean(r2) > 0.8 + +end # @testset diff --git a/test/regression/scikitlearn.jl b/test/regression/scikitlearn.jl index 7902b011..8710821e 100644 --- a/test/regression/scikitlearn.jl +++ b/test/regression/scikitlearn.jl @@ -1,3 +1,4 @@ +@testset "scikitlearn.jl" begin srand(2) n,m = 10^3, 5 ; @@ -32,3 +33,4 @@ y = randn(100) @test fit_predict!(RandomForestRegressor(; rng=10), X, y) != fit_predict!(RandomForestRegressor(; rng=22), X, y) +end # @testset diff --git a/test/run_all_tests.jl b/test/run_all_tests.jl index bdd4d70d..eccbf011 100644 --- a/test/run_all_tests.jl +++ b/test/run_all_tests.jl @@ -19,8 +19,16 @@ classification = "classification/" .* readdir("classification/") regression = "regression/" .* readdir("regression/") miscellaneous = "miscellaneous/" .* readdir("miscellaneous/") -for list in [classification, regression, miscellaneous] - run_tests(list) +test_suites = [("Classification", classification), ("Regression", regression), ("Miscellaneous", miscellaneous)] + +@testset "Test Suites" begin + for ts in 1:length(test_suites) + name = test_suites[ts][1] + list = test_suites[ts][2] + @testset "$name" begin + run_tests(list) + end + end end # remove downloaded .csv files diff --git a/test/runtests.jl b/test/runtests.jl index b2662897..eda04a49 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -27,6 +27,14 @@ regression = ["regression/random.jl", miscellaneous = ["miscellaneous/promote.jl", "miscellaneous/parallel.jl"] -for list in [classification, regression, miscellaneous] - run_tests(list) +test_suites = [("Classification", classification), ("Regression", regression), ("Miscellaneous", miscellaneous)] + +@testset "Test Suites" begin + for ts in 1:length(test_suites) + name = test_suites[ts][1] + list = test_suites[ts][2] + @testset "$name" begin + run_tests(list) + end + end end