diff --git a/LICENSE.md b/LICENSE.md index 4695972c..098eca5a 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -1,10 +1,10 @@ DecisionTree.jl is licensed under the MIT License: > Code is originally adapted from MILK: Machine Learning Toolkit -> Copyright (C) 2008-2011, Luis Pedro Coelho +> Copyright (c) 2008-2011, Luis Pedro Coelho > License: MIT. See COPYING.MIT file in the milk distribution > -> Copyright (c) 2012: Ben Sadeghi +> Copyright (c) 2012-2013, Ben Sadeghi > > Permission is hereby granted, free of charge, to any person obtaining > a copy of this software and associated documentation files (the diff --git a/README.md b/README.md index cc51eb68..8e79bf6b 100644 --- a/README.md +++ b/README.md @@ -17,12 +17,12 @@ using RDatasets using DecisionTree iris = data("datasets", "iris") -features = convert(Array{Float64,2}, matrix(iris[:, 2:5])); -labels = convert(Array{UTF8String,1}, iris[:, "Species"]); +features = matrix(iris[:, 2:5]); +labels = vector(iris[:, "Species"]); # train full-tree classifier model = build_tree(labels, features); -# prune tree: merge leaves having > 90% combined purity +# prune tree: merge leaves having > 90% combined purity (default 100%) model = prune_tree(model, 0.9); # apply learned model apply_tree(model, [5.9,3.0,5.1,1.9]) @@ -44,4 +44,4 @@ nfoldCV_stumps(labels, features, 7, 3) # Coming Soon -* Support for DataFrames +* Support for missing values, DataFrames diff --git a/src/DecisionTree.jl b/src/DecisionTree.jl index b5f57a86..7932656b 100644 --- a/src/DecisionTree.jl +++ b/src/DecisionTree.jl @@ -34,7 +34,7 @@ function length(tree::Union(Leaf,Node)) return length(s) - 1 end -function _split{T<:RealStr}(labels::Vector{T}, features::Matrix{Float64}, nsubfeatures::Integer, weights::Vector{Float64}) +function _split{T<:RealStr, U<:Real, V<:Real}(labels::Vector{T}, features::Matrix{U}, nsubfeatures::Integer, weights::Vector{V}) nf = size(features,2) best = None best_val = -Inf @@ -62,7 +62,7 @@ function _split{T<:RealStr}(labels::Vector{T}, features::Matrix{Float64}, nsubfe return best end -function build_stump{T<:RealStr}(labels::Vector{T}, features::Matrix{Float64}, weights::Vector{Float64}) +function build_stump{T<:RealStr, U<:Real, V<:Real}(labels::Vector{T}, features::Matrix{U}, weights::Vector{V}) S = _split(labels, features, 0, weights) if S == None return Leaf(majority_vote(labels), labels) @@ -73,10 +73,10 @@ function build_stump{T<:RealStr}(labels::Vector{T}, features::Matrix{Float64}, w Leaf(majority_vote(labels[split]), labels[split]), Leaf(majority_vote(labels[!split]), labels[!split])) end -build_stump{T<:RealStr}(labels::Vector{T}, features::Matrix{Float64}) = build_stump(labels, features, [0.]) +build_stump{T<:RealStr, U<:Real}(labels::Vector{T}, features::Matrix{U}) = build_stump(labels, features, [0]) -function build_tree{T<:RealStr}(labels::Vector{T}, features::Matrix{Float64}, nsubfeatures::Integer) - S = _split(labels, features, nsubfeatures, [0.]) +function build_tree{T<:RealStr, U<:Real}(labels::Vector{T}, features::Matrix{U}, nsubfeatures::Integer) + S = _split(labels, features, nsubfeatures, [0]) if S == None return Leaf(majority_vote(labels), labels) end @@ -86,22 +86,19 @@ function build_tree{T<:RealStr}(labels::Vector{T}, features::Matrix{Float64}, ns build_tree(labels[split],features[split,:], nsubfeatures), build_tree(labels[!split],features[!split,:], nsubfeatures)) end -build_tree{T<:RealStr}(labels::Vector{T}, features::Matrix{Float64}) = build_tree(labels, features, 0) +build_tree{S<:RealStr, T<:Real}(labels::Vector{S}, features::Matrix{T}) = build_tree(labels, features, 0) -function prune_tree{T<:Union(Leaf,Node)}(tree::T, purity_thresh::Float64) - function _prune_run{T<:Union(Leaf,Node)}(tree::T, purity_thresh::Float64) +function prune_tree{T<:Union(Leaf,Node)}(tree::T, purity_thresh::Real) + purity_thresh -= eps() + function _prune_run{T<:Union(Leaf,Node)}(tree::T, purity_thresh::Real) N = length(tree) if N == 1 ## a Leaf return tree elseif N == 2 ## a stump all_labels = [tree.left.values, tree.right.values] majority = majority_vote(all_labels) - mismatches = length(find(all_labels .!= majority)) - if mismatches == 0 - purity = 1.0 - else - purity = mismatches / length(all_labels) - end + mismatches = find(all_labels .!= majority) + purity = 1.0 - length(mismatches) / length(all_labels) if purity > purity_thresh return Leaf(majority, all_labels) else @@ -109,20 +106,20 @@ function prune_tree{T<:Union(Leaf,Node)}(tree::T, purity_thresh::Float64) end else return Node(tree.featid, tree.featval, - prune_tree(tree.left, purity_thresh), - prune_tree(tree.right, purity_thresh)) + _prune_run(tree.left, purity_thresh), + _prune_run(tree.right, purity_thresh)) end end pruned = _prune_run(tree, purity_thresh) while length(pruned) < length(tree) tree = pruned - return _prune_run(tree, purity_thresh) + pruned = _prune_run(tree, purity_thresh) end return pruned end -prune_tree{T<:Union(Leaf,Node)}(tree::T) = prune_tree(tree, 0.9) ## defaults to 90% purity pruning +prune_tree{T<:Union(Leaf,Node)}(tree::T) = prune_tree(tree, 1.0) ## defaults to 100% purity pruning -function apply_tree{T<:Union(Leaf,Node)}(tree::T, features::Vector{Float64}) +function apply_tree{T<:Union(Leaf,Node), U<:Real}(tree::T, features::Vector{U}) if typeof(tree) == Leaf return tree.majority elseif features[tree.featid] < tree.featval @@ -132,7 +129,7 @@ function apply_tree{T<:Union(Leaf,Node)}(tree::T, features::Vector{Float64}) end end -function apply_tree{T<:Union(Leaf,Node)}(tree::T, features::Matrix{Float64}) +function apply_tree{T<:Union(Leaf,Node), U<:Real}(tree::T, features::Matrix{U}) N = size(features,1) predictions = zeros(Any,N) for i in 1:N @@ -141,7 +138,7 @@ function apply_tree{T<:Union(Leaf,Node)}(tree::T, features::Matrix{Float64}) return convert(Array{UTF8String,1}, predictions) end -function build_forest{T<:RealStr}(labels::Vector{T}, features::Matrix{Float64}, nsubfeatures::Integer, ntrees::Integer) +function build_forest{T<:RealStr, U<:Real}(labels::Vector{T}, features::Matrix{U}, nsubfeatures::Integer, ntrees::Integer) N = int(0.7 * length(labels)) forest = @parallel (vcat) for i in 1:ntrees _labels, _features = sample(labels, features, N) @@ -151,7 +148,7 @@ function build_forest{T<:RealStr}(labels::Vector{T}, features::Matrix{Float64}, return forest end -function apply_forest{T<:Union(Leaf,Node)}(forest::Vector{T}, features::Vector{Float64}) +function apply_forest{T<:Union(Leaf,Node), U<:Real}(forest::Vector{T}, features::Vector{U}) ntrees = length(forest) votes = zeros(Any,ntrees) for i in 1:ntrees @@ -160,7 +157,7 @@ function apply_forest{T<:Union(Leaf,Node)}(forest::Vector{T}, features::Vector{F return majority_vote(convert(Array{UTF8String,1}, votes)) end -function apply_forest{T<:Union(Leaf,Node)}(forest::Vector{T}, features::Matrix{Float64}) +function apply_forest{T<:Union(Leaf,Node), U<:Real}(forest::Vector{T}, features::Matrix{U}) N = size(features,1) predictions = zeros(Any,N) for i in 1:N @@ -169,7 +166,7 @@ function apply_forest{T<:Union(Leaf,Node)}(forest::Vector{T}, features::Matrix{F return convert(Array{UTF8String,1}, predictions) end -function build_adaboost_stumps{T<:RealStr}(labels::Vector{T}, features::Matrix{Float64}, niterations::Integer) +function build_adaboost_stumps{T<:RealStr, U<:Real}(labels::Vector{T}, features::Matrix{U}, niterations::Integer) N = length(labels) weights = ones(N) / N stumps = [] @@ -191,7 +188,7 @@ function build_adaboost_stumps{T<:RealStr}(labels::Vector{T}, features::Matrix{F return (stumps, coeffs) end -function apply_adaboost_stumps{T<:Union(Leaf,Node)}(stumps::Vector{T}, coeffs::Vector{Float64}, features::Vector{Float64}) +function apply_adaboost_stumps{T<:Union(Leaf,Node), U<:Real, V<:Real}(stumps::Vector{T}, coeffs::Vector{U}, features::Vector{V}) nstumps = length(stumps) counts = Dict() for i in 1:nstumps @@ -213,7 +210,7 @@ function apply_adaboost_stumps{T<:Union(Leaf,Node)}(stumps::Vector{T}, coeffs::V return top_prediction end -function apply_adaboost_stumps{T<:Union(Leaf,Node)}(stumps::Vector{T}, coeffs::Vector{Float64}, features::Matrix{Float64}) +function apply_adaboost_stumps{T<:Union(Leaf,Node), U<:Real, V<:Real}(stumps::Vector{T}, coeffs::Vector{U}, features::Matrix{V}) N = size(features,1) predictions = zeros(Any,N) for i in 1:N diff --git a/src/measures.jl b/src/measures.jl index 9f385518..de85c8bc 100644 --- a/src/measures.jl +++ b/src/measures.jl @@ -27,13 +27,13 @@ function _info_gain{T<:RealStr}(labels0::Vector{T}, labels1::Vector{T}) return H end -function _neg_z1_loss{T<:RealStr}(labels::Vector{T}, weights::Vector{Float64}) +function _neg_z1_loss{T<:RealStr, U<:Real}(labels::Vector{T}, weights::Vector{U}) missmatches = labels .!= majority_vote(labels) loss = sum(weights[missmatches]) return -loss end -function _weighted_error{T<:RealStr}(actual::Vector{T}, predicted::Vector{T}, weights::Vector{Float64}) +function _weighted_error{T<:RealStr, U<:Real}(actual::Vector{T}, predicted::Vector{T}, weights::Vector{U}) mismatches = actual .!= predicted err = sum(weights[mismatches]) / sum(weights) return err @@ -59,12 +59,12 @@ function majority_vote{T<:RealStr}(labels::Vector{T}) return top_vote end -function sample{T<:RealStr}(labels::Vector{T}, features::Matrix{Float64}, nsamples::Integer) +function sample{T<:RealStr, U<:Real}(labels::Vector{T}, features::Matrix{U}, nsamples::Integer) inds = iceil(length(labels) * rand(nsamples)) ## with replacement return (labels[inds], features[inds,:]) end -function confusion_matrix{T<:RealStr}(actual::Vector{T}, predicted::Vector{T}) +function confusion_matrix{T<:RealStr, U<:RealStr}(actual::Vector{T}, predicted::Vector{U}) @assert length(actual) == length(predicted) N = length(actual) _actual = zeros(Int,N) @@ -89,7 +89,7 @@ function confusion_matrix{T<:RealStr}(actual::Vector{T}, predicted::Vector{T}) println("Kappa ", kappa) end -function nfoldCV_forest{T<:RealStr}(labels::Vector{T}, features::Matrix{Float64}, nsubfeatures::Integer, ntrees::Integer, nfolds::Integer) +function nfoldCV_forest{T<:RealStr, U<:Real}(labels::Vector{T}, features::Matrix{U}, nsubfeatures::Integer, ntrees::Integer, nfolds::Integer) if nfolds < 2 || ntrees < 1 return end @@ -117,7 +117,7 @@ function nfoldCV_forest{T<:RealStr}(labels::Vector{T}, features::Matrix{Float64} end end -function nfoldCV_stumps{T<:RealStr}(labels::Vector{T}, features::Matrix{Float64}, niterations::Integer, nfolds::Integer) +function nfoldCV_stumps{T<:RealStr, U<:Real}(labels::Vector{T}, features::Matrix{U}, niterations::Integer, nfolds::Integer) if nfolds < 2 || niterations < 1 return end diff --git a/test/iris.jl b/test/iris.jl index 12069b82..0ba85d5e 100644 --- a/test/iris.jl +++ b/test/iris.jl @@ -2,12 +2,12 @@ using RDatasets using DecisionTree iris = data("datasets", "iris") -features = convert(Array{Float64,2}, matrix(iris[:, 2:5])); -labels = convert(Array{UTF8String,1}, iris[:, "Species"]); +features = matrix(iris[:, 2:5]); +labels = vector(iris[:, "Species"]); # train full-tree classifier model = build_tree(labels, features); -# prune tree: merge leaves having > 90% combined purity +# prune tree: merge leaves having > 90% combined purity (default: 100%) model = prune_tree(model, 0.9); # apply learned model apply_tree(model, [5.9,3.0,5.1,1.9])