Skip to content

Commit

Permalink
Bug fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
Ben Sadeghi committed Jan 20, 2013
1 parent ffeed63 commit e4e8687
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 41 deletions.
4 changes: 2 additions & 2 deletions LICENSE.md
@@ -1,10 +1,10 @@
DecisionTree.jl is licensed under the MIT License:

> Code is originally adapted from MILK: Machine Learning Toolkit
> Copyright (C) 2008-2011, Luis Pedro Coelho <luis@luispedro.org>
> Copyright (c) 2008-2011, Luis Pedro Coelho <luis@luispedro.org>
> License: MIT. See COPYING.MIT file in the milk distribution
>
> Copyright (c) 2012: Ben Sadeghi
> Copyright (c) 2012-2013, Ben Sadeghi
>
> Permission is hereby granted, free of charge, to any person obtaining
> a copy of this software and associated documentation files (the
Expand Down
8 changes: 4 additions & 4 deletions README.md
Expand Up @@ -17,12 +17,12 @@ using RDatasets
using DecisionTree

iris = data("datasets", "iris")
features = convert(Array{Float64,2}, matrix(iris[:, 2:5]));
labels = convert(Array{UTF8String,1}, iris[:, "Species"]);
features = matrix(iris[:, 2:5]);
labels = vector(iris[:, "Species"]);

# train full-tree classifier
model = build_tree(labels, features);
# prune tree: merge leaves having > 90% combined purity
# prune tree: merge leaves having > 90% combined purity (default 100%)
model = prune_tree(model, 0.9);
# apply learned model
apply_tree(model, [5.9,3.0,5.1,1.9])
Expand All @@ -44,4 +44,4 @@ nfoldCV_stumps(labels, features, 7, 3)

# Coming Soon

* Support for DataFrames
* Support for missing values, DataFrames
49 changes: 23 additions & 26 deletions src/DecisionTree.jl
Expand Up @@ -34,7 +34,7 @@ function length(tree::Union(Leaf,Node))
return length(s) - 1
end

function _split{T<:RealStr}(labels::Vector{T}, features::Matrix{Float64}, nsubfeatures::Integer, weights::Vector{Float64})
function _split{T<:RealStr, U<:Real, V<:Real}(labels::Vector{T}, features::Matrix{U}, nsubfeatures::Integer, weights::Vector{V})
nf = size(features,2)
best = None
best_val = -Inf
Expand Down Expand Up @@ -62,7 +62,7 @@ function _split{T<:RealStr}(labels::Vector{T}, features::Matrix{Float64}, nsubfe
return best
end

function build_stump{T<:RealStr}(labels::Vector{T}, features::Matrix{Float64}, weights::Vector{Float64})
function build_stump{T<:RealStr, U<:Real, V<:Real}(labels::Vector{T}, features::Matrix{U}, weights::Vector{V})
S = _split(labels, features, 0, weights)
if S == None
return Leaf(majority_vote(labels), labels)
Expand All @@ -73,10 +73,10 @@ function build_stump{T<:RealStr}(labels::Vector{T}, features::Matrix{Float64}, w
Leaf(majority_vote(labels[split]), labels[split]),
Leaf(majority_vote(labels[!split]), labels[!split]))
end
build_stump{T<:RealStr}(labels::Vector{T}, features::Matrix{Float64}) = build_stump(labels, features, [0.])
build_stump{T<:RealStr, U<:Real}(labels::Vector{T}, features::Matrix{U}) = build_stump(labels, features, [0])

function build_tree{T<:RealStr}(labels::Vector{T}, features::Matrix{Float64}, nsubfeatures::Integer)
S = _split(labels, features, nsubfeatures, [0.])
function build_tree{T<:RealStr, U<:Real}(labels::Vector{T}, features::Matrix{U}, nsubfeatures::Integer)
S = _split(labels, features, nsubfeatures, [0])
if S == None
return Leaf(majority_vote(labels), labels)
end
Expand All @@ -86,43 +86,40 @@ function build_tree{T<:RealStr}(labels::Vector{T}, features::Matrix{Float64}, ns
build_tree(labels[split],features[split,:], nsubfeatures),
build_tree(labels[!split],features[!split,:], nsubfeatures))
end
build_tree{T<:RealStr}(labels::Vector{T}, features::Matrix{Float64}) = build_tree(labels, features, 0)
build_tree{S<:RealStr, T<:Real}(labels::Vector{S}, features::Matrix{T}) = build_tree(labels, features, 0)

function prune_tree{T<:Union(Leaf,Node)}(tree::T, purity_thresh::Float64)
function _prune_run{T<:Union(Leaf,Node)}(tree::T, purity_thresh::Float64)
function prune_tree{T<:Union(Leaf,Node)}(tree::T, purity_thresh::Real)
purity_thresh -= eps()
function _prune_run{T<:Union(Leaf,Node)}(tree::T, purity_thresh::Real)
N = length(tree)
if N == 1 ## a Leaf
return tree
elseif N == 2 ## a stump
all_labels = [tree.left.values, tree.right.values]
majority = majority_vote(all_labels)
mismatches = length(find(all_labels .!= majority))
if mismatches == 0
purity = 1.0
else
purity = mismatches / length(all_labels)
end
mismatches = find(all_labels .!= majority)
purity = 1.0 - length(mismatches) / length(all_labels)
if purity > purity_thresh
return Leaf(majority, all_labels)
else
return tree
end
else
return Node(tree.featid, tree.featval,
prune_tree(tree.left, purity_thresh),
prune_tree(tree.right, purity_thresh))
_prune_run(tree.left, purity_thresh),
_prune_run(tree.right, purity_thresh))
end
end
pruned = _prune_run(tree, purity_thresh)
while length(pruned) < length(tree)
tree = pruned
return _prune_run(tree, purity_thresh)
pruned = _prune_run(tree, purity_thresh)
end
return pruned
end
prune_tree{T<:Union(Leaf,Node)}(tree::T) = prune_tree(tree, 0.9) ## defaults to 90% purity pruning
prune_tree{T<:Union(Leaf,Node)}(tree::T) = prune_tree(tree, 1.0) ## defaults to 100% purity pruning

function apply_tree{T<:Union(Leaf,Node)}(tree::T, features::Vector{Float64})
function apply_tree{T<:Union(Leaf,Node), U<:Real}(tree::T, features::Vector{U})
if typeof(tree) == Leaf
return tree.majority
elseif features[tree.featid] < tree.featval
Expand All @@ -132,7 +129,7 @@ function apply_tree{T<:Union(Leaf,Node)}(tree::T, features::Vector{Float64})
end
end

function apply_tree{T<:Union(Leaf,Node)}(tree::T, features::Matrix{Float64})
function apply_tree{T<:Union(Leaf,Node), U<:Real}(tree::T, features::Matrix{U})
N = size(features,1)
predictions = zeros(Any,N)
for i in 1:N
Expand All @@ -141,7 +138,7 @@ function apply_tree{T<:Union(Leaf,Node)}(tree::T, features::Matrix{Float64})
return convert(Array{UTF8String,1}, predictions)
end

function build_forest{T<:RealStr}(labels::Vector{T}, features::Matrix{Float64}, nsubfeatures::Integer, ntrees::Integer)
function build_forest{T<:RealStr, U<:Real}(labels::Vector{T}, features::Matrix{U}, nsubfeatures::Integer, ntrees::Integer)
N = int(0.7 * length(labels))
forest = @parallel (vcat) for i in 1:ntrees
_labels, _features = sample(labels, features, N)
Expand All @@ -151,7 +148,7 @@ function build_forest{T<:RealStr}(labels::Vector{T}, features::Matrix{Float64},
return forest
end

function apply_forest{T<:Union(Leaf,Node)}(forest::Vector{T}, features::Vector{Float64})
function apply_forest{T<:Union(Leaf,Node), U<:Real}(forest::Vector{T}, features::Vector{U})
ntrees = length(forest)
votes = zeros(Any,ntrees)
for i in 1:ntrees
Expand All @@ -160,7 +157,7 @@ function apply_forest{T<:Union(Leaf,Node)}(forest::Vector{T}, features::Vector{F
return majority_vote(convert(Array{UTF8String,1}, votes))
end

function apply_forest{T<:Union(Leaf,Node)}(forest::Vector{T}, features::Matrix{Float64})
function apply_forest{T<:Union(Leaf,Node), U<:Real}(forest::Vector{T}, features::Matrix{U})
N = size(features,1)
predictions = zeros(Any,N)
for i in 1:N
Expand All @@ -169,7 +166,7 @@ function apply_forest{T<:Union(Leaf,Node)}(forest::Vector{T}, features::Matrix{F
return convert(Array{UTF8String,1}, predictions)
end

function build_adaboost_stumps{T<:RealStr}(labels::Vector{T}, features::Matrix{Float64}, niterations::Integer)
function build_adaboost_stumps{T<:RealStr, U<:Real}(labels::Vector{T}, features::Matrix{U}, niterations::Integer)
N = length(labels)
weights = ones(N) / N
stumps = []
Expand All @@ -191,7 +188,7 @@ function build_adaboost_stumps{T<:RealStr}(labels::Vector{T}, features::Matrix{F
return (stumps, coeffs)
end

function apply_adaboost_stumps{T<:Union(Leaf,Node)}(stumps::Vector{T}, coeffs::Vector{Float64}, features::Vector{Float64})
function apply_adaboost_stumps{T<:Union(Leaf,Node), U<:Real, V<:Real}(stumps::Vector{T}, coeffs::Vector{U}, features::Vector{V})
nstumps = length(stumps)
counts = Dict()
for i in 1:nstumps
Expand All @@ -213,7 +210,7 @@ function apply_adaboost_stumps{T<:Union(Leaf,Node)}(stumps::Vector{T}, coeffs::V
return top_prediction
end

function apply_adaboost_stumps{T<:Union(Leaf,Node)}(stumps::Vector{T}, coeffs::Vector{Float64}, features::Matrix{Float64})
function apply_adaboost_stumps{T<:Union(Leaf,Node), U<:Real, V<:Real}(stumps::Vector{T}, coeffs::Vector{U}, features::Matrix{V})
N = size(features,1)
predictions = zeros(Any,N)
for i in 1:N
Expand Down
12 changes: 6 additions & 6 deletions src/measures.jl
Expand Up @@ -27,13 +27,13 @@ function _info_gain{T<:RealStr}(labels0::Vector{T}, labels1::Vector{T})
return H
end

function _neg_z1_loss{T<:RealStr}(labels::Vector{T}, weights::Vector{Float64})
function _neg_z1_loss{T<:RealStr, U<:Real}(labels::Vector{T}, weights::Vector{U})
missmatches = labels .!= majority_vote(labels)
loss = sum(weights[missmatches])
return -loss
end

function _weighted_error{T<:RealStr}(actual::Vector{T}, predicted::Vector{T}, weights::Vector{Float64})
function _weighted_error{T<:RealStr, U<:Real}(actual::Vector{T}, predicted::Vector{T}, weights::Vector{U})
mismatches = actual .!= predicted
err = sum(weights[mismatches]) / sum(weights)
return err
Expand All @@ -59,12 +59,12 @@ function majority_vote{T<:RealStr}(labels::Vector{T})
return top_vote
end

function sample{T<:RealStr}(labels::Vector{T}, features::Matrix{Float64}, nsamples::Integer)
function sample{T<:RealStr, U<:Real}(labels::Vector{T}, features::Matrix{U}, nsamples::Integer)
inds = iceil(length(labels) * rand(nsamples)) ## with replacement
return (labels[inds], features[inds,:])
end

function confusion_matrix{T<:RealStr}(actual::Vector{T}, predicted::Vector{T})
function confusion_matrix{T<:RealStr, U<:RealStr}(actual::Vector{T}, predicted::Vector{U})
@assert length(actual) == length(predicted)
N = length(actual)
_actual = zeros(Int,N)
Expand All @@ -89,7 +89,7 @@ function confusion_matrix{T<:RealStr}(actual::Vector{T}, predicted::Vector{T})
println("Kappa ", kappa)
end

function nfoldCV_forest{T<:RealStr}(labels::Vector{T}, features::Matrix{Float64}, nsubfeatures::Integer, ntrees::Integer, nfolds::Integer)
function nfoldCV_forest{T<:RealStr, U<:Real}(labels::Vector{T}, features::Matrix{U}, nsubfeatures::Integer, ntrees::Integer, nfolds::Integer)
if nfolds < 2 || ntrees < 1
return
end
Expand Down Expand Up @@ -117,7 +117,7 @@ function nfoldCV_forest{T<:RealStr}(labels::Vector{T}, features::Matrix{Float64}
end
end

function nfoldCV_stumps{T<:RealStr}(labels::Vector{T}, features::Matrix{Float64}, niterations::Integer, nfolds::Integer)
function nfoldCV_stumps{T<:RealStr, U<:Real}(labels::Vector{T}, features::Matrix{U}, niterations::Integer, nfolds::Integer)
if nfolds < 2 || niterations < 1
return
end
Expand Down
6 changes: 3 additions & 3 deletions test/iris.jl
Expand Up @@ -2,12 +2,12 @@ using RDatasets
using DecisionTree

iris = data("datasets", "iris")
features = convert(Array{Float64,2}, matrix(iris[:, 2:5]));
labels = convert(Array{UTF8String,1}, iris[:, "Species"]);
features = matrix(iris[:, 2:5]);
labels = vector(iris[:, "Species"]);

# train full-tree classifier
model = build_tree(labels, features);
# prune tree: merge leaves having > 90% combined purity
# prune tree: merge leaves having > 90% combined purity (default: 100%)
model = prune_tree(model, 0.9);
# apply learned model
apply_tree(model, [5.9,3.0,5.1,1.9])
Expand Down

0 comments on commit e4e8687

Please sign in to comment.