forked from JuliaAI/DecisionTree.jl
/
measures.jl
142 lines (133 loc) · 4.28 KB
/
measures.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
function _set_entropy{T<:RealStr}(labels::Vector{T})
N = length(labels)
counts = Dict()
for i in labels
if !has(counts,i)
counts[i] = 0
end
counts[i] += 1
end
entropy = 0
for i in counts
v = i[2]
if v > 0
entropy += v * log(v)
end
end
entropy /= -N
entropy += log(N)
return entropy
end
function _info_gain{T<:RealStr}(labels0::Vector{T}, labels1::Vector{T})
N0 = length(labels0)
N1 = length(labels1)
N = N0 + N1
H = - N0/N * _set_entropy(labels0) - N1/N * _set_entropy(labels1)
return H
end
function _neg_z1_loss{T<:RealStr, U<:Real}(labels::Vector{T}, weights::Vector{U})
missmatches = labels .!= majority_vote(labels)
loss = sum(weights[missmatches])
return -loss
end
function _weighted_error{T<:RealStr, U<:Real}(actual::Vector{T}, predicted::Vector{T}, weights::Vector{U})
mismatches = actual .!= predicted
err = sum(weights[mismatches]) / sum(weights)
return err
end
function majority_vote{T<:RealStr}(labels::Vector{T})
counts = Dict()
for i in labels
if has(counts,i)
counts[i] += 1
else
counts[i] = 0
end
end
top_vote = None
top_count = -Inf
for i in pairs(counts)
if i[2] > top_count
top_vote = i[1]
top_count = i[2]
end
end
return top_vote
end
function sample{T<:RealStr, U<:Real}(labels::Vector{T}, features::Matrix{U}, nsamples::Integer)
inds = iceil(length(labels) * rand(nsamples)) ## with replacement
return (labels[inds], features[inds,:])
end
function confusion_matrix{T<:RealStr, U<:RealStr}(actual::Vector{T}, predicted::Vector{U})
@assert length(actual) == length(predicted)
N = length(actual)
_actual = zeros(Int,N)
_predicted = zeros(Int,N)
classes = sort(unique([actual, predicted]))
N = length(classes)
for i in 1:N
_actual[actual .== classes[i]] = i
_predicted[predicted .== classes[i]] = i
end
CM = zeros(Int,N,N)
for i in zip(_actual, _predicted)
CM[i[1],i[2]] += 1
end
accuracy = trace(CM) / sum(CM)
prob_chance = (sum(CM,1) * sum(CM,2))[1] / sum(CM)^2
prob_chance = prob_chance[1]
kappa = (accuracy - prob_chance) / (1.0 - prob_chance)
println(classes)
println(CM)
println("Accuracy ", accuracy)
println("Kappa ", kappa)
end
function nfoldCV_forest{T<:RealStr, U<:Real}(labels::Vector{T}, features::Matrix{U}, nsubfeatures::Integer, ntrees::Integer, nfolds::Integer)
if nfolds < 2 || ntrees < 1
return
end
N = length(labels)
ntest = ifloor(N / nfolds)
inds = randperm(N)
for i in 1:nfolds
test_inds = falses(N)
test_inds[(i - 1) * ntest + 1 : i * ntest] = true
train_inds = !test_inds
test_features = features[inds[test_inds],:]
test_labels = labels[inds[test_inds]]
train_features = features[inds[train_inds],:]
train_labels = labels[inds[train_inds]]
if ntrees == 1
model = build_tree(train_labels, train_features, nsubfeatures)
predictions = apply_tree(model, test_features)
else
model = build_forest(train_labels, train_features, nsubfeatures, ntrees)
predictions = apply_forest(model, test_features)
end
println()
println("Fold ", i)
confusion_matrix(test_labels, predictions)
end
end
function nfoldCV_stumps{T<:RealStr, U<:Real}(labels::Vector{T}, features::Matrix{U}, niterations::Integer, nfolds::Integer)
if nfolds < 2 || niterations < 1
return
end
N = length(labels)
ntest = ifloor(N / nfolds)
inds = randperm(N)
for i in 1:nfolds
test_inds = falses(N)
test_inds[(i - 1) * ntest + 1 : i * ntest] = true
train_inds = !test_inds
test_features = features[inds[test_inds],:]
test_labels = labels[inds[test_inds]]
train_features = features[inds[train_inds],:]
train_labels = labels[inds[train_inds]]
model, coeffs = build_adaboost_stumps(train_labels, train_features, niterations)
predictions = apply_adaboost_stumps(model, coeffs, test_features)
println()
println("Fold ", i)
confusion_matrix(test_labels, predictions)
end
end