/
detector.rb
133 lines (112 loc) · 3.57 KB
/
detector.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
module Anomaly
class Detector
attr_reader :mean, :std
attr_accessor :eps
def initialize(examples = nil, opts = {})
@m = 0
train(examples, opts) if examples
end
def train(examples, opts = {})
raise "No examples" if examples.empty?
raise "Must have at least two columns" if examples.first.size < 2
# Divide into groups since we only want to train with non-anomalies.
anomalies = []
non_anomalies = []
examples.each do |example|
if example.last == 0
non_anomalies << example
else
anomalies << example
end
end
raise "Must have at least one non-anomaly" if non_anomalies.empty?
@eps = (opts[:eps] || 0).to_f
if @eps > 0
# Use all non-anomalies to train.
training_examples = non_anomalies
else
training_examples, test_examples = partition!(non_anomalies)
test_examples.concat(anomalies)
end
# Remove last column.
training_examples = training_examples.map{|e| e[0..-2]}
@m = training_examples.size
@n = training_examples.first.size
if defined?(NMatrix)
training_examples = NMatrix.to_na(training_examples)
# Convert these to an Array for Marshal.dump
@mean = training_examples.mean(1).to_a
@std = training_examples.stddev(1).to_a
else
# Default to Array, since built-in Matrix does not give us a big performance advantage.
cols = @n.times.map{|i| training_examples.map{|r| r[i]}}
@mean = cols.map{|c| alt_mean(c)}
@std = cols.each_with_index.map{|c,i| alt_std(c, @mean[i])}
end
@std.map!{|std| (std == 0 or std.nan?) ? Float::MIN : std}
if @eps == 0
# Find the best eps.
epss = (1..9).map{|i| [1,3,5,7,9].map{|j| (j*10**(-i)).to_f }}.flatten
f1_scores = epss.map{|eps| [eps, compute_f1_score(test_examples, eps)] }
@eps, best_f1 = f1_scores.max_by{|v| v[1]}
end
end
def trained?
@m > 0
end
# Limit the probability of features to [0,1]
# to keep probabilities at same scale.
def probability(x)
raise "Train me first" unless trained?
raise ArgumentError, "First argument must have #{@n} elements" if x.size != @n
@n.times.map do |i|
p = normal_pdf(x[i], @mean[i], @std[i])
(p.nan? or p > 1) ? 1 : p
end.reduce(1, :*)
end
def anomaly?(x, eps = @eps)
probability(x) < eps
end
protected
SQRT2PI = Math.sqrt(2*Math::PI)
def normal_pdf(x, mean = 0, std = 1)
1/(SQRT2PI*std)*Math.exp(-((x - mean)**2/(2.0*(std**2))))
end
# Find best eps.
def partition!(examples, p_last = 0.2)
examples.shuffle!
n = (examples.size * p_last).floor
[examples[n..-1], examples[0...n]]
end
def compute_f1_score(examples, eps)
tp = 0
fp = 0
fn = 0
examples.each do |example|
act = example.last != 0
pred = self.anomaly?(example[0..-2], eps)
if act and pred
tp += 1
elsif pred # and !act
fp += 1
elsif act # and !pred
fn += 1
end
end
f1_score(tp, fp, fn)
end
def f1_score(tp, fp, fn)
precision = tp / (tp + fp).to_f
recall = tp / (tp + fn).to_f
score = 2.0 * precision * recall / (precision + recall)
score.nan? ? 0.0 : score
end
# Not used for NArray
def alt_mean(x)
x.inject(0.0){|a, i| a + i}/x.size
end
def alt_std(x, mean)
Math.sqrt(x.inject(0.0){|a, i| a + (i - mean) ** 2}/(x.size - 1))
end
end
end