Permalink
Browse files

Automatically find the best value for epsilon

  • Loading branch information...
1 parent 429460f commit affb4ba7f17c64db3018f66a5504b49680ca37f9 Andrew Kane committed Dec 18, 2011
Showing with 136 additions and 57 deletions.
  1. +21 −24 README.md
  2. +11 −3 Rakefile
  3. +81 −19 lib/anomaly/detector.rb
  4. +23 −11 spec/anomaly/detector_spec.rb
View
@@ -16,7 +16,7 @@ And then execute:
bundle install
```
-For max performance (~ 2x faster), also install the NArray gem:
+For max performance (trains ~3x faster for large datasets), also install the NArray gem:
```ruby
gem "narray"
@@ -30,42 +30,39 @@ Say we have weather data for sunny days and we're trying to detect days that are
```ruby
# Each row is a different day.
-# [temperature (°F), humidity (%), pressure (in)]
-weather_data = [
- [85, 68, 10.4],
- [88, 62, 12.1],
- [86, 64, 13.6],
+# [temperature (°F), humidity (%), pressure (in), anomaly?(n=0, y=1)]
+weather_examples = [
+ [85, 68, 10.4, 0],
+ [88, 62, 12.1, 0],
+ [86, 64, 13.6, 0],
+ [88, 40, 11.1, 1],
...
]
```
-Train the detector with **only non-anomalies** (sunny days in our case).
+The last column **must** be 0 for non-anomalies, 1 for anomalies. Non-anomalies are used to train the detector, and both non-anomalies and anomalies are used to find the best value of ε.
-```ruby
-ad = Anomaly::Detector.new(weather_data)
-```
-
-That's it! Let's test for anomalies.
+To train the detector and test for anomalies, run:
```ruby
+ad = Anomaly::Detector.new(weather_examples)
+
# 79°F, 66% humidity, 12.3 in. pressure
-test_sample = [79, 66, 12.3]
-ad.probability(test_sample)
-# => 7.537174740907633e-08
+ad.anomaly?([79, 66, 12.3])
+# => true
```
-**Super-important:** You must select a threshold for anomalies (which we denote with ε - "epsilon")
-
-Probabilities less than ε are considered anomalies. If ε is higher, more things are considered anomalies.
+Anomaly automatically finds the best value for ε, which you can access with:
-``` ruby
-ad.anomaly?(test_sample, 1e-10)
-# => false
-ad.anomaly?(test_sample, 1e-5)
-# => true
```
+ad.eps
+```
+
+If you already know you want ε = 0.01, initialize the detector with:
-The wiki has [sample code](https://github.com/ankane/anomaly/wiki/Home) to help you find the best ε for your application.
+```
+ad = Anomaly::Detector.new(weather_examples, {:eps => 0.01})
+```
### Persistence
View
@@ -7,11 +7,19 @@ require "benchmark"
require "anomaly"
task :benchmark do
- data = 1_000_000.times.map{ [rand, rand, rand, rand] }
+ examples = 1_000_000.times.map{ [rand, rand, rand, 0] }
Benchmark.bm do |x|
- x.report { Anomaly::Detector.new(data) }
+ x.report { Anomaly::Detector.new(examples, {:eps => 0.5}) }
require "narray"
- x.report { Anomaly::Detector.new(data) }
+ x.report { Anomaly::Detector.new(examples, {:eps => 0.5}) }
end
end
+
+task :random_examples do
+ examples = 10_000.times.map{ [rand, rand(10), rand(100), 0] } +
+ 100.times.map{ [rand + 1, rand(10) + 2, rand(100) + 20, 1] }
+
+ ad = Anomaly::Detector.new(examples)
+ puts ad.eps
+end
View
@@ -1,51 +1,80 @@
module Anomaly
class Detector
+ attr_accessor :eps
- def initialize(data = nil)
+ def initialize(examples = nil, opts = {})
@m = 0
- train(data) if data
+ train(examples, opts) if examples
end
- def train(data)
+ def train(examples, opts = {})
+ raise "No examples" if examples.empty?
+ raise "Must have at least two columns" if examples.first.size < 2
+
+ # Divide into groups since we only want to train with non-anomalies.
+ anomalies = []
+ non_anomalies = []
+ examples.each do |example|
+ if example.last == 0
+ non_anomalies << example
+ else
+ anomalies << example
+ end
+ end
+
+ raise "Must have at least one non-anomaly" if non_anomalies.empty?
+
+ @eps = (opts[:eps] || 0).to_f
+ if @eps > 0
+ # Use all non-anomalies to train.
+ training_examples = non_anomalies
+ else
+ training_examples, test_examples = partition!(non_anomalies)
+ test_examples.concat(anomalies)
+ end
+ # Remove last column.
+ training_examples = training_examples.map{|e| e[0..-2]}
+ @m = training_examples.size
+ @n = training_examples.first.size
+
if defined?(NMatrix)
- d = NMatrix.to_na(data)
- @n, @m = d.sizes
- # Convert these to an array for Marshal.dump
- @mean = d.mean(1).to_a
- @std = d.stddev(1).to_a
+ training_examples = NMatrix.to_na(training_examples)
+ # Convert these to an Array for Marshal.dump
+ @mean = training_examples.mean(1).to_a
+ @std = training_examples.stddev(1).to_a
else
# Default to Array, since built-in Matrix does not give us a big performance advantage.
- d = data.to_a
- @m = d.size
- @n = d.first ? d.first.size : 0
- cols = @n.times.map{|i| d.map{|r| r[i]}}
+ cols = @n.times.map{|i| training_examples.map{|r| r[i]}}
@mean = cols.map{|c| mean(c)}
@std = cols.each_with_index.map{|c,i| std(c, @mean[i])}
end
@std.map!{|std| (std == 0 or std.nan?) ? Float::MIN : std}
+
+ if @eps == 0
+ # Find the best eps.
+ epss = (1..9).map{|i| [1,3,5,7,9].map{|j| (j*10**(-i)).to_f }}.flatten
+ f1_scores = epss.map{|eps| [eps, compute_f1_score(test_examples, eps)] }
+ @eps, best_f1 = f1_scores.max_by{|v| v[1]}
+ end
end
def trained?
@m > 0
end
- def samples
- @m
- end
-
# Limit the probability of features to [0,1]
# to keep probabilities at same scale.
def probability(x)
raise "Train me first" unless trained?
- raise ArgumentError, "x must have #{@n} elements" if x.size != @n
+ raise ArgumentError, "First argument must have #{@n} elements" if x.size != @n
@n.times.map do |i|
p = normal_pdf(x[i], @mean[i], @std[i])
(p.nan? or p > 1) ? 1 : p
end.reduce(1, :*)
end
- def anomaly?(x, epsilon)
- probability(x) < epsilon
+ def anomaly?(x, eps = @eps)
+ probability(x) < eps
end
protected
@@ -56,6 +85,39 @@ def normal_pdf(x, mean = 0, std = 1)
1/(SQRT2PI*std)*Math.exp(-((x - mean)**2/(2.0*(std**2))))
end
+ # Find best eps.
+
+ def partition!(examples, p_last = 0.2)
+ examples.shuffle!
+ n = (examples.size * p_last).floor
+ [examples[n..-1], examples[0...n]]
+ end
+
+ def compute_f1_score(examples, eps)
+ tp = 0
+ fp = 0
+ fn = 0
+ examples.each do |example|
+ act = example.last != 0
+ pred = self.anomaly?(example[0..-2], eps)
+ if act and pred
+ tp += 1
+ elsif pred # and !act
+ fp += 1
+ elsif act # and !pred
+ fn += 1
+ end
+ end
+ f1_score(tp, fp, fn)
+ end
+
+ def f1_score(tp, fp, fn)
+ precision = tp / (tp + fp).to_f
+ recall = tp / (tp + fn).to_f
+ score = 2.0 * precision * recall / (precision + recall)
+ score.nan? ? 0.0 : score
+ end
+
# Not used for NArray
def mean(x)
@@ -1,8 +1,8 @@
require "spec_helper"
describe Anomaly::Detector do
- let(:data) { [[-1,-2],[0,0],[1,2]] }
- let(:ad) { Anomaly::Detector.new(data) }
+ let(:examples) { [[-1,-2,0],[0,0,0],[1,2,0]] }
+ let(:ad) { Anomaly::Detector.new(examples) }
# mean = [0, 0], std = [1, 2]
it "computes the right probability" do
@@ -14,7 +14,7 @@
end
context "when standard deviation is 0" do
- let(:data) { [[0],[0]] }
+ let(:examples) { [[0,0],[0,0]] }
it "returns infinity for mean" do
ad.probability([0]).should == 1
@@ -25,35 +25,47 @@
end
end
- context "when data is an array" do
- let(:data) { [[-1,-2],[0,0],[1,2]] }
+ context "when examples is an array" do
+ let(:examples) { [[-1,-2,0],[0,0,0],[1,2,0]] }
let(:sample) { [rand, rand] }
it "returns the same probability as an NMatrix" do
prob = ad.probability(sample)
Object.send(:remove_const, :NMatrix)
- prob.should == Anomaly::Detector.new(data).probability(sample)
+ prob.should == Anomaly::Detector.new(examples).probability(sample)
end
end
context "when lots of samples" do
- let(:data) { m.times.map{[0]} }
+ let(:examples) { m.times.map{[0,0]} }
let(:m) { rand(100) + 1 }
- it { ad.samples.should == m }
it { ad.trained?.should be_true }
end
context "when no samples" do
- let(:data) { [] }
+ let(:examples) { nil }
- it { ad.samples.should == 0 }
it { ad.trained?.should be_false }
end
context "when pdf is greater than 1" do
- let(:data) { 100.times.map{[0]}.push([1]) }
+ let(:examples) { 100.times.map{[0,0]}.push([1,0]) }
it { ad.probability([0]).should == 1 }
end
+
+ context "when only anomalies" do
+ let(:examples) { [[0,1]] }
+
+ it "raises error" do
+ expect{ ad }.to raise_error RuntimeError, "Must have at least one non-anomaly"
+ end
+ end
+
+ context "when only one non-anomaly" do
+ let(:examples) { [[0,0]] }
+
+ it { ad.eps.should == 1e-1 }
+ end
end

0 comments on commit affb4ba

Please sign in to comment.