Automatically find the best value for epsilon

ankane · Dec 18, 2011 · affb4ba · affb4ba
1 parent 429460f
commit affb4ba
Show file tree

Hide file tree

Showing 4 changed files with 136 additions and 57 deletions.
diff --git a/README.md b/README.md
@@ -16,7 +16,7 @@ And then execute:
 bundle install
 ```
 
-For max performance (~ 2x faster), also install the NArray gem:
+For max performance (trains ~3x faster for large datasets), also install the NArray gem:
 
 ```ruby
 gem "narray"
@@ -30,42 +30,39 @@ Say we have weather data for sunny days and we're trying to detect days that are
 
 ```ruby
 # Each row is a different day.
-# [temperature (°F), humidity (%), pressure (in)]
+# [temperature (°F), humidity (%), pressure (in), anomaly?(n=0, y=1)]
-weather_data = [
+weather_examples = [
-  [85, 68, 10.4],
+  [85, 68, 10.4, 0],
-  [88, 62, 12.1],
+  [88, 62, 12.1, 0],
-  [86, 64, 13.6],
+  [86, 64, 13.6, 0],
+  [88, 40, 11.1, 1],
   ...
 ]
 ```
 
-Train the detector with **only non-anomalies** (sunny days in our case).
+The last column **must** be 0 for non-anomalies, 1 for anomalies. Non-anomalies are used to train the detector, and both non-anomalies and anomalies are used to find the best value of ε.
 
-```ruby
+To train the detector and test for anomalies, run:
-ad = Anomaly::Detector.new(weather_data)
-```
-
-That's it! Let's test for anomalies.
 
 ```ruby
+ad = Anomaly::Detector.new(weather_examples)
+
 # 79°F, 66% humidity, 12.3 in. pressure
-test_sample = [79, 66, 12.3]
+ad.anomaly?([79, 66, 12.3])
-ad.probability(test_sample)
+# => true
-# => 7.537174740907633e-08
 ```
 
-**Super-important:** You must select a threshold for anomalies (which we denote with ε - "epsilon")
+Anomaly automatically finds the best value for ε, which you can access with:
-
-Probabilities less than ε are considered anomalies. If ε is higher, more things are considered anomalies.
 
-``` ruby
-ad.anomaly?(test_sample, 1e-10)
-# => false
-ad.anomaly?(test_sample, 1e-5)
-# => true
 ```
+ad.eps
+```
+
+If you already know you want ε = 0.01, initialize the detector with:
 
-The wiki has [sample code](https://github.com/ankane/anomaly/wiki/Home) to help you find the best ε for your application.
+```
+ad = Anomaly::Detector.new(weather_examples, {:eps => 0.01})
+```
 
 ### Persistence
 

diff --git a/Rakefile b/Rakefile
@@ -7,11 +7,19 @@ require "benchmark"
 require "anomaly"
 
 task :benchmark do
-  data = 1_000_000.times.map{ [rand, rand, rand, rand] }
+  examples = 1_000_000.times.map{ [rand, rand, rand, 0] }
 
   Benchmark.bm do |x|
-    x.report { Anomaly::Detector.new(data) }
+    x.report { Anomaly::Detector.new(examples, {:eps => 0.5}) }
     require "narray"
-    x.report { Anomaly::Detector.new(data) }
+    x.report { Anomaly::Detector.new(examples, {:eps => 0.5}) }
   end
 end
+
+task :random_examples do
+  examples = 10_000.times.map{ [rand, rand(10), rand(100), 0] } +
+    100.times.map{ [rand + 1, rand(10) + 2, rand(100) + 20, 1] }
+
+  ad = Anomaly::Detector.new(examples)
+  puts ad.eps
+end
diff --git a/lib/anomaly/detector.rb b/lib/anomaly/detector.rb
@@ -1,51 +1,80 @@
 module Anomaly
   class Detector
+    attr_accessor :eps
 
-    def initialize(data = nil)
+    def initialize(examples = nil, opts = {})
       @m = 0
-      train(data) if data
+      train(examples, opts) if examples
     end
 
-    def train(data)
+    def train(examples, opts = {})
+      raise "No examples" if examples.empty?
+      raise "Must have at least two columns" if examples.first.size < 2
+
+      # Divide into groups since we only want to train with non-anomalies.
+      anomalies = []
+      non_anomalies = []
+      examples.each do |example|
+        if example.last == 0
+          non_anomalies << example
+        else
+          anomalies << example
+        end
+      end
+
+      raise "Must have at least one non-anomaly" if non_anomalies.empty?
+
+      @eps = (opts[:eps] || 0).to_f
+      if @eps > 0
+        # Use all non-anomalies to train.
+        training_examples = non_anomalies
+      else
+        training_examples, test_examples = partition!(non_anomalies)
+        test_examples.concat(anomalies)
+      end
+      # Remove last column.
+      training_examples = training_examples.map{|e| e[0..-2]}
+      @m = training_examples.size
+      @n = training_examples.first.size
+
       if defined?(NMatrix)
-        d = NMatrix.to_na(data)
+        training_examples = NMatrix.to_na(training_examples)
-        @n, @m = d.sizes
+        # Convert these to an Array for Marshal.dump
-        # Convert these to an array for Marshal.dump
+        @mean = training_examples.mean(1).to_a
-        @mean = d.mean(1).to_a
+        @std = training_examples.stddev(1).to_a
-        @std = d.stddev(1).to_a
       else
         # Default to Array, since built-in Matrix does not give us a big performance advantage.
-        d = data.to_a
+        cols = @n.times.map{|i| training_examples.map{|r| r[i]}}
-        @m = d.size
-        @n = d.first ? d.first.size : 0
-        cols = @n.times.map{|i| d.map{|r| r[i]}}
         @mean = cols.map{|c| mean(c)}
         @std = cols.each_with_index.map{|c,i| std(c, @mean[i])}
       end
       @std.map!{|std| (std == 0 or std.nan?) ? Float::MIN : std}
+
+      if @eps == 0
+        # Find the best eps.
+        epss = (1..9).map{|i| [1,3,5,7,9].map{|j| (j*10**(-i)).to_f }}.flatten
+        f1_scores = epss.map{|eps| [eps, compute_f1_score(test_examples, eps)] }
+        @eps, best_f1 = f1_scores.max_by{|v| v[1]}
+      end
     end
 
     def trained?
       @m > 0
     end
 
-    def samples
-      @m
-    end
-
     # Limit the probability of features to [0,1]
     # to keep probabilities at same scale.
     def probability(x)
       raise "Train me first" unless trained?
-      raise ArgumentError, "x must have #{@n} elements" if x.size != @n
+      raise ArgumentError, "First argument must have #{@n} elements" if x.size != @n
       @n.times.map do |i|
         p = normal_pdf(x[i], @mean[i], @std[i])
         (p.nan? or p > 1) ? 1 : p
       end.reduce(1, :*)
     end
 
-    def anomaly?(x, epsilon)
+    def anomaly?(x, eps = @eps)
-      probability(x) < epsilon
+      probability(x) < eps
     end
 
     protected
@@ -56,6 +85,39 @@ def normal_pdf(x, mean = 0, std = 1)
       1/(SQRT2PI*std)*Math.exp(-((x - mean)**2/(2.0*(std**2))))
     end
 
+    # Find best eps.
+
+    def partition!(examples, p_last = 0.2)
+      examples.shuffle!
+      n = (examples.size * p_last).floor
+      [examples[n..-1], examples[0...n]]
+    end
+
+    def compute_f1_score(examples, eps)
+      tp = 0
+      fp = 0
+      fn = 0
+      examples.each do |example|
+        act = example.last != 0
+        pred = self.anomaly?(example[0..-2], eps)
+        if act and pred
+          tp += 1
+        elsif pred # and !act
+          fp += 1
+        elsif act # and !pred
+          fn += 1
+        end
+      end
+      f1_score(tp, fp, fn)
+    end
+
+    def f1_score(tp, fp, fn)
+      precision = tp / (tp + fp).to_f
+      recall = tp / (tp + fn).to_f
+      score = 2.0 * precision * recall / (precision + recall)
+      score.nan? ? 0.0 : score
+    end
+
     # Not used for NArray
 
     def mean(x)

diff --git a/spec/anomaly/detector_spec.rb b/spec/anomaly/detector_spec.rb
@@ -1,8 +1,8 @@
 require "spec_helper"
 
 describe Anomaly::Detector do
-  let(:data) { [[-1,-2],[0,0],[1,2]] }
+  let(:examples) { [[-1,-2,0],[0,0,0],[1,2,0]] }
-  let(:ad) { Anomaly::Detector.new(data) }
+  let(:ad) { Anomaly::Detector.new(examples) }
 
   # mean = [0, 0], std = [1, 2]
   it "computes the right probability" do
@@ -14,7 +14,7 @@
   end
 
   context "when standard deviation is 0" do
-    let(:data) { [[0],[0]] }
+    let(:examples) { [[0,0],[0,0]] }
 
     it "returns infinity for mean" do
       ad.probability([0]).should == 1
@@ -25,35 +25,47 @@
     end
   end
 
-  context "when data is an array" do
+  context "when examples is an array" do
-    let(:data) { [[-1,-2],[0,0],[1,2]] }
+    let(:examples) { [[-1,-2,0],[0,0,0],[1,2,0]] }
     let(:sample) { [rand, rand] }
 
     it "returns the same probability as an NMatrix" do
       prob = ad.probability(sample)
       Object.send(:remove_const, :NMatrix)
-      prob.should == Anomaly::Detector.new(data).probability(sample)
+      prob.should == Anomaly::Detector.new(examples).probability(sample)
     end
   end
 
   context "when lots of samples" do
-    let(:data) { m.times.map{[0]} }
+    let(:examples) { m.times.map{[0,0]} }
     let(:m) { rand(100) + 1 }
 
-    it { ad.samples.should == m }
     it { ad.trained?.should be_true }
   end
 
   context "when no samples" do
-    let(:data) { [] }
+    let(:examples) { nil }
 
-    it { ad.samples.should == 0 }
     it { ad.trained?.should be_false }
   end
 
   context "when pdf is greater than 1" do
-    let(:data) { 100.times.map{[0]}.push([1]) }
+    let(:examples) { 100.times.map{[0,0]}.push([1,0]) }
 
     it { ad.probability([0]).should == 1 }
   end
+
+  context "when only anomalies" do
+    let(:examples) { [[0,1]] }
+
+    it "raises error" do
+      expect{ ad }.to raise_error RuntimeError, "Must have at least one non-anomaly"
+    end
+  end
+
+  context "when only one non-anomaly" do
+    let(:examples) { [[0,0]] }
+
+    it { ad.eps.should == 1e-1 }
+  end
 end