Skip to content

Commit

Permalink
Automatically find the best value for epsilon
Browse files Browse the repository at this point in the history
  • Loading branch information
Andrew Kane committed Dec 18, 2011
1 parent 429460f commit affb4ba
Show file tree
Hide file tree
Showing 4 changed files with 136 additions and 57 deletions.
45 changes: 21 additions & 24 deletions README.md
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ And then execute:
bundle install bundle install
``` ```


For max performance (~ 2x faster), also install the NArray gem: For max performance (trains ~3x faster for large datasets), also install the NArray gem:


```ruby ```ruby
gem "narray" gem "narray"
Expand All @@ -30,42 +30,39 @@ Say we have weather data for sunny days and we're trying to detect days that are


```ruby ```ruby
# Each row is a different day. # Each row is a different day.
# [temperature (°F), humidity (%), pressure (in)] # [temperature (°F), humidity (%), pressure (in), anomaly?(n=0, y=1)]
weather_data = [ weather_examples = [
[85, 68, 10.4], [85, 68, 10.4, 0],
[88, 62, 12.1], [88, 62, 12.1, 0],
[86, 64, 13.6], [86, 64, 13.6, 0],
[88, 40, 11.1, 1],
... ...
] ]
``` ```


Train the detector with **only non-anomalies** (sunny days in our case). The last column **must** be 0 for non-anomalies, 1 for anomalies. Non-anomalies are used to train the detector, and both non-anomalies and anomalies are used to find the best value of ε.


```ruby To train the detector and test for anomalies, run:
ad = Anomaly::Detector.new(weather_data)
```

That's it! Let's test for anomalies.


```ruby ```ruby
ad = Anomaly::Detector.new(weather_examples)

# 79°F, 66% humidity, 12.3 in. pressure # 79°F, 66% humidity, 12.3 in. pressure
test_sample = [79, 66, 12.3] ad.anomaly?([79, 66, 12.3])
ad.probability(test_sample) # => true
# => 7.537174740907633e-08
``` ```


**Super-important:** You must select a threshold for anomalies (which we denote with ε - "epsilon") Anomaly automatically finds the best value for ε, which you can access with:

Probabilities less than ε are considered anomalies. If ε is higher, more things are considered anomalies.


``` ruby
ad.anomaly?(test_sample, 1e-10)
# => false
ad.anomaly?(test_sample, 1e-5)
# => true
``` ```
ad.eps
```

If you already know you want ε = 0.01, initialize the detector with:


The wiki has [sample code](https://github.com/ankane/anomaly/wiki/Home) to help you find the best ε for your application. ```
ad = Anomaly::Detector.new(weather_examples, {:eps => 0.01})
```


### Persistence ### Persistence


Expand Down
14 changes: 11 additions & 3 deletions Rakefile
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -7,11 +7,19 @@ require "benchmark"
require "anomaly" require "anomaly"


task :benchmark do task :benchmark do
data = 1_000_000.times.map{ [rand, rand, rand, rand] } examples = 1_000_000.times.map{ [rand, rand, rand, 0] }


Benchmark.bm do |x| Benchmark.bm do |x|
x.report { Anomaly::Detector.new(data) } x.report { Anomaly::Detector.new(examples, {:eps => 0.5}) }
require "narray" require "narray"
x.report { Anomaly::Detector.new(data) } x.report { Anomaly::Detector.new(examples, {:eps => 0.5}) }
end end
end end

task :random_examples do
examples = 10_000.times.map{ [rand, rand(10), rand(100), 0] } +
100.times.map{ [rand + 1, rand(10) + 2, rand(100) + 20, 1] }

ad = Anomaly::Detector.new(examples)
puts ad.eps
end
100 changes: 81 additions & 19 deletions lib/anomaly/detector.rb
Original file line number Original file line Diff line number Diff line change
@@ -1,51 +1,80 @@
module Anomaly module Anomaly
class Detector class Detector
attr_accessor :eps


def initialize(data = nil) def initialize(examples = nil, opts = {})
@m = 0 @m = 0
train(data) if data train(examples, opts) if examples
end end


def train(data) def train(examples, opts = {})
raise "No examples" if examples.empty?
raise "Must have at least two columns" if examples.first.size < 2

# Divide into groups since we only want to train with non-anomalies.
anomalies = []
non_anomalies = []
examples.each do |example|
if example.last == 0
non_anomalies << example
else
anomalies << example
end
end

raise "Must have at least one non-anomaly" if non_anomalies.empty?

@eps = (opts[:eps] || 0).to_f
if @eps > 0
# Use all non-anomalies to train.
training_examples = non_anomalies
else
training_examples, test_examples = partition!(non_anomalies)
test_examples.concat(anomalies)
end
# Remove last column.
training_examples = training_examples.map{|e| e[0..-2]}
@m = training_examples.size
@n = training_examples.first.size

if defined?(NMatrix) if defined?(NMatrix)
d = NMatrix.to_na(data) training_examples = NMatrix.to_na(training_examples)
@n, @m = d.sizes # Convert these to an Array for Marshal.dump
# Convert these to an array for Marshal.dump @mean = training_examples.mean(1).to_a
@mean = d.mean(1).to_a @std = training_examples.stddev(1).to_a
@std = d.stddev(1).to_a
else else
# Default to Array, since built-in Matrix does not give us a big performance advantage. # Default to Array, since built-in Matrix does not give us a big performance advantage.
d = data.to_a cols = @n.times.map{|i| training_examples.map{|r| r[i]}}
@m = d.size
@n = d.first ? d.first.size : 0
cols = @n.times.map{|i| d.map{|r| r[i]}}
@mean = cols.map{|c| mean(c)} @mean = cols.map{|c| mean(c)}
@std = cols.each_with_index.map{|c,i| std(c, @mean[i])} @std = cols.each_with_index.map{|c,i| std(c, @mean[i])}
end end
@std.map!{|std| (std == 0 or std.nan?) ? Float::MIN : std} @std.map!{|std| (std == 0 or std.nan?) ? Float::MIN : std}

if @eps == 0
# Find the best eps.
epss = (1..9).map{|i| [1,3,5,7,9].map{|j| (j*10**(-i)).to_f }}.flatten
f1_scores = epss.map{|eps| [eps, compute_f1_score(test_examples, eps)] }
@eps, best_f1 = f1_scores.max_by{|v| v[1]}
end
end end


def trained? def trained?
@m > 0 @m > 0
end end


def samples
@m
end

# Limit the probability of features to [0,1] # Limit the probability of features to [0,1]
# to keep probabilities at same scale. # to keep probabilities at same scale.
def probability(x) def probability(x)
raise "Train me first" unless trained? raise "Train me first" unless trained?
raise ArgumentError, "x must have #{@n} elements" if x.size != @n raise ArgumentError, "First argument must have #{@n} elements" if x.size != @n
@n.times.map do |i| @n.times.map do |i|
p = normal_pdf(x[i], @mean[i], @std[i]) p = normal_pdf(x[i], @mean[i], @std[i])
(p.nan? or p > 1) ? 1 : p (p.nan? or p > 1) ? 1 : p
end.reduce(1, :*) end.reduce(1, :*)
end end


def anomaly?(x, epsilon) def anomaly?(x, eps = @eps)
probability(x) < epsilon probability(x) < eps
end end


protected protected
Expand All @@ -56,6 +85,39 @@ def normal_pdf(x, mean = 0, std = 1)
1/(SQRT2PI*std)*Math.exp(-((x - mean)**2/(2.0*(std**2)))) 1/(SQRT2PI*std)*Math.exp(-((x - mean)**2/(2.0*(std**2))))
end end


# Find best eps.

def partition!(examples, p_last = 0.2)
examples.shuffle!
n = (examples.size * p_last).floor
[examples[n..-1], examples[0...n]]
end

def compute_f1_score(examples, eps)
tp = 0
fp = 0
fn = 0
examples.each do |example|
act = example.last != 0
pred = self.anomaly?(example[0..-2], eps)
if act and pred
tp += 1
elsif pred # and !act
fp += 1
elsif act # and !pred
fn += 1
end
end
f1_score(tp, fp, fn)
end

def f1_score(tp, fp, fn)
precision = tp / (tp + fp).to_f
recall = tp / (tp + fn).to_f
score = 2.0 * precision * recall / (precision + recall)
score.nan? ? 0.0 : score
end

# Not used for NArray # Not used for NArray


def mean(x) def mean(x)
Expand Down
34 changes: 23 additions & 11 deletions spec/anomaly/detector_spec.rb
Original file line number Original file line Diff line number Diff line change
@@ -1,8 +1,8 @@
require "spec_helper" require "spec_helper"


describe Anomaly::Detector do describe Anomaly::Detector do
let(:data) { [[-1,-2],[0,0],[1,2]] } let(:examples) { [[-1,-2,0],[0,0,0],[1,2,0]] }
let(:ad) { Anomaly::Detector.new(data) } let(:ad) { Anomaly::Detector.new(examples) }


# mean = [0, 0], std = [1, 2] # mean = [0, 0], std = [1, 2]
it "computes the right probability" do it "computes the right probability" do
Expand All @@ -14,7 +14,7 @@
end end


context "when standard deviation is 0" do context "when standard deviation is 0" do
let(:data) { [[0],[0]] } let(:examples) { [[0,0],[0,0]] }


it "returns infinity for mean" do it "returns infinity for mean" do
ad.probability([0]).should == 1 ad.probability([0]).should == 1
Expand All @@ -25,35 +25,47 @@
end end
end end


context "when data is an array" do context "when examples is an array" do
let(:data) { [[-1,-2],[0,0],[1,2]] } let(:examples) { [[-1,-2,0],[0,0,0],[1,2,0]] }
let(:sample) { [rand, rand] } let(:sample) { [rand, rand] }


it "returns the same probability as an NMatrix" do it "returns the same probability as an NMatrix" do
prob = ad.probability(sample) prob = ad.probability(sample)
Object.send(:remove_const, :NMatrix) Object.send(:remove_const, :NMatrix)
prob.should == Anomaly::Detector.new(data).probability(sample) prob.should == Anomaly::Detector.new(examples).probability(sample)
end end
end end


context "when lots of samples" do context "when lots of samples" do
let(:data) { m.times.map{[0]} } let(:examples) { m.times.map{[0,0]} }
let(:m) { rand(100) + 1 } let(:m) { rand(100) + 1 }


it { ad.samples.should == m }
it { ad.trained?.should be_true } it { ad.trained?.should be_true }
end end


context "when no samples" do context "when no samples" do
let(:data) { [] } let(:examples) { nil }


it { ad.samples.should == 0 }
it { ad.trained?.should be_false } it { ad.trained?.should be_false }
end end


context "when pdf is greater than 1" do context "when pdf is greater than 1" do
let(:data) { 100.times.map{[0]}.push([1]) } let(:examples) { 100.times.map{[0,0]}.push([1,0]) }


it { ad.probability([0]).should == 1 } it { ad.probability([0]).should == 1 }
end end

context "when only anomalies" do
let(:examples) { [[0,1]] }

it "raises error" do
expect{ ad }.to raise_error RuntimeError, "Must have at least one non-anomaly"
end
end

context "when only one non-anomaly" do
let(:examples) { [[0,0]] }

it { ad.eps.should == 1e-1 }
end
end end

0 comments on commit affb4ba

Please sign in to comment.