This repository has been archived by the owner. It is now read-only.
Permalink
Newer
Older
100644 58 lines (46 sloc) 1.19 KB
Alexandru Nedelcu
Jan 19, 2012
1
2
class StuffClassifier::Bayes < StuffClassifier::Base
3
# http://en.wikipedia.org/wiki/Naive_Bayes_classifier
4
Alexandru Nedelcu
Jan 20, 2012
5
attr_writer :thresholds
6
Alexandru Nedelcu
Jan 19, 2012
7
def initialize(name, opts={})
8
super(name, opts)
9
@thresholds = {}
Alexandru Nedelcu
Jan 19, 2012
10
end
11
Alexandru Nedelcu
Jan 19, 2012
12
def doc_prob(text, category)
13
each_word(text).map {|w|
14
word_weighted_average(w, category)
15
}.inject(1) {|p,c| p * c}
Alexandru Nedelcu
Jan 19, 2012
16
end
17
Alexandru Nedelcu
Jan 19, 2012
18
def text_prob(text, category)
19
cat_prob = cat_count(category) / total_count
20
doc_prob = doc_prob(text, category)
21
cat_prob * doc_prob
22
end
23
24
def cat_scores(text)
25
probs = {}
26
categories.each do |cat|
27
probs[cat] = text_prob(text, cat)
28
end
29
probs.map{|k,v| [k,v]}.sort{|a,b| b[1] <=> a[1]}
30
end
31
32
def classify(text, default=nil)
33
# Find the category with the highest probability
34
max_prob = 0.0
35
best = nil
36
37
scores = cat_scores(text)
38
scores.each do |score|
39
cat, prob = score
40
if prob > max_prob
41
max_prob = prob
42
best = cat
43
end
44
end
45
46
return default unless best
47
threshold = @thresholds[best] || 1.0
48
49
scores.each do |score|
50
cat, prob = score
51
next if cat == best
52
return default if prob * threshold > max_prob
53
end
54
55
return best
Alexandru Nedelcu
Jan 19, 2012
56
end
57
end