forked from alexandru/stuff-classifier
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Alexandru Nedelcu
committed
Jan 19, 2012
1 parent
0da309e
commit 8c56669
Showing
6 changed files
with
84 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
class StuffClassifier::TfIdf < StuffClassifier::Base | ||
def tf_idf(word, cat) | ||
word_cat_nr = word_count(word, cat) | ||
cat_nr = cat_count(cat) | ||
tf = 1.0 * word_cat_nr / cat_nr | ||
|
||
total_categories = categories.length | ||
categories_with_word = (@wcount[word] || []).length | ||
|
||
idf = Math.log((total_categories + 2) / (categories_with_word + 1.0), 10) | ||
return tf * idf | ||
end | ||
|
||
def text_prob(text, cat) | ||
each_word(text).map{|w| tf_idf(w, cat)}.inject(0){|s,p| s + p} | ||
end | ||
|
||
def cat_scores(text) | ||
probs = {} | ||
categories.each do |cat| | ||
p = text_prob(text, cat) | ||
probs[cat] = p | ||
end | ||
probs.map{|k,v| [k,v]}.sort{|a,b| b[1] <=> a[1]} | ||
end | ||
|
||
def classify(text, default=nil) | ||
max_prob = 0.0 | ||
best = nil | ||
|
||
cat_scores(text).each do |score| | ||
cat, prob = score | ||
if prob > max_prob | ||
max_prob = prob | ||
best = cat | ||
end | ||
end | ||
|
||
max_prob > 0 ? best : default | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
require 'helper' | ||
|
||
|
||
class Test003TfIdfClassification < TestBase | ||
before do | ||
set_classifier StuffClassifier::TfIdf.new("Cats or Dogs") | ||
|
||
train :dog, "Dogs are awesome, cats too. I love my dog" | ||
train :cat, "Cats are more preferred by software developers. I never could stand cats. I have a dog" | ||
train :dog, "My dog's name is Willy. He likes to play with my wife's cat all day long. I love dogs" | ||
train :cat, "Cats are difficult animals, unlike dogs, really annoying, I hate them all" | ||
train :dog, "So which one should you choose? A dog, definitely." | ||
train :cat, "The favorite food for cats is bird meat, although mice are good, but birds are a delicacy" | ||
train :dog, "A dog will eat anything, including birds or whatever meat" | ||
train :cat, "My cat's favorite place to purr is on my keyboard" | ||
train :dog, "My dog's favorite place to take a leak is the tree in front of our house" | ||
end | ||
|
||
def test_for_cats | ||
should_be :cat, "This test is about cats." | ||
should_be :cat, "I hate ..." | ||
should_be :cat, "The most annoying animal on earth." | ||
should_be :cat, "The preferred company of software developers." | ||
should_be :cat, "My precious, my favorite!" | ||
should_be :cat, "Kill that bird!" | ||
end | ||
|
||
def test_for_dogs | ||
should_be :dog, "This test is about dogs." | ||
should_be :dog, "Cats or Dogs?" | ||
should_be :dog, "What pet will I love more?" | ||
should_be :dog, "Willy, where the heck are you?" | ||
should_be :dog, "I like big buts and I cannot lie." | ||
should_be :dog, "Why is the front door of our house open?" | ||
should_be :dog, "Who is eating my meat?" | ||
end | ||
end |