From 8c566699ff708cdcd2148163511b263400526a80 Mon Sep 17 00:00:00 2001 From: Alexandru Nedelcu Date: Thu, 19 Jan 2012 23:30:43 +0200 Subject: [PATCH] added classifier based on Tf-Idf --- lib/stuff-classifier.rb | 1 + lib/stuff-classifier/tf-idf.rb | 41 ++++++++++++++++++++++++++ test/{minitest_helper.rb => helper.rb} | 3 +- test/test_001_tokenizer.rb | 5 ++-- test/test_002_naive_bayes.rb | 4 +-- test/test_003_tf_idf.rb | 37 +++++++++++++++++++++++ 6 files changed, 84 insertions(+), 7 deletions(-) create mode 100644 lib/stuff-classifier/tf-idf.rb rename test/{minitest_helper.rb => helper.rb} (94%) create mode 100644 test/test_003_tf_idf.rb diff --git a/lib/stuff-classifier.rb b/lib/stuff-classifier.rb index 68c93cc..0f35a2c 100644 --- a/lib/stuff-classifier.rb +++ b/lib/stuff-classifier.rb @@ -4,4 +4,5 @@ module StuffClassifier autoload :Tokenizer, 'stuff-classifier/tokenizer' autoload :Base, 'stuff-classifier/base' autoload :Bayes, 'stuff-classifier/bayes' + autoload :TfIdf, 'stuff-classifier/tf-idf' end diff --git a/lib/stuff-classifier/tf-idf.rb b/lib/stuff-classifier/tf-idf.rb new file mode 100644 index 0000000..af925a2 --- /dev/null +++ b/lib/stuff-classifier/tf-idf.rb @@ -0,0 +1,41 @@ +class StuffClassifier::TfIdf < StuffClassifier::Base + def tf_idf(word, cat) + word_cat_nr = word_count(word, cat) + cat_nr = cat_count(cat) + tf = 1.0 * word_cat_nr / cat_nr + + total_categories = categories.length + categories_with_word = (@wcount[word] || []).length + + idf = Math.log((total_categories + 2) / (categories_with_word + 1.0), 10) + return tf * idf + end + + def text_prob(text, cat) + each_word(text).map{|w| tf_idf(w, cat)}.inject(0){|s,p| s + p} + end + + def cat_scores(text) + probs = {} + categories.each do |cat| + p = text_prob(text, cat) + probs[cat] = p + end + probs.map{|k,v| [k,v]}.sort{|a,b| b[1] <=> a[1]} + end + + def classify(text, default=nil) + max_prob = 0.0 + best = nil + + cat_scores(text).each do |score| + cat, prob = score + if prob > max_prob + max_prob = prob + best = cat + end + end + + max_prob > 0 ? best : default + end +end diff --git a/test/minitest_helper.rb b/test/helper.rb similarity index 94% rename from test/minitest_helper.rb rename to test/helper.rb index 3d938eb..47bc7b2 100644 --- a/test/minitest_helper.rb +++ b/test/helper.rb @@ -17,8 +17,7 @@ c.natural = true end - -class StuffClassifierTest < MiniTest::Unit::TestCase +class TestBase < MiniTest::Unit::TestCase def self.before(&block) @on_setup = block if block @on_setup diff --git a/test/test_001_tokenizer.rb b/test/test_001_tokenizer.rb index b5a7aa8..1adf0fa 100644 --- a/test/test_001_tokenizer.rb +++ b/test/test_001_tokenizer.rb @@ -1,7 +1,6 @@ -require 'minitest_helper' +require 'helper' - -class Test001Tokenizer < StuffClassifierTest +class Test001Tokenizer < TestBase before do tokenizer_cls = Class.new do include StuffClassifier::Tokenizer diff --git a/test/test_002_naive_bayes.rb b/test/test_002_naive_bayes.rb index 6179cac..fe1c9e6 100644 --- a/test/test_002_naive_bayes.rb +++ b/test/test_002_naive_bayes.rb @@ -1,7 +1,7 @@ -require 'minitest_helper' +require 'helper' -class Test002NaiveBayesClassification < StuffClassifierTest +class Test002NaiveBayesClassification < TestBase before do set_classifier StuffClassifier::Bayes.new("Cats or Dogs") diff --git a/test/test_003_tf_idf.rb b/test/test_003_tf_idf.rb new file mode 100644 index 0000000..86865fb --- /dev/null +++ b/test/test_003_tf_idf.rb @@ -0,0 +1,37 @@ +require 'helper' + + +class Test003TfIdfClassification < TestBase + before do + set_classifier StuffClassifier::TfIdf.new("Cats or Dogs") + + train :dog, "Dogs are awesome, cats too. I love my dog" + train :cat, "Cats are more preferred by software developers. I never could stand cats. I have a dog" + train :dog, "My dog's name is Willy. He likes to play with my wife's cat all day long. I love dogs" + train :cat, "Cats are difficult animals, unlike dogs, really annoying, I hate them all" + train :dog, "So which one should you choose? A dog, definitely." + train :cat, "The favorite food for cats is bird meat, although mice are good, but birds are a delicacy" + train :dog, "A dog will eat anything, including birds or whatever meat" + train :cat, "My cat's favorite place to purr is on my keyboard" + train :dog, "My dog's favorite place to take a leak is the tree in front of our house" + end + + def test_for_cats + should_be :cat, "This test is about cats." + should_be :cat, "I hate ..." + should_be :cat, "The most annoying animal on earth." + should_be :cat, "The preferred company of software developers." + should_be :cat, "My precious, my favorite!" + should_be :cat, "Kill that bird!" + end + + def test_for_dogs + should_be :dog, "This test is about dogs." + should_be :dog, "Cats or Dogs?" + should_be :dog, "What pet will I love more?" + should_be :dog, "Willy, where the heck are you?" + should_be :dog, "I like big buts and I cannot lie." + should_be :dog, "Why is the front door of our house open?" + should_be :dog, "Who is eating my meat?" + end +end