Skip to content

Commit

Permalink
added classifier based on Tf-Idf
Browse files Browse the repository at this point in the history
  • Loading branch information
Alexandru Nedelcu committed Jan 19, 2012
1 parent 0da309e commit 8c56669
Show file tree
Hide file tree
Showing 6 changed files with 84 additions and 7 deletions.
1 change: 1 addition & 0 deletions lib/stuff-classifier.rb
Expand Up @@ -4,4 +4,5 @@ module StuffClassifier
autoload :Tokenizer, 'stuff-classifier/tokenizer'
autoload :Base, 'stuff-classifier/base'
autoload :Bayes, 'stuff-classifier/bayes'
autoload :TfIdf, 'stuff-classifier/tf-idf'
end
41 changes: 41 additions & 0 deletions lib/stuff-classifier/tf-idf.rb
@@ -0,0 +1,41 @@
class StuffClassifier::TfIdf < StuffClassifier::Base
def tf_idf(word, cat)
word_cat_nr = word_count(word, cat)
cat_nr = cat_count(cat)
tf = 1.0 * word_cat_nr / cat_nr

total_categories = categories.length
categories_with_word = (@wcount[word] || []).length

idf = Math.log((total_categories + 2) / (categories_with_word + 1.0), 10)
return tf * idf
end

def text_prob(text, cat)
each_word(text).map{|w| tf_idf(w, cat)}.inject(0){|s,p| s + p}
end

def cat_scores(text)
probs = {}
categories.each do |cat|
p = text_prob(text, cat)
probs[cat] = p
end
probs.map{|k,v| [k,v]}.sort{|a,b| b[1] <=> a[1]}
end

def classify(text, default=nil)
max_prob = 0.0
best = nil

cat_scores(text).each do |score|
cat, prob = score
if prob > max_prob
max_prob = prob
best = cat
end
end

max_prob > 0 ? best : default
end
end
3 changes: 1 addition & 2 deletions test/minitest_helper.rb → test/helper.rb
Expand Up @@ -17,8 +17,7 @@
c.natural = true
end


class StuffClassifierTest < MiniTest::Unit::TestCase
class TestBase < MiniTest::Unit::TestCase
def self.before(&block)
@on_setup = block if block
@on_setup
Expand Down
5 changes: 2 additions & 3 deletions test/test_001_tokenizer.rb
@@ -1,7 +1,6 @@
require 'minitest_helper'
require 'helper'


class Test001Tokenizer < StuffClassifierTest
class Test001Tokenizer < TestBase
before do
tokenizer_cls = Class.new do
include StuffClassifier::Tokenizer
Expand Down
4 changes: 2 additions & 2 deletions test/test_002_naive_bayes.rb
@@ -1,7 +1,7 @@
require 'minitest_helper'
require 'helper'


class Test002NaiveBayesClassification < StuffClassifierTest
class Test002NaiveBayesClassification < TestBase
before do
set_classifier StuffClassifier::Bayes.new("Cats or Dogs")

Expand Down
37 changes: 37 additions & 0 deletions test/test_003_tf_idf.rb
@@ -0,0 +1,37 @@
require 'helper'


class Test003TfIdfClassification < TestBase
before do
set_classifier StuffClassifier::TfIdf.new("Cats or Dogs")

train :dog, "Dogs are awesome, cats too. I love my dog"
train :cat, "Cats are more preferred by software developers. I never could stand cats. I have a dog"
train :dog, "My dog's name is Willy. He likes to play with my wife's cat all day long. I love dogs"
train :cat, "Cats are difficult animals, unlike dogs, really annoying, I hate them all"
train :dog, "So which one should you choose? A dog, definitely."
train :cat, "The favorite food for cats is bird meat, although mice are good, but birds are a delicacy"
train :dog, "A dog will eat anything, including birds or whatever meat"
train :cat, "My cat's favorite place to purr is on my keyboard"
train :dog, "My dog's favorite place to take a leak is the tree in front of our house"
end

def test_for_cats
should_be :cat, "This test is about cats."
should_be :cat, "I hate ..."
should_be :cat, "The most annoying animal on earth."
should_be :cat, "The preferred company of software developers."
should_be :cat, "My precious, my favorite!"
should_be :cat, "Kill that bird!"
end

def test_for_dogs
should_be :dog, "This test is about dogs."
should_be :dog, "Cats or Dogs?"
should_be :dog, "What pet will I love more?"
should_be :dog, "Willy, where the heck are you?"
should_be :dog, "I like big buts and I cannot lie."
should_be :dog, "Why is the front door of our house open?"
should_be :dog, "Who is eating my meat?"
end
end

0 comments on commit 8c56669

Please sign in to comment.