diff --git a/lib/ankusa/extensions.rb b/lib/ankusa/extensions.rb index 8704bfc..14b645e 100644 --- a/lib/ankusa/extensions.rb +++ b/lib/ankusa/extensions.rb @@ -1,8 +1,4 @@ class String - def numeric? - true if Float(self) rescue false - end - def to_ascii encode("UTF-8", :invalid => :replace, :undef => :replace, :replace => "").force_encoding('UTF-8') rescue "" end diff --git a/lib/ankusa/hasher.rb b/lib/ankusa/hasher.rb index aea9b0f..d9ec56f 100644 --- a/lib/ankusa/hasher.rb +++ b/lib/ankusa/hasher.rb @@ -3,7 +3,7 @@ module Ankusa - class TextHash < Hash + class TextHash < Hash attr_reader :word_count def initialize(text=nil, stem=true) @@ -19,14 +19,14 @@ def self.atomize(text) # word should be only alphanum chars at this point def self.valid_word?(word) - not (Ankusa::STOPWORDS.include?(word) || word.length < 3 || word.numeric?) + not (Ankusa::STOPWORDS.include?(word) || word.length < 3 || self.numeric_word?(word)) end def add_text(text) if text.instance_of? Array text.each { |t| add_text t } else - # replace dashes with spaces, then get rid of non-word/non-space characters, + # replace dashes with spaces, then get rid of non-word/non-space characters, # then split by space to get words words = TextHash.atomize text words.each { |word| add_word(word) if TextHash.valid_word?(word) } @@ -42,6 +42,15 @@ def add_word(word) key = word.intern store key, fetch(key, 0)+1 end + + # Due to the character filtering that takes place in atomisation + # this method should never received something that could be a + # negative number, float etc. + # Therefore we can dispense with the SLOW Float(word) method and + # just do a simple regex. + def self.numeric_word?(word) + word.match(/[\d]+/) + end end end diff --git a/test/hasher_test.rb b/test/hasher_test.rb index 7854783..52f9704 100644 --- a/test/hasher_test.rb +++ b/test/hasher_test.rb @@ -1,13 +1,12 @@ require File.join File.dirname(__FILE__), 'helper' class HasherTest < Test::Unit::TestCase - def setup + + def test_stemming string = "Words word a the at fish fishing fishes? /^/ The at a of! @#$!" @text_hash = Ankusa::TextHash.new string @array = Ankusa::TextHash.new [string] - end - def test_stemming assert_equal @text_hash.length, 2 assert_equal @text_hash.word_count, 5 @@ -15,11 +14,19 @@ def test_stemming assert_equal @array.word_count, 5 end + def test_atomization + string = "Hello 123,45 My-name! is Robot14 123.45 @#$!" + @array = Ankusa::TextHash.atomize string + + assert_equal %w{hello 123 45 my name is robot14 123 45}, @array + end + def test_valid_word - assert (not Ankusa::TextHash.valid_word? "accordingly") - assert (not Ankusa::TextHash.valid_word? "appropriate") - assert Ankusa::TextHash.valid_word? "^*&@" - assert Ankusa::TextHash.valid_word? "mother" - assert (not Ankusa::TextHash.valid_word? "21675") + assert !Ankusa::TextHash.valid_word?("accordingly") + assert !Ankusa::TextHash.valid_word?("appropriate") + assert Ankusa::TextHash.valid_word?("^*&@") + assert Ankusa::TextHash.valid_word?("mother") + assert !Ankusa::TextHash.valid_word?("21675") + assert !Ankusa::TextHash.valid_word?("00000") end end