-
Notifications
You must be signed in to change notification settings - Fork 21
Expand file tree
/
Copy pathhasher.rb
More file actions
47 lines (38 loc) · 1.06 KB
/
hasher.rb
File metadata and controls
47 lines (38 loc) · 1.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
require 'fast_stemmer'
require 'ankusa/stopwords'
module Ankusa
class TextHash < Hash
attr_reader :word_count
def initialize(text=nil, stem=true)
super 0
@word_count = 0
@stem = stem
add_text(text) unless text.nil?
end
def self.atomize(text)
text.downcase.to_ascii.tr('-', ' ').gsub(/[^\w\s]/," ").split
end
# word should be only alphanum chars at this point
def self.valid_word?(word)
not (Ankusa::STOPWORDS.include?(word) || word.length < 3 || word.numeric?)
end
def add_text(text)
if text.instance_of? Array
text.each { |t| add_text t }
else
# replace dashes with spaces, then get rid of non-word/non-space characters,
# then split by space to get words
words = TextHash.atomize text
words.each { |word| add_word(word) if TextHash.valid_word?(word) }
end
self
end
protected
def add_word(word)
@word_count += 1
word = word.stem if @stem
key = word.intern
store key, fetch(key, 0)+1
end
end
end