This repository was archived by the owner on Jan 17, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 88
Expand file tree
/
Copy pathtokenizer.rb
More file actions
80 lines (60 loc) · 1.6 KB
/
tokenizer.rb
File metadata and controls
80 lines (60 loc) · 1.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# -*- encoding : utf-8 -*-
require "lingua/stemmer"
require "rseg"
class StuffClassifier::Tokenizer
require "stuff-classifier/tokenizer/tokenizer_properties"
def initialize(opts={})
@language = opts.key?(:language) ? opts[:language] : "en"
@properties = StuffClassifier::Tokenizer::TOKENIZER_PROPERTIES[@language]
@stemming = opts.key?(:stemming) ? opts[:stemming] : true
if @stemming
@stemmer = Lingua::Stemmer.new(:language => @language)
end
end
def language
@language
end
def preprocessing_regexps=(value)
@preprocessing_regexps = value
end
def preprocessing_regexps
@preprocessing_regexps || @properties[:preprocessing_regexps]
end
def ignore_words=(value)
@ignore_words = value
end
def ignore_words
@ignore_words || @properties[:stop_word]
end
def stemming?
@stemming || false
end
def each_word(string)
string = string.strip
return if string == ''
words = []
# tokenize string
string.split("\n").each do |line|
# Apply preprocessing regexps
if preprocessing_regexps
preprocessing_regexps.each { |regexp,replace_by| line.gsub!(regexp, replace_by) }
end
Rseg.segment(line).each do |w|
next if w == '' || ignore_words.member?(w.downcase)
if stemming? and stemable?(w)
w = @stemmer.stem(w).downcase
next if ignore_words.member?(w)
else
w = w.downcase
end
words << (block_given? ? (yield w) : w)
end
end
return words
end
private
def stemable?(word)
true
word =~ /^\p{Alpha}+$/
end
end