/
tokeniser.rb
163 lines (155 loc) · 5.79 KB
/
tokeniser.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# -*- encoding : utf-8 -*-
module WordsCounted
class Tokeniser
# Takes a string and breaks it into an array of tokens.
# Using `pattern` and `exclude` allows for powerful tokenisation strategies.
#
# @example
# tokeniser
# = WordsCounted::Tokeniser.new(
# "We are all in the gutter, but some of us are looking at the stars."
# )
# tokeniser.tokenise(exclude: "We are all in the gutter")
# # => ['but', 'some', 'of', 'us', 'are', 'looking', 'at', 'the', 'stars']
# Default tokenisation strategy
TOKEN_REGEXP = /[\p{Alpha}\-']+/
# Initialises state with the string to be tokenised.
#
# @param [String] input The string to tokenise
def initialize(input)
@input = input
end
# Converts a string into an array of tokens using a regular expression.
# If a regexp is not provided a default one is used. See `Tokenizer.TOKEN_REGEXP`.
#
# Use `exclude` to remove tokens from the final list. `exclude` can be a string,
# a regular expression, a lambda, a symbol, or an array of one or more of those types.
# This allows for powerful and flexible tokenisation strategies.
#
# If a symbol is passed, it must name a predicate method.
#
# @example
# WordsCounted::Tokeniser.new("Hello World").tokenise
# # => ['hello', 'world']
#
# @example With `pattern`
# WordsCounted::Tokeniser.new("Hello-Mohamad").tokenise(pattern: /[^-]+/)
# # => ['hello', 'mohamad']
#
# @example With `exclude` as a string
# WordsCounted::Tokeniser.new("Hello Sami").tokenise(exclude: "hello")
# # => ['sami']
#
# @example With `exclude` as a regexp
# WordsCounted::Tokeniser.new("Hello Dani").tokenise(exclude: /hello/i)
# # => ['dani']
#
# @example With `exclude` as a lambda
# WordsCounted::Tokeniser.new("Goodbye Sami").tokenise(
# exclude: ->(token) { token.length > 6 }
# )
# # => ['sami']
#
# @example With `exclude` as a symbol
# WordsCounted::Tokeniser.new("Hello محمد").tokenise(exclude: :ascii_only?)
# # => ['محمد']
#
# @example With `exclude` as an array of strings
# WordsCounted::Tokeniser.new("Goodbye Sami and hello Dani").tokenise(
# exclude: ["goodbye hello"]
# )
# # => ['sami', 'and', dani']
#
# @example With `exclude` as an array of regular expressions
# WordsCounted::Tokeniser.new("Goodbye and hello Dani").tokenise(
# exclude: [/goodbye/i, /and/i]
# )
# # => ['hello', 'dani']
#
# @example With `exclude` as an array of lambdas
# t = WordsCounted::Tokeniser.new("Special Agent 007")
# t.tokenise(
# exclude: [
# ->(t) { t.to_i.odd? },
# ->(t) { t.length > 5}
# ]
# )
# # => ['agent']
#
# @example With `exclude` as a mixed array
# t = WordsCounted::Tokeniser.new("Hello! اسماءنا هي محمد، كارولينا، سامي، وداني")
# t.tokenise(
# exclude: [
# :ascii_only?,
# /محمد/,
# ->(t) { t.length > 6},
# "و"
# ]
# )
# # => ["هي", "سامي", "وداني"]
#
# @param [Regexp] pattern The string to tokenise
# @param [Array<String, Regexp, Lambda, Symbol>, String, Regexp, Lambda, Symbol, nil] exclude The filter to apply
# @return [Array] The array of filtered tokens
def tokenise(pattern: TOKEN_REGEXP, exclude: nil)
filter_proc = filter_to_proc(exclude)
@input.scan(pattern).map(&:downcase).reject { |token| filter_proc.call(token) }
end
private
# The following methods convert any arguments into a callable object. The return value of this
# lambda is then used to determine whether a token should be excluded from the final list.
#
# `filter` can be a string, a regular expression, a lambda, a symbol, or an array
# of any combination of those types.
#
# If `filter` is a string, it converts the string into an array, and returns a lambda
# that returns true if the token is included in the resulting array.
#
# @see {Tokeniser#filter_proc_from_string}.
#
# If `filter` is a an array, it creates a new array where each element of the origingal is
# converted to a lambda, and returns a lambda that calls each lambda in the resulting array.
# If any lambda returns true the token is excluded from the final list.
#
# @see {Tokeniser#filter_procs_from_array}.
#
# If `filter` is a proc, then the proc is simply called. If `filter` is a regexp, a `lambda`
# is returned that checks the token for a match.
#
# If a symbol is passed, it is converted to a proc. The symbol must name a predicate method.
#
# This method depends on `nil` responding `to_a` with an empty array, which
# avoids having to check if `exclude` was passed.
# @api private
def filter_to_proc(filter)
if filter.respond_to?(:to_a)
filter_procs_from_array(filter)
elsif filter.respond_to?(:to_str)
filter_proc_from_string(filter)
elsif regexp_filter = Regexp.try_convert(filter)
->(token) {
token =~ regexp_filter
}
elsif filter.respond_to?(:to_proc)
filter.to_proc
else
raise ArgumentError,
"`filter` must be a `String`, `Regexp`, `lambda`, `Symbol`, or an `Array` of any combination of those types"
end
end
# @api private
def filter_procs_from_array(filter)
filter_procs = Array(filter).map &method(:filter_to_proc)
->(token) {
filter_procs.any? { |pro| pro.call(token) }
}
end
# @api private
def filter_proc_from_string(filter)
normalized_exclusion_list = filter.split.map(&:downcase)
->(token) {
normalized_exclusion_list.include?(token)
}
end
end
end