-
Notifications
You must be signed in to change notification settings - Fork 43
/
lexer.rl
364 lines (308 loc) · 10.6 KB
/
lexer.rl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
%%machine xpath_lexer; # %
module Oga
module XPath
# Lexer for turning XPath expressions into a set of tokens. Tokens are
# returned as arrays with every array having two values:
#
# 1. The token type as a symbol
# 2. The token value or nil if there is no value
#
# Basic usage of this lexer is as following:
#
# lexer = Oga::XPath::Lexer.new('//foo/bar')
# tokens = lexer.lex
#
# Alternatively you can stream tokens instead of returning them as a whole:
#
# lexer = Oga::XPath::Lexer.new('//foo/bar')
#
# lexer.advance do |type, value|
#
# end
#
# Unlike the XML lexer the XPath lexer does not support IO instances, it can
# only lex strings.
#
# ## Thread Safety
#
# This class keeps track of an internal state. As a result it's not safe to
# share a single instance between multiple threads. However, you're free to
# use separate instances per thread as there is no global (= class level)
# shared state.
#
# @api private
class Lexer
%% write data;
# % fix highlight
# Maps certain XPath axes written in their short form to their long form
# equivalents.
#
# @return [Hash]
AXIS_MAPPING = {
'@' => 'attribute',
'//' => 'descendant-or-self',
'..' => 'parent',
'.' => 'self'
}
# Axes that require a separate `node()` call to be emitted.
#
# @return [Array]
AXIS_EMIT_NODE = %w{descendant-or-self parent self}
# Axes that require an extra T_SLASH token to be emitted.
#
# @return [Array]
AXIS_EMIT_EXTRA_SLASH = %w{descendant-or-self}
# @param [String] data The data to lex.
def initialize(data)
@data = data
end
# Gathers all the tokens for the input and returns them as an Array.
#
# @see [#advance]
# @return [Array]
def lex
tokens = []
advance do |type, value|
tokens << [type, value]
end
return tokens
end
# Advances through the input and generates the corresponding tokens. Each
# token is yielded to the supplied block.
#
# Each token is an Array in the following format:
#
# [TYPE, VALUE]
#
# The type is a symbol, the value is either nil or a String.
#
# This method stores the supplied block in `@block` and resets it after
# the lexer loop has finished.
#
# @see [#add_token]
def advance(&block)
@block = block
data = @data # saves ivar lookups while lexing.
ts = nil
te = nil
stack = []
top = 0
cs = self.class.xpath_lexer_start
act = 0
eof = @data.bytesize
p = 0
pe = eof
_xpath_lexer_eof_trans = self.class.send(:_xpath_lexer_eof_trans)
_xpath_lexer_from_state_actions = self.class.send(:_xpath_lexer_from_state_actions)
_xpath_lexer_index_offsets = self.class.send(:_xpath_lexer_index_offsets)
_xpath_lexer_indicies = self.class.send(:_xpath_lexer_indicies)
_xpath_lexer_key_spans = self.class.send(:_xpath_lexer_key_spans)
_xpath_lexer_to_state_actions = self.class.send(:_xpath_lexer_to_state_actions)
_xpath_lexer_trans_actions = self.class.send(:_xpath_lexer_trans_actions)
_xpath_lexer_trans_keys = self.class.send(:_xpath_lexer_trans_keys)
_xpath_lexer_trans_targs = self.class.send(:_xpath_lexer_trans_targs)
%% write exec;
# % fix highlight
ensure
@block = nil
end
private
# Emits a token of which the value is based on the supplied start/stop
# position.
#
# @param [Symbol] type The token type.
# @param [Fixnum] start
# @param [Fixnum] stop
#
# @see [#text]
# @see [#add_token]
def emit(type, start, stop)
value = slice_input(start, stop)
add_token(type, value)
end
# Returns the text between the specified start and stop position.
#
# @param [Fixnum] start
# @param [Fixnum] stop
# @return [String]
def slice_input(start, stop)
return @data.byteslice(start, stop - start)
end
# Yields a new token to the supplied block.
#
# @param [Symbol] type The token type.
# @param [String] value The token value.
#
# @yieldparam [Symbol] type
# @yieldparam [String|NilClass] value
def add_token(type, value = nil)
@block.call(type, value)
end
%%{
getkey (data.getbyte(p) || 0);
whitespace = [\n\t ];
slash = '/' @{ add_token(:T_SLASH) };
lparen = '(' @{ add_token(:T_LPAREN) };
rparen = ')' @{ add_token(:T_RPAREN) };
comma = ',' @{ add_token(:T_COMMA) };
colon = ':' @{ add_token(:T_COLON) };
lbrack = '[' @{ add_token(:T_LBRACK) };
rbrack = ']' @{ add_token(:T_RBRACK) };
# Identifiers
#
# Identifiers are used for element names, namespaces, attribute names,
# etc. Identifiers have to start with a letter.
unicode = any - ascii;
unicode_or_ascii = (unicode | [a-zA-Z\-_0-9\.])*;
identifier = '*' | (unicode | [a-zA-Z_]) unicode_or_ascii ;
action emit_identifier {
emit(:T_IDENT, ts, te)
}
# Numbers
#
# XPath expressions can contain both integers and floats. The W3
# specification treats these both as the same type of number. Oga
# instead lexes them separately so that we can convert the values to
# the corresponding Ruby types (Fixnum and Float).
integer = ('-' | '+')* digit+;
float = ('-' | '+')* digit+ ('.' digit+)*;
action emit_integer {
value = slice_input(ts, te).to_i
add_token(:T_INT, value)
}
action emit_float {
value = slice_input(ts, te).to_f
add_token(:T_FLOAT, value)
}
# Strings
#
# Strings can be single or double quoted. They are mainly used for
# attribute values.
dquote = '"';
squote = "'";
string_dquote = (dquote ^dquote* dquote);
string_squote = (squote ^squote* squote);
string = string_dquote | string_squote;
action emit_string {
emit(:T_STRING, ts + 1, te - 1)
}
# Full Axes
#
# XPath axes in their full syntax.
axis_full = ('ancestor'
| 'ancestor-or-self'
| 'attribute'
| 'child'
| 'descendant'
| 'descendant-or-self'
| 'following'
| 'following-sibling'
| 'namespace'
| 'parent'
| 'preceding'
| 'preceding-sibling'
| 'self') '::';
action emit_axis_full {
emit(:T_AXIS, ts, te - 2)
}
# Short Axes
#
# XPath axes in their abbreviated form. When lexing these are mapped to
# their full forms so that the parser doesn't have to take care of
# this.
axis_short = '@' | '//' | '..' | '.';
action emit_axis_short {
value = AXIS_MAPPING[slice_input(ts, te)]
add_token(:T_AXIS, value)
# Short axes that use node() as their default, implicit test. This is
# added on lexer level to make it easier to handle these cases on
# parser/evaluator level.
if AXIS_EMIT_NODE.include?(value)
add_token(:T_TYPE_TEST, 'node')
if AXIS_EMIT_EXTRA_SLASH.include?(value) and te != eof
add_token(:T_SLASH)
end
end
}
# Operators
#
# Operators can only be used inside predicates due to "div" and "mod"
# conflicting with the patterns used for matching identifiers (=
# element names and the likes).
op_pipe = '|' %{ add_token(:T_PIPE) };
op_plus = '+' %{ add_token(:T_ADD) };
op_eq = '=' %{ add_token(:T_EQ) };
op_neq = '!=' %{ add_token(:T_NEQ) };
op_lt = '<' %{ add_token(:T_LT) };
op_gt = '>' %{ add_token(:T_GT) };
op_lte = '<=' %{ add_token(:T_LTE) };
op_gte = '>=' %{ add_token(:T_GTE) };
# These operators require whitespace around them in order to be lexed
# as operators. This is due to "-" being allowed in node names and "*"
# also being used as a whildcard.
#
# THINK: relying on whitespace is a rather fragile solution, even
# though the W3 actually recommends this for the "-" operator. Perhaps
# there's a better way of doing this.
op_and = ' and ' %{ add_token(:T_AND) };
op_or = ' or ' %{ add_token(:T_OR) };
op_div = ' div ' %{ add_token(:T_DIV) };
op_mod = ' mod ' %{ add_token(:T_MOD) };
op_mul = ' * ' %{ add_token(:T_MUL) };
op_sub = ' - ' %{ add_token(:T_SUB) };
operator = op_pipe
| op_and
| op_or
| op_plus
| op_div
| op_mod
| op_eq
| op_neq
| op_lt
| op_gt
| op_lte
| op_gte
| op_mul
| op_sub
;
# Node type tests
#
# While these look like functions they are actually node type tests. For
# example, comment() matches all comment nodes.
#
# See http://www.w3.org/TR/xpath/#NT-NodeType for more information.
type_test = (
'comment' |
'text' |
'processing-instruction' |
'node'
) '()';
action emit_type_test {
emit(:T_TYPE_TEST, ts, te - 2)
}
# Variables
#
# XPath 1.0 allows the use of variables in expressions. Oddly enough you
# can not assign variables in an expression, you can only refer to them.
# This means that libraries themselves have to expose an interface for
# setting variables.
var = '$' identifier;
action emit_variable {
emit(:T_VAR, ts + 1, te)
}
main := |*
operator;
whitespace | slash | lparen | rparen | comma | colon | lbrack | rbrack;
type_test => emit_type_test;
var => emit_variable;
string => emit_string;
integer => emit_integer;
float => emit_float;
axis_full => emit_axis_full;
axis_short => emit_axis_short;
identifier => emit_identifier;
*|;
}%%
end # Lexer
end # XPath
end # Oga