forked from ocaml/ocaml
/
Lexer.mll
483 lines (422 loc) · 19.2 KB
/
Lexer.mll
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
(****************************************************************************)
(* *)
(* Objective Caml *)
(* *)
(* INRIA Rocquencourt *)
(* *)
(* Copyright 2006-2006 Institut National de Recherche en Informatique et *)
(* en Automatique. All rights reserved. This file is distributed under *)
(* the terms of the GNU Library General Public License, with the special *)
(* exception on linking described in LICENSE at the top of the Objective *)
(* Caml source tree. *)
(* *)
(****************************************************************************)
(* Authors:
* - Daniel de Rauglaudre: initial version
* - Nicolas Pouillard: refactoring
*)
(* The lexer definition *)
{
(** A lexical analyzer. *)
(* FIXME interface module Make (Token : Token) |+ Note that this Token sig is not in Sig +| *)
(* : Sig.Lexer. S with module Loc = Token.Loc and module Token = Token; *)
(* type context =
{ loc : Loc.t ;
in_comment : bool ;
|+* FIXME When True, all lexers built by [Plexer.make ()] do not lex the
quotation syntax any more. Default is False (quotations are
lexed). +|
quotations : bool };
value default_context : context;
value mk : Loc.t -> Stream.t char -> Stream.t (Token.t * Loc.t);
value mk' : context -> Stream.t char -> Stream.t (Token.t * Loc.t); *)
(* FIXME Beware the context argument must be given like that:
* mk' { (default_context) with ... = ... } strm
*)
module TokenEval = Token.Eval
module Make (Token : Sig.Camlp4Token)
= struct
module Loc = Token.Loc
module Token = Token
open Lexing
open Sig
(* Error report *)
module Error = struct
type t =
| Illegal_character of char
| Illegal_escape of string
| Unterminated_comment
| Unterminated_string
| Unterminated_quotation
| Unterminated_antiquot
| Unterminated_string_in_comment
| Comment_start
| Comment_not_end
| Literal_overflow of string
exception E of t
open Format
let print ppf =
function
| Illegal_character c ->
fprintf ppf "Illegal character (%s)" (Char.escaped c)
| Illegal_escape s ->
fprintf ppf "Illegal backslash escape in string or character (%s)" s
| Unterminated_comment ->
fprintf ppf "Comment not terminated"
| Unterminated_string ->
fprintf ppf "String literal not terminated"
| Unterminated_string_in_comment ->
fprintf ppf "This comment contains an unterminated string literal"
| Unterminated_quotation ->
fprintf ppf "Quotation not terminated"
| Unterminated_antiquot ->
fprintf ppf "Antiquotation not terminated"
| Literal_overflow ty ->
fprintf ppf "Integer literal exceeds the range of representable integers of type %s" ty
| Comment_start ->
fprintf ppf "this is the start of a comment"
| Comment_not_end ->
fprintf ppf "this is not the end of a comment"
let to_string x =
let b = Buffer.create 50 in
let () = bprintf b "%a" print x in Buffer.contents b
end;;
let module M = ErrorHandler.Register(Error) in ()
open Error
(* To store some context information:
* loc : position of the beginning of a string, quotation and comment
* in_comment: are we in a comment?
* quotations: shall we lex quotation?
* If quotations is false it's a SYMBOL token.
* antiquots : shall we lex antiquotations.
*)
type context =
{ loc : Loc.t ;
in_comment : bool ;
quotations : bool ;
antiquots : bool ;
lexbuf : lexbuf ;
buffer : Buffer.t }
let default_context lb =
{ loc = Loc.ghost ;
in_comment = false ;
quotations = true ;
antiquots = false ;
lexbuf = lb ;
buffer = Buffer.create 256 }
(* To buffer string literals, quotations and antiquotations *)
let store c = Buffer.add_string c.buffer (Lexing.lexeme c.lexbuf)
let istore_char c i = Buffer.add_char c.buffer (Lexing.lexeme_char c.lexbuf i)
let buff_contents c =
let contents = Buffer.contents c.buffer in
Buffer.reset c.buffer; contents
let loc c = Loc.merge c.loc (Loc.of_lexbuf c.lexbuf)
let quotations c = c.quotations
let antiquots c = c.antiquots
let is_in_comment c = c.in_comment
let in_comment c = { (c) with in_comment = true }
let set_start_p c = c.lexbuf.lex_start_p <- Loc.start_pos c.loc
let move_start_p shift c =
let p = c.lexbuf.lex_start_p in
c.lexbuf.lex_start_p <- { (p) with pos_cnum = p.pos_cnum + shift }
let update_loc c = { (c) with loc = Loc.of_lexbuf c.lexbuf }
let with_curr_loc f c = f (update_loc c) c.lexbuf
let parse_nested f c =
with_curr_loc f c;
set_start_p c;
buff_contents c
let shift n c = { (c) with loc = Loc.move `both n c.loc }
let store_parse f c = store c ; f c c.lexbuf
let parse f c = f c c.lexbuf
let mk_quotation quotation c name loc shift =
let s = parse_nested quotation (update_loc c) in
let contents = String.sub s 0 (String.length s - 2) in
QUOTATION { q_name = name ;
q_loc = loc ;
q_shift = shift ;
q_contents = contents }
(* Update the current location with file name and line number. *)
let update_loc c file line absolute chars =
let lexbuf = c.lexbuf in
let pos = lexbuf.lex_curr_p in
let new_file = match file with
| None -> pos.pos_fname
| Some s -> s
in
lexbuf.lex_curr_p <- { pos with
pos_fname = new_file;
pos_lnum = if absolute then line else pos.pos_lnum + line;
pos_bol = pos.pos_cnum - chars;
}
let err error loc =
raise(Loc.Exc_located(loc, Error.E error))
let warn error loc =
Format.eprintf "Warning: %a: %a@." Loc.print loc Error.print error
}
let newline = ('\010' | '\013' | "\013\010")
let blank = [' ' '\009' '\012']
let lowercase = ['a'-'z' '\223'-'\246' '\248'-'\255' '_']
let uppercase = ['A'-'Z' '\192'-'\214' '\216'-'\222']
let identchar =
['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255' '\'' '0'-'9']
let ident = (lowercase|uppercase) identchar*
let locname = ident
let not_star_symbolchar =
['$' '!' '%' '&' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~' '\\']
let symbolchar = '*' | not_star_symbolchar
let quotchar =
['!' '%' '&' '+' '-' '.' '/' ':' '=' '?' '@' '^' '|' '~' '\\' '*']
let hexa_char = ['0'-'9' 'A'-'F' 'a'-'f']
let decimal_literal =
['0'-'9'] ['0'-'9' '_']*
let hex_literal =
'0' ['x' 'X'] hexa_char ['0'-'9' 'A'-'F' 'a'-'f' '_']*
let oct_literal =
'0' ['o' 'O'] ['0'-'7'] ['0'-'7' '_']*
let bin_literal =
'0' ['b' 'B'] ['0'-'1'] ['0'-'1' '_']*
let int_literal =
decimal_literal | hex_literal | oct_literal | bin_literal
let float_literal =
['0'-'9'] ['0'-'9' '_']*
('.' ['0'-'9' '_']* )?
(['e' 'E'] ['+' '-']? ['0'-'9'] ['0'-'9' '_']*)?
(* Delimitors are extended (from 3.09) in a conservative way *)
(* These chars that can't start an expression or a pattern: *)
let safe_delimchars = ['%' '&' '/' '@' '^']
(* These symbols are unsafe since "[<", "[|", etc. exsist. *)
let delimchars = safe_delimchars | ['|' '<' '>' ':' '=' '.']
let left_delims = ['(' '[' '{']
let right_delims = [')' ']' '}']
let left_delimitor =
(* At least a safe_delimchars *)
left_delims delimchars* safe_delimchars (delimchars|left_delims)*
(* A '(' or a new super '(' without "(<" *)
| '(' (['|' ':'] delimchars*)?
(* Old brackets, no new brackets starting with "[|" or "[:" *)
| '[' ['|' ':']?
(* Old "[<","{<" and new ones *)
| ['[' '{'] delimchars* '<'
(* Old brace and new ones *)
| '{' (['|' ':'] delimchars*)?
let right_delimitor =
(* At least a safe_delimchars *)
(delimchars|right_delims)* safe_delimchars (delimchars|right_delims)* right_delims
(* A ')' or a new super ')' without ">)" *)
| (delimchars* ['|' ':'])? ')'
(* Old brackets, no new brackets ending with "|]" or ":]" *)
| ['|' ':']? ']'
(* Old ">]",">}" and new ones *)
| '>' delimchars* [']' '}']
(* Old brace and new ones *)
| (delimchars* ['|' ':'])? '}'
rule token c = parse
| newline { update_loc c None 1 false 0; NEWLINE }
| blank + as x { BLANKS x }
| "~" (lowercase identchar * as x) ':' { LABEL x }
| "?" (lowercase identchar * as x) ':' { OPTLABEL x }
| lowercase identchar * as x { LIDENT x }
| uppercase identchar * as x { UIDENT x }
| int_literal as i
{ try INT(int_of_string i, i)
with Failure _ -> err (Literal_overflow "int") (Loc.of_lexbuf lexbuf) }
| float_literal as f
{ try FLOAT(float_of_string f, f)
with Failure _ -> err (Literal_overflow "float") (Loc.of_lexbuf lexbuf) }
| (int_literal as i) "l"
{ try INT32(Int32.of_string i, i)
with Failure _ -> err (Literal_overflow "int32") (Loc.of_lexbuf lexbuf) }
| (int_literal as i) "L"
{ try INT64(Int64.of_string i, i)
with Failure _ -> err (Literal_overflow "int64") (Loc.of_lexbuf lexbuf) }
| (int_literal as i) "n"
{ try NATIVEINT(Nativeint.of_string i, i)
with Failure _ -> err (Literal_overflow "nativeint") (Loc.of_lexbuf lexbuf) }
| '"'
{ with_curr_loc string c;
let s = buff_contents c in STRING (TokenEval.string s, s) }
| "'" (newline as x) "'"
{ update_loc c None 1 false 1; CHAR (TokenEval.char x, x) }
| "'" ( [^ '\\' '\010' '\013']
| '\\' (['\\' '"' 'n' 't' 'b' 'r' ' ' '\'']
|['0'-'9'] ['0'-'9'] ['0'-'9']
|'x' hexa_char hexa_char)
as x) "'" { CHAR (TokenEval.char x, x) }
| "'\\" (_ as c)
{ err (Illegal_escape (String.make 1 c)) (Loc.of_lexbuf lexbuf) }
| "(*"
{ store c; COMMENT(parse_nested comment (in_comment c)) }
| "(*)"
{ warn Comment_start (Loc.of_lexbuf lexbuf) ;
parse comment (in_comment c); COMMENT (buff_contents c) }
| "*)"
{ warn Comment_not_end (Loc.of_lexbuf lexbuf) ;
move_start_p (-1) c; SYMBOL "*" }
| "<<" (quotchar* as beginning)
{ if quotations c
then (move_start_p (-String.length beginning);
mk_quotation quotation c "" "" 2)
else parse (symbolchar_star ("<<" ^ beginning)) c }
| "<<>>"
{ if quotations c
then QUOTATION { q_name = ""; q_loc = ""; q_shift = 2; q_contents = "" }
else parse (symbolchar_star "<<>>") c }
| "<@"
{ if quotations c then with_curr_loc maybe_quotation_at c
else parse (symbolchar_star "<@") c }
| "<:"
{ if quotations c then with_curr_loc maybe_quotation_colon c
else parse (symbolchar_star "<:") c }
| "#" [' ' '\t']* (['0'-'9']+ as num) [' ' '\t']*
("\"" ([^ '\010' '\013' '"' ] * as name) "\"")?
[^ '\010' '\013'] * newline
{ let inum = int_of_string num
in update_loc c name inum true 0; LINE_DIRECTIVE(inum, name) }
| '(' (not_star_symbolchar as op) ')'
{ ESCAPED_IDENT (String.make 1 op) }
| '(' (not_star_symbolchar symbolchar* not_star_symbolchar as op) ')'
{ ESCAPED_IDENT op }
| '(' (not_star_symbolchar symbolchar* as op) blank+ ')'
{ ESCAPED_IDENT op }
| '(' blank+ (symbolchar* not_star_symbolchar as op) ')'
{ ESCAPED_IDENT op }
| '(' blank+ (symbolchar+ as op) blank+ ')'
{ ESCAPED_IDENT op }
| ( "#" | "`" | "'" | "," | "." | ".." | ":" | "::"
| ":=" | ":>" | ";" | ";;" | "_"
| left_delimitor | right_delimitor ) as x { SYMBOL x }
| '$' { if antiquots c
then with_curr_loc dollar (shift 1 c)
else parse (symbolchar_star "$") c }
| ['~' '?' '!' '=' '<' '>' '|' '&' '@' '^' '+' '-' '*' '/' '%' '\\'] symbolchar *
as x { SYMBOL x }
| eof
{ let pos = lexbuf.lex_curr_p in
lexbuf.lex_curr_p <- { pos with pos_bol = pos.pos_bol + 1 ;
pos_cnum = pos.pos_cnum + 1 }; EOI }
| _ as c { err (Illegal_character c) (Loc.of_lexbuf lexbuf) }
and comment c = parse
"(*"
{ store c; with_curr_loc comment c; parse comment c }
| "*)" { store c }
| '<' (':' ident)? ('@' locname)? '<'
{ store c;
if quotations c then with_curr_loc quotation c; parse comment c }
| ident { store_parse comment c }
| "\""
{ store c;
begin try with_curr_loc string c
with Loc.Exc_located(_, Error.E Unterminated_string) ->
err Unterminated_string_in_comment (loc c)
end;
Buffer.add_char c.buffer '"';
parse comment c }
| "''" { store_parse comment c }
| "'''" { store_parse comment c }
| "'" newline "'"
{ update_loc c None 1 false 1; store_parse comment c }
| "'" [^ '\\' '\'' '\010' '\013' ] "'" { store_parse comment c }
| "'\\" ['\\' '"' '\'' 'n' 't' 'b' 'r' ' '] "'" { store_parse comment c }
| "'\\" ['0'-'9'] ['0'-'9'] ['0'-'9'] "'" { store_parse comment c }
| "'\\" 'x' hexa_char hexa_char "'" { store_parse comment c }
| eof
{ err Unterminated_comment (loc c) }
| newline
{ update_loc c None 1 false 0; store_parse comment c }
| _ { store_parse comment c }
and string c = parse
'"' { set_start_p c }
| '\\' newline ([' ' '\t'] * as space)
{ update_loc c None 1 false (String.length space);
store_parse string c }
| '\\' ['\\' '"' 'n' 't' 'b' 'r' ' ' '\''] { store_parse string c }
| '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] { store_parse string c }
| '\\' 'x' hexa_char hexa_char { store_parse string c }
| '\\' (_ as x)
{ if is_in_comment c
then store_parse string c
else begin
warn (Illegal_escape (String.make 1 x)) (Loc.of_lexbuf lexbuf);
store_parse string c
end }
| newline
{ update_loc c None 1 false 0; store_parse string c }
| eof { err Unterminated_string (loc c) }
| _ { store_parse string c }
and symbolchar_star beginning c = parse
| symbolchar* as tok { move_start_p (-String.length beginning) c ;
SYMBOL(beginning ^ tok) }
and maybe_quotation_at c = parse
| (ident as loc) '<'
{ mk_quotation quotation c "" loc (1 + String.length loc) }
| symbolchar* as tok { SYMBOL("<@" ^ tok) }
and maybe_quotation_colon c = parse
| (ident as name) '<'
{ mk_quotation quotation c name "" (1 + String.length name) }
| (ident as name) '@' (locname as loc) '<'
{ mk_quotation quotation c name loc
(2 + String.length loc + String.length name) }
| symbolchar* as tok { SYMBOL("<:" ^ tok) }
and quotation c = parse
| '<' (':' ident)? ('@' locname)? '<' { store c ;
with_curr_loc quotation c ;
parse quotation c }
| ">>" { store c }
| eof { err Unterminated_quotation (loc c) }
| newline { update_loc c None 1 false 0 ;
store_parse quotation c }
| _ { store_parse quotation c }
and dollar c = parse
| '$' { set_start_p c; ANTIQUOT("", "") }
| ('`'? (identchar*|['.' '!']+) as name) ':'
{ with_curr_loc (antiquot name) (shift (1 + String.length name) c) }
| _ { store_parse (antiquot "") c }
and antiquot name c = parse
| '$' { set_start_p c; ANTIQUOT(name, buff_contents c) }
| eof { err Unterminated_antiquot (loc c) }
| newline
{ update_loc c None 1 false 0; store_parse (antiquot name) c }
| '<' (':' ident)? ('@' locname)? '<'
{ store c; with_curr_loc quotation c; parse (antiquot name) c }
| _ { store_parse (antiquot name) c }
{
let lexing_store s buff max =
let rec self n s =
if n >= max then n
else
match Stream.peek s with
| Some x ->
Stream.junk s;
buff.[n] <- x;
succ n
| _ -> n
in
self 0 s
let from_context c =
let next _ =
let tok = with_curr_loc token c in
let loc = Loc.of_lexbuf c.lexbuf in
Some ((tok, loc))
in Stream.from next
let from_lexbuf ?(quotations = true) lb =
let c = { (default_context lb) with
loc = Loc.of_lexbuf lb;
antiquots = !Camlp4_config.antiquotations;
quotations = quotations }
in from_context c
let setup_loc lb loc =
let start_pos = Loc.start_pos loc in
lb.lex_abs_pos <- start_pos.pos_cnum;
lb.lex_curr_p <- start_pos
let from_string ?quotations loc str =
let lb = Lexing.from_string str in
setup_loc lb loc;
from_lexbuf ?quotations lb
let from_stream ?quotations loc strm =
let lb = Lexing.from_function (lexing_store strm) in
setup_loc lb loc;
from_lexbuf ?quotations lb
let mk () loc strm =
from_stream ~quotations:!Camlp4_config.quotations loc strm
end
}