diff --git a/README.md b/README.md index 803e7ba..d327dc2 100644 --- a/README.md +++ b/README.md @@ -16,11 +16,31 @@ and lexing in a single top down grammar. For a basic overview of the subject, se ### Parser Format -Parsers are contained within a PHP file, in a special comment block that starts with `/*Parser:NameOfParser` and continues until the -comment is closed. During compilation this block will be replaced with a set of matching functions. +Parsers are contained within a PHP file, in one or more special comment blocks that start with `/*!* [name | !pragma]` (like a docblock, but with an +exclamation mark in the middle of the stars) -Lexically, the parser is a name token, a matching rule and a set of functions. The name token must not start with whitespace, contain no whitespace -and end with a `:` character. The rule and function set are on the same line or on the indented lines below. +Lexically, these blocks are a set of rules, each consisting of a name token, a matching rule and a set of attached functions. +The name token must contain no whitespace and end with a `:` character. The matching rule and functions are on the same line or on the indented lines below. + +You can have multiple comment blocks, all of which are treated as contiguous for the purpose of compiling. During compilation these blocks will be replaced +with a set of "matching" functions (functions which match a string against their rules) for each rule in the block. + +The optional name marks the start of a new set of parser rules. This is currently unused, but might be used in future for opimization & debugging purposes. +If unspecified, it defaults to the same name as the previous parser comment block, or 'Anonymous Parser' if no name has ever been set. + +If the name starts with an '!' symbol, that comment block is a pragma, and is treated not as some part of the parser, but as a special block of meta-data + +##### Tricks and traps + +We allow indenting a parser block, but only in a consistant manner - whatever the indent of the /*** marker becomes the "base" indent, and needs to be used +for all lines. You can mix tabs and spaces, but the indent must always be an exact match - if the "base" indent is a tab then two spaces, every line within the +block also needs indenting with a tab then two spaces, not two tabs (even if in your editor, that gives the same indent). + +Any line with more than the "base" indent is considered a continuation of the previous rule + +Any line with less than the "base" indent is an error + +This might get looser if I get around to re-writing the internal "parser parser" in php-peg, bootstrapping the whole thing ### Rules @@ -58,8 +78,11 @@ Tokens may be ##### Regular expression tokens -Automatically anchored to the current string start - do not include a string start anchor (`^`) anywhere. -Can specify flags on stand-alone regexs. Currently doesn't handle flags on regexs with rules. +Automatically anchored to the current string start - do not include a string start anchor (`^`) anywhere. Always acts as when the 'x' flag is enabled in PHP - +whitespace is ignored unless escaped, and '#' stats a comment. + +Be careful when ending a regular expression token - the '*/' pattern (as in /foo\s*/) will end a PHP comment. Since the 'x' flag is always active, +just split with a space (as in / foo \s* /) ### Expressions @@ -183,6 +206,19 @@ You can also specify a rule-attached function called `*`, which will be called w By default all matches are added to the 'text' property of a result. By prepending a member with `.` that match will not be added to the ['text'] member. This doesn't affect the other result properties that named rules' add. +### Pragmas + +When opening a parser comment block, if instead of a name (or no name) you put a word starting with '!', that comment block is treated as a pragma - not +part of the parser language itself, but some other instruction to the compiler. These pragmas are currently understood: + + !silent + + This is a comment that should only appear in the source code. Don't output it in the generated code + + !insert_autogen_warning + + Insert a warning comment into the generated code at this point, warning that the file is autogenerated and not to edit it + ## TODO - Allow configuration of whitespace - specify what matches, and wether it should be injected into results as-is, collapsed, or not at all diff --git a/examples/CalculatedLiterals.peg.inc b/examples/CalculatedLiterals.peg.inc index cc8ea73..018d4fa 100644 --- a/examples/CalculatedLiterals.peg.inc +++ b/examples/CalculatedLiterals.peg.inc @@ -4,7 +4,7 @@ require '../Parser.php' ; class CalculatedLiterals extends Parser { -/*Parser:CalculatedLiterals +/*!* CalculatedLiterals string: ( /\\./ | /[^${parent.q}]/ )* diff --git a/examples/Calculator.peg.inc b/examples/Calculator.peg.inc index cffe786..f15d00c 100644 --- a/examples/Calculator.peg.inc +++ b/examples/Calculator.peg.inc @@ -4,7 +4,7 @@ require '../Parser.php' ; class Calculator extends Parser { -/*Parser:Calculator +/*!* Calculator Number: /[0-9]+/ Value: Number > | '(' > Expr > ')' > diff --git a/examples/EqualRepeat.peg.inc b/examples/EqualRepeat.peg.inc index 87e484f..ad7ec74 100644 --- a/examples/EqualRepeat.peg.inc +++ b/examples/EqualRepeat.peg.inc @@ -10,7 +10,7 @@ class EqualRepeat extends Packrat { * aabbacc - bad */ -/*Parser:Grammar1 +/*!* Grammar1 A: "a" A? "b" B: "b" B? "c" T: !"b" diff --git a/examples/Rfc822.peg.inc b/examples/Rfc822.peg.inc index 73bef48..839491d 100644 --- a/examples/Rfc822.peg.inc +++ b/examples/Rfc822.peg.inc @@ -8,7 +8,7 @@ require '../Parser.php'; */ class Rfc822 extends Parser { -/*Parser:Rfc822 +/*!* Rfc822 crlf: /\r\n/ diff --git a/examples/Rfc822UTF8.peg.inc b/examples/Rfc822UTF8.peg.inc index 25ec5ff..b2605a6 100644 --- a/examples/Rfc822UTF8.peg.inc +++ b/examples/Rfc822UTF8.peg.inc @@ -8,7 +8,7 @@ require 'Rfc822.php'; */ class Rfc822UTF8 extends Rfc822 { -/*Parser:Rfc822UTF8 +/*!* Rfc822UTF8 crlf: /\r\n/u