refactor(compiler): introduce block parsing in lexer (#50895)

⚠️Disclaimer⚠️ this PR implements syntax that is still in an open RFC. It will be adjusted once the RFC is closed. These changes extend the lexer to recognize the concepts of a block group (`{#foo paramA; paramB}{/foo}`) and a block (`{:foo paramA; paramB;}`) which will be useful later on for the control flow and defer proposals. Block groups can be used anywhere and require a closing tag while block can only be used inside of a block. The idea is that in the next PRs the markup AST will be expanded to have some more specialized node like `ConditionalBlock` or `DeferBlock` which will then be turned into instructions. PR Close #50895
angular · Jul 11, 2023 · 29aaded · 29aaded
1 parent 8758517
commit 29aaded
Show file tree

Hide file tree

Showing 3 changed files with 364 additions and 6 deletions.
diff --git a/packages/compiler/src/ml_parser/lexer.ts b/packages/compiler/src/ml_parser/lexer.ts
@@ -89,6 +89,12 @@ export interface TokenizeOptions {
    * If true, do not convert CRLF to LF.
    */
   preserveLineEndings?: boolean;
+
+  // TODO(crisbeto): temporary option to limit access to the block syntax.
+  /**
+   * Whether the block syntax is enabled at the compiler level.
+   */
+  tokenizeBlocks?: boolean;
 }
 
 export function tokenize(
@@ -136,8 +142,8 @@ class _Tokenizer {
   private _expansionCaseStack: TokenType[] = [];
   private _inInterpolation: boolean = false;
   private readonly _preserveLineEndings: boolean;
-  private readonly _escapedString: boolean;
   private readonly _i18nNormalizeLineEndingsInICUs: boolean;
+  private readonly _tokenizeBlocks: boolean;
   tokens: Token[] = [];
   errors: TokenError[] = [];
   nonNormalizedIcuExpressions: Token[] = [];
@@ -159,8 +165,8 @@ class _Tokenizer {
     this._cursor = options.escapedString ? new EscapedCharacterCursor(_file, range) :
                                            new PlainCharacterCursor(_file, range);
     this._preserveLineEndings = options.preserveLineEndings || false;
-    this._escapedString = options.escapedString || false;
     this._i18nNormalizeLineEndingsInICUs = options.i18nNormalizeLineEndingsInICUs || false;
+    this._tokenizeBlocks = options.tokenizeBlocks || false;
     try {
       this._cursor.init();
     } catch (e) {
@@ -197,6 +203,12 @@ class _Tokenizer {
           } else {
             this._consumeTagOpen(start);
           }
+        } else if (this._tokenizeBlocks && this._attemptStr('{#')) {
+          this._consumeBlockGroupOpen(start);
+        } else if (this._tokenizeBlocks && this._attemptStr('{/')) {
+          this._consumeBlockGroupClose(start);
+        } else if (this._tokenizeBlocks && this._attemptStr('{:')) {
+          this._consumeBlock(start);
         } else if (!(this._tokenizeIcu && this._tokenizeExpansionForm())) {
           // In (possibly interpolated) text the end of the text is given by `isTextEnd()`, while
           // the premature end of an interpolation is given by the start of a new HTML element.
@@ -212,6 +224,72 @@ class _Tokenizer {
     this._endToken([]);
   }
 
+  private _consumeBlockGroupOpen(start: CharacterCursor) {
+    this._beginToken(TokenType.BLOCK_GROUP_OPEN_START, start);
+    const nameCursor = this._cursor.clone();
+    this._attemptCharCodeUntilFn(code => !isBlockNameChar(code));
+    this._endToken([this._cursor.getChars(nameCursor)]);
+    this._consumeBlockParameters();
+    this._beginToken(TokenType.BLOCK_GROUP_OPEN_END);
+    this._requireCharCode(chars.$RBRACE);
+    this._endToken([]);
+  }
+
+  private _consumeBlockGroupClose(start: CharacterCursor) {
+    this._beginToken(TokenType.BLOCK_GROUP_CLOSE, start);
+    const nameCursor = this._cursor.clone();
+    this._attemptCharCodeUntilFn(code => !isBlockNameChar(code));
+    const name = this._cursor.getChars(nameCursor);
+    this._requireCharCode(chars.$RBRACE);
+    this._endToken([name]);
+  }
+
+  private _consumeBlock(start: CharacterCursor) {
+    this._beginToken(TokenType.BLOCK_OPEN_START, start);
+    const nameCursor = this._cursor.clone();
+    this._attemptCharCodeUntilFn(code => !isBlockNameChar(code));
+    this._endToken([this._cursor.getChars(nameCursor)]);
+    this._consumeBlockParameters();
+    this._beginToken(TokenType.BLOCK_OPEN_END);
+    this._requireCharCode(chars.$RBRACE);
+    this._endToken([]);
+  }
+
+  private _consumeBlockParameters() {
+    // Trim the whitespace until the first parameter.
+    this._attemptCharCodeUntilFn(isBlockParameterChar);
+
+    while (this._cursor.peek() !== chars.$RBRACE && this._cursor.peek() !== chars.$EOF) {
+      this._beginToken(TokenType.BLOCK_PARAMETER);
+      const start = this._cursor.clone();
+      let inQuote: number|null = null;
+
+      // Consume the parameter until the next semicolon or brace.
+      // Note that we skip over semicolons/braces inside of strings.
+      while ((this._cursor.peek() !== chars.$SEMICOLON && this._cursor.peek() !== chars.$RBRACE &&
+              this._cursor.peek() !== chars.$EOF) ||
+             inQuote !== null) {
+        const char = this._cursor.peek();
+
+        // Skip to the next character if it was escaped.
+        if (char === chars.$BACKSLASH) {
+          this._cursor.advance();
+        } else if (char === inQuote) {
+          inQuote = null;
+        } else if (inQuote === null && chars.isQuote(char)) {
+          inQuote = char;
+        }
+
+        this._cursor.advance();
+      }
+
+      this._endToken([this._cursor.getChars(start)]);
+
+      // Skip to the next parameter.
+      this._attemptCharCodeUntilFn(isBlockParameterChar);
+    }
+  }
+
   /**
    * @returns whether an ICU token has been created
    * @internal
@@ -578,7 +656,6 @@ class _Tokenizer {
   }
 
   private _consumeAttributeValue() {
-    let value: string;
     if (this._cursor.peek() === chars.$SQ || this._cursor.peek() === chars.$DQ) {
       const quoteChar = this._cursor.peek();
       this._consumeQuote(quoteChar);
@@ -795,7 +872,7 @@ class _Tokenizer {
   }
 
   private _isTextEnd(): boolean {
-    if (this._isTagStart() || this._cursor.peek() === chars.$EOF) {
+    if (this._isTagStart() || this._isBlockStart() || this._cursor.peek() === chars.$EOF) {
       return true;
     }
 
@@ -833,6 +910,26 @@ class _Tokenizer {
     return false;
   }
 
+  private _isBlockStart(): boolean {
+    if (this._tokenizeBlocks && this._cursor.peek() === chars.$LBRACE) {
+      const tmp = this._cursor.clone();
+
+      // Check that the cursor is on a `{#`, `{/` or `{:`.
+      tmp.advance();
+      const next = tmp.peek();
+      if (next !== chars.$BANG && next !== chars.$SLASH && next !== chars.$COLON) {
+        return false;
+      }
+
+      // If it is, also verify that the next character is a valid block identifier.
+      tmp.advance();
+      if (isBlockNameChar(tmp.peek())) {
+        return true;
+      }
+    }
+    return false;
+  }
+
   private _readUntil(char: number): string {
     const start = this._cursor.clone();
     this._attemptUntilChar(char);
@@ -900,6 +997,14 @@ function toUpperCaseCharCode(code: number): number {
   return code >= chars.$a && code <= chars.$z ? code - chars.$a + chars.$A : code;
 }
 
+function isBlockNameChar(code: number): boolean {
+  return chars.isAsciiLetter(code) || chars.isDigit(code) || code === chars.$_;
+}
+
+function isBlockParameterChar(code: number): boolean {
+  return code !== chars.$SEMICOLON && isNotWhitespace(code);
+}
+
 function mergeTextTokens(srcTokens: Token[]): Token[] {
   const dstTokens: Token[] = [];
   let lastDstToken: Token|undefined = undefined;

diff --git a/packages/compiler/src/ml_parser/tokens.ts b/packages/compiler/src/ml_parser/tokens.ts
@@ -33,15 +33,22 @@ export const enum TokenType {
   EXPANSION_CASE_EXP_START,
   EXPANSION_CASE_EXP_END,
   EXPANSION_FORM_END,
-  EOF
+  EOF,
+  BLOCK_GROUP_OPEN_START,
+  BLOCK_GROUP_OPEN_END,
+  BLOCK_GROUP_CLOSE,
+  BLOCK_PARAMETER,
+  BLOCK_OPEN_START,
+  BLOCK_OPEN_END,
 }
 
 export type Token = TagOpenStartToken|TagOpenEndToken|TagOpenEndVoidToken|TagCloseToken|
     IncompleteTagOpenToken|TextToken|InterpolationToken|EncodedEntityToken|CommentStartToken|
     CommentEndToken|CdataStartToken|CdataEndToken|AttributeNameToken|AttributeQuoteToken|
     AttributeValueTextToken|AttributeValueInterpolationToken|DocTypeToken|ExpansionFormStartToken|
     ExpansionCaseValueToken|ExpansionCaseExpressionStartToken|ExpansionCaseExpressionEndToken|
-    ExpansionFormEndToken|EndOfFileToken;
+    ExpansionFormEndToken|EndOfFileToken|BlockGroupOpenStartToken|BlockGroupOpenEndToken|
+    BlockGroupCloseToken|BlockParameterToken|BlockOpenStartToken|BlockOpenEndToken;
 
 export type InterpolatedTextToken = TextToken|InterpolationToken|EncodedEntityToken;
 
@@ -170,3 +177,33 @@ export interface EndOfFileToken extends TokenBase {
   type: TokenType.EOF;
   parts: [];
 }
+
+export interface BlockGroupOpenStartToken extends TokenBase {
+  type: TokenType.BLOCK_GROUP_OPEN_START;
+  parts: [name: string];
+}
+
+export interface BlockGroupOpenEndToken extends TokenBase {
+  type: TokenType.BLOCK_GROUP_OPEN_END;
+  parts: [];
+}
+
+export interface BlockGroupCloseToken extends TokenBase {
+  type: TokenType.BLOCK_GROUP_CLOSE;
+  parts: [name: string];
+}
+
+export interface BlockParameterToken extends TokenBase {
+  type: TokenType.BLOCK_PARAMETER;
+  parts: [expression: string];
+}
+
+export interface BlockOpenStartToken extends TokenBase {
+  type: TokenType.BLOCK_OPEN_START;
+  parts: [name: string];
+}
+
+export interface BlockOpenEndToken extends TokenBase {
+  type: TokenType.BLOCK_OPEN_END;
+  parts: [];
+}