Use TokenSource to find new location for re-lexing (#12060)

## Summary This PR splits the re-lexing logic into two parts: 1. `TokenSource`: The token source will be responsible to find the position the lexer needs to be moved to 2. `Lexer`: The lexer will be responsible to reduce the nesting level and move itself to the new position if recovered from a parenthesized context This split makes it easy to find the new lexer position without needing to implement the backwards lexing logic again which would need to handle cases involving: * Different kinds of newlines * Line continuation character(s) * Comments * Whitespaces ### F-strings This change did reveal one thing about re-lexing f-strings. Consider the following example: ```py f'{' # ^ f'foo' ``` Here, the quote as highlighted by the caret (`^`) is the start of a string inside an f-string expression. This is unterminated string which means the token emitted is actually `Unknown`. The parser tries to recover from it but there's no newline token in the vector so the new logic doesn't recover from it. The previous logic does recover because it's looking at the raw characters instead. The parser would be at `FStringStart` (the one for the second line) when it calls into the re-lexing logic to recover from an unterminated f-string on the first line. So, moving backwards the first character encountered is a newline character but the first token encountered is an `Unknown` token. This is improved with #12067 fixes: #12046 fixes: #12036 ## Test Plan Update the snapshot and validate the changes.
astral-sh · Jun 27, 2024 · a4688ae · a4688ae
1 parent e137c82
commit a4688ae
Show file tree

Hide file tree

Showing 3 changed files with 49 additions and 82 deletions.
diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs
@@ -1319,7 +1319,8 @@ impl<'src> Lexer<'src> {
         }
     }
 
-    /// Re-lex the current token in the context of a logical line.
+    /// Re-lex the [`NonLogicalNewline`] token at the given position in the context of a logical
+    /// line.
     ///
     /// Returns a boolean indicating whether the lexer's position has changed. This could result
     /// into the new current token being different than the previous current token but is not
@@ -1373,7 +1374,10 @@ impl<'src> Lexer<'src> {
     ///
     /// [`Newline`]: TokenKind::Newline
     /// [`NonLogicalNewline`]: TokenKind::NonLogicalNewline
-    pub(crate) fn re_lex_logical_token(&mut self) -> bool {
+    pub(crate) fn re_lex_logical_token(
+        &mut self,
+        non_logical_newline_start: Option<TextSize>,
+    ) -> bool {
         if self.nesting == 0 {
             return false;
         }
@@ -1388,84 +1392,35 @@ impl<'src> Lexer<'src> {
             return false;
         }
 
-        let mut current_position = self.current_range().start();
-        let mut reverse_chars = self.source[..current_position.to_usize()]
-            .chars()
-            .rev()
-            .peekable();
-        let mut newline_position = None;
-
-        while let Some(ch) = reverse_chars.next() {
-            if is_python_whitespace(ch) {
-                current_position -= ch.text_len();
-                continue;
-            }
-
-            match ch {
-                '\n' => {
-                    current_position -= ch.text_len();
-                    if let Some(carriage_return) = reverse_chars.next_if_eq(&'\r') {
-                        current_position -= carriage_return.text_len();
-                    }
-                }
-                '\r' => {
-                    current_position -= ch.text_len();
-                }
-                _ => break,
-            }
-
-            debug_assert!(matches!(ch, '\n' | '\r'));
-
-            // Count the number of backslashes before the newline character.
-            let mut backslash_count = 0;
-            while reverse_chars.next_if_eq(&'\\').is_some() {
-                backslash_count += 1;
-            }
+        let Some(new_position) = non_logical_newline_start else {
+            return false;
+        };
 
-            if backslash_count == 0 {
-                // No escapes: `\n`
-                newline_position = Some(current_position);
-            } else {
-                if backslash_count % 2 == 0 {
-                    // Even number of backslashes i.e., all backslashes cancel each other out
-                    // which means the newline character is not being escaped.
-                    newline_position = Some(current_position);
-                }
-                current_position -= TextSize::new('\\'.text_len().to_u32() * backslash_count);
-            }
+        // Earlier we reduced the nesting level unconditionally. Now that we know the lexer's
+        // position is going to be moved back, the lexer needs to be put back into a
+        // parenthesized context if the current token is a closing parenthesis.
+        //
+        // ```py
+        // (a, [b,
+        //     c
+        // )
+        // ```
+        //
+        // Here, the parser would request to re-lex the token when it's at `)` and can recover
+        // from an unclosed `[`. This method will move the lexer back to the newline character
+        // after `c` which means it goes back into parenthesized context.
+        if matches!(
+            self.current_kind,
+            TokenKind::Rpar | TokenKind::Rsqb | TokenKind::Rbrace
+        ) {
+            self.nesting += 1;
         }
 
-        // The lexer should only be moved if there's a newline character which needs to be
-        // re-lexed.
-        if let Some(newline_position) = newline_position {
-            // Earlier we reduced the nesting level unconditionally. Now that we know the lexer's
-            // position is going to be moved back, the lexer needs to be put back into a
-            // parenthesized context if the current token is a closing parenthesis.
-            //
-            // ```py
-            // (a, [b,
-            //     c
-            // )
-            // ```
-            //
-            // Here, the parser would request to re-lex the token when it's at `)` and can recover
-            // from an unclosed `[`. This method will move the lexer back to the newline character
-            // after `c` which means it goes back into parenthesized context.
-            if matches!(
-                self.current_kind,
-                TokenKind::Rpar | TokenKind::Rsqb | TokenKind::Rbrace
-            ) {
-                self.nesting += 1;
-            }
-
-            self.cursor = Cursor::new(self.source);
-            self.cursor.skip_bytes(newline_position.to_usize());
-            self.state = State::Other;
-            self.next_token();
-            true
-        } else {
-            false
-        }
+        self.cursor = Cursor::new(self.source);
+        self.cursor.skip_bytes(new_position.to_usize());
+        self.state = State::Other;
+        self.next_token();
+        true
     }
 
     #[inline]

diff --git a/crates/ruff_python_parser/src/token_source.rs b/crates/ruff_python_parser/src/token_source.rs
@@ -60,12 +60,23 @@ impl<'src> TokenSource<'src> {
         self.lexer.take_value()
     }
 
-    /// Calls the underlying [`re_lex_logical_token`] method on the lexer and updates the token
-    /// vector accordingly.
+    /// Calls the underlying [`re_lex_logical_token`] method on the lexer with the new lexer
+    /// position and updates the token vector accordingly.
     ///
     /// [`re_lex_logical_token`]: Lexer::re_lex_logical_token
     pub(crate) fn re_lex_logical_token(&mut self) {
-        if self.lexer.re_lex_logical_token() {
+        let mut non_logical_newline_start = None;
+        for token in self.tokens.iter().rev() {
+            match token.kind() {
+                TokenKind::NonLogicalNewline => {
+                    non_logical_newline_start = Some(token.start());
+                }
+                TokenKind::Comment => continue,
+                _ => break,
+            }
+        }
+
+        if self.lexer.re_lex_logical_token(non_logical_newline_start) {
             let current_start = self.current_range().start();
             while self
                 .tokens

diff --git a/...on_parser/tests/snapshots/invalid_syntax@re_lexing__line_continuation_windows_eol.py.snap b/...on_parser/tests/snapshots/invalid_syntax@re_lexing__line_continuation_windows_eol.py.snap
@@ -82,8 +82,9 @@ Module(
 
   |
 1 |   call(a, b, # comment \
-2 | / 
-3 | | def bar():
+  |  _______________________^
+2 | | 
   | |_^ Syntax Error: Expected ')', found newline
+3 |   def bar():
 4 |       pass
   |