Use TokenSource to find new location for re-lexing

astral-sh · Jun 27, 2024 · f7fd083 · f7fd083
1 parent 55f4812
commit f7fd083
Show file tree

Hide file tree

Showing 4 changed files with 149 additions and 268 deletions.
diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs
@@ -1314,7 +1314,8 @@ impl<'src> Lexer<'src> {
         }
     }
 
-    /// Re-lex the current token in the context of a logical line.
+    /// Re-lex the [`NonLogicalNewline`] token at the given position in the context of a logical
+    /// line.
     ///
     /// Returns a boolean indicating whether the lexer's position has changed. This could result
     /// into the new current token being different than the previous current token but is not
@@ -1368,7 +1369,10 @@ impl<'src> Lexer<'src> {
     ///
     /// [`Newline`]: TokenKind::Newline
     /// [`NonLogicalNewline`]: TokenKind::NonLogicalNewline
-    pub(crate) fn re_lex_logical_token(&mut self) -> bool {
+    pub(crate) fn re_lex_logical_token(
+        &mut self,
+        non_logical_newline_start: Option<TextSize>,
+    ) -> bool {
         if self.nesting == 0 {
             return false;
         }
@@ -1383,84 +1387,36 @@ impl<'src> Lexer<'src> {
             return false;
         }
 
-        let mut current_position = self.current_range().start();
-        let mut reverse_chars = self.source[..current_position.to_usize()]
-            .chars()
-            .rev()
-            .peekable();
-        let mut newline_position = None;
-
-        while let Some(ch) = reverse_chars.next() {
-            if is_python_whitespace(ch) {
-                current_position -= ch.text_len();
-                continue;
-            }
-
-            match ch {
-                '\n' => {
-                    current_position -= ch.text_len();
-                    if let Some(carriage_return) = reverse_chars.next_if_eq(&'\r') {
-                        current_position -= carriage_return.text_len();
-                    }
-                }
-                '\r' => {
-                    current_position -= ch.text_len();
-                }
-                _ => break,
-            }
-
-            debug_assert!(matches!(ch, '\n' | '\r'));
-
-            // Count the number of backslashes before the newline character.
-            let mut backslash_count = 0;
-            while reverse_chars.next_if_eq(&'\\').is_some() {
-                backslash_count += 1;
-            }
+        let Some(new_position) = non_logical_newline_start else {
+            return false;
+        };
 
-            if backslash_count == 0 {
-                // No escapes: `\n`
-                newline_position = Some(current_position);
-            } else {
-                if backslash_count % 2 == 0 {
-                    // Even number of backslashes i.e., all backslashes cancel each other out
-                    // which means the newline character is not being escaped.
-                    newline_position = Some(current_position);
-                }
-                current_position -= TextSize::new('\\'.text_len().to_u32() * backslash_count);
-            }
+        // Earlier we reduced the nesting level unconditionally. Now that we know the lexer's
+        // position is going to be moved back, the lexer needs to be put back into a
+        // parenthesized context if the current token is a closing parenthesis.
+        //
+        // ```py
+        // (a, [b,
+        //     c
+        // )
+        // ```
+        //
+        // Here, the parser would request to re-lex the token when it's at `)` and can recover
+        // from an unclosed `[`. This method will move the lexer back to the newline character
+        // after `c` which means it goes back into parenthesized context.
+        if matches!(
+            self.current_kind,
+            TokenKind::Rpar | TokenKind::Rsqb | TokenKind::Rbrace
+        ) {
+            self.nesting += 1;
         }
 
-        // The lexer should only be moved if there's a newline character which needs to be
-        // re-lexed.
-        if let Some(newline_position) = newline_position {
-            // Earlier we reduced the nesting level unconditionally. Now that we know the lexer's
-            // position is going to be moved back, the lexer needs to be put back into a
-            // parenthesized context if the current token is a closing parenthesis.
-            //
-            // ```py
-            // (a, [b,
-            //     c
-            // )
-            // ```
-            //
-            // Here, the parser would request to re-lex the token when it's at `)` and can recover
-            // from an unclosed `[`. This method will move the lexer back to the newline character
-            // after `c` which means it goes back into parenthesized context.
-            if matches!(
-                self.current_kind,
-                TokenKind::Rpar | TokenKind::Rsqb | TokenKind::Rbrace
-            ) {
-                self.nesting += 1;
-            }
-
-            self.cursor = Cursor::new(self.source);
-            self.cursor.skip_bytes(newline_position.to_usize());
-            self.state = State::Other;
-            self.next_token();
-            true
-        } else {
-            false
-        }
+        self.cursor = Cursor::new(self.source);
+        self.cursor.skip_bytes(new_position.to_usize());
+        debug_assert!(matches!(self.cursor.first(), '\n' | '\r'));
+        self.state = State::Other;
+        self.next_token();
+        true
     }
 
     #[inline]

diff --git a/crates/ruff_python_parser/src/token_source.rs b/crates/ruff_python_parser/src/token_source.rs
@@ -60,12 +60,23 @@ impl<'src> TokenSource<'src> {
         self.lexer.take_value()
     }
 
-    /// Calls the underlying [`re_lex_logical_token`] method on the lexer and updates the token
-    /// vector accordingly.
+    /// Calls the underlying [`re_lex_logical_token`] method on the lexer with the new lexer
+    /// position and updates the token vector accordingly.
     ///
     /// [`re_lex_logical_token`]: Lexer::re_lex_logical_token
     pub(crate) fn re_lex_logical_token(&mut self) {
-        if self.lexer.re_lex_logical_token() {
+        let mut non_logical_newline_start = None;
+        for token in self.tokens.iter().rev() {
+            match token.kind() {
+                TokenKind::NonLogicalNewline => {
+                    non_logical_newline_start = Some(token.start());
+                }
+                TokenKind::Comment => continue,
+                _ => break,
+            }
+        }
+
+        if self.lexer.re_lex_logical_token(non_logical_newline_start) {
             let current_start = self.current_range().start();
             while self
                 .tokens