From f0851d45d5b0fb9efbbd72194a5d91c8f4a45ac6 Mon Sep 17 00:00:00 2001
From: Charlie Marsh <charlie.r.marsh@gmail.com>
Date: Sat, 20 May 2023 16:49:16 -0400
Subject: [PATCH] Make ambiguous-unicode detection sensitive to 'word' context

---
 .../test/fixtures/ruff/confusables.py         |  19 +-
 .../ruff/rules/ambiguous_unicode_character.rs | 165 +++++++++++++-----
 ...ruff__rules__ruff__tests__confusables.snap |  71 ++++++++
 3 files changed, 212 insertions(+), 43 deletions(-)

diff --git a/crates/ruff/resources/test/fixtures/ruff/confusables.py b/crates/ruff/resources/test/fixtures/ruff/confusables.py
index b642cebd6f396c..7a42022153d6d7 100644
--- a/crates/ruff/resources/test/fixtures/ruff/confusables.py
+++ b/crates/ruff/resources/test/fixtures/ruff/confusables.py
@@ -8,7 +8,24 @@ def f():
     ...
 
 
-def g():
+def f():
     """Here's a docstring with a greek rho: ρ"""
     # And here's a comment with a greek alpha: ∗
     ...
+
+
+x = "𝐁ad string"
+x = "−"
+
+# This should be ignored, since it contains an unambiguous unicode character, and no
+# ASCII.
+x = "Русский"
+
+# The first word should be ignored, while the second should be included, since it
+# contains ASCII.
+x = "βα Bα"
+
+# The two characters should be flagged here. The first character is a "word"
+# consisting of a single ambiguous character, while the second character is a "word
+# boundary" (whitespace) that it itself ambiguous.
+x = "Р усский"
diff --git a/crates/ruff/src/rules/ruff/rules/ambiguous_unicode_character.rs b/crates/ruff/src/rules/ruff/rules/ambiguous_unicode_character.rs
index c82fafc0be3040..b2147623954269 100644
--- a/crates/ruff/src/rules/ruff/rules/ambiguous_unicode_character.rs
+++ b/crates/ruff/src/rules/ruff/rules/ambiguous_unicode_character.rs
@@ -1,3 +1,4 @@
+use bitflags::bitflags;
 use ruff_text_size::{TextLen, TextRange, TextSize};
 
 use ruff_diagnostics::{AlwaysAutofixableViolation, Diagnostic, DiagnosticKind, Edit, Fix};
@@ -23,8 +24,7 @@ impl AlwaysAutofixableViolation for AmbiguousUnicodeCharacterString {
             representant,
         } = self;
         format!(
-            "String contains ambiguous unicode character `{confusable}` (did you mean \
-             `{representant}`?)"
+            "String contains ambiguous unicode character `{confusable}` (did you mean `{representant}`?)"
         )
     }
 
@@ -51,8 +51,7 @@ impl AlwaysAutofixableViolation for AmbiguousUnicodeCharacterDocstring {
             representant,
         } = self;
         format!(
-            "Docstring contains ambiguous unicode character `{confusable}` (did you mean \
-             `{representant}`?)"
+            "Docstring contains ambiguous unicode character `{confusable}` (did you mean `{representant}`?)"
         )
     }
 
@@ -79,8 +78,7 @@ impl AlwaysAutofixableViolation for AmbiguousUnicodeCharacterComment {
             representant,
         } = self;
         format!(
-            "Comment contains ambiguous unicode character `{confusable}` (did you mean \
-             `{representant}`?)"
+            "Comment contains ambiguous unicode character `{confusable}` (did you mean `{representant}`?)"
         )
     }
 
@@ -103,50 +101,133 @@ pub(crate) fn ambiguous_unicode_character(
 
     let text = locator.slice(range);
 
+    // Most of the time, we don't need to check for ambiguous unicode characters at all.
+    if text.is_ascii() {
+        return diagnostics;
+    }
+
+    // Iterate over the "words" in the text.
+    let mut flags = WordFlags::empty();
+    let mut buffer = vec![];
     for (relative_offset, current_char) in text.char_indices() {
-        if !current_char.is_ascii() {
-            // Search for confusing characters.
-            if let Some(representant) = CONFUSABLES.get(&(current_char as u32)).copied() {
-                if !settings.allowed_confusables.contains(&current_char) {
-                    let char_range = TextRange::at(
+        // Word boundary.
+        if !current_char.is_alphanumeric() {
+            if !buffer.is_empty() {
+                if flags.is_candidate_word() {
+                    diagnostics.append(&mut buffer);
+                }
+                buffer.clear();
+            }
+            flags = WordFlags::empty();
+
+            // Check if the boundary character is itself an ambiguous unicode character, in which
+            // case, it's always included as a diagnostic.
+            if !current_char.is_ascii() {
+                if let Some(representant) = CONFUSABLES.get(&(current_char as u32)).copied() {
+                    if let Some(diagnostic) = diagnostic_for_char(
+                        current_char,
+                        representant as char,
                         TextSize::try_from(relative_offset).unwrap() + range.start(),
-                        current_char.text_len(),
-                    );
-
-                    let mut diagnostic = Diagnostic::new::<DiagnosticKind>(
-                        match context {
-                            Context::String => AmbiguousUnicodeCharacterString {
-                                confusable: current_char,
-                                representant: representant as char,
-                            }
-                            .into(),
-                            Context::Docstring => AmbiguousUnicodeCharacterDocstring {
-                                confusable: current_char,
-                                representant: representant as char,
-                            }
-                            .into(),
-                            Context::Comment => AmbiguousUnicodeCharacterComment {
-                                confusable: current_char,
-                                representant: representant as char,
-                            }
-                            .into(),
-                        },
-                        char_range,
-                    );
-                    if settings.rules.enabled(diagnostic.kind.rule()) {
-                        if settings.rules.should_fix(diagnostic.kind.rule()) {
-                            #[allow(deprecated)]
-                            diagnostic.set_fix(Fix::unspecified(Edit::range_replacement(
-                                (representant as char).to_string(),
-                                char_range,
-                            )));
-                        }
+                        context,
+                        settings,
+                    ) {
                         diagnostics.push(diagnostic);
                     }
                 }
             }
+        } else if current_char.is_ascii() {
+            // The current word contains at least one ASCII character.
+            flags |= WordFlags::ASCII;
+        } else if let Some(representant) = CONFUSABLES.get(&(current_char as u32)).copied() {
+            // The current word contains an ambiguous unicode character.
+            if let Some(diagnostic) = diagnostic_for_char(
+                current_char,
+                representant as char,
+                TextSize::try_from(relative_offset).unwrap() + range.start(),
+                context,
+                settings,
+            ) {
+                buffer.push(diagnostic);
+            }
+        } else {
+            // The current word contains at least one unambiguous unicode character.
+            flags |= WordFlags::UNAMBIGUOUS_UNICODE;
         }
     }
 
+    // End of the text.
+    if !buffer.is_empty() {
+        if flags.is_candidate_word() {
+            diagnostics.append(&mut buffer);
+        }
+        buffer.clear();
+    }
+
     diagnostics
 }
+
+bitflags! {
+    #[derive(Default, Debug, Copy, Clone, PartialEq, Eq)]
+    pub struct WordFlags: u8 {
+        /// The word contains at least one ASCII character (like `B`).
+        const ASCII = 0b0000_0001;
+        /// The word contains at least one unambiguous unicode character (like `β`).
+        const UNAMBIGUOUS_UNICODE = 0b0000_0010;
+    }
+}
+
+impl WordFlags {
+    /// Return `true` if the flags indicate that the word is a candidate for flagging
+    /// ambiguous unicode characters.
+    ///
+    /// We follow VS Code's logic for determining whether ambiguous unicode characters within a
+    /// given word should be flagged, i.e., we flag a word if it contains at least one ASCII
+    /// character, or is purely unicode but _only_ consists of ambiguous characters.
+    const fn is_candidate_word(self) -> bool {
+        self.contains(WordFlags::ASCII) || !self.contains(WordFlags::UNAMBIGUOUS_UNICODE)
+    }
+}
+
+/// Create a [`Diagnostic`] to report an ambiguous unicode character.
+fn diagnostic_for_char(
+    confusable: char,
+    representant: char,
+    offset: TextSize,
+    context: Context,
+    settings: &Settings,
+) -> Option<Diagnostic> {
+    if !settings.allowed_confusables.contains(&confusable) {
+        let char_range = TextRange::at(offset, confusable.text_len());
+        let mut diagnostic = Diagnostic::new::<DiagnosticKind>(
+            match context {
+                Context::String => AmbiguousUnicodeCharacterString {
+                    confusable,
+                    representant,
+                }
+                .into(),
+                Context::Docstring => AmbiguousUnicodeCharacterDocstring {
+                    confusable,
+                    representant,
+                }
+                .into(),
+                Context::Comment => AmbiguousUnicodeCharacterComment {
+                    confusable,
+                    representant,
+                }
+                .into(),
+            },
+            char_range,
+        );
+        if settings.rules.enabled(diagnostic.kind.rule()) {
+            if settings.rules.should_fix(diagnostic.kind.rule()) {
+                #[allow(deprecated)]
+                diagnostic.set_fix(Fix::unspecified(Edit::range_replacement(
+                    representant.to_string(),
+                    char_range,
+                )));
+            }
+            return Some(diagnostic);
+        }
+    }
+    None
+}
diff --git a/crates/ruff/src/rules/ruff/snapshots/ruff__rules__ruff__tests__confusables.snap b/crates/ruff/src/rules/ruff/snapshots/ruff__rules__ruff__tests__confusables.snap
index 75d127a751bf35..0d2d98b755dbd2 100644
--- a/crates/ruff/src/rules/ruff/snapshots/ruff__rules__ruff__tests__confusables.snap
+++ b/crates/ruff/src/rules/ruff/snapshots/ruff__rules__ruff__tests__confusables.snap
@@ -56,4 +56,75 @@ confusables.py:7:62: RUF003 [*] Comment contains ambiguous unicode character `
 9 9 | 
 10 10 | 
 
+confusables.py:17:6: RUF001 [*] String contains ambiguous unicode character `𝐁` (did you mean `B`?)
+   |
+17 | x = "𝐁ad string"
+   |      ^ RUF001
+18 | x = "−"
+   |
+   = help: Replace `𝐁` with `B`
+
+ℹ Suggested fix
+14 14 |     ...
+15 15 | 
+16 16 | 
+17    |-x = "𝐁ad string"
+   17 |+x = "Bad string"
+18 18 | x = "−"
+19 19 | 
+20 20 | # This should be ignored, since it contains an unambiguous unicode character, and no
+
+confusables.py:26:10: RUF001 [*] String contains ambiguous unicode character `α` (did you mean `a`?)
+   |
+26 | # The first word should be ignored, while the second should be included, since it
+27 | # contains ASCII.
+28 | x = "βα Bα"
+   |          ^ RUF001
+29 | 
+30 | # The two characters should be flagged here. The first character is a "word"
+   |
+   = help: Replace `α` with `a`
+
+ℹ Suggested fix
+23 23 | 
+24 24 | # The first word should be ignored, while the second should be included, since it
+25 25 | # contains ASCII.
+26    |-x = "βα Bα"
+   26 |+x = "βα Ba"
+27 27 | 
+28 28 | # The two characters should be flagged here. The first character is a "word"
+29 29 | # consisting of a single ambiguous character, while the second character is a "word
+
+confusables.py:31:6: RUF001 [*] String contains ambiguous unicode character `Р` (did you mean `P`?)
+   |
+31 | # consisting of a single ambiguous character, while the second character is a "word
+32 | # boundary" (whitespace) that it itself ambiguous.
+33 | x = "Р усский"
+   |      ^ RUF001
+   |
+   = help: Replace `Р` with `P`
+
+ℹ Suggested fix
+28 28 | # The two characters should be flagged here. The first character is a "word"
+29 29 | # consisting of a single ambiguous character, while the second character is a "word
+30 30 | # boundary" (whitespace) that it itself ambiguous.
+31    |-x = "Р усский"
+   31 |+x = "P усский"
+
+confusables.py:31:7: RUF001 [*] String contains ambiguous unicode character ` ` (did you mean ` `?)
+   |
+31 | # consisting of a single ambiguous character, while the second character is a "word
+32 | # boundary" (whitespace) that it itself ambiguous.
+33 | x = "Р усский"
+   |       ^ RUF001
+   |
+   = help: Replace ` ` with ` `
+
+ℹ Suggested fix
+28 28 | # The two characters should be flagged here. The first character is a "word"
+29 29 | # consisting of a single ambiguous character, while the second character is a "word
+30 30 | # boundary" (whitespace) that it itself ambiguous.
+31    |-x = "Р усский"
+   31 |+x = "Р усский"
+