From f0851d45d5b0fb9efbbd72194a5d91c8f4a45ac6 Mon Sep 17 00:00:00 2001 From: Charlie Marsh Date: Sat, 20 May 2023 16:49:16 -0400 Subject: [PATCH] Make ambiguous-unicode detection sensitive to 'word' context --- .../test/fixtures/ruff/confusables.py | 19 +- .../ruff/rules/ambiguous_unicode_character.rs | 165 +++++++++++++----- ...ruff__rules__ruff__tests__confusables.snap | 71 ++++++++ 3 files changed, 212 insertions(+), 43 deletions(-) diff --git a/crates/ruff/resources/test/fixtures/ruff/confusables.py b/crates/ruff/resources/test/fixtures/ruff/confusables.py index b642cebd6f396c..7a42022153d6d7 100644 --- a/crates/ruff/resources/test/fixtures/ruff/confusables.py +++ b/crates/ruff/resources/test/fixtures/ruff/confusables.py @@ -8,7 +8,24 @@ def f(): ... -def g(): +def f(): """Here's a docstring with a greek rho: ρ""" # And here's a comment with a greek alpha: ∗ ... + + +x = "𝐁ad string" +x = "−" + +# This should be ignored, since it contains an unambiguous unicode character, and no +# ASCII. +x = "Русский" + +# The first word should be ignored, while the second should be included, since it +# contains ASCII. +x = "βα Bα" + +# The two characters should be flagged here. The first character is a "word" +# consisting of a single ambiguous character, while the second character is a "word +# boundary" (whitespace) that it itself ambiguous. +x = "Р усский" diff --git a/crates/ruff/src/rules/ruff/rules/ambiguous_unicode_character.rs b/crates/ruff/src/rules/ruff/rules/ambiguous_unicode_character.rs index c82fafc0be3040..b2147623954269 100644 --- a/crates/ruff/src/rules/ruff/rules/ambiguous_unicode_character.rs +++ b/crates/ruff/src/rules/ruff/rules/ambiguous_unicode_character.rs @@ -1,3 +1,4 @@ +use bitflags::bitflags; use ruff_text_size::{TextLen, TextRange, TextSize}; use ruff_diagnostics::{AlwaysAutofixableViolation, Diagnostic, DiagnosticKind, Edit, Fix}; @@ -23,8 +24,7 @@ impl AlwaysAutofixableViolation for AmbiguousUnicodeCharacterString { representant, } = self; format!( - "String contains ambiguous unicode character `{confusable}` (did you mean \ - `{representant}`?)" + "String contains ambiguous unicode character `{confusable}` (did you mean `{representant}`?)" ) } @@ -51,8 +51,7 @@ impl AlwaysAutofixableViolation for AmbiguousUnicodeCharacterDocstring { representant, } = self; format!( - "Docstring contains ambiguous unicode character `{confusable}` (did you mean \ - `{representant}`?)" + "Docstring contains ambiguous unicode character `{confusable}` (did you mean `{representant}`?)" ) } @@ -79,8 +78,7 @@ impl AlwaysAutofixableViolation for AmbiguousUnicodeCharacterComment { representant, } = self; format!( - "Comment contains ambiguous unicode character `{confusable}` (did you mean \ - `{representant}`?)" + "Comment contains ambiguous unicode character `{confusable}` (did you mean `{representant}`?)" ) } @@ -103,50 +101,133 @@ pub(crate) fn ambiguous_unicode_character( let text = locator.slice(range); + // Most of the time, we don't need to check for ambiguous unicode characters at all. + if text.is_ascii() { + return diagnostics; + } + + // Iterate over the "words" in the text. + let mut flags = WordFlags::empty(); + let mut buffer = vec![]; for (relative_offset, current_char) in text.char_indices() { - if !current_char.is_ascii() { - // Search for confusing characters. - if let Some(representant) = CONFUSABLES.get(&(current_char as u32)).copied() { - if !settings.allowed_confusables.contains(¤t_char) { - let char_range = TextRange::at( + // Word boundary. + if !current_char.is_alphanumeric() { + if !buffer.is_empty() { + if flags.is_candidate_word() { + diagnostics.append(&mut buffer); + } + buffer.clear(); + } + flags = WordFlags::empty(); + + // Check if the boundary character is itself an ambiguous unicode character, in which + // case, it's always included as a diagnostic. + if !current_char.is_ascii() { + if let Some(representant) = CONFUSABLES.get(&(current_char as u32)).copied() { + if let Some(diagnostic) = diagnostic_for_char( + current_char, + representant as char, TextSize::try_from(relative_offset).unwrap() + range.start(), - current_char.text_len(), - ); - - let mut diagnostic = Diagnostic::new::( - match context { - Context::String => AmbiguousUnicodeCharacterString { - confusable: current_char, - representant: representant as char, - } - .into(), - Context::Docstring => AmbiguousUnicodeCharacterDocstring { - confusable: current_char, - representant: representant as char, - } - .into(), - Context::Comment => AmbiguousUnicodeCharacterComment { - confusable: current_char, - representant: representant as char, - } - .into(), - }, - char_range, - ); - if settings.rules.enabled(diagnostic.kind.rule()) { - if settings.rules.should_fix(diagnostic.kind.rule()) { - #[allow(deprecated)] - diagnostic.set_fix(Fix::unspecified(Edit::range_replacement( - (representant as char).to_string(), - char_range, - ))); - } + context, + settings, + ) { diagnostics.push(diagnostic); } } } + } else if current_char.is_ascii() { + // The current word contains at least one ASCII character. + flags |= WordFlags::ASCII; + } else if let Some(representant) = CONFUSABLES.get(&(current_char as u32)).copied() { + // The current word contains an ambiguous unicode character. + if let Some(diagnostic) = diagnostic_for_char( + current_char, + representant as char, + TextSize::try_from(relative_offset).unwrap() + range.start(), + context, + settings, + ) { + buffer.push(diagnostic); + } + } else { + // The current word contains at least one unambiguous unicode character. + flags |= WordFlags::UNAMBIGUOUS_UNICODE; } } + // End of the text. + if !buffer.is_empty() { + if flags.is_candidate_word() { + diagnostics.append(&mut buffer); + } + buffer.clear(); + } + diagnostics } + +bitflags! { + #[derive(Default, Debug, Copy, Clone, PartialEq, Eq)] + pub struct WordFlags: u8 { + /// The word contains at least one ASCII character (like `B`). + const ASCII = 0b0000_0001; + /// The word contains at least one unambiguous unicode character (like `β`). + const UNAMBIGUOUS_UNICODE = 0b0000_0010; + } +} + +impl WordFlags { + /// Return `true` if the flags indicate that the word is a candidate for flagging + /// ambiguous unicode characters. + /// + /// We follow VS Code's logic for determining whether ambiguous unicode characters within a + /// given word should be flagged, i.e., we flag a word if it contains at least one ASCII + /// character, or is purely unicode but _only_ consists of ambiguous characters. + const fn is_candidate_word(self) -> bool { + self.contains(WordFlags::ASCII) || !self.contains(WordFlags::UNAMBIGUOUS_UNICODE) + } +} + +/// Create a [`Diagnostic`] to report an ambiguous unicode character. +fn diagnostic_for_char( + confusable: char, + representant: char, + offset: TextSize, + context: Context, + settings: &Settings, +) -> Option { + if !settings.allowed_confusables.contains(&confusable) { + let char_range = TextRange::at(offset, confusable.text_len()); + let mut diagnostic = Diagnostic::new::( + match context { + Context::String => AmbiguousUnicodeCharacterString { + confusable, + representant, + } + .into(), + Context::Docstring => AmbiguousUnicodeCharacterDocstring { + confusable, + representant, + } + .into(), + Context::Comment => AmbiguousUnicodeCharacterComment { + confusable, + representant, + } + .into(), + }, + char_range, + ); + if settings.rules.enabled(diagnostic.kind.rule()) { + if settings.rules.should_fix(diagnostic.kind.rule()) { + #[allow(deprecated)] + diagnostic.set_fix(Fix::unspecified(Edit::range_replacement( + representant.to_string(), + char_range, + ))); + } + return Some(diagnostic); + } + } + None +} diff --git a/crates/ruff/src/rules/ruff/snapshots/ruff__rules__ruff__tests__confusables.snap b/crates/ruff/src/rules/ruff/snapshots/ruff__rules__ruff__tests__confusables.snap index 75d127a751bf35..0d2d98b755dbd2 100644 --- a/crates/ruff/src/rules/ruff/snapshots/ruff__rules__ruff__tests__confusables.snap +++ b/crates/ruff/src/rules/ruff/snapshots/ruff__rules__ruff__tests__confusables.snap @@ -56,4 +56,75 @@ confusables.py:7:62: RUF003 [*] Comment contains ambiguous unicode character ` 9 9 | 10 10 | +confusables.py:17:6: RUF001 [*] String contains ambiguous unicode character `𝐁` (did you mean `B`?) + | +17 | x = "𝐁ad string" + | ^ RUF001 +18 | x = "−" + | + = help: Replace `𝐁` with `B` + +ℹ Suggested fix +14 14 | ... +15 15 | +16 16 | +17 |-x = "𝐁ad string" + 17 |+x = "Bad string" +18 18 | x = "−" +19 19 | +20 20 | # This should be ignored, since it contains an unambiguous unicode character, and no + +confusables.py:26:10: RUF001 [*] String contains ambiguous unicode character `α` (did you mean `a`?) + | +26 | # The first word should be ignored, while the second should be included, since it +27 | # contains ASCII. +28 | x = "βα Bα" + | ^ RUF001 +29 | +30 | # The two characters should be flagged here. The first character is a "word" + | + = help: Replace `α` with `a` + +ℹ Suggested fix +23 23 | +24 24 | # The first word should be ignored, while the second should be included, since it +25 25 | # contains ASCII. +26 |-x = "βα Bα" + 26 |+x = "βα Ba" +27 27 | +28 28 | # The two characters should be flagged here. The first character is a "word" +29 29 | # consisting of a single ambiguous character, while the second character is a "word + +confusables.py:31:6: RUF001 [*] String contains ambiguous unicode character `Р` (did you mean `P`?) + | +31 | # consisting of a single ambiguous character, while the second character is a "word +32 | # boundary" (whitespace) that it itself ambiguous. +33 | x = "Р усский" + | ^ RUF001 + | + = help: Replace `Р` with `P` + +ℹ Suggested fix +28 28 | # The two characters should be flagged here. The first character is a "word" +29 29 | # consisting of a single ambiguous character, while the second character is a "word +30 30 | # boundary" (whitespace) that it itself ambiguous. +31 |-x = "Р усский" + 31 |+x = "P усский" + +confusables.py:31:7: RUF001 [*] String contains ambiguous unicode character ` ` (did you mean ` `?) + | +31 | # consisting of a single ambiguous character, while the second character is a "word +32 | # boundary" (whitespace) that it itself ambiguous. +33 | x = "Р усский" + | ^ RUF001 + | + = help: Replace ` ` with ` ` + +ℹ Suggested fix +28 28 | # The two characters should be flagged here. The first character is a "word" +29 29 | # consisting of a single ambiguous character, while the second character is a "word +30 30 | # boundary" (whitespace) that it itself ambiguous. +31 |-x = "Р усский" + 31 |+x = "Р усский" +