Skip to content

Commit

Permalink
Make ambiguous-unicode detection sensitive to 'word' context
Browse files Browse the repository at this point in the history
  • Loading branch information
charliermarsh committed May 20, 2023
1 parent fc63c6f commit f0851d4
Show file tree
Hide file tree
Showing 3 changed files with 212 additions and 43 deletions.
19 changes: 18 additions & 1 deletion crates/ruff/resources/test/fixtures/ruff/confusables.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,24 @@ def f():
...


def g():
def f():
"""Here's a docstring with a greek rho: ρ"""
# And here's a comment with a greek alpha: ∗
...


x = "𝐁ad string"
x = "−"

# This should be ignored, since it contains an unambiguous unicode character, and no
# ASCII.
x = "Русский"

# The first word should be ignored, while the second should be included, since it
# contains ASCII.
x = "βα Bα"

# The two characters should be flagged here. The first character is a "word"
# consisting of a single ambiguous character, while the second character is a "word
# boundary" (whitespace) that it itself ambiguous.
x = "Р усский"
165 changes: 123 additions & 42 deletions crates/ruff/src/rules/ruff/rules/ambiguous_unicode_character.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use bitflags::bitflags;
use ruff_text_size::{TextLen, TextRange, TextSize};

use ruff_diagnostics::{AlwaysAutofixableViolation, Diagnostic, DiagnosticKind, Edit, Fix};
Expand All @@ -23,8 +24,7 @@ impl AlwaysAutofixableViolation for AmbiguousUnicodeCharacterString {
representant,
} = self;
format!(
"String contains ambiguous unicode character `{confusable}` (did you mean \
`{representant}`?)"
"String contains ambiguous unicode character `{confusable}` (did you mean `{representant}`?)"
)
}

Expand All @@ -51,8 +51,7 @@ impl AlwaysAutofixableViolation for AmbiguousUnicodeCharacterDocstring {
representant,
} = self;
format!(
"Docstring contains ambiguous unicode character `{confusable}` (did you mean \
`{representant}`?)"
"Docstring contains ambiguous unicode character `{confusable}` (did you mean `{representant}`?)"
)
}

Expand All @@ -79,8 +78,7 @@ impl AlwaysAutofixableViolation for AmbiguousUnicodeCharacterComment {
representant,
} = self;
format!(
"Comment contains ambiguous unicode character `{confusable}` (did you mean \
`{representant}`?)"
"Comment contains ambiguous unicode character `{confusable}` (did you mean `{representant}`?)"
)
}

Expand All @@ -103,50 +101,133 @@ pub(crate) fn ambiguous_unicode_character(

let text = locator.slice(range);

// Most of the time, we don't need to check for ambiguous unicode characters at all.
if text.is_ascii() {
return diagnostics;
}

// Iterate over the "words" in the text.
let mut flags = WordFlags::empty();
let mut buffer = vec![];
for (relative_offset, current_char) in text.char_indices() {
if !current_char.is_ascii() {
// Search for confusing characters.
if let Some(representant) = CONFUSABLES.get(&(current_char as u32)).copied() {
if !settings.allowed_confusables.contains(&current_char) {
let char_range = TextRange::at(
// Word boundary.
if !current_char.is_alphanumeric() {
if !buffer.is_empty() {
if flags.is_candidate_word() {
diagnostics.append(&mut buffer);
}
buffer.clear();
}
flags = WordFlags::empty();

// Check if the boundary character is itself an ambiguous unicode character, in which
// case, it's always included as a diagnostic.
if !current_char.is_ascii() {
if let Some(representant) = CONFUSABLES.get(&(current_char as u32)).copied() {
if let Some(diagnostic) = diagnostic_for_char(
current_char,
representant as char,
TextSize::try_from(relative_offset).unwrap() + range.start(),
current_char.text_len(),
);

let mut diagnostic = Diagnostic::new::<DiagnosticKind>(
match context {
Context::String => AmbiguousUnicodeCharacterString {
confusable: current_char,
representant: representant as char,
}
.into(),
Context::Docstring => AmbiguousUnicodeCharacterDocstring {
confusable: current_char,
representant: representant as char,
}
.into(),
Context::Comment => AmbiguousUnicodeCharacterComment {
confusable: current_char,
representant: representant as char,
}
.into(),
},
char_range,
);
if settings.rules.enabled(diagnostic.kind.rule()) {
if settings.rules.should_fix(diagnostic.kind.rule()) {
#[allow(deprecated)]
diagnostic.set_fix(Fix::unspecified(Edit::range_replacement(
(representant as char).to_string(),
char_range,
)));
}
context,
settings,
) {
diagnostics.push(diagnostic);
}
}
}
} else if current_char.is_ascii() {
// The current word contains at least one ASCII character.
flags |= WordFlags::ASCII;
} else if let Some(representant) = CONFUSABLES.get(&(current_char as u32)).copied() {
// The current word contains an ambiguous unicode character.
if let Some(diagnostic) = diagnostic_for_char(
current_char,
representant as char,
TextSize::try_from(relative_offset).unwrap() + range.start(),
context,
settings,
) {
buffer.push(diagnostic);
}
} else {
// The current word contains at least one unambiguous unicode character.
flags |= WordFlags::UNAMBIGUOUS_UNICODE;
}
}

// End of the text.
if !buffer.is_empty() {
if flags.is_candidate_word() {
diagnostics.append(&mut buffer);
}
buffer.clear();
}

diagnostics
}

bitflags! {
#[derive(Default, Debug, Copy, Clone, PartialEq, Eq)]
pub struct WordFlags: u8 {
/// The word contains at least one ASCII character (like `B`).
const ASCII = 0b0000_0001;
/// The word contains at least one unambiguous unicode character (like `β`).
const UNAMBIGUOUS_UNICODE = 0b0000_0010;
}
}

impl WordFlags {
/// Return `true` if the flags indicate that the word is a candidate for flagging
/// ambiguous unicode characters.
///
/// We follow VS Code's logic for determining whether ambiguous unicode characters within a
/// given word should be flagged, i.e., we flag a word if it contains at least one ASCII
/// character, or is purely unicode but _only_ consists of ambiguous characters.
const fn is_candidate_word(self) -> bool {
self.contains(WordFlags::ASCII) || !self.contains(WordFlags::UNAMBIGUOUS_UNICODE)
}
}

/// Create a [`Diagnostic`] to report an ambiguous unicode character.
fn diagnostic_for_char(
confusable: char,
representant: char,
offset: TextSize,
context: Context,
settings: &Settings,
) -> Option<Diagnostic> {
if !settings.allowed_confusables.contains(&confusable) {
let char_range = TextRange::at(offset, confusable.text_len());
let mut diagnostic = Diagnostic::new::<DiagnosticKind>(
match context {
Context::String => AmbiguousUnicodeCharacterString {
confusable,
representant,
}
.into(),
Context::Docstring => AmbiguousUnicodeCharacterDocstring {
confusable,
representant,
}
.into(),
Context::Comment => AmbiguousUnicodeCharacterComment {
confusable,
representant,
}
.into(),
},
char_range,
);
if settings.rules.enabled(diagnostic.kind.rule()) {
if settings.rules.should_fix(diagnostic.kind.rule()) {
#[allow(deprecated)]
diagnostic.set_fix(Fix::unspecified(Edit::range_replacement(
representant.to_string(),
char_range,
)));
}
return Some(diagnostic);
}
}
None
}
Original file line number Diff line number Diff line change
Expand Up @@ -56,4 +56,75 @@ confusables.py:7:62: RUF003 [*] Comment contains ambiguous unicode character `
9 9 |
10 10 |

confusables.py:17:6: RUF001 [*] String contains ambiguous unicode character `𝐁` (did you mean `B`?)
|
17 | x = "𝐁ad string"
| ^ RUF001
18 | x = ""
|
= help: Replace `𝐁` with `B`

Suggested fix
14 14 | ...
15 15 |
16 16 |
17 |-x = "𝐁ad string"
17 |+x = "Bad string"
18 18 | x = ""
19 19 |
20 20 | # This should be ignored, since it contains an unambiguous unicode character, and no

confusables.py:26:10: RUF001 [*] String contains ambiguous unicode character `α` (did you mean `a`?)
|
26 | # The first word should be ignored, while the second should be included, since it
27 | # contains ASCII.
28 | x = "βα Bα"
| ^ RUF001
29 |
30 | # The two characters should be flagged here. The first character is a "word"
|
= help: Replace `α` with `a`

Suggested fix
23 23 |
24 24 | # The first word should be ignored, while the second should be included, since it
25 25 | # contains ASCII.
26 |-x = "βα Bα"
26 |+x = "βα Ba"
27 27 |
28 28 | # The two characters should be flagged here. The first character is a "word"
29 29 | # consisting of a single ambiguous character, while the second character is a "word

confusables.py:31:6: RUF001 [*] String contains ambiguous unicode character `Р` (did you mean `P`?)
|
31 | # consisting of a single ambiguous character, while the second character is a "word
32 | # boundary" (whitespace) that it itself ambiguous.
33 | x = "Р усский"
| ^ RUF001
|
= help: Replace `Р` with `P`

Suggested fix
28 28 | # The two characters should be flagged here. The first character is a "word"
29 29 | # consisting of a single ambiguous character, while the second character is a "word
30 30 | # boundary" (whitespace) that it itself ambiguous.
31 |-x = "Р усский"
31 |+x = "P усский"

confusables.py:31:7: RUF001 [*] String contains ambiguous unicode character ` ` (did you mean ` `?)
|
31 | # consisting of a single ambiguous character, while the second character is a "word
32 | # boundary" (whitespace) that it itself ambiguous.
33 | x = "Р усский"
| ^ RUF001
|
= help: Replace ` ` with ` `

Suggested fix
28 28 | # The two characters should be flagged here. The first character is a "word"
29 29 | # consisting of a single ambiguous character, while the second character is a "word
30 30 | # boundary" (whitespace) that it itself ambiguous.
31 |-x = "Р усский"
31 |+x = "Р усский"


0 comments on commit f0851d4

Please sign in to comment.