astral-sh · charliermarsh · Jan 11, 2023 · Jan 2, 2023 · Jan 2, 2023 · Jan 3, 2023
diff --git a/README.md b/README.md
@@ -701,6 +701,7 @@ For more, see [pyupgrade](https://pypi.org/project/pyupgrade/3.2.0/) on PyPI.
 | UP027 | RewriteListComprehension | Replace unpacked list comprehension with a generator expression | 🛠 |
 | UP028 | RewriteYieldFrom | Replace `yield` over `for` loop with `yield from` | 🛠 |
 | UP029 | UnnecessaryBuiltinImport | Unnecessary builtin import: `...` | 🛠 |
+| UP030 | FormatSpecifiers | Remove specifiers from inside the string's brackets | 🛠 |
 
 ### pep8-naming (N)
 

diff --git a/resources/test/fixtures/pyupgrade/UP030_0.py b/resources/test/fixtures/pyupgrade/UP030_0.py
@@ -0,0 +1,33 @@
+# These SHOULD change
+"{0}" "{1}" "{2}".format(1, 2, 3)
+
+"a {3} complicated {1} string with {0} {2}".format(
+    "first", "second", "third", "fourth"
+)
+
+'{0}'.format(1)
+
+'{0:x}'.format(30)
+
+x = '{0}'.format(1)
+
+'''{0}\n{1}\n'''.format(1, 2)
+
+x = "foo {0}" \
+    "bar {1}".format(1, 2)
+
+("{0}").format(1)
+
+"\N{snowman} {0}".format(1)
+
+# These will not change because we are waiting for libcst to fix this issue:
+# https://github.com/Instagram/LibCST/issues/846
+print(
+    'foo{0}'
+    'bar{1}'.format(1, 2)
+)
+
+print(
+    'foo{0}'  # ohai\n"
+    'bar{1}'.format(1, 2)
+)
diff --git a/resources/test/fixtures/pyupgrade/UP030_1.py b/resources/test/fixtures/pyupgrade/UP030_1.py
@@ -0,0 +1,25 @@
+# These should NOT change
+
+'{}'.format(1)
+
+
+x = ('{0} {1}',)
+
+'{0} {0}'.format(1)
+
+'{0:<{1}}'.format(1, 4)
+
+'{' '0}'.format(1)
+
+f"{0}".format(a)
+
+f"{0}".format(1)
+
+print(f"{0}".format(1))
+
+# I did not include the following tests because ruff does not seem to work with
+# invalid python syntax (which is a good thing)
+
+# "{0}"format(1)
+# '{'.format(1)", "'}'.format(1)
+# ("{0}" # {1}\n"{2}").format(1, 2, 3)
diff --git a/ruff.schema.json b/ruff.schema.json
@@ -1595,6 +1595,8 @@
         "UP027",
         "UP028",
         "UP029",
+        "UP03",
+        "UP030",
         "W",
         "W2",
         "W29",

diff --git a/src/checkers/ast.rs b/src/checkers/ast.rs
@@ -1917,6 +1917,9 @@ where
                 if self.settings.enabled.contains(&RuleCode::UP024) {
                     pyupgrade::rules::os_error_alias(self, expr);
                 }
+                if self.settings.enabled.contains(&RuleCode::UP030) {
+                    pyupgrade::rules::format_specifiers(self, expr, func);
+                }
 
                 // flake8-print
                 if self.settings.enabled.contains(&RuleCode::T201)

diff --git a/src/cst/matchers.rs b/src/cst/matchers.rs
@@ -1,5 +1,7 @@
 use anyhow::{bail, Result};
-use libcst_native::{Expr, Import, ImportFrom, Module, SmallStatement, Statement};
+use libcst_native::{
+    Call, Expr, Expression, Import, ImportFrom, Module, SmallStatement, Statement,
+};
 
 pub fn match_module(module_text: &str) -> Result<Module> {
     match libcst_native::parse_module(module_text, None) {
@@ -8,6 +10,13 @@ pub fn match_module(module_text: &str) -> Result<Module> {
     }
 }
 
+pub fn match_expression(expression_text: &str) -> Result<Expression> {
+    match libcst_native::parse_expression(expression_text) {
+        Ok(expression) => Ok(expression),
+        Err(_) => bail!("Failed to extract CST from source"),
+    }
+}
+
 pub fn match_expr<'a, 'b>(module: &'a mut Module<'b>) -> Result<&'a mut Expr<'b>> {
     if let Some(Statement::Simple(expr)) = module.body.first_mut() {
         if let Some(SmallStatement::Expr(expr)) = expr.body.first_mut() {
@@ -43,3 +52,11 @@ pub fn match_import_from<'a, 'b>(module: &'a mut Module<'b>) -> Result<&'a mut I
         bail!("Expected Statement::Simple")
     }
 }
+
+pub fn match_call<'a, 'b>(expression: &'a mut Expression<'b>) -> Result<&'a mut Call<'b>> {
+    if let Expression::Call(call) = expression {
+        Ok(call)
+    } else {
+        bail!("Expected SmallStatement::Expr")
+    }
+}
diff --git a/src/pyupgrade/mod.rs b/src/pyupgrade/mod.rs
@@ -50,6 +50,8 @@ mod tests {
     #[test_case(RuleCode::UP028, Path::new("UP028_0.py"); "UP028_0")]
     #[test_case(RuleCode::UP028, Path::new("UP028_1.py"); "UP028_1")]
     #[test_case(RuleCode::UP029, Path::new("UP029.py"); "UP029")]
+    #[test_case(RuleCode::UP030, Path::new("UP030_0.py"); "UP030_0")]
+    #[test_case(RuleCode::UP030, Path::new("UP030_1.py"); "UP030_1")]
     fn rules(rule_code: RuleCode, path: &Path) -> Result<()> {
         let snapshot = format!("{}_{}", rule_code.as_ref(), path.to_string_lossy());
         let diagnostics = test_path(

diff --git a/src/pyupgrade/rules/format_specifiers.rs b/src/pyupgrade/rules/format_specifiers.rs
@@ -0,0 +1,215 @@
+use libcst_native::{parse_expression, Arg, Codegen, CodegenState, Expression};
+use num_bigint::{BigInt, Sign};
+use once_cell::sync::Lazy;
+use regex::Regex;
+use rustpython_ast::{Constant, Expr, ExprKind};
+use rustpython_parser::lexer;
+use rustpython_parser::lexer::Tok;
+
+use crate::ast::types::Range;
+use crate::autofix::Fix;
+use crate::checkers::ast::Checker;
+use crate::cst::matchers::{match_call, match_expression};
+use crate::registry::Diagnostic;
+use crate::violations;
+
+// This checks for a an opening squiggly bracket, followed by any integer,
+// followed by any text, follow be a squiggly closing bracket
+static FORMAT_SPECIFIER: Lazy<Regex> =
+    Lazy::new(|| Regex::new(r"\{(?P<int>\d+)(?P<fmt>.*?)\}").unwrap());
+
+// When we check for a nested format specifier, the closing bracket will be
+// removed, so we just need to check for the opening bracket and an integer
+static FIRST_HALF: Lazy<Regex> = Lazy::new(|| Regex::new(r"\{(\d+)").unwrap());
+
+/// Convert a python integer to a unsigned 32 but integer. We are assuming this
+/// will never overflow because people will probably never have more than 2^32
+/// arguments to a format string. I am also ignoring the signed, I personally
+/// checked and negative numbers are not allowed in format strings
+fn convert_big_int(bigint: &BigInt) -> Option<u32> {
+    let (sign, digits) = bigint.to_u32_digits();
+    match sign {
+        Sign::Plus => digits.first().copied(),
+        Sign::Minus => None,
+        Sign::NoSign => Some(0),
+    }
+}
+
+fn get_new_args<'a>(old_args: &[Arg<'a>], correct_order: &'a [u32]) -> Result<Vec<Arg<'a>>, ()> {
+    let mut new_args: Vec<Arg> = Vec::new();
+    for (i, given_idx) in correct_order.iter().enumerate() {
+        // We need to keep the formatting in the same order but move the values
+        let values = match old_args.get(given_idx.to_owned() as usize) {
+            None => return Err(()),
+            Some(item) => item,
+        };
+        let formatting = match old_args.get(i) {
+            None => return Err(()),
+            Some(item) => item,
+        };
+        let new_arg = Arg {
+            value: values.value.clone(),
+            comma: formatting.comma.clone(),
+            // Kwargs are NOT allowed in .format (I checked)
+            equal: None,
+            keyword: None,
+            star: values.star,
+            whitespace_after_star: formatting.whitespace_after_star.clone(),
+            whitespace_after_arg: formatting.whitespace_after_arg.clone(),
+        };
+        new_args.push(new_arg);
+    }
+    Ok(new_args)
+}
+
+/// Returns the new call string, or returns an error if it cannot create a new
+/// call string
+fn get_new_call(module_text: &str, correct_order: &[u32]) -> Result<String, ()> {
+    let mut expression = match parse_expression(module_text) {
+        Err(_) => return Err(()),
+        Ok(item) => item,
+    };
+    let mut call = match match_call(&mut expression) {
+        Err(_) => return Err(()),
+        Ok(item) => item,
+    };
+    call.args = match get_new_args(&call.args, correct_order) {
+        Err(_) => return Err(()),
+        Ok(item) => item,
+    };
+    // Create the new function
+    if let Expression::Attribute(item) = &*call.func {
+        // Converting the struct to a struct and then back is not very efficient, but
+        // regexs were the simplest way I could find to remove the specifiers
+        let mut state = CodegenState::default();
+        item.codegen(&mut state);
+        let cleaned = remove_specifiers(&state.to_string());
+        match match_expression(&cleaned) {
+            Err(_) => return Err(()),
+            Ok(item) => call.func = Box::new(item),
+        };
+        // Create the string
+        let mut final_state = CodegenState::default();
+        expression.codegen(&mut final_state);
+        // FOR REVIEWER: If the new and old are identical, don't create a fix. Pyupgrade
+        // doesnt even want me to report this, so we could just have an enum for errors,
+        // and if a special one is returned here then we dont even report a fix
+        if module_text == final_state.to_string() {
+            return Err(());
+        }
+        return Ok(final_state.to_string());
+    }
+    Err(())
+}
+
+fn get_specifier_order(value_str: &str) -> Vec<u32> {
+    let mut specifier_ints: Vec<u32> = vec![];
+    // Whether the previous character was a Lbrace. If this is true and the next
+    // character is an integer than this integer gets added to the list of
+    // constants
+    let mut prev_l_brace = false;
+    for (_, tok, _) in lexer::make_tokenizer(value_str).flatten() {
+        if Tok::Lbrace == tok {
+            prev_l_brace = true;
+        } else if let Tok::Int { value } = tok {
+            if prev_l_brace {
+                if let Some(int_val) = convert_big_int(&value) {
+                    specifier_ints.push(int_val);
+                }
+            }
+            prev_l_brace = false;
+        } else {
+            prev_l_brace = false;
+        }
+    }
+    specifier_ints
+}
+
+/// Returns a string without the format specifiers. Ex. "Hello {0} {1}" ->
+/// "Hello {} {}"
+fn remove_specifiers(raw_specifiers: &str) -> String {
+    let new_str = FORMAT_SPECIFIER
+        .replace_all(raw_specifiers, "{$fmt}")
+        .to_string();
+    new_str
+}
+
+/// Checks if there is a single specifier in the string. The string must either
+/// have all formatterts or no formatters (or else an error will be thrown), so
+/// this will work as long as the python code is valid
+fn has_valid_specifiers(raw_specifiers: &str) -> bool {
+    // If there is at least one match we should return a true
+    let mut at_least_one = false;
+    for cap in FORMAT_SPECIFIER.captures_iter(raw_specifiers) {
+        at_least_one = true;
+        // If we have a nested format specifier we need to return a false
+        if FIRST_HALF.is_match(&cap[2]) {
+            return false;
+        }
+    }
+    at_least_one
+}
+
+/// Checks if the string has specifiers and that they are in the correct order
+fn valid_specifiers(raw_specifiers: &str) -> bool {
+    if !has_valid_specifiers(raw_specifiers) {
+        return false;
+    }
+    let mut specifiers = get_specifier_order(raw_specifiers);
+    specifiers.sort_unstable();
+    let mut current = 0;
+    for item in specifiers {
+        if item == current {
+            current += 1;
+        } else {
+            return false;
+        }
+    }
+    true
+}
+
+/// UP030
+pub fn format_specifiers(checker: &mut Checker, expr: &Expr, func: &Expr) {
+    if let ExprKind::Attribute { value, attr, .. } = &func.node {
+        if let ExprKind::Constant {
+            value: Constant::Str(provided_string),
+            ..
+        } = &value.node
+        {
+            // The function must be a format function
+            if attr != "format" {
+                return;
+            }
+            // The squigly brackets must have format specifiers inside of them
+            if !valid_specifiers(provided_string) {
+                return;
+            }
+            let as_ints = get_specifier_order(provided_string);
+            let call_range = Range::from_located(expr);
+            let call_text = checker.locator.slice_source_code_range(&call_range);
+            let mut diagnostic =
+                Diagnostic::new(violations::FormatSpecifiers, Range::from_located(expr));
+            match get_new_call(&call_text, &as_ints) {
+                // If we get any errors, we know that there is an issue that we cannot fix
+                // so we should just report that there is a formatting issue. Currently the
+                // only issue we know of is a ParseError from a multi line format statement
+                // inside a function call that does not explicitly say there are multiple
+                // lines. Follow my Github issue here:
+                // https://github.com/Instagram/LibCST/issues/846
+
+                // Is there a way to specify that here this is not fixable, but below it is??
+                Err(_) => checker.diagnostics.push(diagnostic),
+                Ok(new_call) => {
+                    if checker.patch(diagnostic.kind.code()) {
+                        diagnostic.amend(Fix::replacement(
+                            new_call,
+                            expr.location,
+                            expr.end_location.unwrap(),
+                        ));
+                    }
+                    checker.diagnostics.push(diagnostic);
+                }
+            };
+        }
+    }
+}
diff --git a/src/pyupgrade/rules/mod.rs b/src/pyupgrade/rules/mod.rs
@@ -2,6 +2,7 @@ pub use convert_named_tuple_functional_to_class::convert_named_tuple_functional_
 pub use convert_typed_dict_functional_to_class::convert_typed_dict_functional_to_class;
 pub use datetime_utc_alias::datetime_utc_alias;
 pub use deprecated_unittest_alias::deprecated_unittest_alias;
+pub use format_specifiers::format_specifiers;
 pub use native_literals::native_literals;
 use once_cell::sync::Lazy;
 pub use open_alias::open_alias;
@@ -40,6 +41,7 @@ mod convert_named_tuple_functional_to_class;
 mod convert_typed_dict_functional_to_class;
 mod datetime_utc_alias;
 mod deprecated_unittest_alias;
+mod format_specifiers;
 mod native_literals;
 mod open_alias;
 mod os_error_alias;