Merge pull request #1983 from artichoke/spinoso-regexp-regex-integration

Add a `regex` implementation of `Regexp` to `spinoso-regexp`
artichoke · Aug 15, 2022 · dde8b9d · dde8b9d
2 parents 23e4307 + 044a3f7
commit dde8b9d
Show file tree

Hide file tree

Showing 12 changed files with 820 additions and 19 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/artichoke-backend/Cargo.toml b/artichoke-backend/Cargo.toml
@@ -56,6 +56,7 @@ core-full = [
   "core-math",
   "core-math-full",
   "core-random",
+  "core-regexp",
   "core-regexp-oniguruma",
   "core-time",
 ]

diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock
diff --git a/spec-runner/Cargo.lock b/spec-runner/Cargo.lock
diff --git a/spinoso-regexp/Cargo.toml b/spinoso-regexp/Cargo.toml
@@ -15,8 +15,9 @@ categories = ["data-structures", "parser-implementations"]
 
 [dependencies]
 bitflags = "1.3.0"
-bstr = { version = "0.2.9", default-features = false }
+bstr = { version = "0.2.9", default-features = false, features = ["std"] } # TODO: use `alloc` feature
 onig = { version = "6.4.0", optional = true, default-features = false }
+posix-space = "1.0.2"
 # Ensure the `regex` minimum version is at least 1.5.5 to pull in a fix for a
 # DoS vulnerability.
 #

diff --git a/spinoso-regexp/src/debug.rs b/spinoso-regexp/src/debug.rs
@@ -48,6 +48,8 @@ impl Delimiters {
 /// This struct is created by the `debug` method on the regexp implementations
 /// in this crate. See these functions' documentation for more.
 ///
+/// This iterator can be used to implement Ruby's [`Regexp#inspect`].
+///
 /// # Examples
 ///
 /// UTF-8 regexp patterns and options are formatted in a debug
@@ -70,6 +72,8 @@ impl Delimiters {
 /// let s = debug.collect::<String>();
 /// assert_eq!(s, r"/\xFF\xFE/");
 /// ```
+///
+/// [`Regexp#inspect`]: https://ruby-doc.org/core-2.4.1/Regexp.html#method-i-inspect
 #[derive(Default, Debug, Clone)]
 #[must_use = "this `Debug` is an `Iterator`, which should be consumed if constructed"]
 pub struct Debug<'a> {
@@ -84,6 +88,7 @@ pub struct Debug<'a> {
     //
     // `Regexp#inspect` prints `"/#{source}/"`.
     source: &'a [u8],
+    non_standard_control_escapes: &'static [u8],
     literal: InvalidUtf8ByteSequence,
     options: &'static str,
     encoding: &'static str,
@@ -122,6 +127,7 @@ impl<'a> Debug<'a> {
         Self {
             delimiters: Delimiters::DEFAULT,
             source,
+            non_standard_control_escapes: &[],
             literal: InvalidUtf8ByteSequence::new(),
             options,
             encoding,
@@ -136,35 +142,77 @@ impl<'a> Iterator for Debug<'a> {
         if let Some(prefix) = self.delimiters.emit_left_delimiter() {
             return Some(prefix);
         }
+        if let Some((&next, tail)) = self.non_standard_control_escapes.split_first() {
+            self.non_standard_control_escapes = tail;
+            return Some(next.into());
+        }
         if let Some(literal) = self.literal.next() {
             return Some(literal);
         }
         if !self.source.is_empty() {
             let (ch, size) = bstr::decode_utf8(self.source);
-            let next = match ch {
+            return match ch {
                 // '/' is the `Regexp` literal delimiter, so escape it.
                 Some('/') => {
+                    self.source = &self.source[1..];
                     // While not an invalid byte, we rely on the documented
                     // behavior of `InvalidUtf8ByteSequence` to always escape
                     // any bytes given to it.
                     self.literal = InvalidUtf8ByteSequence::with_byte(b'/');
                     Some('\\')
                 }
-                Some(ch) => Some(ch),
-                // Otherwise, we've gotten invalid UTF-8, which means this is not an
+                Some('\x07') => {
+                    self.source = &self.source[1..];
+                    let (&next, tail) = br"\x07".split_first().unwrap();
+                    self.non_standard_control_escapes = tail;
+                    Some(next.into())
+                }
+                Some('\x08') => {
+                    self.source = &self.source[1..];
+                    let (&next, tail) = br"\x08".split_first().unwrap();
+                    self.non_standard_control_escapes = tail;
+                    Some(next.into())
+                }
+                Some('\x1B') => {
+                    self.source = &self.source[1..];
+                    let (&next, tail) = br"\x1B".split_first().unwrap();
+                    self.non_standard_control_escapes = tail;
+                    Some(next.into())
+                }
+                Some(ch @ ('"' | '\'' | '\\')) => {
+                    self.source = &self.source[1..];
+                    Some(ch)
+                }
+                Some(ch) if ch.is_ascii() && posix_space::is_space(ch as u8) => {
+                    self.source = &self.source[1..];
+                    Some(ch)
+                }
+                Some(ch) if ch.is_ascii() => {
+                    self.source = &self.source[1..];
+                    // While not an invalid byte, we rely on the documented
+                    // behavior of `InvalidUtf8ByteSequence` to always escape
+                    // any bytes given to it.
+                    self.literal = InvalidUtf8ByteSequence::with_byte(ch as u8);
+                    self.literal.next()
+                }
+                Some(ch) => {
+                    self.source = &self.source[size..];
+                    Some(ch)
+                }
+                // Otherwise, we've gotten invalid UTF-8, which means this is not a
                 // printable char.
                 None => {
+                    let (chunk, remainder) = self.source.split_at(size);
+                    self.source = remainder;
                     // This conversion is safe to unwrap due to the documented
                     // behavior of `bstr::decode_utf8` and `InvalidUtf8ByteSequence`
                     // which indicate that `size` is always in the range of 0..=3.
-                    self.literal = InvalidUtf8ByteSequence::try_from(&self.source[..size]).unwrap();
+                    self.literal = InvalidUtf8ByteSequence::try_from(chunk).unwrap();
                     // `size` is non-zero because `pattern` is non-empty.
                     // `Literal`s created from > one byte are always non-empty.
                     self.literal.next()
                 }
             };
-            self.source = &self.source[size..];
-            return next;
         }
         if let Some(suffix) = self.delimiters.emit_right_delimiter() {
             return Some(suffix);
@@ -185,6 +233,8 @@ impl<'a> FusedIterator for Debug<'a> {}
 
 #[cfg(test)]
 mod tests {
+    use bstr::{ByteSlice, B};
+
     use super::Debug;
 
     // Iterator + Collect
@@ -272,19 +322,83 @@ mod tests {
     }
 
     #[test]
-    fn iter_ascii_escaped_byte_pattern_literal_exhaustive() {
+    fn iter_ascii_escaped_byte_pattern_literal_ascii_control() {
         // ```ruby
-        // [2.6.6] > /"\a\b\c\e\f\r\n\\\"$$"/
-        // => /"\a\b\c\e\f\r\n\\\"$$"/
-        // [2.6.6] > /"\a\b\c\e\f\r\n\\\"$$"/.source.bytes
-        // => [34, 92, 97, 92, 98, 92, 99, 92, 101, 92, 102, 92, 114, 92, 110, 92, 92, 92, 34, 36, 36, 34]
+        // [3.1.2] > Regexp.compile((0..0x1F).to_a.map(&:chr).join).inspect.bytes
         // ```
-        let pattern = [
-            34, 92, 97, 92, 98, 92, 99, 92, 101, 92, 102, 92, 114, 92, 110, 92, 92, 92, 34, 36, 36, 34,
-        ];
+        let pattern = (0x00..=0x1F).collect::<Vec<u8>>();
+        let debug = Debug::new(&pattern, "", "");
+        let s = debug.collect::<String>();
+        assert_eq!(
+            s.as_bytes().as_bstr(),
+            B(&[
+                47, 92, 120, 48, 48, 92, 120, 48, 49, 92, 120, 48, 50, 92, 120, 48, 51, 92, 120, 48, 52, 92, 120, 48,
+                53, 92, 120, 48, 54, 92, 120, 48, 55, 92, 120, 48, 56, 9, 10, 11, 12, 13, 92, 120, 48, 69, 92, 120,
+                48, 70, 92, 120, 49, 48, 92, 120, 49, 49, 92, 120, 49, 50, 92, 120, 49, 51, 92, 120, 49, 52, 92, 120,
+                49, 53, 92, 120, 49, 54, 92, 120, 49, 55, 92, 120, 49, 56, 92, 120, 49, 57, 92, 120, 49, 65, 92, 120,
+                49, 66, 92, 120, 49, 67, 92, 120, 49, 68, 92, 120, 49, 69, 92, 120, 49, 70, 47_u8
+            ])
+            .as_bstr(),
+        );
+    }
+
+    #[test]
+    fn iter_ascii_pattern_exhaustive() {
+        // ```ruby
+        // Regexp.compile((0..0x7F).to_a.reject {|b| "[](){}".include?(b.chr) }.map(&:chr).join).inspect.bytes
+        // ```
+        let pattern = (0x00..=0x7F).filter(|b| !b"[](){}".contains(b)).collect::<Vec<u8>>();
         let debug = Debug::new(&pattern, "", "");
         let s = debug.collect::<String>();
-        assert_eq!(s, r#"/"\a\b\c\e\f\r\n\\\"$$"/"#);
+        assert_eq!(
+            s.as_bytes().as_bstr(),
+            B(&[
+                47, 92, 120, 48, 48, 92, 120, 48, 49, 92, 120, 48, 50, 92, 120, 48, 51, 92, 120, 48, 52, 92, 120, 48,
+                53, 92, 120, 48, 54, 92, 120, 48, 55, 92, 120, 48, 56, 9, 10, 11, 12, 13, 92, 120, 48, 69, 92, 120,
+                48, 70, 92, 120, 49, 48, 92, 120, 49, 49, 92, 120, 49, 50, 92, 120, 49, 51, 92, 120, 49, 52, 92, 120,
+                49, 53, 92, 120, 49, 54, 92, 120, 49, 55, 92, 120, 49, 56, 92, 120, 49, 57, 92, 120, 49, 65, 92, 120,
+                49, 66, 92, 120, 49, 67, 92, 120, 49, 68, 92, 120, 49, 69, 92, 120, 49, 70, 32, 33, 34, 35, 36, 37,
+                38, 39, 42, 43, 44, 45, 46, 92, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+                64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88,
+                89, 90, 92, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
+                113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 124, 126, 92, 120, 55, 70, 47_u8
+            ])
+            .as_bstr(),
+        );
+    }
+
+    #[test]
+    fn iter_ascii_pattern_escaped_exhaustive() {
+        // ```ruby
+        // Regexp.escape((0..0x7F).to_a.map(&:chr).join).bytes
+        // Regexp.compile(Regexp.escape((0..0x7F).to_a.map(&:chr).join)).inspect.bytes
+        // ```
+        let pattern = &[
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 92, 116, 92, 110, 92, 118, 92, 102, 92, 114, 14, 15, 16, 17, 18, 19, 20, 21,
+            22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 92, 32, 33, 34, 92, 35, 92, 36, 37, 38, 39, 92, 40, 92, 41, 92,
+            42, 92, 43, 44, 92, 45, 92, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 92, 63,
+            64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+            90, 92, 91, 92, 92, 92, 93, 92, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
+            110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 92, 123, 92, 124, 92, 125, 126, 127_u8,
+        ];
+        let debug = Debug::new(pattern, "", "");
+        let s = debug.collect::<String>();
+        assert_eq!(
+            s.as_bytes().as_bstr(),
+            B(&[
+                47, 92, 120, 48, 48, 92, 120, 48, 49, 92, 120, 48, 50, 92, 120, 48, 51, 92, 120, 48, 52, 92, 120, 48,
+                53, 92, 120, 48, 54, 92, 120, 48, 55, 92, 120, 48, 56, 92, 116, 92, 110, 92, 118, 92, 102, 92, 114,
+                92, 120, 48, 69, 92, 120, 48, 70, 92, 120, 49, 48, 92, 120, 49, 49, 92, 120, 49, 50, 92, 120, 49, 51,
+                92, 120, 49, 52, 92, 120, 49, 53, 92, 120, 49, 54, 92, 120, 49, 55, 92, 120, 49, 56, 92, 120, 49, 57,
+                92, 120, 49, 65, 92, 120, 49, 66, 92, 120, 49, 67, 92, 120, 49, 68, 92, 120, 49, 69, 92, 120, 49, 70,
+                92, 32, 33, 34, 92, 35, 92, 36, 37, 38, 39, 92, 40, 92, 41, 92, 42, 92, 43, 44, 92, 45, 92, 46, 92,
+                47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 92, 63, 64, 65, 66, 67, 68, 69, 70,
+                71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 92, 91, 92, 92, 92,
+                93, 92, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113,
+                114, 115, 116, 117, 118, 119, 120, 121, 122, 92, 123, 92, 124, 92, 125, 126, 92, 120, 55, 70, 47_u8
+            ])
+            .as_bstr(),
+        );
     }
 
     #[test]

diff --git a/spinoso-regexp/src/error.rs b/spinoso-regexp/src/error.rs
@@ -141,6 +141,16 @@ impl ArgumentError {
         Self(Cow::Borrowed(message))
     }
 
+    #[must_use]
+    pub(crate) const fn unsupported_pattern_encoding() -> Self {
+        Self::with_message("Unsupported pattern encoding")
+    }
+
+    #[must_use]
+    pub(crate) const fn unsupported_haystack_encoding() -> Self {
+        Self::with_message("Unsupported haystack encoding")
+    }
+
     /// Retrieve the exception message associated with this argument error.
     ///
     /// # Examples

diff --git a/spinoso-regexp/src/lib.rs b/spinoso-regexp/src/lib.rs
@@ -34,13 +34,16 @@ use bstr::ByteSlice;
 mod debug;
 mod encoding;
 mod error;
+mod named_captures;
 mod options;
 mod regexp;
 
 pub use debug::Debug;
 pub use encoding::{Encoding, InvalidEncodingError};
 pub use error::{ArgumentError, Error, RegexpError, SyntaxError};
+pub use named_captures::NamedCaptures;
 pub use options::{Options, RegexpOption};
+pub use regexp::regex::utf8::Utf8;
 
 bitflags::bitflags! {
     #[derive(Default)]
@@ -107,7 +110,7 @@ impl fmt::Debug for Source {
 
 impl From<Config> for Source {
     fn from(config: Config) -> Self {
-        Self::with_pattern_and_options(config.pattern.clone(), config.options)
+        Self::with_pattern_and_options(config.pattern, config.options)
     }
 }
 
@@ -249,7 +252,7 @@ impl fmt::Debug for Config {
 
 impl From<Source> for Config {
     fn from(source: Source) -> Self {
-        Self::with_pattern_and_options(source.pattern.clone(), source.options)
+        Self::with_pattern_and_options(source.pattern, source.options)
     }
 }