Re: [PATCH] Change implementation of %+ to use a proper tied hash int…

…erface and add support for %- Message-ID: <9b18b3110612291245q792fe91cu69422d2b81bb4f0b@mail.gmail.com> p4raw-id: //depot/perl@29682
castaway · Jan 4, 2007 · 44a2ac7 · 44a2ac7
1 parent 1f2e791
commit 44a2ac7
Show file tree

Hide file tree

Showing 21 changed files with 678 additions and 280 deletions.
diff --git a/MANIFEST b/MANIFEST
@@ -980,6 +980,7 @@ ext/re/re_comp.h		re extension wrapper for regcomp.h
 ext/re/re.pm			re extension Perl module
 ext/re/re_top.h			re extension symbol hiding header
 ext/re/re.xs			re extension external subroutines
+ext/re/lib/re/Tie/Hash/NamedCapture.pm	Implements %- and %+ behaviour
 ext/re/t/lexical_debug.pl	generate debug output for lexical re 'debug'
 ext/re/t/lexical_debug.t	test that lexical re 'debug' works
 ext/re/t/re_funcs.t		see if exportable funcs from re.pm work

diff --git a/doop.c b/doop.c
@@ -1434,8 +1434,7 @@ Perl_do_kv(pTHX)
 	    RETURN;
 	}
 
-	if (! SvTIED_mg((SV*)keys, PERL_MAGIC_tied) 
-	    && ! SvTIED_mg((SV*)keys, PERL_MAGIC_regdata_names))
+	if (! SvTIED_mg((SV*)keys, PERL_MAGIC_tied) )
 	{
 	    i = HvKEYS(keys);
 	}

diff --git a/dump.c b/dump.c
@@ -192,6 +192,10 @@ sequence. Thus the output will either be a single char,
 an octal escape sequence, a special escape like C<\n> or a 3 or 
 more digit hex value. 
 
+If PERL_PV_ESCAPE_RE is set then the escape char used will be a '%' and
+not a '\\'. This is because regexes very often contain backslashed
+sequences, whereas '%' is not a particularly common character in patterns.
+
 Returns a pointer to the escaped text as held by dsv.
 
 =cut
@@ -203,14 +207,16 @@ Perl_pv_escape( pTHX_ SV *dsv, char const * const str,
                 const STRLEN count, const STRLEN max, 
                 STRLEN * const escaped, const U32 flags ) 
 {
-    char dq = (flags & PERL_PV_ESCAPE_QUOTE) ? '"' : '\\';
-    char octbuf[PV_ESCAPE_OCTBUFSIZE] = "\\123456789ABCDF";
+    char esc = (flags & PERL_PV_ESCAPE_RE) ? '%' : '\\';
+    char dq = (flags & PERL_PV_ESCAPE_QUOTE) ? '"' : esc;
+    char octbuf[PV_ESCAPE_OCTBUFSIZE] = "%123456789ABCDF";
     STRLEN wrote = 0;    /* chars written so far */
     STRLEN chsize = 0;   /* size of data to be written */
     STRLEN readsize = 1; /* size of data just read */
     bool isuni= flags & PERL_PV_ESCAPE_UNI ? 1 : 0; /* is this unicode */
     const char *pv  = str;
     const char *end = pv + count; /* end of string */
+    octbuf[0] = esc;
 
     if (!flags & PERL_PV_ESCAPE_NOCLEAR) 
 	    sv_setpvn(dsv, "", 0);
@@ -228,42 +234,49 @@ Perl_pv_escape( pTHX_ SV *dsv, char const * const str,
                                       "%"UVxf, u);
             else
                 chsize = my_snprintf( octbuf, PV_ESCAPE_OCTBUFSIZE, 
-                                      "\\x{%"UVxf"}", u);
+                                      "%cx{%"UVxf"}", esc, u);
         } else if (flags & PERL_PV_ESCAPE_NOBACKSLASH) {
             chsize = 1;            
         } else {         
-            if ( (c == dq) || (c == '\\') || !isPRINT(c) ) {
-	    chsize = 2;
+            if ( (c == dq) || (c == esc) || !isPRINT(c) ) {
+	        chsize = 2;
                 switch (c) {
-		case '\\' : octbuf[1] = '\\'; break;
+
+		case '\\' : /* fallthrough */
+		case '%'  : if ( c == esc )  {
+		                octbuf[1] = esc;  
+		            } else {
+		                chsize = 1;
+		            }
+		            break;
 		case '\v' : octbuf[1] = 'v';  break;
 		case '\t' : octbuf[1] = 't';  break;
 		case '\r' : octbuf[1] = 'r';  break;
 		case '\n' : octbuf[1] = 'n';  break;
 		case '\f' : octbuf[1] = 'f';  break;
-                    case '"'  : 
+                case '"'  : 
                         if ( dq == '"' ) 
 				octbuf[1] = '"';
                         else 
                             chsize = 1;
-				break;
+                        break;
 		default:
                         if ( (pv < end) && isDIGIT((U8)*(pv+readsize)) )
                             chsize = my_snprintf( octbuf, PV_ESCAPE_OCTBUFSIZE, 
-                                                  "\\%03o", c);
-			    else
+                                                  "%c%03o", esc, c);
+			else
                             chsize = my_snprintf( octbuf, PV_ESCAPE_OCTBUFSIZE, 
-                                                  "\\%o", c);
+                                                  "%c%o", esc, c);
                 }
             } else {
-                chsize=1;
+                chsize = 1;
             }
-	    }
-	    if ( max && (wrote + chsize > max) ) {
-		break;
+	}
+	if ( max && (wrote + chsize > max) ) {
+	    break;
         } else if (chsize > 1) {
-		sv_catpvn(dsv, octbuf, chsize);
-		wrote += chsize;
+            sv_catpvn(dsv, octbuf, chsize);
+            wrote += chsize;
 	} else {
             Perl_sv_catpvf( aTHX_ dsv, "%c", c);
 	    wrote++;
@@ -308,7 +321,7 @@ Perl_pv_pretty( pTHX_ SV *dsv, char const * const str, const STRLEN count,
   const STRLEN max, char const * const start_color, char const * const end_color, 
   const U32 flags ) 
 {
-    U8 dq = (flags & PERL_PV_PRETTY_QUOTE) ? '"' : '\\';
+    U8 dq = (flags & PERL_PV_PRETTY_QUOTE) ? '"' : '%';
     STRLEN escaped;
 
     if ( dq == '"' )
@@ -1129,7 +1142,6 @@ static const struct { const char type; const char *name; } magic_names[] = {
 	{ PERL_MAGIC_sv,             "sv(\\0)" },
 	{ PERL_MAGIC_arylen,         "arylen(#)" },
 	{ PERL_MAGIC_rhash,          "rhash(%)" },
-	{ PERL_MAGIC_regdata_names,  "regdata_names(+)" },
 	{ PERL_MAGIC_pos,            "pos(.)" },
 	{ PERL_MAGIC_symtab,         "symtab(:)" },
 	{ PERL_MAGIC_backref,        "backref(<)" },

diff --git a/embed.fnc b/embed.fnc
@@ -684,7 +684,8 @@ Ap	|I32	|regexec_flags	|NN regexp* prog|NN char* stringarg \
 				|NN char* strend|NN char* strbeg|I32 minend \
 				|NN SV* screamer|NULLOK void* data|U32 flags
 ApR	|regnode*|regnext	|NN regnode* p
-Ep	|SV*|reg_named_buff_sv	|NN SV* namesv
+EXp	|SV*|reg_named_buff_get	|NN SV* namesv|NULLOK const REGEXP * const from_re|U32 flags
+EXp	|SV*|reg_numbered_buff_get|I32 paren|NN const REGEXP * const rx|NULLOK SV* usesv|U32 flags
 Ep	|void	|regprop	|NULLOK const regexp *prog|NN SV* sv|NN const regnode* o
 Ap	|void	|repeatcpy	|NN char* to|NN const char* from|I32 len|I32 count
 ApP	|char*	|rninstr	|NN const char* big|NN const char* bigend \
@@ -1100,7 +1101,8 @@ sR	|I32	|do_trans_complex_utf8	|NN SV * const sv
 
 #if defined(PERL_IN_GV_C) || defined(PERL_DECL_PROT)
 s	|void	|gv_init_sv	|NN GV *gv|I32 sv_type
-s	|void	|require_errno	|NN GV *gv
+s	|HV*	|require_tie_mod|NN GV *gv|NN const char *varpv|NN SV* namesv \
+				|NN const char *methpv|const U32 flags
 #endif
 
 : #if defined(PERL_IN_HV_C) || defined(PERL_IN_SV_C) || defined(PERL_DECL_PROT)

diff --git a/embed.h b/embed.h
@@ -692,7 +692,8 @@
 #define regexec_flags		Perl_regexec_flags
 #define regnext			Perl_regnext
 #if defined(PERL_CORE) || defined(PERL_EXT)
-#define reg_named_buff_sv	Perl_reg_named_buff_sv
+#define reg_named_buff_get	Perl_reg_named_buff_get
+#define reg_numbered_buff_get	Perl_reg_numbered_buff_get
 #define regprop			Perl_regprop
 #endif
 #define repeatcpy		Perl_repeatcpy
@@ -1098,7 +1099,7 @@
 #if defined(PERL_IN_GV_C) || defined(PERL_DECL_PROT)
 #ifdef PERL_CORE
 #define gv_init_sv		S_gv_init_sv
-#define require_errno		S_require_errno
+#define require_tie_mod		S_require_tie_mod
 #endif
 #endif
 #if defined(PERL_IN_HV_C) || defined(PERL_DECL_PROT)
@@ -2904,7 +2905,8 @@
 #define regexec_flags(a,b,c,d,e,f,g,h)	Perl_regexec_flags(aTHX_ a,b,c,d,e,f,g,h)
 #define regnext(a)		Perl_regnext(aTHX_ a)
 #if defined(PERL_CORE) || defined(PERL_EXT)
-#define reg_named_buff_sv(a)	Perl_reg_named_buff_sv(aTHX_ a)
+#define reg_named_buff_get(a,b,c)	Perl_reg_named_buff_get(aTHX_ a,b,c)
+#define reg_numbered_buff_get(a,b,c,d)	Perl_reg_numbered_buff_get(aTHX_ a,b,c,d)
 #define regprop(a,b,c)		Perl_regprop(aTHX_ a,b,c)
 #endif
 #define repeatcpy(a,b,c,d)	Perl_repeatcpy(aTHX_ a,b,c,d)
@@ -3301,7 +3303,7 @@
 #if defined(PERL_IN_GV_C) || defined(PERL_DECL_PROT)
 #ifdef PERL_CORE
 #define gv_init_sv(a,b)		S_gv_init_sv(aTHX_ a,b)
-#define require_errno(a)	S_require_errno(aTHX_ a)
+#define require_tie_mod(a,b,c,d,e)	S_require_tie_mod(aTHX_ a,b,c,d,e)
 #endif
 #endif
 #ifdef PERL_CORE

diff --git a/ext/re/lib/re/Tie/Hash/NamedCapture.pm b/ext/re/lib/re/Tie/Hash/NamedCapture.pm
@@ -0,0 +1,111 @@
+package re::Tie::Hash::NamedCapture;
+use strict;
+use warnings;
+our $VERSION     = "0.01";
+use re qw(is_regexp
+          regname
+          regnames
+          regnames_count
+          regnames_iterinit
+          regnames_iternext);
+
+sub TIEHASH {
+    my $classname = shift;
+    my $hash = {@_};
+
+    if ($hash->{re} && !is_regexp($hash->{re})) {
+        die "'re' parameter to ",__PACKAGE__,"->TIEHASH must be a qr//"
+    }
+
+    return bless $hash, $classname;
+}
+
+sub FETCH {
+    return regname($_[1],$_[0]->{re},$_[0]->{all});
+}
+
+sub STORE {
+    require Carp;
+    Carp::croak("STORE forbidden: Hashes tied to ",__PACKAGE__," are read/only.");
+}
+
+sub FIRSTKEY {
+    regnames_iterinit($_[0]->{re});
+    return $_[0]->NEXTKEY;
+}
+
+sub NEXTKEY {
+    return regnames_iternext($_[0]->{re},$_[0]->{all});
+}
+
+sub EXISTS {
+    return defined regname( $_[1], $_[0]->{re},$_[0]->{all});
+}
+
+sub DELETE {
+    require Carp;
+    Carp::croak("DELETE forbidden: Hashes tied to ",__PACKAGE__," are read/only");
+}
+
+sub CLEAR {
+    require Carp;
+    Carp::croak("CLEAR forbidden: Hashes tied to ",__PACKAGE__," are read/only");
+}
+
+sub SCALAR {
+    return scalar regnames($_[0]->{re},$_[0]->{all});
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+re::Tie::Hash::NamedCapture - Perl module to support named regex capture buffers
+
+=head1 SYNOPSIS
+
+    tie my %hash,"re::Tie::Hash::NamedCapture";
+    # %hash now behaves like %-
+
+    tie my %hash,"re::Tie::Hash::NamedCapture",re => $qr, all=> 1,
+    # %hash now access buffers from regex in $qr like %+
+
+=head1 DESCRIPTION
+
+Implements the behaviour required for C<%+> and C<%-> but can be used
+independently.
+
+When the C<re> parameter is provided, and the value is the result of
+a C<qr//> expression then the hash is bound to that particular regexp
+and will return the results of its last successful match. If the
+parameter is omitted then the hash behaves just as C<$1> does by
+referencing the last successful match.
+
+When the C<all> parameter is provided then the result of a fetch
+is an array ref containing the contents of each buffer whose name
+was the same as the key used for the access. If the buffer wasn't
+involved in the match then an undef will be stored. When the all
+parameter is omitted or not a true value then the return will be
+a the content of the left most defined buffer with the given name.
+If there is no buffer with the desired name defined then C<undef>
+is returned.
+
+
+For instance:
+
+    my $qr = qr/(?<foo>bar)/;
+    if ( 'bar' =~ /$qr/ ) {
+        tie my %hash,"re::Tie::Hash::NamedCapture",re => $qr, all => 1;
+        if ('bar'=~/bar/) {
+            # last successful match is now different
+            print $hash{foo}; # prints foo
+        }
+    }
+
+=head1 SEE ALSO
+
+L<re>, L<perlmodlib/Pragmatic Modules>.
+
+=cut
diff --git a/ext/re/re.pm b/ext/re/re.pm
@@ -4,9 +4,11 @@ package re;
 use strict;
 use warnings;
 
-our $VERSION     = "0.07";
+our $VERSION     = "0.08";
 our @ISA         = qw(Exporter);
-our @EXPORT_OK   = qw(is_regexp regexp_pattern regmust);
+our @EXPORT_OK   = qw(is_regexp regexp_pattern regmust 
+                      regname regnames 
+                      regnames_count regnames_iterinit regnames_iternext);
 our %EXPORT_OK = map { $_ => 1 } @EXPORT_OK;
 
 # *** WARNING *** WARNING *** WARNING *** WARNING *** WARNING ***
@@ -464,6 +466,46 @@ floating string. This will be what the optimiser of the Perl that you
 are using thinks is the longest. If you believe that the result is wrong
 please report it via the L<perlbug> utility.
 
+=item regname($name,$qr,$all)
+
+Returns the contents of a named buffer. If $qr is missing, or is not the
+result of a qr// then returns the result of the last successful match. If
+$all is true then returns an array ref containing one entry per buffer,
+otherwise returns the first defined buffer.
+
+=item regnames($qr,$all)
+
+Returns a list of all of the named buffers defined in a pattern. If 
+$all is true then it returns all names defined, if not returns only 
+names which were involved in the last successful match. If $qr is omitted
+or is not the result of a qr// then returns the details for the last
+successful match.
+
+=item regnames_iterinit($qr)
+
+Initializes the internal hash iterator associated to a regexps named capture
+buffers. If $qr is omitted resets the iterator associated with the regexp used 
+in the last successful match.
+
+=item regnames_iternext($qr,$all)
+
+Gets the next key from the hash associated with a regexp. If $qr
+is omitted resets the iterator associated with the regexp used in the 
+last successful match. If $all is true returns the keys of all of the 
+distinct named buffers in the pattern, if not returns only those names
+used in the last successful match.
+
+=item regnames_count($qr)
+
+Returns the number of distinct names defined in the regexp $qr. If
+$qr is omitted or not a regexp returns the count of names in the 
+last successful match. 
+
+B<Note:> that this result is always the actual  number of distinct 
+named buffers defined, it may not actually match that which is 
+returned by C<regnames()> and related routines when those routines 
+have not been called with the $all parameter set..
+
 =back
 
 =head1 SEE ALSO