Permalink
Browse files

Add /d, /l, /u (infixed) regex modifiers

This patch adds recognition of these modifiers, with appropriate action
for d and l.  u does nothing useful yet.  This allows for the
interpolation of a regex into another one without losing the character
set semantics that it was compiled with, as for the first time, the
semantics is now specified in the stringification as one of these
modifiers.

To this end, it allocates an unused bit in the structures.  The off-
sets change so as to not disturb other bits.
  • Loading branch information...
1 parent 4c2c679 commit 9de15fec376a8ff90a38fad0ff322c72c2995765 Karl Williamson committed with Father Chrysostomos Sep 21, 2010
Showing with 187 additions and 38 deletions.
  1. +5 −1 op.c
  2. +1 −1 op.h
  3. +3 −2 op_reg_common.h
  4. +25 −1 pod/perldelta.pod
  5. +4 −2 pod/perldiag.pod
  6. +48 −17 pod/perlre.pod
  7. +39 −1 regcomp.c
  8. +9 −3 regexp.h
  9. +2 −2 regnodes.h
  10. +30 −2 t/re/pat.t
  11. +6 −4 t/re/re.t
  12. +6 −0 t/re/reg_mesg.t
  13. +9 −2 universal.c
View
6 op.c
@@ -3649,8 +3649,12 @@ Perl_newPMOP(pTHX_ I32 type, I32 flags)
if (PL_hints & HINT_RE_TAINT)
pmop->op_pmflags |= PMf_RETAINT;
- if (PL_hints & HINT_LOCALE)
+ if (PL_hints & HINT_LOCALE) {
pmop->op_pmflags |= PMf_LOCALE;
+ }
+ else if ((! (PL_hints & HINT_BYTES)) && (PL_hints & HINT_UNI_8_BIT)) {
+ pmop->op_pmflags |= RXf_PMf_UNICODE;
+ }
#ifdef USE_ITHREADS
View
2 op.h
@@ -366,7 +366,7 @@ struct pmop {
/* Leave some space, so future bit allocations can go either in the shared or
* unshared area without affecting binary compatibility */
-#define PMf_BASE_SHIFT (_RXf_PMf_SHIFT_NEXT+8)
+#define PMf_BASE_SHIFT (_RXf_PMf_SHIFT_NEXT+7)
/* taint $1 etc. if target tainted */
#define PMf_RETAINT (1<<(PMf_BASE_SHIFT+0))
View
@@ -29,14 +29,15 @@
#define RXf_PMf_EXTENDED (1 << (RXf_PMf_STD_PMMOD_SHIFT+3)) /* /x */
#define RXf_PMf_KEEPCOPY (1 << (RXf_PMf_STD_PMMOD_SHIFT+4)) /* /p */
#define RXf_PMf_LOCALE (1 << (RXf_PMf_STD_PMMOD_SHIFT+5))
+#define RXf_PMf_UNICODE (1 << (RXf_PMf_STD_PMMOD_SHIFT+6))
/* Next available bit after the above. Name begins with '_' so won't be
* exported by B */
-#define _RXf_PMf_SHIFT_NEXT (RXf_PMf_STD_PMMOD_SHIFT+6)
+#define _RXf_PMf_SHIFT_NEXT (RXf_PMf_STD_PMMOD_SHIFT+7)
/* Mask of the above bits. These need to be transferred from op_pmflags to
* re->extflags during compilation */
-#define RXf_PMf_COMPILETIME (RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_LOCALE|RXf_PMf_FOLD|RXf_PMf_EXTENDED|RXf_PMf_KEEPCOPY)
+#define RXf_PMf_COMPILETIME (RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_LOCALE|RXf_PMf_FOLD|RXf_PMf_EXTENDED|RXf_PMf_KEEPCOPY|RXf_PMf_UNICODE)
/* These copies need to be numerical or defsubs_h.PL won't know about them. */
#define PMf_MULTILINE 1<<0
View
@@ -40,6 +40,19 @@ main purpose of this is to allow tests that rely on the stringification
to not have to change when new modifiers are added. See
L<perlre/Extended Patterns>.
+=head2 C<"d">, C<"l">, and C<"u"> regex modifiers added
+
+These modifiers are currently only available within a C<(?...)> construct.
+
+The C<"l"> modifier says to compile the regular expression as if it were
+in the scope of C<use locale>, even if it is not.
+
+The C<"u"> modifier currently does nothing.
+
+The C<"d"> modifier is used in the scope of C<use locale> to compile the
+regular expression as if it were not in that scope.
+See L<perlre/(?dlupimsx-imsx)>.
+
=head1 Security
XXX Any security-related notices go here. In particular, any security
@@ -66,6 +79,17 @@ can use something like the following:
And then use C<$modifiers> instead of C<-xism>.
+=head2 Regular expressions retain their localeness when interpolated
+
+Regular expressions compiled under C<"use locale"> now retain this when
+interpolated into a new regular expression compiled outside a
+C<"use locale">, and vice-versa.
+
+Previously, a regular expression interpolated into another one inherited
+the localeness of the surrounding one, losing whatever state it
+originally had. This is considered a bug fix, but may trip up code that
+has come to rely on the incorrect behavior.
+
[ List each incompatible change as a =head2 entry ]
=head1 Deprecations
@@ -320,7 +344,7 @@ be noted as well.
=item *
-XXX
+See L</Regular expressions retain their localeness when interpolated>
=back
View
@@ -4035,8 +4035,10 @@ where the problem was discovered. See L<perlre>.
<-- HERE shows in the regular expression about where the problem was
discovered. This happens when using the C<(?^...)> construct to tell
Perl to use the default regular expression modifiers, and you
-redundantly specify a default modifier. For other causes, see
-L<perlre>.
+redundantly specify a default modifier; or having a modifier that can't
+be turned off (such as C<"p"> or C<"l">) after a minus; or specifying
+more than one of the C<"d">, C<"l">, or C<"u"> modifiers. For other
+causes, see L<perlre>.
=item Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/
View
@@ -594,20 +594,15 @@ whitespace formatting, a simple C<#> will suffice. Note that Perl closes
the comment as soon as it sees a C<)>, so there is no way to put a literal
C<)> in the comment.
-=item C<(?pimsx-imsx)>
+=item C<(?dlupimsx-imsx)>
-=item C<(?^pimsx)>
+=item C<(?^lupimsx)>
X<(?)> X<(?^)>
One or more embedded pattern-match modifiers, to be turned on (or
turned off, if preceded by C<->) for the remainder of the pattern or
the remainder of the enclosing pattern group (if any).
-Starting in Perl 5.14, a C<"^"> (caret or circumflex accent) immediately
-after the C<"?"> is a shorthand equivalent to C<-imsx> and compiling the
-regex under C<no locale>. Flags may follow the caret to override it.
-But a minus sign is not legal with it.
-
This is particularly useful for dynamic patterns, such as those read in from a
configuration file, taken from an argument, or specified in a table
somewhere. Consider the case where some patterns want to be case
@@ -634,17 +629,53 @@ These modifiers do not carry over into named subpatterns called in the
enclosing group. In other words, a pattern such as C<((?i)(&NAME))> does not
change the case-sensitivity of the "NAME" pattern.
-Note that the C<p> modifier is special in that it can only be enabled,
-not disabled, and that its presence anywhere in a pattern has a global
-effect. Thus C<(?-p)> and C<(?-p:...)> are meaningless and will warn
-when executed under C<use warnings>.
+Starting in Perl 5.14, a C<"^"> (caret or circumflex accent) immediately
+after the C<"?"> is a shorthand equivalent to C<d-imsx>. Flags (except
+C<"d">) may follow the caret to override it.
+But a minus sign is not legal with it.
+
+Also, starting in Perl 5.14, are modifiers C<"d">, C<"l">, and C<"u">,
+which for 5.14 may not be used as suffix modifiers.
+
+C<"l"> means to use a locale (see L<perllocale>) when pattern matching.
+The locale used will be the one in effect at the time of execution of
+the pattern match. This may not be the same as the compilation-time
+locale, and can differ from one match to another if there is an
+intervening call of the
+L<setlocale() function|perllocale/The setlocale function>.
+This modifier is automatically set if the regular expression is compiled
+within the scope of a C<"use locale"> pragma.
+
+C<"u"> has no effect currently. It is automatically set if the regular
+expression is compiled within the scope of a
+L<C<"use feature 'unicode_strings">|feature> pragma.
+
+C<"d"> means to use the traditional Perl pattern matching behavior.
+This is dualistic (hence the name C<"d">, which also could stand for
+"default"). When this is in effect, Perl matches utf8-encoded strings
+using Unicode rules, and matches non-utf8-encoded strings using the
+platform's native character set rules.
+See L<perlunicode/The "Unicode Bug">. It is automatically selected by
+default if the regular expression is compiled neither within the scope
+of a C<"use locale"> pragma nor a <C<"use feature 'unicode_strings">
+pragma.
+
+Note that the C<d>, C<l>, C<p>, and C<u> modifiers are special in that
+they can only be enabled, not disabled, and the C<d>, C<l>, and C<u>
+modifiers are mutually exclusive; a maximum of one may appear in the
+construct. Specifying one de-specifies the others. Thus, for example,
+C<(?-p)> and C<(?-d:...)> are meaningless and will warn when compiled
+under C<use warnings>.
+
+Note also that the C<p> modifier is special in that its presence
+anywhere in a pattern has a global effect.
=item C<(?:pattern)>
X<(?:)>
-=item C<(?imsx-imsx:pattern)>
+=item C<(?dluimsx-imsx:pattern)>
-=item C<(?^imsx:pattern)>
+=item C<(?^luimsx:pattern)>
X<(?^:)>
This is for clustering, not capturing; it groups subexpressions like
@@ -660,7 +691,7 @@ but doesn't spit out extra fields. It's also cheaper not to capture
characters if you don't need to.
Any letters between C<?> and C<:> act as flags modifiers as with
-C<(?imsx-imsx)>. For example,
+C<(?dluimsx-imsx)>. For example,
/(?s-i:more.*than).*million/i
@@ -669,8 +700,8 @@ is equivalent to the more verbose
/(?:(?s-i)more.*than).*million/i
Starting in Perl 5.14, a C<"^"> (caret or circumflex accent) immediately
-after the C<"?"> is a shorthand equivalent to C<-imsx> and compiling the
-regex under C<no locale>. Any positive flags may follow the caret, so
+after the C<"?"> is a shorthand equivalent to C<d-imsx>. Any positive
+flags (except C<"d">) may follow the caret, so
(?^x:foo)
@@ -679,7 +710,7 @@ is equivalent to
(?x-ims:foo)
The caret tells Perl that this cluster doesn't inherit the flags of any
-surrounding pattern, but to go back to the system defaults (C<-imsx>),
+surrounding pattern, but to go back to the system defaults (C<d-imsx>),
modified by any flags specified.
The caret allows for simpler stringification of compiled regular
View
@@ -370,6 +370,7 @@ static const scan_data_t zero_scan_data =
#define UTF (RExC_utf8 != 0)
#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
+#define UNI_SEMANTICS ((RExC_flags & RXf_PMf_UNICODE) != 0)
#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
#define OOB_UNICODE 12345678
@@ -4441,9 +4442,17 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
SvPOK_on(rx);
SvFLAGS(rx) |= SvUTF8(pattern);
*p++='('; *p++='?';
- if (has_minus) { /* If a default, cover it using the caret */
+
+ /* If a default, cover it using the caret */
+ if (has_minus || (r->extflags & ~(RXf_PMf_LOCALE|RXf_PMf_UNICODE))) {
*p++= DEFAULT_PAT_MOD;
}
+ if (r->extflags & RXf_PMf_LOCALE) {
+ *p++ = LOCALE_PAT_MOD;
+ }
+ else if (r->extflags & RXf_PMf_UNICODE) {
+ *p++ = UNICODE_PAT_MOD;
+ }
if (has_p)
*p++ = KEEPCOPY_PAT_MOD; /*'p'*/
{
@@ -6124,20 +6133,48 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
that follow */
has_use_defaults = TRUE;
STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
+ RExC_flags &= ~(RXf_PMf_LOCALE|RXf_PMf_UNICODE);
goto parse_flags;
default:
--RExC_parse;
parse_flags: /* (?i) */
{
U32 posflags = 0, negflags = 0;
U32 *flagsp = &posflags;
+ bool has_charset_modifier = 0;
while (*RExC_parse) {
/* && strchr("iogcmsx", *RExC_parse) */
/* (?g), (?gc) and (?o) are useless here
and must be globally applied -- japhy */
switch (*RExC_parse) {
CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
+ case LOCALE_PAT_MOD:
+ if (has_charset_modifier || flagsp == &negflags) {
+ goto fail_modifiers;
+ }
+ *flagsp &= ~RXf_PMf_UNICODE;
+ *flagsp |= RXf_PMf_LOCALE;
+ has_charset_modifier = 1;
+ break;
+ case UNICODE_PAT_MOD:
+ if (has_charset_modifier || flagsp == &negflags) {
+ goto fail_modifiers;
+ }
+ *flagsp &= ~RXf_PMf_LOCALE;
+ *flagsp |= RXf_PMf_UNICODE;
+ has_charset_modifier = 1;
+ break;
+ case DUAL_PAT_MOD:
+ if (has_use_defaults
+ || has_charset_modifier
+ || flagsp == &negflags)
+ {
+ goto fail_modifiers;
+ }
+ *flagsp &= ~(RXf_PMf_LOCALE|RXf_PMf_UNICODE);
+ has_charset_modifier = 1;
+ break;
case ONCE_PAT_MOD: /* 'o' */
case GLOBAL_PAT_MOD: /* 'g' */
if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
@@ -6182,6 +6219,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
* if there is a minus, it means will be trying to
* re-specify a default which is an error */
if (has_use_defaults || flagsp == &negflags) {
+ fail_modifiers:
RExC_parse++;
vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
/*NOTREACHED*/
View
@@ -236,9 +236,9 @@ and check for NULL.
case SINGLE_PAT_MOD: *(pmfl) |= RXf_PMf_SINGLELINE; break; \
case XTENDED_PAT_MOD: *(pmfl) |= RXf_PMf_EXTENDED; break
-/* Note, includes locale */
+/* Note, includes locale, unicode */
#define STD_PMMOD_FLAGS_CLEAR(pmfl) \
- *(pmfl) &= ~(RXf_PMf_FOLD|RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_EXTENDED|RXf_PMf_LOCALE)
+ *(pmfl) &= ~(RXf_PMf_FOLD|RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_EXTENDED|RXf_PMf_LOCALE|RXf_PMf_UNICODE)
/* chars and strings used as regex pattern modifiers
* Singlular is a 'c'har, plural is a "string"
@@ -258,12 +258,18 @@ and check for NULL.
#define IGNORE_PAT_MOD 'i'
#define XTENDED_PAT_MOD 'x'
#define NONDESTRUCT_PAT_MOD 'r'
+#define LOCALE_PAT_MOD 'l'
+#define UNICODE_PAT_MOD 'u'
+#define DUAL_PAT_MOD 'd'
#define ONCE_PAT_MODS "o"
#define KEEPCOPY_PAT_MODS "p"
#define EXEC_PAT_MODS "e"
#define LOOP_PAT_MODS "gc"
#define NONDESTRUCT_PAT_MODS "r"
+#define LOCALE_PAT_MODS "l"
+#define UNICODE_PAT_MODS "u"
+#define DUAL_PAT_MODS "d"
/* This string is expected by regcomp.c to be ordered so that the first
* character is the flag in bit RXf_PMf_STD_PMMOD_SHIFT of extflags; the next
@@ -288,7 +294,7 @@ and check for NULL.
/* Leave some space, so future bit allocations can go either in the shared or
* unshared area without affecting binary compatibility */
-#define RXf_BASE_SHIFT (_RXf_PMf_SHIFT_NEXT+3)
+#define RXf_BASE_SHIFT (_RXf_PMf_SHIFT_NEXT+2)
/* Anchor and GPOS related stuff */
#define RXf_ANCH_BOL (1<<(RXf_BASE_SHIFT+0))
View
@@ -625,14 +625,14 @@ EXTCONST char * const PL_reg_name[] = {
EXTCONST char * PL_reg_extflags_name[];
#else
EXTCONST char * const PL_reg_extflags_name[] = {
- /* Bits in extflags defined: 11111111111111111111111000111111 */
+ /* Bits in extflags defined: 11111111111111111111111001111111 */
"MULTILINE", /* 0x00000001 */
"SINGLELINE", /* 0x00000002 */
"FOLD", /* 0x00000004 */
"EXTENDED", /* 0x00000008 */
"KEEPCOPY", /* 0x00000010 */
"LOCALE", /* 0x00000020 */
- "UNUSED_BIT_6", /* 0x00000040 */
+ "UNICODE", /* 0x00000040 */
"UNUSED_BIT_7", /* 0x00000080 */
"UNUSED_BIT_8", /* 0x00000100 */
"ANCH_BOL", /* 0x00000200 */
Oops, something went wrong.

0 comments on commit 9de15fe

Please sign in to comment.