In [1]:
import re
from typing import Pattern
from dataclasses import dataclass

In [2]:
number_regexp =  r"""(?x)                   # a number contains:
                   [0-9]+                   #   one or more digits  
                   (?: \.[0-9]+ )?          #   optionally followed by decimal dot and more digits
                   s?                       #   optionally followed by 's' (e.g. "1990s")
                 """
alpha_regexp =   r"""(?x)                   # an alphanumeric word:
                   (?!                      #   never has index sign prefixes of the form:
                     (?: THUMB- )?          #     THUMB-IX-/THUMB-POSS-/THUMB-SELF-
                     (?:                    #   or of the form
                         IX- |              #     IX-
                         POSS- |            #     POSS-
                         SELF-              #     SELF-
                     )  
                   )                        # and contains:
                   [A-Z0-9]                 #   one uppercase letter or digit
                   (?:                      #   optionally followed by:
                     [A-Z0-9'-]*            #     more uppercase letters and digits, and punctuation characters,
                     [A-Z0-9]               #     where the last character may not be punctuation
                   )?
                   (?:                      # and is optionally followed by:
                     \.                     #   a trailing . (e.g., ETC.)
                   |                        # or
                     :[0-9]                 #   a trailing colon and digit (e.g. COUNT-ON-FINGERS:2)
                   )?
                 """
lookahead_regexp = r"""(?x)                 # a word may not be followed by:
                   (?:
                     (?! [a-z] )            #   any lowercase letters (to prevent e.g. AGE-1p from matching as AGE-1),              
                   |                        # with the exception of:
                     (?= wg )               #   wg, describing a wiggle sign (e.g., MOTHERwg)
                   )
                 """
word_all_regexp = r"""(?x)                  # all put together, a word may be:
                   (?: %s | %s )            #   either a number or an alphanumeric word,
                   %s                       #   not followed by lowercase letters, except for wg
                 """    % (number_regexp, alpha_regexp, lookahead_regexp)

In [3]:
@dataclass
class RegexRule:
    """
    Guarantees that any combination is a valid Regex gloss morpheme.
    """
    name: str
    pattern: str
    flags: int = re.X

    def __post_init__(self):
        self._re = re.compile(self.pattern, self.flags)

    def __or__(self, rule: "RegexRule") -> "RegexRule":
        """Allow only number|alpha, not arbitrary unions."""
        allowed = {("number","alpha"), ("alpha","number")}
        if (self.name, rule.name) not in allowed:
            raise ValueError(f"Cannot `or` {self.name} with {rule.name}")
        pat = f"(?:{self.pattern}|{rule.pattern})"
        return RegexRule(f"{self.name}|{rule.name}", pat, self.flags)

    def then(self, rule: "RegexRule") -> "RegexRule":
        """Regular concatenation—but only allow number/alpha → lookahead."""
        if rule.name != "lookahead":
            raise ValueError(f"{self.name} can only be followed by lookahead, not {rule.name}")
        pat = f"{self.pattern}{rule.pattern}"
        return RegexRule(f"{self.name}+{rule.name}", pat, self.flags)

    def compile(self) -> Pattern:
        return self._re

In [4]:
number   = RegexRule("number",    number_regexp)
alpha    = RegexRule("alpha",     alpha_regexp)
lookahead = RegexRule("lookahead", lookahead_regexp)

word = (number | alpha).then(lookahead)

  self._re = re.compile(self.pattern, self.flags)
  self._re = re.compile(self.pattern, self.flags)
  self._re = re.compile(self.pattern, self.flags)
