In [1]:
import re
from typing import Pattern
from dataclasses import dataclass

In [None]:
number_regexp =  r"""(?x)                   # a number contains:
                   [0-9]+                   #   one or more digits  
                   (?: \.[0-9]+ )?          #   optionally followed by decimal dot and more digits
                   s?                       #   optionally followed by 's' (e.g. "1990s")
                 """
alpha_regexp =   r"""(?x)                   # an alphanumeric word:
                   (?!                      #   never has index sign prefixes of the form:
                     (?: THUMB- )?          #     THUMB-IX-/THUMB-POSS-/THUMB-SELF-
                     (?:                    #   or of the form
                         IX- |              #     IX-
                         POSS- |            #     POSS-
                         SELF-              #     SELF-
                     )  
                   )                        # and contains:
                   [A-Z0-9]                 #   one uppercase letter or digit
                   (?:                      #   optionally followed by:
                     [A-Z0-9'-]*            #     more uppercase letters and digits, and punctuation characters,
                     [A-Z0-9]               #     where the last character may not be punctuation
                   )?
                   (?:                      # and is optionally followed by:
                     \.                     #   a trailing . (e.g., ETC.)
                   |                        # or
                     :[0-9]                 #   a trailing colon and digit (e.g. COUNT-ON-FINGERS:2)
                   )?
                 """
lookahead_regexp = r"""(?x)                 # a word may not be followed by:
                   (?:
                     (?! [a-z] )            #   any lowercase letters (to prevent e.g. AGE-1p from matching as AGE-1),              
                   |                        # with the exception of:
                     (?= wg )               #   wg, describing a wiggle sign (e.g., MOTHERwg)
                   )
                 """
word_all_regexp = r"""(?x)                  # all put together, a word may be:
                   (?: %s | %s )            #   either a number or an alphanumeric word,
                   %s                       #   not followed by lowercase letters, except for wg
                 """    % (number_regexp, alpha_regexp, lookahead_regexp)
                 
cl_prefix = ["CL", "DCL", "LCL", "SCL", "BCL", "BPCL", "PCL", "ICL"] # classifier prefixes
ns_prefix = "ns"
fs_prefix = "fs"
lex_exceptions = "part"
aspect_text = "aspect"
thumb = "THUMB"
index_core_ix = "IX"
other_index_core = ["POSS", "SELF"]
person = r"""[0-9]""" + "p"
arc = "arc"
loc = "loc"
compound = "+"
choice = "/"
contraction = "^"


In [None]:
INDEX = HANDEDNESS[0:1] & INDEX_WITH_OPT_HS & REPETITION[0:1] & (INDEX_PL | INDEX_LOCATION)[0:1] > IndexSign
INDEX_LOC = HANDEDNESS[0:1] & INDEX_LOC_WITH_OPT_HS & INDEX_LOC_PL[0:1] & ('"' & text & '"')[0:1] & REPETITION[0:1] & INDEX_LOC_LOCATION[0:1] > IndexSign
SIGN = (SUBJ_INDEX & ':')[0:1] & HANDEDNESS[0:1] & SIGN_WITH_OPT_HS & ((REPETITION[1] & SIGN_MODIFIERS & PL_LOC[0:1]) | (SIGN_MODIFIERS & REPETITION[0:1]))
GESTURE = GESTURE_NUCLEUS & REPETITION[0:1] & INDEX_LOC_LOCATION[0:1] > Gesture

ALLSIGNS = INDEX_LOC | INDEX | SIGN | GESTURE

In [None]:
@dataclass
class GlossExpressions:
    """A collection of valid expressions used in gloss conventions."""
    
    number: Pattern[str] = re.compile(number_regexp, re.X)
    alpha: Pattern[str] = re.compile(alpha_regexp, re.X)
    lookahead: Pattern[str] = re.compile(lookahead_regexp, re.X)
    word_all: Pattern[str] = re.compile(word_all_regexp, re.X)


In [None]:
HANDCONFIG = handshape & (">" & handshape)[0:1]
HANDEDNESS = "(" & handedness & ")" & (alternate & ('.' | l.Lookahead("(")))[0:1]
# The lookahead disambiguates handedness and hand configuration. HC triggers only
# if the relevant text does not also match handednesss
SIGN_HS = ~l.Lookahead(HANDEDNESS) & "(" & HANDCONFIG & ")"
CLASSIFIER_HS = ":" & HANDCONFIG
SUBJ_INDEX = neu | subject_person | simple_subject_location
OBJ_INDEX = neu | simple_object_location
DEFINITE_PLURAL = plural_number
PL_QUANTIFIER = plural_location & ("/" & plural_location)[0:]
PL_ARC = ('-' & DEFINITE_PLURAL)[0:1] & '-' & arc
PL_LOC = ('-' & DEFINITE_PLURAL)[0:1] & ':' & PL_QUANTIFIER
PLURAL = plural & ((PL_ARC & (':' & OBJ_INDEX)[0:1]) | PL_LOC)
# The lookahead disambiguates aspect and plural. Aspect triggers only if
# the relevant text does not also match a plural
ASPECT = ~l.Lookahead(PLURAL) & aspect_text & ('-' & aspect_text)[0:]
REPETITION = plus[1:] > 'reduplication'

BASE = word & ("-" & word)[0:] & wiggle[0:1]
CHOICE = BASE & (choice & BASE)[1:]
LOAN = "#" & BASE > LoanSign
FS = fs_prefix & "-" & BASE > Fingerspelled
LEXICAL = lex_exceptions | CHOICE | BASE > LexicalSign
NS = ns_prefix & "-" & (lex_exceptions | LOAN | FS | CHOICE | BASE) > NameSign
SIGN_CORE = (NS | LOAN | FS | LEXICAL) >> l.throw
CONTRACTION = caret & SIGN_CORE
SIGN_UNIT = SIGN_CORE & CONTRACTION[0:1] > SignComponent
COMPOUND = compound_plus & SIGN_UNIT
CLASSIFIER = cl_prefix & (~uncertain)[0:1] & CLASSIFIER_HS[0:1] & '"' & text  & '"' > Classifier
CLASSIFIER_NUCLEUS = CLASSIFIER > Nucleus
SIGN_NUCLEUS = SIGN_UNIT & COMPOUND[0:] > Nucleus
SIGN_WITH_OPT_HS = CLASSIFIER_NUCLEUS | (SIGN_HS[0:1] & SIGN_NUCLEUS)
SIGN_MODIFIERS = ('-' & ASPECT)[0:1] & ('-' & object_person)[0:1] & (('-' & PLURAL) | (':' & OBJ_INDEX))[0:1]
SIGN = (SUBJ_INDEX & ':')[0:1] & HANDEDNESS[0:1] & SIGN_WITH_OPT_HS & ((REPETITION[1] & SIGN_MODIFIERS & PL_LOC[0:1]) | (SIGN_MODIFIERS & REPETITION[0:1]))

INDEX_IX = (thumb[0:1] & "-")[0:1] & index_core_ix

INDEX_CORE = INDEX_IX | other_index_core
INDEX_NUCLEUS = INDEX_CORE & '-' & object_person > Nucleus
INDEX_LOCATION = ':' & OBJ_INDEX
INDEX_PL = '-' & PLURAL
INDEX_WITH_OPT_HS = SIGN_HS[0:1] & INDEX_NUCLEUS 
INDEX = HANDEDNESS[0:1] & INDEX_WITH_OPT_HS & REPETITION[0:1] & (INDEX_PL | INDEX_LOCATION)[0:1] > IndexSign

INDEX_LOC_NUCLEUS = INDEX_IX & '-' & loc > Nucleus
INDEX_LOC_PL = '-' & arc
INDEX_LOC_LOCATION = ':' & OBJ_INDEX
INDEX_LOC_WITH_OPT_HS = SIGN_HS[0:1] & INDEX_LOC_NUCLEUS 
INDEX_LOC = HANDEDNESS[0:1] & INDEX_LOC_WITH_OPT_HS & INDEX_LOC_PL[0:1] & ('"' & text & '"')[0:1] & REPETITION[0:1] & INDEX_LOC_LOCATION[0:1] > IndexSign

GESTURE_NUCLEUS = HANDEDNESS[0:1] & (SIGN_HS | HANDCONFIG)[0:1] & '"' & text & '"' > Nucleus
GESTURE = GESTURE_NUCLEUS & REPETITION[0:1] & INDEX_LOC_LOCATION[0:1] > Gesture

ALLSIGNS = INDEX_LOC | INDEX | SIGN | GESTURE
GLOSS = ((stress & ALLSIGNS & stress) | ALLSIGNS) & l.Eos() > AttrNode


In [None]:
@dataclass
class RegexRule:
    # Guarantees that any combination is a valid Regex gloss morpheme.
    name: str
    pattern: str
    flags: int = re.X

    def __post_init__(self):
        self._re = re.compile(self.pattern, self.flags)
        
    @classmethod

    def __or__(self, rule: "RegexRule") -> "RegexRule":
        """Allow only number|alpha, not arbitrary unions."""
        allowed = {("number","alpha"), ("alpha","number")}
        if (self.name, rule.name) not in allowed:
            raise ValueError(f"Cannot `or` {self.name} with {rule.name}")
        pat = f"(?:{self.pattern}|{rule.pattern})"
        return RegexRule(f"{self.name}|{rule.name}", pat, self.flags)

    def then(self, rule: "RegexRule") -> "RegexRule":
        """Regular concatenation—but only allow number/alpha → lookahead."""
        if rule.name != "lookahead":
            raise ValueError(f"{self.name} can only be followed by lookahead, not {rule.name}")
        pat = f"{self.pattern}{rule.pattern}"
        return RegexRule(f"{self.name}+{rule.name}", pat, self.flags)

    def compile(self) -> Pattern:
        return self._re

In [4]:
number   = RegexRule("number",    number_regexp)
alpha    = RegexRule("alpha",     alpha_regexp)
lookahead = RegexRule("lookahead", lookahead_regexp)

word = (number | alpha).then(lookahead)

  self._re = re.compile(self.pattern, self.flags)
  self._re = re.compile(self.pattern, self.flags)
  self._re = re.compile(self.pattern, self.flags)
