In [None]:
import re
from typing import Pattern
from dataclasses import dataclass
import pyparsing as pp
pp.ParserElement.enable_Packrat()

In [None]:
number_regexp =  r"""(?x)                   # a number contains:
                   [0-9]+                   #   one or more digits  
                   (?: \.[0-9]+ )?          #   optionally followed by decimal dot and more digits
                   s?                       #   optionally followed by 's' (e.g. "1990s")
                 """
alpha_regexp =   r"""(?x)                   # an alphanumeric word:
                   (?!                      #   never has index sign prefixes of the form:
                     (?: THUMB- )?          #     THUMB-IX-/THUMB-POSS-/THUMB-SELF-
                     (?:                    #   or of the form
                         IX- |              #     IX-
                         POSS- |            #     POSS-
                         SELF-              #     SELF-
                     )  
                   )                        # and contains:
                   [A-Z0-9]                 #   one uppercase letter or digit
                   (?:                      #   optionally followed by:
                     [A-Z0-9'-]*            #     more uppercase letters and digits, and punctuation characters,
                     [A-Z0-9]               #     where the last character may not be punctuation
                   )?
                   (?:                      # and is optionally followed by:
                     \.                     #   a trailing . (e.g., ETC.)
                   |                        # or
                     :[0-9]                 #   a trailing colon and digit (e.g. COUNT-ON-FINGERS:2)
                   )?
                 """
lookahead_regexp = r"""(?x)                 # a word may not be followed by:
                   (?:
                     (?! [a-z] )            #   any lowercase letters (to prevent e.g. AGE-1p from matching as AGE-1),              
                   |                        # with the exception of:
                     (?= wg )               #   wg, describing a wiggle sign (e.g., MOTHERwg)
                   )
                 """
word_all_regexp = r"""(?x)                  # all put together, a word may be:
                   (?: %s | %s )            #   either a number or an alphanumeric word,
                   %s                       #   not followed by lowercase letters, except for wg
                 """    % (number_regexp, alpha_regexp, lookahead_regexp)

In [None]:
word = pp.Regex(word_all_regexp)
cl_prefix = pp.one_of(["CL", "DCL", "LCL", "SCL", "BCL", "BPCL", "PCL", "ICL"], as_keyword = True)
ns_prefix = pp.Keyword("ns-")
fs_prefix = pp.Keyword("fs-")
lex_exceptions = pp.one_of(["part", "'WHAT'"], as_keyword = True)
aspect_text = pp.Keyword("aspect")
index_core_ix = pp.Keyword("IX")
other_index_core = pp.one_of(["POSS", "SELF"], as_keyword = True)
person = pp.Regex(r"[0-9]p")
mwe_indic = pp.Keyword("-")
arc = pp.Keyword("-arc")
loc = pp.Keyword("-loc")
compound = pp.Keyword("+")
choice = pp.Keyword("/")
contraction = pp.Keyword("^")