#### Regular Expressions with Counted Repetitions

Consider the constructors of the abstract syntax tree for regular expressions from the chapter on Regular Expressions:

In [None]:
class RegEx:
  pass


class Œµ(RegEx):
  def __repr__(self):
    return ''


class Sym(RegEx):
  def __init__(self, a: str):
    self.a = a

  def __repr__(self):
    return str(self.a)


class Choice(RegEx):
  def __init__(self, E1: RegEx, E2: RegEx):
    self.E1, self.E2 = E1, E2

  def __repr__(self):
    return '(' + str(self.E1) + '|' + str(self.E2) + ')'


class Conc(RegEx):
  def __init__(self, E1: RegEx, E2: RegEx):
    self.E1, self.E2 = E1, E2

  def __repr__(self):
    return '(' + str(self.E1) + str(self.E2) + ')'


class Star(RegEx):
  def __init__(self, E: RegEx):
    self.E = E

  def __repr__(self):
    return '(' + str(self.E) + ')*'

Let us build a parser that constructs the abstract syntax tree of regular expressions. The attribute grammar for this is as follows, with `plainchar` and `escapedchar` containing all the characters as in Chapter 4, Analysis of Context-Free Languages:

    expression(e)  ‚Üí  term(e) { '|' term(f) ¬´ e := Choice(e, f) ¬ª  }
    term(e)  ‚Üí  factor(e) { factor(f) ¬´ e := Conc(e, f) ¬ª }
    factor(e) ‚Üí atom(e) [ '*' ¬´ e := Star(e) ¬ª | '+' ¬´ e := Conc(e, Star(e)) ¬ª | '?' ¬´ e := Choice(e, Œµ()) ¬ª ]
    atom(e)  ‚Üí  plainchar(e) | escapedchar(e) | '(' expression(e) ')'
    plainchar(e)  ‚Üí  ' ' ¬´ e := Sym(' ') ¬ª | ... | '~' ¬´ e := Sym('~') ¬ª
    escapedchar(e)  ‚Üí '\\' ( '(' ¬´ e := Sym('(') ¬ª | ')' | ... | '|' ¬´ e := Sym('|') ¬ª)

Extend the parser from the course notes to include attribute evaluation rules so that `parse` returns the abstract syntax tree. Copy the parser from the course notes and modify it:

In [None]:
PlainChars = set(' !"#$%&\',-./0123456789:;<=>@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]^_`abcdefghijklmnopqrstuvwxyz{}~')
EscapedChars = set('()*+?\\|')
FirstFactor = PlainChars | {'\\', '('}

src: str
pos: int
sym: str


def nxt():
  global pos, sym
  if pos < len(src):
    sym, pos = src[pos], pos + 1
  else:
    sym = chr(0)


def expression():
  e = term()
  while sym == '|':
    nxt()
    f = term()
    e = Choice(e, f)
  return e


def term():
  e = factor()
  while sym in FirstFactor:
    f = factor()
    e = Conc(e, f)
  return e


def factor():
  e = atom()
  if sym == '*':
    nxt()
    e = Star(e)
  elif sym == '+':
    nxt()
    e = Conc(e, Star(e))
  elif sym == '?':
    nxt()
    e = Choice(e, Œµ())
  return e


def atom():
  if sym in PlainChars:
    e = Sym(sym)
    nxt()
  elif sym == '\\':
    nxt()
    if sym in EscapedChars:
      e = Sym(sym)
      nxt()
    else:
      raise Exception('invalid escaped character at ' + str(pos))
  elif sym == '(':
    nxt()
    e = expression()
    if sym == ')':
      nxt()
    else:
      raise Exception("')' expected at " + str(pos))
  elif sym < ' ' or sym > '~':
    raise Exception('invalid character at ' + str(pos))
  else:
    raise Exception('unexpected character at ' + str(pos))
  return e


def parse(s: str):
  global src, pos
  src, pos = s, 0
  nxt()
  e = expression()
  if sym != chr(0):
    raise Exception('unexpected character at ' + str(pos))
  return e

Here are some test cases:

In [None]:
# parse("a\$") # Exception: invalid escaped character at 3
# parse("a(b") # Exception: ')' expected at 3
# parse("a(" + chr(5) + ")") # invalid character at 3
# parse("a" + chr(5)) # unexpected character at 2
assert str(parse('(a*)*abcc')) == '((((((a)*)*a)b)c)c)'
assert str(parse('a|b*c')) == '(a|((b)*c))'

Let's have some fun and use this to check the equivalence of regular expressions. The following cells contain code from the course notes:

In [None]:
class fset(frozenset):
  def __repr__(self):
    return '{' + ', '.join(str(e) for e in self) + '}'

In [None]:
def wrap(a):
  import textwrap

  return '\\n'.join(textwrap.wrap(str(a), width=12))


TransFunc = dict[str, dict[str, set[str]]]


class FiniteStateAutomaton:
  Œ£: set[str]  # set of symbols
  Q: set[str]  # set of states
  I: set[str]  # I ‚äÜ Q, the initial states,
  Œ¥: TransFunc  # representing Q ‚Üõ Œ£ ‚Üõ ùí´Q, the transition function
  F: set[str]  # F ‚äÜ Q, the finite states
  vars = ()  # for reduced FSAs, the names of the original variables

  def __init__(self, Œ£, Q, I, Œ¥, F):
    self.Œ£, self.Q, self.I, self.Œ¥, self.F = Œ£, fset(Q), fset(I), Œ¥, fset(F)

  def draw(self, trace=None):
    from graphviz import Digraph

    dot = Digraph(
      graph_attr={'rankdir': 'LR'},
      node_attr={
        'fontsize': '10',
        'fontname': 'Noto Sans',
        'margin': '0',
        'width': '0.25',
      },  # 'nodesep': '0.75', 'ranksep': '0.75'
      edge_attr={'fontsize': '10', 'fontname': 'Noto Sans', 'arrowsize': '0.5'},
    )  # 'weight': '5.0' # create a directed graph
    for q in self.I:
      dot.node('_' + str(q), label='', shape='none', height='.0', width='.0')
      dot.node(wrap(q), shape='circle')
      dot.edge('_' + str(q), wrap(q), len='.1')
    P = self.I | self.F
    for q in self.Œ¥:
      P = P | {q}
      for a in self.Œ¥[q]:
        dot.node(wrap(q), shape='circle')
        for r in self.Œ¥[q][a]:
          dot.node(wrap(r), shape='circle')
          dot.edge(wrap(q), wrap(r), label=str(a))
          P = P | {r}
    for q in self.F:
      dot.node(wrap(q), shape='doublecircle')
    for q in self.Q - P:  # place all unreachable nodes to the right
      dot.node(wrap(q), shape='circle')
      for p in P:
        dot.edge(wrap(p), wrap(q), style='invis')  # , constraint='false'
    if trace:
      xlab = {}  # maps states to Graphviz external labels
      for i in range(0, len(trace), 2):
        xlab[trace[i]] = xlab[trace[i]] + ', ' + str(i // 2) if trace[i] in xlab else str(i // 2)
      for q in xlab:
        dot.node(wrap(q), xlabel='<<font color="royalblue">' + wrap(xlab[q]) + '</font>>')
    return dot

  def writepdf(self, name, trace=None):
    open(name, 'wb').write(self.draw(trace).pipe(format='pdf'))

  def writesvg(self, name, trace=None):
    open(name, 'wb').write(self.draw(trace).pipe(format='svg'))

  def __repr__(self):
    return (
      ' '.join(str(q) for q in self.I)
      + '\n'
      + '\n'.join(
        str(q) + ' ' + str(a) + ' ‚Üí ' + ', '.join(str(r) for r in self.Œ¥[q][a])
        for q in self.Œ¥
        for a in self.Œ¥[q]
        if self.Œ¥[q][a] != set()
      )
      + '\n'
      + ' '.join(str(f) for f in self.F)
      + '\n'
    )


def parseFSA(fsa: str) -> FiniteStateAutomaton:
  fl = [line for line in fsa.split('\n') if line != '']
  I = set(fl[0].split()) if len(fl) > 0 else set()  # second line: initial initial ...
  Œ£, Q, Œ¥, F = set(), set(), {}, set()
  for line in fl[1:]:  # all subsequent lines
    if '‚Üí' in line:  # source action ‚Üí target
      l, r = line.split('‚Üí')
      p, a, q = l.split()[0], l.split()[1], r.split()[0]
      if p in Œ¥:
        s = Œ¥[p]
        s[a] = s[a] | {q} if a in s else {q}
      else:
        Œ¥[p] = {a: {q}}
      Œ£.add(a)
      Q.add(p)
      Q.add(q)
    else:  # a line without ‚Üí is assumed to have the final states
      F = set(line.split()) if len(line) > 0 else set()  # final final ...
  return FiniteStateAutomaton(Œ£, Q | I | F, I, Œ¥, F)

In [None]:
def setunion(S: set[set]) -> set:
  return set.union(set(), *S)


def Œ¥ÃÇ(Œ¥: TransFunc, P: set[str], a: str) -> set[str]:
  return fset(setunion(Œ¥[p][a] for p in P if p in Œ¥ if a in Œ¥[p]))

In [None]:
def Œµ_closure(Q, Œ¥) -> set:  #
  C, W = set(Q), Q  # as C is updated, a copy of Q is needed
  # invariant: C ‚à™ Œµ-closure W Œ¥ = Œµ-closure Q Œ¥
  # variant: Œµ-closure Q Œ¥ - C
  while W != set():
    W = Œ¥ÃÇ(Œ¥, W, 'Œµ') - C
    C |= W
  return fset(C)

In [None]:
def accepts(A: FiniteStateAutomaton, Œ±: str):
  W = Œµ_closure(A.I, A.Œ¥)
  for a in Œ±:
    W = Œµ_closure(Œ¥ÃÇ(A.Œ¥, W, a), A.Œ¥)
  return W & A.F != set()


setattr(FiniteStateAutomaton, 'accepts', accepts)

In [None]:
def equiv(A: FiniteStateAutomaton, B: FiniteStateAutomaton, log=False) -> bool:
  W, V = {(Œµ_closure(A.I, A.Œ¥), Œµ_closure(B.I, B.Œ¥))}, set()  # work, visited
  while W != set():
    P, Q = W.pop()
    if (P, Q) not in V:
      if log:
        print('checking', P, Q)
      if (P & A.F == set()) != (Q & B.F == set()):
        return False
      for a in A.Œ£ | B.Œ£:
        W |= {(Œµ_closure(Œ¥ÃÇ(A.Œ¥, P, a), A.Œ¥), Œµ_closure(Œ¥ÃÇ(B.Œ¥, Q, a), B.Œ¥))}
      V |= {(P, Q)}
  if log:
    print('equivalent', V)
  return True

In [None]:
def merge(Œ≥: TransFunc, Œ¥: TransFunc) -> TransFunc:
  return (
    {q: Œ≥[q] for q in Œ≥.keys() - Œ¥.keys()}
    | {q: Œ¥[q] for q in Œ¥.keys() - Œ≥.keys()}
    | {q: {a: Œ≥[q].get(a, set()) | Œ¥[q].get(a, set()) for a in Œ≥[q].keys() | Œ¥[q].keys()} for q in Œ≥.keys() & Œ¥.keys()}
  )

In [None]:
def RegExToFSA(re) -> FiniteStateAutomaton:
  def ToFSA(re) -> FiniteStateAutomaton:
    nonlocal QC
    match re:
      case Œµ():
        q = QC
        QC += 1
        return FiniteStateAutomaton(set(), {q}, {q}, {}, {q})
      case Sym(a=a):
        q = QC
        QC += 1
        r = QC
        QC += 1
        return FiniteStateAutomaton({a}, {q, r}, {q}, {q: {a: {r}}}, {r})
      case Choice(E1=E1, E2=E2):
        A1, A2 = ToFSA(E1), ToFSA(E2)
        q = QC
        QC += 1
        Œ¥ = A1.Œ¥ | A2.Œ¥ | {q: {'Œµ': A1.I | A2.I}}
        return FiniteStateAutomaton(A1.Œ£ | A2.Œ£, A1.Q | A2.Q | {q}, {q}, Œ¥, A1.F | A2.F)
      case Conc(E1=E1, E2=E2):
        A1, A2 = ToFSA(E1), ToFSA(E2)
        Œ¥ = merge(A1.Œ¥ | A2.Œ¥, {q: {'Œµ': A2.I} for q in A1.F})
        return FiniteStateAutomaton(A1.Œ£ | A2.Œ£, A1.Q | A2.Q, A1.I, Œ¥, A2.F)
      case Star(E=E):
        A = ToFSA(E)
        Œ¥ = merge(A.Œ¥, {q: {'Œµ': A.I} for q in A.F})
        return FiniteStateAutomaton(A.Œ£, A.Q, A.I, Œ¥, A.I | A.F)
      case E:
        raise Exception(str(E) + ' not a regular expression')

  QC = 0
  return ToFSA(re)

Now we add one boolean function, `equalRegEx`, that takes two strings, parses them as regular expressions, converts them to finite state machines, and compares them for equivalence:

In [None]:
def equalRegEx(E1: str, E2: str) -> bool:
  A1, A2 = RegExToFSA(parse(E1)), RegExToFSA(parse(E2))
  return equiv(A1, A2)

You may use the test cases below to check your implementation of `parse`:

In [None]:
assert equalRegEx('a+', '(a+)+')
assert equalRegEx('(a+)*', '(a*)+')
assert equalRegEx('(a+)*', 'a*')
assert equalRegEx('aa*', 'a*a')
assert equalRegEx('a*', '(a+)?')
assert equalRegEx('a*', '(a?)+')
assert equalRegEx('a?', '(a?)?')
assert equalRegEx('(a*b*)*', '(a|b)*')
assert not equalRegEx('a*b*', '(a|b)*')

Let us extend regular expressions with counted repetitions, for example:
- `a{3}` is `a` repeated 3 times, i.e. `aaa`;
- `a{3,}` is `a` repeated at least 3 times, i.e. `aaaa*`
- `a{2,4}` is `a` repeated 2, 3, or 4 times, i.e. `aa | aaa | aaaa`.

JupyterLab supports counted repetitions: you can try this out by selecting `Find...` and then clicking on `Use regex`. 

We let `{...}` bind like the other postfix operators, `*`, `+`, and `?`. That is:

    E‚ÇÅ|E‚ÇÇ{d} = E‚ÇÅ|(E‚ÇÇ{d})
    E‚ÇÅ E‚ÇÇ{d} = E‚ÇÅ (E‚ÇÇ{d})

In the extended grammar, `{` and `}` are now escaped characters:

    expression  ‚Üí  term { '|' term }
    term  ‚Üí  factor { factor }
    factor ‚Üí atom [ '*' | '+' | '?' | '{' integer [',' [integer]] '}' ]
    atom  ‚Üí  plainchar | escapedchar | '(' expression ')'
    plainchar  ‚Üí  ' ' | '!' | '"' | '#' | '$' | '%' | '&' | '\'' | ',' | '-' | '.' | '/' |
         '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' | ':' | ';' | '<' | '=' | '>' | 
         '@' | 'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'G' | 'H' | 'I' | 'J' | 'K' | 'L' | 'M' | 'N' | 'O' |
         'P' | 'Q' | 'R' | 'S' | 'T' | 'U' | 'V' | 'W' | 'X' | 'Y' | 'Z' | '[' | ']' | '^' | '_' |
         '`' | 'a' | 'b' | 'c' | 'd' | 'e' | 'f' | 'g' | 'h' | 'i' | 'j' | 'k' | 'l' | 'm' | 'n' | 'o' |
         'p' | 'q' | 'r' | 's' | 't' | 'u' | 'v' | 'w' | 'x' | 'y' | 'z' | '~'
    escapedchar  ‚Üí '\\' ( '(' | ')' | '*' | '+' | '?' | '\\' | '|' | '{' | '}' )
    integer  ‚Üí  digit {digit}
    digit  ‚Üí  '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9'

Extend the attribute grammar above to construct an abstract syntax tree of extended regular expressions. The type of the nodes does not need to be extended. Rather, counted repetitions are expanded while parsing, similarly to `+`. For EBNF expression `[E]`, attribute rules can be added by `(E ¬´S¬ª | ¬´T¬ª)`, meaning that attributes are calculated according to `S` if `E` is present and according to `T` otherwise.

Extended attribute grammar for `factor` with counted repetitions, where `repeat(e, n)` builds `Conc(Conc(...Conc(Œµ(), e)..., e), e)` for n copies:

    factor(e) ‚Üí atom(e) [ '*' ¬´ e := Star(e) ¬ª
        | '+' ¬´ e := Conc(e, Star(e)) ¬ª
        | '?' ¬´ e := Choice(e, Œµ()) ¬ª
        | '{' integer(n)
            ( '}' ¬´ e := repeat(e, n) ¬ª
            | ',' ( integer(m) '}' ¬´ e := repeatRange(e, n, m) ¬ª
                  | '}' ¬´ e := Conc(repeat(e, n), Star(e)) ¬ª
                  )
            )
        ]
    integer(n) ‚Üí digit(n) { digit(d) ¬´ n := 10 √ó n + d ¬ª }
    digit(d) ‚Üí '0' ¬´ d := 0 ¬ª | ... | '9' ¬´ d := 9 ¬ª

where:
- `repeat(e, 0) = Œµ()`
- `repeat(e, k) = Conc(repeat(e, k-1), e)` for k ‚â• 1
- `repeatRange(e, n, m) = repeat(e, n)` if n = m
- `repeatRange(e, n, m) = Choice(repeat(e, n), repeatRange(e, n+1, m))` if n < m

Now extend the parser to constructed the abstract syntax tree of extended regular expressions.

In [None]:
PlainChars = set(' !"#$%&\',-./0123456789:;<=>@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]^_`abcdefghijklmnopqrstuvwxyz~')
EscapedChars = set('()*+?\\|{}')
FirstFactor = PlainChars | {'\\', '('}

src: str
pos: int
sym: str


def nxt():
  global pos, sym
  if pos < len(src):
    sym, pos = src[pos], pos + 1
  else:
    sym = chr(0)


def repeat(e, n):
  r = Œµ()
  for _ in range(n):
    r = Conc(r, e)
  return r


def repeatRange(e, lo, hi):
  r = repeat(e, lo)
  for i in range(lo + 1, hi + 1):
    r = Choice(r, repeat(e, i))
  return r


def integer():
  if not ('0' <= sym <= '9'):
    raise Exception('digit expected at ' + str(pos))
  n = ord(sym) - ord('0')
  nxt()
  while '0' <= sym <= '9':
    n = 10 * n + ord(sym) - ord('0')
    nxt()
  return n


def expression():
  e = term()
  while sym == '|':
    nxt()
    f = term()
    e = Choice(e, f)
  return e


def term():
  e = factor()
  while sym in FirstFactor:
    f = factor()
    e = Conc(e, f)
  return e


def factor():
  e = atom()
  if sym == '*':
    nxt()
    e = Star(e)
  elif sym == '+':
    nxt()
    e = Conc(e, Star(e))
  elif sym == '?':
    nxt()
    e = Choice(e, Œµ())
  elif sym == '{':
    nxt()
    n = integer()
    if sym == '}':
      nxt()
      e = repeat(e, n)
    elif sym == ',':
      nxt()
      if sym == '}':
        nxt()
        e = Conc(repeat(e, n), Star(e))
      else:
        m = integer()
        if sym == '}':
          nxt()
        else:
          raise Exception("'}' expected at " + str(pos))
        e = repeatRange(e, n, m)
    else:
      raise Exception("'}' or ',' expected at " + str(pos))
  return e


def atom():
  if sym in PlainChars:
    e = Sym(sym)
    nxt()
  elif sym == '\\':
    nxt()
    if sym in EscapedChars:
      e = Sym(sym)
      nxt()
    else:
      raise Exception('invalid escaped character at ' + str(pos))
  elif sym == '(':
    nxt()
    e = expression()
    if sym == ')':
      nxt()
    else:
      raise Exception("')' expected at " + str(pos))
  elif sym < ' ' or sym > '~':
    raise Exception('invalid character at ' + str(pos))
  else:
    raise Exception('unexpected character at ' + str(pos))
  return e


def parse(s: str):
  global src, pos
  src, pos = s, 0
  nxt()
  e = expression()
  if sym != chr(0):
    raise Exception('unexpected character at ' + str(pos))
  return e

Here are some test cases:

In [None]:
assert str(parse('a{0}')) == ''
assert str(parse('a{1}')) == '(a)'
assert str(parse('a{10}')) == '((((((((((a)a)a)a)a)a)a)a)a)a)'
assert str(parse('(ab){2}')) == '(((ab))(ab))'
assert str(parse('a{1,2}')) == '((a)|((a)a))'
assert str(parse('a{2,}')) == '(((a)a)(a)*)'

assert equalRegEx('a{0,}', 'a*')
assert equalRegEx('a{1,}', 'a+')
assert equalRegEx('a{1,1}', 'a')
assert equalRegEx('a{0,1}', 'a?')
assert equalRegEx('a{0,1}', 'a?')
assert equalRegEx('a{0,2}', 'a?|aa')
assert equalRegEx('a{1,3}', 'a|aa|aaa')

GhostSpeak = RegExToFSA(parse('bo{3,}h'))
assert not GhostSpeak.accepts('booo')
assert not GhostSpeak.accepts('booh')
assert GhostSpeak.accepts('boooh')
assert GhostSpeak.accepts('booooooooh')
assert not GhostSpeak.accepts('bboooohhh')
assert not GhostSpeak.accepts('booh boooh')