<a id='top'></a>
Agnieszka Dutka
# Laboratory 7-  Regular expressions

Contents:  
[Finite automata for Regex finding](#0)  
[Example usage](#1)
[Tests](#2)

<a id='0'></a>
## Finite automata for Regex finding

To implement regex matching, I first convert regular expression to reverse polish notation, and then to non-deterministic finite automata, as explained in:

https://deniskyashif.com/2019/02/17/implementing-a-regular-expression-engine/
and:
https://en.wikipedia.org/wiki/Thompson%27s_construction


Nondeterministic finite automata structures

In [18]:
class State:
    def __init__(self, is_final=False, e_trans=None):
        self.is_final = is_final
        self.transitions = {}
        self.e_transitions = [] if e_trans is None else e_trans

    def add_trans(self, to, symbol):
        assert not self.is_final, "cannot add (non-empty) transition from non-final state"
        self.transitions[symbol] = to

    def add_e_trans(self, to):
        self.e_transitions.append(to)
        self.is_final = False  # if state was final, it no longer is

    def get_all_trans(self, char):
        """ get all transitions reachable by 'char' transition (checks for . also) """
        trans = []
        if char in self.transitions:
            trans += self.transitions[char].get_all_e_trans()
        if '.' in self.transitions:
            trans += self.transitions['.'].get_all_e_trans()
        return trans

    def get_all_e_trans(self, curr_set=None):
        """ recursively gets all e-transitions reachable from self. """
        curr_set = set() if curr_set is None else curr_set

        if not self.e_transitions:
            curr_set.add(self)

        for state in self.e_transitions:
            if state not in curr_set:
                curr_set.add(state)
                state.get_all_e_trans(curr_set)

        return curr_set

    def __repr__(self):
        return "final" if self.is_final else f"tr: {self.transitions.keys()}\n {self.e_transitions}"


class Nfa:
    def __init__(self, start, finish):
        self.start = start
        self.finish = finish

    def set(self, start, finish):
        self.start = start
        self.finish = finish

    def follow(self, other):
        """ add nfa at the end of self """
        self.finish.add_e_trans(other.start)
        self.finish = other.finish

    def join(self, other):
        """ join two parallel nfas """
        start = State(e_trans=[self.start, other.start])
        finish = State(is_final=True)
        self.finish.add_e_trans(finish)
        other.finish.add_e_trans(finish)
        self.set(start, finish)

    def cycle(self):
        self.finish.add_e_trans(self.start)
        self.pad_with_e()

    def maybe(self):
        self.pad_with_e()
        self.start.add_e_trans(self.finish)

    def pad_with_e(self):
        """ pad start and finish with e-transitions """
        start = State(e_trans=[self.start])
        finish = State(is_final=True)
        self.finish.add_e_trans(finish)
        self.set(start, finish)


In [86]:
from string import ascii_letters, ascii_lowercase, ascii_uppercase, digits, whitespace

operators = ['|', '*', '+', '?', '^']  # ^ - concatenation
priority = {'|': 0, '^': 1, '*': 2, '+': 2, '?': 2}

class Regex:
    def __init__(self, regex):
        pre = self.rmv_classes_and_divide(regex)
        rpn = self.to_rpn(pre)
        self.nfa = self.nfa_from_rpn(rpn)

    @staticmethod
    def rmv_classes_and_divide(regex: str):
        """
            convert regex with classes to simple regex, and divide expressions with marker (^)
        """

        def get_class(symbol):  # predefined characters classes
            if symbol == 'd':
                return digits  # 0-9
            if symbol == 's':
                return whitespace  # all whitespace characters
            if symbol == 'c':
                return ascii_letters  # lower and uppercase
            if symbol == 'w':
                return ascii_letters + digits  # all letters and digits
            if symbol == 'l':
                return ascii_lowercase  # a-z
            if symbol == 'u':
                return ascii_uppercase  # A-Z

        expr = ""
        i = 0
        while i < len(regex):
            if regex[i] == '\\':  # predefined class
                ors = '|'.join(get_class(regex[i + 1]))
                expr += "^(" + ors + ")"
                i += 1
            elif regex[i] == '[':  # user-created class
                clas = []
                j = i + 1
                while regex[j] != ']':
                    clas.append(regex[j])
                    j += 1
                ors = '|'.join(clas)
                expr += "^(" + ors + ")"
                i = j

            else:
                if i > 0 and regex[i] not in operators and regex[i - 1] not in {'|', '(', ')'}:
                    expr += '^'
                expr += regex[i]
            i += 1
        return expr

    @staticmethod
    def to_rpn(regex: str):
        """
            convert regex expression (with markers and without classes) to reverse polish notation
        """
        rpn = ""
        st = []  # current operators stack

        for c in regex:
            if c == '(':
                st.append(c)
            elif c == ')':
                while st[-1] != '(':
                    rpn += st.pop()
                st.pop()
            elif c in operators:
                while len(st) > 0 and st[-1] != '(' and priority[st[-1]] >= priority[c]:
                    rpn += st.pop()
                st.append(c)
            else:
                rpn += c

        return rpn + ''.join(reversed(st))  # add remaining operators

    @staticmethod
    def nfa_from_rpn(rpn: str):
        """ build nfa from given rpn """
        def singular_nfa(char):
            start, finish = State(), State(is_final=True)
            start.add_trans(finish, char)
            return Nfa(start, finish)
        st = []  # nfa-stack
        for c in rpn:
            if c in operators:
                nfa = st.pop()  # pop nfa from stack and modify
                if c == '*':
                    nfa.cycle()
                    nfa.maybe()
                elif c == '|':
                    other = st.pop()
                    nfa.join(other)
                elif c == '+':
                    nfa.cycle()
                elif c == '?':
                    nfa.maybe()
                else:  # concatenate nfas
                    first = st.pop()
                    first.follow(nfa)
                    nfa = first
                st.append(nfa)
            else:
                st.append(singular_nfa(c))

        nfa = st.pop()  # created nfa at the top of the stack
        return nfa

    def match(self, pattern: str, show=True):
        """ checks if regex matches pattern exactly (a matches only a, not: ba, ab ...)"""
        curr = self.nfa.start.get_all_e_trans()
        for char in pattern:
            nxt = []
            for state in curr:
                nxt += state.get_all_trans(char)
            curr = nxt

        if any(state.is_final for state in curr):
            if show:
                print("✔️")
            else:
                return True
        else:
            if show:
                    print("❌")
            else:
                return False

    def find(self, text, show=False):
        """ find all occurrences of this regex in given text """
        res = []
        for idx, char in enumerate(text):
            curr = self.nfa.start.get_all_e_trans()
            i = idx
            while i < len(text) and curr:
                char = text[i]
                nxt = []
                for state in curr:
                    nxt += state.get_all_trans(char)
                curr = nxt
                for state in curr:
                    if state.is_final:
                        res.append(idx)
                i += 1
        if show:
            print("mathes found at:", res)
        else:
            return res


In [87]:
from time import perf_counter

def eval_func(func, args, count=4, show=True):
    start = perf_counter()
    for i in range(0, count):
        func(*args)
    end = perf_counter()
    average = (end-start)/count
    if show:
        print(average)
    else:
        return average

## Example usage
<a id='1'></a>

In [88]:
expr = Regex("abc")
expr.match("abc")
expr.match("aabc")
expr.match("ac")

expr.find("aabcabc", True)

✔️
❌
❌
mathes found at: [1, 4]


## Tests
<a id='2'></a>

Literals

In [36]:
expr = Regex("abc")
expr.match("abc")
expr.match("aabc")
expr.match("ac")

expr.find("aabcabc", True)

expr = Regex("a")
expr.match("abc")
expr.match("a")
expr.match("bac")

expr.find("aabcabc", True)

✔️
❌
❌
mathes found at: [1, 4]
❌
✔️
❌
mathes found at: [0, 1, 4]


.

In [89]:
expr = Regex("a.c")

expr.match("abc")
expr.match("ac")
expr.match("acc")
expr.match("addc")

expr.find("accc", True)
expr.find("aacc", True)

expr = Regex(".")

expr.match("a")
expr.match("ac")
expr.match("acc")
expr.match("addc")

expr.find("accc", True)
expr.find("aacc", True)

✔️
❌
✔️
❌
mathes found at: [0]
mathes found at: [0, 1]
✔️
❌
❌
❌
mathes found at: [0, 1, 2, 3]
mathes found at: [0, 1, 2, 3]


*, +, ?

In [90]:
expr = Regex("a*c")

expr.match("c")
expr.match("ac")
expr.match("aac")
expr.match("acc") # x

expr.find("acb", True)


✔️
✔️
✔️
❌
mathes found at: [0, 1]


In [91]:
expr = Regex("alexa? has 2+1* cats")

expr.match("alexa has 21 cats")
expr.match("alex has 21 cats")
expr.match("alexaa has 21 cats") # x
expr.match("alexa has 1 cats") # x
expr.match("alexa has 22 cats")
expr.match("alexa has 2221111 cats")

expr.find("I bet alexa has 22 cats today.")


✔️
✔️
❌
❌
✔️
✔️


[6]

()

In [65]:
expr = Regex("ba(na)*s")
expr2 = Regex("ba(na)+s")

expr.match("bas")
expr.match("banas")
expr.match("banananas")

expr2.match("bas") # x
expr2.match("banas")
expr2.match("banas")


✔️
✔️
✔️
❌
✔️
✔️


In [64]:
expr = Regex("(ve(r)+y )?(nice) weather")

expr.match("very nice weather")
expr.match("verry nice weather") 
expr.match("vey nice weather") # x
expr.match("nice weather")


✔️
✔️
❌
✔️


classes

In [70]:
expr = Regex("f[au]rther")

expr.match("further")
expr.match("farther")
expr.match("ferther")


✔️
✔️
❌


In [72]:
expr = Regex("f\drther")  # digits

expr.match("f1rther")
expr.match("f00rther")
expr.match("ferther")

✔️
❌
❌


In [85]:
expr = Regex("me\l")  # lowercase

expr.match("me")
expr.match("meh")
expr.match("meH")

❌
✔️
❌
