## Lexer
* This Component is responsible for breaking the input into tokens.
* And it escape all spaces and newlines.
* and we implemented it mainly for 2 reasons:
    1. We will need to do parsing, and parsing on raw characters is ugly

    2. (Optional) We will need to handle escapes :
     Suppose you want to match the literal string '(', how
     would you do it ?
     If your regex engine doesn't handle escapes, you
     can't, but with escapes it's simply the regex "\("

In [31]:
# First of all we need to define all the tokens' types
DEBUG = True
from enum import Enum
class TokenTypes(Enum):
    OR = 1 
    Astrisk = 2 
    Plus = 3 
    QuestionMark = 4
    OpenSquareBracket = 5 
    ClosedSquareBracket = 6
    OpenBracket = 7
    ClosedBracket = 8
    Hiphen = 9 
    Dot = 10
    LiteralChar = 11

In [43]:
# now we need to define a class for the tokens 
class Token: 
    def __init__(self, tokenType, value):
        self.tokenType = tokenType
        self.value = value # the string value of the token, and is used in case of LiteralChar.

# we need to define a class for token stream, and a pointer, which is initialized to 0, because they will be used later by the 
class TokenStream: 
    def __init__ (self, tokenStream = [],tokenPointer=0): 
        self.tokenStream = tokenStream
        self.tokenPointer = tokenPointer
    def resetStream (self): 
        self.tokenStream = []
        self.tokenPointer = 0
    def advanceTokenPointer(self):
        self.tokenPointer += 1
    def getCurrentToken(self):
        return self.tokenStream[self.tokenPointer]
    

In [58]:
# Now lets implement the Lexer Class 
class Lexer: 
    @staticmethod
    def lexReg(regex:str): 
        '''
            This function is responsible for lexing the regular expression and returning a list of tokens.
            Input: 
                regex: a string representing the regular expression.
            Output:
                a list of tokens.
        '''
        # create a map which maps each character to its corresponding token type.
        metaCharactersMap = { 
            '|': TokenTypes.OR,
            '*': TokenTypes.Astrisk,
            '+': TokenTypes.Plus,
            '?': TokenTypes.QuestionMark,
            '[': TokenTypes.OpenSquareBracket,
            ']': TokenTypes.ClosedSquareBracket,
            '(': TokenTypes.OpenBracket,
            ')': TokenTypes.ClosedBracket,
            '-': TokenTypes.Hiphen,
            '.': TokenTypes.Dot
            # LiteralCharacters are any other characters that are not in the map.
        }

        # define spaceChar and escapeChar
        spaceChar = ' '
        escapeChar = '\\'

        # initialize an empty token stream
        tokens = TokenStream()
        tokens.resetStream()

        # we will need to use two pointers approach, one holds the previous character, and the other holds the current character.
        previousChar = None
        for char in regex : 
            if char == spaceChar: 
                continue
            # skip all escapes
            if char == escapeChar: 
                # if the previous character is escape character, then we need to add the current character as a LiteralChar token.
                if previousChar == escapeChar:
                    tokens.tokenStream.append(Token(TokenTypes.LiteralChar, char))
                    previousChar = None
                    continue
                # else, then we are preparing ourself to escape the next character.
                previousChar = char
                continue
            # check if the current character meta character and not preceeded with space 
            if char in metaCharactersMap and previousChar != escapeChar: 
                # add the current character to the token stream
                tokens.tokenStream.append(Token(metaCharactersMap[char], char))
            else: 
                # in this case we are trying to match the exact symbol, so we need to add it as a LiteralChar token.
                tokens.tokenStream.append(Token(TokenTypes.LiteralChar, char))
            previousChar = char
        return tokens


        

In [72]:
# Lets Test the Lexer 
def testLexer(): 
    # regex = 'a?b(cd|ef)[a-z]'
    
    regex = 'ab*c+de?(f|g|h)|mr|n|[pq]' #\\
    tokens = Lexer.lexReg(regex)
    for token in tokens.tokenStream: 
        print(token.tokenType, token.value)

testLexer()


TokenTypes.LiteralChar a
TokenTypes.LiteralChar b
TokenTypes.Astrisk *
TokenTypes.LiteralChar c
TokenTypes.Plus +
TokenTypes.LiteralChar d
TokenTypes.LiteralChar e
TokenTypes.QuestionMark ?
TokenTypes.OpenBracket (
TokenTypes.LiteralChar f
TokenTypes.OR |
TokenTypes.LiteralChar g
TokenTypes.OR |
TokenTypes.LiteralChar h
TokenTypes.ClosedBracket )
TokenTypes.OR |
TokenTypes.LiteralChar m
TokenTypes.LiteralChar r
TokenTypes.OR |
TokenTypes.LiteralChar n
TokenTypes.OR |
TokenTypes.OpenSquareBracket [
TokenTypes.LiteralChar p
TokenTypes.LiteralChar q
TokenTypes.ClosedSquareBracket ]


# Regex_Parser

In [63]:
# First we need to define our AST Nodes 
class AstNode: 
    pass # this is an abstract class 

'''
    What are our cases? 
        1. + -> One or more node class
        2. * -> Zero or more node class
        3. ? -> Zero or one class
        4. | -> OR class
        5. [] -> set of characters class
        6. abcd -> Sequence of characters class
        7. LiteralChar -> Literal Character class
'''
class PlusNode(AstNode):
    left: AstNode
    def __init__(self, left):
        self.left = left

class AstriskNode(AstNode):
    left: AstNode
    def __init__(self, left):
        self.left = left

class QuestionMarkNode(AstNode):
    left: AstNode
    def __init__(self, left):
        self.left = left
    
class OrNode(AstNode):
    left: AstNode
    right: AstNode
    def __init__(self, left, right):
        self.left = left
        self.right = right

class SetOfCharactersNode(AstNode):
    characters: set  # maybe strings or pairs as in case of [0-9]
    def __init__(self, characters):
        self.characters = characters    

class SequenceOfCharactersNode(AstNode):
    left: AstNode
    right: AstNode
    def __init__(self, left, right):
        self.left = left
        self.right = right

class LiteralCharNode(AstNode):
    value: str
    def __init__(self, value):
        self.value = value
        

### now lets define our grammar 

1. parse -> parseReg
2. parseReg -> parseOr
2. parseOr -> parseSeq (| parseSeq)*
2. parseSeq -> parseQuantified (parseQuantified)*
2. parseQuantified -> parseBase (+ | * | ?)?
2. parseBase -> LiteralChar | SetOfCharacters | ( parseReg )
2. so we just need to implement one function for each of those following the recursive descent parsing approach.


In [85]:
# 1. parseBase -> because it is our basecase, so we must implement the functions bottom up.
def parseSquareBrackets(tokenStream, tokenIdx):
    '''
        Our content may have different patterns: 
            1. a-z -> one range
            2. abc0-9 -> certain Literals and a range
            3. xyz -> certain Literals
            4. A-Z0-9 multiple Ranges 
            5. all above.
        how to handle them? 
        1. any range should be treated as a tuple of 2 characters
            first is the starting index
            second is the ending index. 
        2. any sequence of characters 
    '''
    characters = [] 
    prevIsDash = False
    # We Should iterate till we find a ] <= 
    while tokenStream[tokenIdx].tokenType != TokenTypes.ClosedSquareBracket:
        if tokenStream[tokenIdx].tokenType == TokenTypes.Hiphen:
            prevIsDash = True
        elif prevIsDash:
            # get the last appended char as the starting character
            startingChar = characters.pop()
            # get the current character as an ending character
            endingChar = tokenStream[tokenIdx].value
            # push in the characters a tuple
            characters.append((startingChar, endingChar))
            # set the prevIsDash back to False 
            prevIsDash = False
        elif tokenStream[tokenIdx].tokenType == TokenTypes.LiteralChar:
            characters.append(tokenStream[tokenIdx].value)
        tokenIdx += 1
    
    # here we should assign it to set, but I left it as List.
    return SetOfCharactersNode(characters), tokenIdx

def parseBase(tokenStream, tokenIdx):
    '''
        This function is responsible for parsing the base cases, or applying a recursive call on bracktes
        Implmenting this grammar:  parseBase -> LiteralChar | SetOfCharacters | ( parseReg )

        Input: 
            tokenStream: a list of tokens.
            tokenIdx: the current index of the token.
        Output:
            an AST node, and the new index
    '''

    # we have three cases, LiteralChar, SetOfCharacters, and ( parseReg )

    # Extracting the token 
    token:Token = tokenStream[tokenIdx]
    tokenIdx += 1 

    # LiteralChar
    if token.tokenType == TokenTypes.LiteralChar: 
        return LiteralCharNode(token.value), tokenIdx
    
    # set of Characters. 
    if token.tokenType == TokenTypes.OpenSquareBracket: 
        # we need a utility function to parse the data inside the square brackets.
        return parseSquareBrackets(tokenStream, tokenIdx)

    if token.tokenType == TokenTypes.OpenBracket: 
        # we need to parse the regular expression inside the brackets.
        parsedReg, tokenIdx =  parseRegex(tokenStream, tokenIdx)
        return parsedReg, tokenIdx+1
        

In [75]:
def parseQuantified(tokensStream, tokenIdx): 
    '''
        This function is responsible for parsing the quantified cases.
        it implements this grammar: 
            parseQuantified -> parseBase (+ | * | ?)?
    '''

    # first we assume that we have only one operand 
    leftOperand, tokenIdx = parseBase(tokensStream, tokenIdx)

    # now we need to check if there are more operands 
    if tokenIdx >= len(tokensStream):
        # no more operands, just return the leftOperand 
        return leftOperand, tokenIdx
    
    # now we have to check on the operator (*,+,?)
    token = tokensStream[tokenIdx]
    if token.tokenType == TokenTypes.Astrisk:
        return AstriskNode(leftOperand), tokenIdx + 1
    if token.tokenType == TokenTypes.Plus:
        return PlusNode(leftOperand), tokenIdx + 1
    if token.tokenType == TokenTypes.QuestionMark:
        return QuestionMarkNode(leftOperand), tokenIdx + 1
    
    # we should never reach here, however, return the leftoperand for the function definition
    return leftOperand, tokenIdx+1



In [76]:
def parseSequence(tokensStream, tokenIdx): 
    '''
        This function is responsible for parsing the sequence of characters.
        it implements this grammar: 
            parseSequence -> parseQuantified (parseQuantified)*
    '''
    # first we assume that we have only one operand
    leftOperand = parseQuantified(tokensStream, tokenIdx)

    # now we need to check if we have more elements 
    if tokenIdx >= len(tokensStream):
        # no more elements 
        return leftOperand, tokenIdx
    
    # we need to continue parsing all sequence like abcd, or if we have operator we should also stop. 
    # to do so, we have to have a stoping condition
    # we will stop if we have a closed bracket, or an OR operator.
    while tokenIdx < len(tokensStream) and tokensStream[tokenIdx].tokenType not in [TokenTypes.OR, TokenTypes.ClosedBracket]:
        rightOperand, tokenIdx = parseQuantified(tokensStream, tokenIdx) # token index is incremented here implecitly.
        leftOperand = SequenceOfCharactersNode(leftOperand, rightOperand) # recursive assignment, cascading the elemets in the leftOperand.
    return leftOperand, tokenIdx 


In [77]:
def parseOr (tokensStream, tokenIdx): 
    '''
        This function is responsible for parsing the OR operator.
        it implements this grammar: 
            parseOr -> parseSequence (| parseSequence)*
    '''
    # first we assume that we have only one operand
    leftOperand = parseSequence(tokensStream, tokenIdx)

    # now we need to check if we have more elements
    if tokenIdx >= len(tokensStream): 
        return leftOperand, tokenIdx
    
    # now we still have elements, we just need to do recursion, as we see or operator 
    while tokenIdx < len(tokensStream) and tokensStream[tokenIdx].tokenType == TokenTypes.OR:
        rightOperand, tokenIdx = parseSequence(tokensStream, tokenIdx+1)
        leftOperand = OrNode(leftOperand, rightOperand)
    
    return leftOperand, tokenIdx

In [78]:
def parseRegex (tokensStream, index):
    '''
        This function should implement the grammar:
            parseReg -> parseOr
    ''' 
    return parseOr(tokensStream, index)

In [79]:
def parse(tokensStream): 
    expression, _ = parseRegex(tokensStream, 0)
    return expression

In [98]:
# Its time to test our logic. 
# regex = 'ab*c+de?(f|g|h)|mr|n|[pq]'
# regex = 'a+'
# tokens = Lexer.lexReg(regex)
# ast = parse(tokens.tokenStream)
# print(ast)
toks = [
    Token(TokenTypes.LiteralChar,"a"),
    Token(TokenTypes.LiteralChar,"b"),
    Token(TokenTypes.Astrisk,"*"),
    Token(TokenTypes.LiteralChar,"c"),
    Token(TokenTypes.Plus,"+"),
    Token(TokenTypes.LiteralChar,"d"),
    Token(TokenTypes.LiteralChar,"e"),
    Token(TokenTypes.QuestionMark,"?"),

    Token(TokenTypes.OpenBracket,"("),
    Token(TokenTypes.LiteralChar,"f"),
    Token(TokenTypes.OR,"|"),
    Token(TokenTypes.LiteralChar,"g"),
    Token(TokenTypes.OR,"|"),
    Token(TokenTypes.LiteralChar,"h"),
    Token(TokenTypes.ClosedBracket,")"),

    Token(TokenTypes.OR,"|"),
    Token(TokenTypes.LiteralChar,"m"),
    Token(TokenTypes.LiteralChar,"r"),

    Token(TokenTypes.OR,"|"),
    Token(TokenTypes.LiteralChar,"n"),

    Token(TokenTypes.OpenBracket,"["),
    Token(TokenTypes.LiteralChar,"p"),
    Token(TokenTypes.LiteralChar,"q"),
    Token(TokenTypes.ClosedBracket,"]")
]
print(parse(toks))

TypeError: cannot unpack non-iterable NoneType object