## Lexer
* This Component is responsible for breaking the input into tokens.
* And it escape all spaces and newlines.
* and we implemented it mainly for 2 reasons:
    1. We will need to do parsing, and parsing on raw characters is ugly

    2. (Optional) We will need to handle escapes :
     Suppose you want to match the literal string '(', how
     would you do it ?
     If your regex engine doesn't handle escapes, you
     can't, but with escapes it's simply the regex "\("

In [31]:
# First of all we need to define all the tokens' types
DEBUG = True
from enum import Enum
class TokenTypes(Enum):
    OR = 1 
    Astrisk = 2 
    Plus = 3 
    QuestionMark = 4
    OpenSquareBracket = 5 
    ClosedSquareBracket = 6
    OpenBracket = 7
    ClosedBracket = 8
    Hiphen = 9 
    Dot = 10
    LiteralChar = 11

In [43]:
# now we need to define a class for the tokens 
class Token: 
    def __init__(self, tokenType, value):
        self.tokenType = tokenType
        self.value = value # the string value of the token, and is used in case of LiteralChar.

# we need to define a class for token stream, and a pointer, which is initialized to 0, because they will be used later by the 
class TokenStream: 
    def __init__ (self, tokenStream = [],tokenPointer=0): 
        self.tokenStream = tokenStream
        self.tokenPointer = tokenPointer
    def resetStream (self): 
        self.tokenStream = []
        self.tokenPointer = 0
    def advanceTokenPointer(self):
        self.tokenPointer += 1
    def getCurrentToken(self):
        return self.tokenStream[self.tokenPointer]
    

In [58]:
# Now lets implement the Lexer Class 
class Lexer: 
    @staticmethod
    def lexReg(regex:str): 
        '''
            This function is responsible for lexing the regular expression and returning a list of tokens.
            Input: 
                regex: a string representing the regular expression.
            Output:
                a list of tokens.
        '''
        # create a map which maps each character to its corresponding token type.
        metaCharactersMap = { 
            '|': TokenTypes.OR,
            '*': TokenTypes.Astrisk,
            '+': TokenTypes.Plus,
            '?': TokenTypes.QuestionMark,
            '[': TokenTypes.OpenSquareBracket,
            ']': TokenTypes.ClosedSquareBracket,
            '(': TokenTypes.OpenBracket,
            ')': TokenTypes.ClosedBracket,
            '-': TokenTypes.Hiphen,
            '.': TokenTypes.Dot
            # LiteralCharacters are any other characters that are not in the map.
        }

        # define spaceChar and escapeChar
        spaceChar = ' '
        escapeChar = '\\'

        # initialize an empty token stream
        tokens = TokenStream()
        tokens.resetStream()

        # we will need to use two pointers approach, one holds the previous character, and the other holds the current character.
        previousChar = None
        for char in regex : 
            if char == spaceChar: 
                continue
            # skip all escapes
            if char == escapeChar: 
                # if the previous character is escape character, then we need to add the current character as a LiteralChar token.
                if previousChar == escapeChar:
                    tokens.tokenStream.append(Token(TokenTypes.LiteralChar, char))
                    previousChar = None
                    continue
                # else, then we are preparing ourself to escape the next character.
                previousChar = char
                continue
            # check if the current character meta character and not preceeded with space 
            if char in metaCharactersMap and previousChar != escapeChar: 
                # add the current character to the token stream
                tokens.tokenStream.append(Token(metaCharactersMap[char], char))
            else: 
                # in this case we are trying to match the exact symbol, so we need to add it as a LiteralChar token.
                tokens.tokenStream.append(Token(TokenTypes.LiteralChar, char))
            previousChar = char
        return tokens


        

In [62]:
# Lets Test the Lexer 
def testLexer(): 
    # regex = 'a?b(cd|ef)[a-z]'
    
    regex = '\\\\' #\\
    tokens = Lexer.lexReg(regex)
    for token in tokens.tokenStream: 
        print(token.tokenType, token.value)

testLexer()


TokenTypes.LiteralChar \
