Tokenizer:

In [2]:
# input_dict={token:TOKEN,....}
import copy,re,collections
Token = collections.namedtuple('Token', ['type', 'value','real_value'])
d_keyword={'main':"main_t","var":"var_t","array":"array_t","function":"function_t",\
           "procedure":"procedure_t","return":"return_t","while":"while_t","do":"do_t","od":"od_t",\
           "if":"if_t","then":"then_t","else":"else_t","fi":"fi_t","call":"call_t","let":"let_t"\
          }
l_kewords=[i for i,v in d_keyword.items()]

token_specification = [
    ('COMMENT',   r'(#|//)(.*)(\n)'), # Assignment operator
    ('ASSIGN',   r'<-'),           # Assignment operator
    ('OPEN_P',   r'\('),           # Assignment operator
    ('CLOSE_P',   r'\)'),           # Assignment operator
    ('OPEN_C',   r'{'),           # Assignment operator
    ('CLOSE_C',   r'}'),           # Assignment operator
    ('OPEN_B',   r'\['),           # Assignment operator
    ('CLOSE_B',   r'\]'),           # Assignment operator
    ('SEMI',   r';'),           # Assignment operator
    ('COMMA',   r'\,'),           # Assignment operator
    ('POINT',   r'\.'),           # Assignment operator
    ('OP_EQ',      r'(==)'),            # Statement terminator
    ('OP_NEQ',   r'(!=)'),           # Assignment operator
    ('OP_LEQ',   r'(<=)'),           # Assignment operator
    ('OP_GEREQ',   r'(>=)'),           # Assignment operator
    ('OP_LESS',      r'(<)'),            # Statement terminator    
    ('OP_GRT',      r'(>)'),            # Statement terminator    
    ('NUMBER',   r'\d+(?![A-Za-z])'),  #Integer
    ('INDENT',       r'[A-Za-z]+[A-Za-z0-9]*'),    # Identifiers
    ('OP',       r'[+\-*/](?![+\-*/])'),      # Arithmetic operators
    ('NEWLINE',  r'(\n|\r)'),           # Line endings
    ('SKIP',     r'[ \t]+'),       # Skip over spaces and tabs
    ('MISMATCH', r'.'),            # Any other character
]


class Tokenizer():
    def __init__(self,keywords,token_specification):
        self.keywords=keywords
        self.token_specification=token_specification
        self.result=[]
        self.indent_index_counter=0
        self.indent_index_table=dict()
        
    def tokenize_program(self,code):
        result=[]
        tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in self.token_specification)
        #print(tok_regex)
        line_num = 1
        line_start = 0
        for mo in re.finditer(tok_regex, code):
            kind = mo.lastgroup
            value = mo.group()
            real_value=None
            if kind == 'NUMBER':
                value = float(value) if '.' in value else int(value)
            elif kind == 'INDENT' and value in self.keywords:
                kind = value
            elif kind == 'INDENT' and value not in self.keywords:
                jit=False
                real_value=value
                if True:
                    if value in self.indent_index_table.keys():
                        value=self.indent_index_table[value]
                    else:
                        self.indent_index_table[value]=self.indent_index_counter
                        self.indent_index_counter+=1
                        value=self.indent_index_table[value]

            elif kind == 'NEWLINE':
                #print("\n")
                continue
            elif kind == 'SKIP':
                continue
            elif kind == 'MISMATCH':
                #raise RuntimeError(f'{value!r} unexpected on line {line_num}')
                print("========MISMATCH",value)
                pass
            #yield Token(kind, value, line_num, column)

            #print(Token(kind, value, line_num, column))
            #print(kind),
            self.result.append(Token(kind,value,real_value))
        
        
    def print_tokens(self):
        for t in self.result:
            print(t.value),
        
    def detect_comment(self,line):
        a=line.rstrip().split("#")
        return a[0]    

In [9]:
if __name__ == '__main__' and '__file__' not in globals():
    print("test")
    f=open("test.c", "r")

    tk=Tokenizer(l_kewords,token_specification)

    code='''
    # Array testing
    main
    var a, b, c;
    function foo( );
    {
    	return 14
    };
    {
    	let a <- b + c
    }
    .
    '''
    a="".join([lines for lines in f])
    tk.tokenize_program(code)
    tk.print_tokens()

test
# Array testing
main var 0 , 1 , 2 ; function 3 ( ) ; { return 14 } ; { let 0 <- 1 + 2 } .


some parser rules

In [386]:
rules={
        "computation":""""main" { varDecl } { funcDecl } "OPEN_C" statSequence "CLOSE_C" "POINT" """,
        "typeDecl" : """ "var" | "array" "OPEN_B" "NUMBER" "CLOSE_B" { "OPEN_B" "NUMBER" "CLOSE_B" } """,
        "varDecl" : """ typeDecl indent { "COMMA" ident } "SEMI" """,
        "funcDecl" : """ ( "function" | "procedure" ) ident [ formalParam ] "SEMI" funcBody "SEMI" """,
        "formalParam" : """ "OPEN_P" [ ident { "COMMA" ident } ] "CLOSE_P" """,
        "funcBody" : """ { varDecl } "OPEN_B" [ statSequence ] "CLOSE_B" """              
      }

In [24]:
#print(l_kewords)