Tokenizer:

In [555]:
# input_dict={token:TOKEN,....}
import copy,re,collections
Token = collections.namedtuple('Token', ['type', 'value'])
d_keyword={'main':"main_t","var":"var_t","array":"array_t","function":"function_t",\
           "procedure":"procedure_t","return":"return_t","while":"while_t","do":"do_t","od":"od_t",\
           "if":"if_t","then":"then_t","else":"else_t","fi":"fi_t","call":"call_t","let":"let_t"\
          }
l_kewords=[i for i,v in d_keyword.items()]

token_specification = [
    ('COMMENT',   r'(#|//)(.*)(\n)'), # Assignment operator
    ('ASSIGN',   r'<-'),           # Assignment operator
    ('OPEN_P',   r'\('),           # Assignment operator
    ('CLOSE_P',   r'\)'),           # Assignment operator
    ('OPEN_C',   r'{'),           # Assignment operator
    ('CLOSE_C',   r'}'),           # Assignment operator
    ('OPEN_B',   r'\['),           # Assignment operator
    ('CLOSE_B',   r'\]'),           # Assignment operator
    ('SEMI',   r';'),           # Assignment operator
    ('COMMA',   r'\,'),           # Assignment operator
    ('POINT',   r'\.'),           # Assignment operator
    ('OP_EQ',      r'(==)'),            # Statement terminator
    ('OP_NEQ',   r'(!=)'),           # Assignment operator
    ('OP_LEQ',   r'(<=)'),           # Assignment operator
    ('OP_GEREQ',   r'(>=)'),           # Assignment operator
    ('OP_LESS',      r'(<)'),            # Statement terminator    
    ('OP_GRT',      r'(>)'),            # Statement terminator    
    ('NUMBER',   r'\d+(?![A-Za-z])'),  #Integer
    ('INDENT',       r'[A-Za-z]+[A-Za-z0-9]*'),    # Identifiers
    ('OP',       r'[+\-*/](?![+\-*/])'),      # Arithmetic operators
    ('NEWLINE',  r'(\n|\r)'),           # Line endings
    ('SKIP',     r'[ \t]+'),       # Skip over spaces and tabs
    ('MISMATCH', r'.'),            # Any other character
]


class Tokenizer():
    def __init__(self,keywords,token_specification):
        self.keywords=keywords
        self.token_specification=token_specification
        self.result=[]
        
    def tokenize_program(self,code):
        result=[]
        tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in self.token_specification)
        #print(tok_regex)
        line_num = 1
        line_start = 0
        for mo in re.finditer(tok_regex, code):
            kind = mo.lastgroup
            value = mo.group()
            if kind == 'NUMBER':
                value = float(value) if '.' in value else int(value)
            elif kind == 'INDENT' and value in self.keywords:
                kind = value
            elif kind == 'NEWLINE':
                #print("\n")
                continue
            elif kind == 'SKIP':
                continue
            elif kind == 'MISMATCH':
                #raise RuntimeError(f'{value!r} unexpected on line {line_num}')
                print("========MISMATCH",value)
                pass
            #yield Token(kind, value, line_num, column)

            #print(Token(kind, value, line_num, column))
            #print(kind),
            self.result.append(Token(kind,value))
        
        
    def print_tokens(self):
        pass      
    def detect_comment(self,line):
        a=line.rstrip().split("#")
        return a[0]    

In [523]:

f=open("test.c", "r")

tk=Tokenizer(l_kewords,token_specification)

code='''
# Array testing
main
array [ 5 ][ 10 ][ 15 ][ 20 ] a;
var b, c, d;
#function foo( );
#{
#	return 14
#};
#{
#	let b <- 1;
#	let c <- 19;
#	let a[ 4 ][ 9 ][ 1 + call foo( ) ][ b * c ] <- 45;
#	let d <- a[ 4 ][ 9 ][ call foo( ) + 1 ][ c * b ] + 2
#}
.
'''
a="".join([lines for lines in f])
tk.tokenize_program(code)



COMMENT main 

array OPEN_B NUMBER CLOSE_B OPEN_B NUMBER CLOSE_B OPEN_B NUMBER CLOSE_B OPEN_B NUMBER CLOSE_B INDENT SEMI 

var INDENT COMMA INDENT COMMA INDENT SEMI 

COMMENT COMMENT COMMENT COMMENT COMMENT COMMENT COMMENT COMMENT COMMENT COMMENT POINT 



In [386]:
rules={
        "computation":""""main" { varDecl } { funcDecl } "OPEN_C" statSequence "CLOSE_C" "POINT" """,
        "typeDecl" : """ "var" | "array" "OPEN_B" "NUMBER" "CLOSE_B" { "OPEN_B" "NUMBER" "CLOSE_B" } """,
        "varDecl" : """ typeDecl indent { "COMMA" ident } "SEMI" """,
        "funcDecl" : """ ( "function" | "procedure" ) ident [ formalParam ] "SEMI" funcBody "SEMI" """,
        "formalParam" : """ "OPEN_P" [ ident { "COMMA" ident } ] "CLOSE_P" """,
        "funcBody" : """ { varDecl } "OPEN_B" [ statSequence ] "CLOSE_B" """              
      }



main 

var INDENT COMMA INDENT COMMA INDENT SEMI 

OPEN_C 

if INDENT OP_LESS INDENT then 

let INDENT ASSIGN INDENT OP INDENT 

else 

let INDENT ASSIGN INDENT OP INDENT 

fi 

CLOSE_C 

POINT 



In [556]:
import collections 
import inspect

pair={"[":"]","{":"}","(":")"}
class parser():
    def __init__(self,tokens,rules):
        self.tokens=collections.deque([i for i in tokens if i.type!='COMMENT'])
        self.rules=rules
        #for i in self.tokens:
        #    print(i.type)
    
    #computation
    def check_computation(self,b):
        self.must_parse("main")
        #more than 0
        while self.tokens[0].type in ["var","array"]:
            self.check_varDecl()
        #more than 0
        while self.tokens[0].type in ["function","procedure"]:
            self.check_funcDecl()
        self.must_parse("OPEN_C")        
        self.check_stat_Sequence()
        self.must_parse("CLOSE_C")
        self.must_parse("POINT")
        
    def check_funcDecl(self):
        if self.tokens[0].type=="function":
            self.must_parse("function")
        else:
            self.must_parse("procedure")
        self.must_parse("INDENT")
        self.check_formalParam()
        self.must_parse("SEMI")
        self.check_funcBody()
        self.must_parse("SEMI")
        
    def check_formalParam(self):
        if self.tokens[0].type=="OPEN_P":
            self.must_parse("OPEN_P")
            if self.tokens[0].type=="INDENT":
                self.must_parse("INDENT")
                #more than 0
                while self.tokens[0].type=="COMMA":            
                    self.must_parse("COMMA")
                    self.must_parse("INDENT")
            self.must_parse("CLOSE_P")
         
    def check_funcBody(self):
        #more than 0
        while self.tokens[0].type in ["var","array"]:
            self.check_varDecl()
        self.must_parse("OPEN_C")        
        self.check_stat_Sequence()
        self.must_parse("CLOSE_C")

    def check_stat_Sequence(self):
        self.check_statement()
        # more that 0
        while self.tokens[0].type in ["SEMI"]:
            self.must_parse("SEMI")            
            self.check_statement()
            
    def check_statement(self):
        if self.tokens[0].type not in ["let","call","if","while","return"]:
            print("error check_statement",self.tokens[0].type,inspect.stack()[1][3])
        if self.tokens[0].type=="let":
            self.check_assignment()
        if self.tokens[0].type=="call":
            self.check_funcCall()
        if self.tokens[0].type=="if":
            self.check_ifStatement()
        if self.tokens[0].type=="while":
            self.check_whileStatement()
        if self.tokens[0].type=="return":
            self.check_returnStatement()
    def check_returnStatement(self):
        self.must_parse("return")
        #how?
        if self.tokens[0].type in ["INDENT","NUMBER","OPEN_P","call"]:
            self.check_expression()

    def check_whileStatement(self):
        self.must_parse("while")
        self.check_relation()
        self.must_parse("do")
        self.check_stat_Sequence()
        self.must_parse("od")
        
    
    def check_ifStatement(self):
        self.must_parse("if")
        self.check_relation()
        self.must_parse("then")
        self.check_stat_Sequence()
        
        if self.tokens[0].type=="else":
            self.must_parse("else")
            self.check_stat_Sequence()
            
        self.must_parse("fi")
        
    
    def check_relation(self):
        self.check_expression()
        self.check_relOp()
        self.check_expression()
        
    def check_relOp(self):
        if self.tokens[0].type in ['OP_EQ','OP_NEQ','OP_LESS','OP_LEQ','OP_GRT','OP_GEREQ']:
            self.must_parse(self.tokens[0].type)
    
    def check_assignment(self):
        self.must_parse("let")
        self.check_designator()
        self.must_parse("ASSIGN")
        self.check_expression()
    
    def check_designator(self):
        self.must_parse("INDENT")
        while self.tokens[0].type=="OPEN_B":
            self.must_parse("OPEN_B")
            self.check_expression()
            self.must_parse("CLOSE_B")
    
    
                
    def check_term(self):
        self.check_factor()
        while self.tokens[0].value in ["*","/"]:
            if self.tokens[0].value=="*":
                self.must_parse("OP")
            if self.tokens[0].value=="/":
                self.must_parse("OP")
            self.check_factor()
    
    def check_factor(self):
        if self.tokens[0].type not in ["INDENT","NUMBER","OPEN_P","call"]:
            print("error,check_factor!!!",self.tokens[0],inspect.stack()[1][3])
        if self.tokens[0].type=="INDENT":
            self.check_designator()
        if self.tokens[0].type=="NUMBER":
            self.must_parse("NUMBER")
        if self.tokens[0].type=="OPEN_P":
            self.must_parse("OPEN_P")
            self.check_expression()
            self.must_parse("CLOSE_P")
        if self.tokens[0].type=="call":
            self.check_funcCall()
        
    def check_funcCall(self):
        self.must_parse("call")
        self.must_parse("INDENT")
        if self.tokens[0].type=="OPEN_P":
            self.must_parse("OPEN_P")
            #how?
            if self.tokens[0].type in ["INDENT","NUMBER","OPEN_P","call"]:
                self.check_expression()
                while self.tokens[0].type=="COMMA":
                        self.must_parse("COMMA")
                        self.check_expression()
            self.must_parse("CLOSE_P")
        
    def check_varDecl(self):
        self.check_typeDecl()
        #print(self.tokens[0].value)
        self.must_parse("INDENT")
        #more than 0
        while self.tokens[0].type in ["COMMA"]:
            self.must_parse("COMMA")            
            #print(self.tokens[0].value)
            self.must_parse("INDENT")
        self.must_parse("SEMI")
        
    def check_typeDecl(self):
        if self.tokens[0].type=="array":
            self.must_parse("array")
            counter=0
            #more than 1
            while self.tokens[0].type=="OPEN_B":
                self.must_parse("OPEN_B")
                self.must_parse("NUMBER")
                self.must_parse("CLOSE_B")
                counter+=1
            if counter==0:
                print("error typeDecl",self.tokens[0].type)
                
        elif self.tokens[0].type=="var":
            self.must_parse("var")
        else:
            print("error typeDecl",target_token,self.tokens[0].type)
    
    def must_parse(self,target_token):        
        sym=self.tokens.popleft()
        if (sym.type==target_token):
            #print(sym.value)
            pass
        else:
            print("error!!!!!",target_token,sym,inspect.stack()[1][3])
            print([i.value for i in self.tokens])
    
    def check_expression(self):
        self.check_term()
        while self.tokens[0].value in ["+","-"]:
            if self.tokens[0].value=="+":
                self.must_parse("OP")
            if self.tokens[0].value=="-":
                self.must_parse("OP")
            self.check_term()
        

In [553]:
tk=Tokenizer(l_kewords,token_specification)

code='''
# Nested if/while v3
# If I think I pass this, I'm moving on.
# This test should not be attempted by anyone who is pregnant, nursing, has 
# high blood pressure, aliens, and stressed out graduate students.
# Based on test 13.
main
var x, y;
var a, b;
{
	call foo( );
	let y <- y + call boo( )
}
.
'''
tk.tokenize_program(code)

c=parser(tk.result,rules)
c.check_computation("computation")





















In [557]:
import os
path="/home/ahmad/Documents/compiler/project/testprogs/"
for files in os.listdir(path):
    print(files,"==========")
    a="".join([lines for lines in open(path+files,'r')])
    tk=Tokenizer(l_kewords,token_specification)
    tk.tokenize_program(a)
    c=parser(tk.result,rules)
    c.check_computation("computation")
    #tk.tokenize_program(code)

