In [36]:
import sys
import os
import re

path = "Pong"
isFile = os.path.isfile(path)
fileList = []
outFileNames_vm = []
tokenIndex = 1
ind = "  "
classSymbolTable = {}
classSymbolCounts = {'static':0,'field':0}

subroutineSymbolCounts = {'local':0,'argument':0}
subroutineSymbolTable = {}
subroutineType = ""
classname = ""

dict_keywords = {'class':'keyword','constructor':'keyword','function':'keyword',
        'method':'keyword','field':'keyword','static':'keyword','var':'keyword','int':'keyword',
        'char':'keyword','boolean':'keyword','void':'keyword','true':'keyword','false':'keyword','null':'keyword',
        'this':'keyword','let':'keyword','do':'keyword','if':'keyword','else':'keyword',
        'while':'keyword','return':'keyword'}
        
symbol_keywords ={'{':'symbol','}':'symbol','(':'symbol',')':'symbol','[':'symbol',
        ']':'symbol','.':'symbol',',':'symbol',';':'symbol','+':'symbol',
        '-':'symbol','*':'symbol','/':'symbol','&':'symbol','|':'symbol',
        '<':'symbol','>':'symbol','=':'symbol','~':'symbol'}

specialCases = {'<':'&lt;','>':'&gt;','&':'&amp;'}

keyword = '(class|constructor|function|method|static|field|var|int|char|boolean|void|true|false|null|this|let|do|if|else|while|return)'
symbol = '[{}()[\].,;+\-*/&|<>=~]' #forgot escape for ]
integerConstant = '\d+'
stringConstant = '"[^"\n]*"'
identifier = '[\w]+'

#files to cover
if isFile:
    fileList.append(path)
    outFileNames_vm.append(path[0:(len(path)-5)] + '.vm')
else:
    for filename in os.listdir(path):
        if filename.endswith(".jack"):
            fileList.append(path+"/"+filename)
            outFileNames_vm.append(path + "/" + filename[0:len(filename)-5] + '.vm')
        else:
            continue

def removeComments(line):
    line = line.strip()
    if line == '' or line[0:2] == '//' or line[0:2] == "/*" or line[0] == "*":
        return ""
    result = line.split("//")
    return result[0]

def tokenizer(textToTokenize):
    regex = re.compile(symbol+"|"+identifier+"|"+stringConstant+"|"+integerConstant)
    elems = regex.findall(textToTokenize)
    return elems   

def makeTxml(tokens):
    txml = "<tokens>\n"
    for token in tokens:
        newLine = ""
        if token in dict_keywords:
            newLine = "<keyword> "+token+" </keyword>\n"
        elif token in symbol_keywords:
            if token in specialCases:
                newLine = "<symbol> "+specialCases.get(token)+" </symbol>\n"
            else:
                newLine = "<symbol> "+token+" </symbol>\n"
        elif re.match(integerConstant,token):
            newLine = "<integerConstant> "+token+" </integerConstant>\n"
        elif re.match(stringConstant,token):
            newLine = "<stringConstant> "+token.replace('"','')+" </stringConstant>\n"
        elif re.match(identifier,token):
            newLine = "<identifier> "+token+" </identifier>\n"
        txml = txml + newLine
    txml = txml + "</tokens>"
    return txml

def compilationEngine(tokens, txml, indent):
    global tokenIndex
    xml = "<class>\n"
    xml = xml + compileClass(tokens, txml, indent+1)
    xml = xml + "</class>\n"
    return xml

def updateClassSymbolTable(name, typeof, kind):
    classSymbolTable[name] = (typeof, kind, classSymbolCounts[kind])
    classSymbolCounts[kind] = classSymbolCounts[kind] + 1
    
def compileClass(tokens, txml, indent):
    global tokenIndex
    result = indent*ind + txml[tokenIndex] + "\n" #class
    result += indent*ind + txml[tokenIndex+1] + "\n" #main
    result += indent*ind + txml[tokenIndex+2] + "\n" #{
    tokenIndex = tokenIndex+3
    while tokens[tokenIndex-1] == "field" or tokens[tokenIndex-1] == "static" :
        typeof = tokens[tokenIndex]
        kind = tokens[tokenIndex-1]
        updateClassSymbolTable(tokens[tokenIndex+1], typeof, kind)
        result += indent*ind + "<classVarDec>" + "\n"
        result += compileClassVarDec(txml, tokens, indent, typeof, kind)
        result += indent*ind + "</classVarDec>" + "\n"
    while (tokens[tokenIndex-1] == "constructor" or tokens[tokenIndex-1] == "function" or tokens[tokenIndex-1] == "method"):
        result += indent*ind + "<subroutineDec>" + "\n"
        result += compileSubroutineDec(txml, tokens, indent);
        result += indent*ind + "</subroutineDec>" + "\n"
    result += indent*ind + txml[tokenIndex] + "\n" #}
    return result

def compileClassVarDec(txml, tokens, indent, typeof, kind):
    global tokenIndex
    result = ""
    indent = indent + 1
    while(tokens[tokenIndex-1] != ";"):
        if(tokens[tokenIndex] == ","):
            updateClassSymbolTable(tokens[tokenIndex+1], typeof, kind)
        result += indent*ind + txml[tokenIndex] + "\n"
        tokenIndex = tokenIndex + 1
    result += indent*ind + txml[tokenIndex] + "\n" #appends ;
    tokenIndex = tokenIndex + 1
    return result

def compileSubroutineDec(txml, tokens, indent):
    global tokenIndex
    global subroutineSymbolTable
    global subroutineSymbolCounts
    global subroutineType
    result = ""
    indent = indent + 1
    #
    subroutineType = tokens[tokenIndex-1]
    if(subroutineType == "method"):
        #print(tokens[tokenIndex+1])
        subroutineSymbolCounts = {'local':0,'argument':0}
        subroutineSymbolTable = {}
        subroutineSymbolTable['this'] = (classname, 'argument', subroutineSymbolCounts['argument'])
        subroutineSymbolCounts['argument'] = subroutineSymbolCounts['argument'] + 1
    #
    result += indent*ind + txml[tokenIndex] + "\n" #constructor/function/method
    tokenIndex = tokenIndex + 1
    result += indent*ind + txml[tokenIndex] + "\n" #return type
    tokenIndex = tokenIndex + 1
    result += indent*ind + txml[tokenIndex] + "\n" #parameter name
    tokenIndex = tokenIndex + 1
    result += indent*ind + txml[tokenIndex] + "\n" #(
    tokenIndex = tokenIndex + 1
    result += indent*ind +"<parameterList>"+"\n"
    result += compileParameterList(txml, tokens, indent)
    result += indent*ind +"</parameterList>"+"\n"
    result += indent*ind + txml[tokenIndex] + "\n" #)
    tokenIndex = tokenIndex + 1
    result += indent*ind +"<subroutineBody>"+"\n"
    result += compileSubroutineBody(txml, tokens, indent)
    result += indent*ind +"</subroutineBody>"+"\n"
    #print(subroutineSymbolTable)
    return result;

def compileParameterList(txml, tokens, indent):
    global tokenIndex
    global subroutineSymbolTable
    global subroutineSymbolCounts
    global subroutineType
    indent = indent + 1
    result = ""
    while(tokens[tokenIndex-1] != ")"):
        #print("Umm.... "+ subroutineType)
        if(subroutineType == "method" and "identifier" in txml[tokenIndex]):
            subroutineSymbolTable[tokens[tokenIndex-1]] = (tokens[tokenIndex-2], 'argument', subroutineSymbolCounts['argument'])
            subroutineSymbolCounts['argument'] = subroutineSymbolCounts['argument'] + 1
        result += indent*ind + txml[tokenIndex] + "\n"
        tokenIndex = tokenIndex + 1
    return result

def compileSubroutineBody(txml, tokens, indent):
    global tokenIndex
    global subroutineSymbolTable
    global subroutineSymbolCounts
    global subroutineType
    indent = indent + 1
    result = indent*ind + txml[tokenIndex] + "\n" #{
    tokenIndex = tokenIndex + 1
    opencnt = 1
    while tokens[tokenIndex-1] == "var":
        result += indent*ind + "<varDec>" + "\n"
        result += compileVarDec(txml, tokens, indent)
        result += indent*ind + "</varDec>" + "\n"
    result += indent*ind + "<statements>" + "\n"
    result += compileStatements(txml, tokens, indent)
    result += indent*ind + "</statements>"+ "\n"   
    result += indent*ind + txml[tokenIndex] + "\n" #}
    tokenIndex = tokenIndex+ 1
    return result

def compileVarDec(txml, tokens, indent):
    global tokenIndex
    global subroutineSymbolTable
    global subroutineSymbolCounts
    global subroutineType
    result = ""
    indent = indent + 1
    result += indent*ind + txml[tokenIndex] + "\n" #var
    tokenIndex = tokenIndex + 1
    typeof = ""
    while(tokens[tokenIndex-1] != ";"):
        #MDA
        if(subroutineType == "method" and "identifier" in txml[tokenIndex]):
            if(typeof == ""):
                typeof = tokens[tokenIndex-2]
                subroutineSymbolTable[tokens[tokenIndex-1]] = (typeof, 'local', subroutineSymbolCounts['local'])
                subroutineSymbolCounts['local'] = subroutineSymbolCounts['local'] + 1
            else:    
                subroutineSymbolTable[tokens[tokenIndex-1]] = (typeof, 'local', subroutineSymbolCounts['local'])
                subroutineSymbolCounts['local'] = subroutineSymbolCounts['local'] + 1
        #
        result += indent*ind + txml[tokenIndex] + "\n"
        tokenIndex = tokenIndex + 1
    result += indent*ind + txml[tokenIndex] + "\n" #appends ;
    tokenIndex = tokenIndex + 1
    return result

def compileStatements(txml, tokens, indent):
    global tokenIndex
    result = ""
    indent = indent + 1
    while tokens[tokenIndex-1] == "let" or tokens[tokenIndex-1] == "if" or tokens[tokenIndex-1] == "while" or tokens[tokenIndex-1] == "do" or tokens[tokenIndex-1] == "return":
        if tokens[tokenIndex-1] == "let":
            result += indent*ind + "<letStatement>"+"\n"
            result += compileLet(txml, tokens, indent)
            result += indent*ind + "</letStatement>"+"\n"
        elif tokens[tokenIndex-1] == "if":
            result += indent*ind + "<ifStatement>"+"\n"
            result += compileIf(txml, tokens, indent)
            result += indent*ind + "</ifStatement>"+"\n"
        elif tokens[tokenIndex-1] == "while":
            result += indent*ind + "<whileStatement>"+"\n"
            result += compileWhile(txml, tokens, indent)
            result += indent*ind + "</whileStatement>"+"\n"
        elif tokens[tokenIndex-1] == "do":
            result += indent*ind + "<doStatement>"+"\n"
            result += compileDo(txml, tokens, indent)
            result += indent*ind + "</doStatement>"+"\n"
        elif tokens[tokenIndex-1] == "return":
            result += indent*ind + "<returnStatement>"+"\n"
            result += compileReturn(txml, tokens, indent)
            result += indent*ind + "</returnStatement>"+"\n"
    return result

def compileLet(txml, tokens, indent):
    global tokenIndex
    indent = indent + 1
    result = indent*ind + txml[tokenIndex] + "\n" #let
    tokenIndex = tokenIndex + 1
    result += indent*ind + txml[tokenIndex] + "\n" #var
    tokenIndex = tokenIndex + 1
    if tokens[tokenIndex-1] == "[":
        result += indent*ind + txml[tokenIndex] + "\n" #[
        tokenIndex = tokenIndex + 1
        result += indent*ind + "<expression>" + "\n" 
        result += compileExpression(txml, tokens, indent)
        result += indent*ind + "</expression>" + "\n" 
        result += indent*ind + txml[tokenIndex] + "\n" #]
        tokenIndex = tokenIndex + 1
    result += indent*ind + txml[tokenIndex] + "\n" #=
    tokenIndex = tokenIndex + 1
    result += indent*ind + "<expression>" + "\n"
    result += compileExpression(txml, tokens, indent)
    result += indent*ind + "</expression>" + "\n"
    result += indent*ind + txml[tokenIndex] + "\n" #;
    tokenIndex = tokenIndex + 1
    return result

def compileIf(txml, tokens, indent):
    global tokenIndex
    indent = indent + 1 
    result = indent*ind + txml[tokenIndex] + "\n" #if
    tokenIndex = tokenIndex + 1
    result += indent*ind + txml[tokenIndex] + "\n" #(
    tokenIndex = tokenIndex + 1
    result += indent*ind + "<expression>" + "\n"
    result += compileExpression(txml, tokens, indent)
    result += indent*ind + "</expression>" + "\n"
    result += indent*ind + txml[tokenIndex] + "\n" #)
    tokenIndex = tokenIndex + 1
    result += indent*ind + txml[tokenIndex] + "\n" #{
    tokenIndex = tokenIndex + 1
    #
    result += indent*ind + "<statements>" + "\n"
    result += compileStatements(txml, tokens, indent)
    result += indent*ind + "</statements>" + "\n"
    #
    result += indent*ind + txml[tokenIndex] + "\n" #}
    tokenIndex = tokenIndex + 1
    if tokens[tokenIndex-1] == "else":
        result += indent*ind + txml[tokenIndex] + "\n" #else
        tokenIndex = tokenIndex + 1
        result += indent*ind + txml[tokenIndex] + "\n" #{
        tokenIndex = tokenIndex + 1
        #
        result += indent*ind + "<statements>" + "\n"
        result += compileStatements(txml, tokens, indent)
        result += indent*ind + "</statements>" + "\n"
        #
        result += indent*ind + txml[tokenIndex] + "\n" #}
        tokenIndex = tokenIndex + 1
    #tokenIndex = tokenIndex + 1
    return result

def compileWhile(txml, tokens, indent):
    global tokenIndex
    result = ""
    indent = indent + 1
    result = indent*ind + txml[tokenIndex] + "\n" #while
    tokenIndex = tokenIndex + 1
    result += indent*ind + txml[tokenIndex] + "\n" #(
    tokenIndex = tokenIndex + 1
    result += indent*ind + "<expression>" + "\n"
    result += compileExpression(txml, tokens, indent)
    result += indent*ind + "</expression>" + "\n"
    result += indent*ind + txml[tokenIndex] + "\n" #)
    tokenIndex = tokenIndex + 1
    result += indent*ind + txml[tokenIndex] + "\n" #{
    tokenIndex = tokenIndex + 1
    #
    result += indent*ind + "<statements>" + "\n"
    result += compileStatements(txml, tokens, indent)
    result += indent*ind + "</statements>" + "\n"
    #
    result += indent*ind + txml[tokenIndex] + "\n" #}
    tokenIndex = tokenIndex + 1
    return result

def compileDo(txml, tokens, indent):
    global tokenIndex
    indent = indent + 1
    result = indent*ind + txml[tokenIndex] + "\n" #do
    tokenIndex = tokenIndex + 1
    result += indent*ind + txml[tokenIndex] + "\n" #var/routine
    tokenIndex = tokenIndex + 1
    if tokens[tokenIndex-1] == ".":
        result += indent*ind + txml[tokenIndex] + "\n" #.
        tokenIndex = tokenIndex + 1
        result += indent*ind + txml[tokenIndex] + "\n" #subroutine
        tokenIndex = tokenIndex + 1
    result += indent*ind + txml[tokenIndex] + "\n" #(
    tokenIndex = tokenIndex + 1
    #
    result += indent*ind + "<expressionList>" + "\n"
    result += compileExpressionList(txml, tokens, indent)
    result += indent*ind + "</expressionList>" + "\n"
    #
    result += indent*ind + txml[tokenIndex] + "\n" #)
    tokenIndex = tokenIndex + 1
    result += indent*ind + txml[tokenIndex] + "\n" #;
    tokenIndex = tokenIndex + 1
    return result

def compileReturn(txml, tokens, indent):
    global tokenIndex
    indent = indent + 1
    result = indent*ind + txml[tokenIndex] + "\n" #return
    tokenIndex = tokenIndex + 1
    if tokens[tokenIndex-1] != ";":
        result += indent*ind + "<expression>" + "\n"
        result += compileExpression(txml, tokens, indent)
        result += indent*ind + "</expression>" + "\n"
    result += indent*ind + txml[tokenIndex] + "\n" #;
    tokenIndex = tokenIndex + 1
    return result

def compileExpression(txml, tokens, indent):
    global tokenIndex
    indent = indent + 1
    result = indent*ind + "<term>" + "\n"
    result += compileTerm(txml, tokens, indent)
    result += indent*ind + "</term>" + "\n"
    while tokens[tokenIndex-1] in ['+','-','*','/','&','|','<','>','=']:
        result += indent*ind + txml[tokenIndex] + "\n" #op 
        tokenIndex = tokenIndex + 1
        result += indent*ind + "<term>" + "\n"
        result += compileTerm(txml, tokens, indent)
        result += indent*ind + "</term>" + "\n"
    return result

def compileTerm(txml, tokens, indent):
    global tokenIndex
    indent = indent + 1
    result = ""
    if tokens[tokenIndex-1] == "(": #meaning its expression
        result += indent*ind + txml[tokenIndex] + "\n" #(
        tokenIndex = tokenIndex + 1
        result += indent*ind + "<expression>" + "\n"
        result += compileExpression(txml, tokens, indent)
        result += indent*ind + "</expression>" + "\n"
        result += indent*ind + txml[tokenIndex] + "\n" #)
        tokenIndex = tokenIndex + 1
    elif tokens[tokenIndex-1] in ['-','~']:
        result += indent*ind + txml[tokenIndex] + "\n" #- or ~
        tokenIndex = tokenIndex + 1
        result += indent*ind + "<term>" + "\n"
        result += compileTerm(txml, tokens, indent)
        result += indent*ind + "</term>" + "\n"
    else:
        result += indent*ind + txml[tokenIndex] + "\n" #any other thing: constants, expressionList, subroutineCall
        tokenIndex = tokenIndex + 1
        if tokens[tokenIndex-1] == "(":
            result += indent*ind + txml[tokenIndex] + "\n" #(
            tokenIndex = tokenIndex + 1
            result += indent*ind + "<expressionList>" + "\n"
            result += compileExpressionList(txml, tokens, indent)
            result += indent*ind + "</expressionList>" + "\n"
            result += indent*ind + txml[tokenIndex] + "\n" #)
            tokenIndex = tokenIndex + 1
        elif tokens[tokenIndex-1] == "[": #expression
            result += indent*ind + txml[tokenIndex] + "\n" #[
            tokenIndex = tokenIndex + 1
            result += indent*ind + "<expression>" + "\n"
            result += compileExpression(txml, tokens, indent)
            result += indent*ind + "</expression>" + "\n"
            result += indent*ind + txml[tokenIndex] + "\n" #]
            tokenIndex = tokenIndex + 1
        elif tokens[tokenIndex-1] == ".": #subroutine
            result += indent*ind + txml[tokenIndex] + "\n" #should be .
            tokenIndex = tokenIndex + 1
            result += indent*ind + txml[tokenIndex] + "\n" #name
            tokenIndex = tokenIndex + 1
            result += indent*ind + txml[tokenIndex] + "\n" #[
            tokenIndex = tokenIndex + 1
            result += indent*ind + "<expressionList>" + "\n"
            result += compileExpressionList(txml, tokens, indent)
            result += indent*ind + "</expressionList>" + "\n"
            result += indent*ind + txml[tokenIndex] + "\n" #]
            tokenIndex = tokenIndex + 1
    return result

def compileExpressionList(txml, tokens, indent):
    global tokenIndex
    result = ""
    indent = indent + 1
    if tokens[tokenIndex-1] != ")": #eseigi expression
        result += indent*ind + "<expression>" + "\n"
        result += compileExpression(txml, tokens, indent)
        result += indent*ind + "</expression>" + "\n"
    while tokens[tokenIndex-1] != ")":
        result += indent*ind + txml[tokenIndex] + "\n" #,
        tokenIndex = tokenIndex + 1
        result += indent*ind + "<expression>" + "\n"
        result += compileExpression(txml, tokens, indent)
        result += indent*ind + "</expression>" + "\n"
    return result


for file,outfile in zip(fileList,outFileNames_vm):
    fileIn = open(file,'r') 
    fileOut = open(outfile,'w')
    rows = fileIn.readlines()
    textToTokenize = ""
    for line in rows:
        textToTokenize = textToTokenize + removeComments(line)
    textToTokenize = ' '.join(textToTokenize.split())
    tokens = tokenizer(textToTokenize)
    txml = makeTxml(tokens)
    tokenIndex = 1
    #
    classSymbolTable = {}
    classSymbolCounts = {'static':0,'field':0,'var':0,'argument':0}
    classname = file.split("/")
    classname = classname[len(classname)-1]
    classname = classname[0:len(classname)-5]
    #
    #print(file)
    xml = compilationEngine(tokens, txml.splitlines(), 0)
    #print(file)
    #print(classSymbolTable)
    fileIn.close()
    fileOut.close()

Pong/Main.jack
{}
Pong/Ball.jack
{}
dispose
{'this': ('Ball', 'argument', 0)}
show
{'this': ('Ball', 'argument', 0)}
hide
{'this': ('Ball', 'argument', 0)}
draw
{'this': ('Ball', 'argument', 0)}
getLeft
{'this': ('Ball', 'argument', 0)}
getRight
{'this': ('Ball', 'argument', 0)}
setDestination
{'this': ('Ball', 'argument', 0), 'destx': ('int', 'argument', 1), 'desty': ('int', 'argument', 2), 'dx': ('int', 'local', 0), 'dy': ('int', 'local', 1), 'temp': ('int', 'local', 2)}
move
{'this': ('Ball', 'argument', 0)}
bounce
{'this': ('Ball', 'argument', 0), 'bouncingDirection': ('int', 'argument', 1), 'newx': ('int', 'local', 0), 'newy': ('int', 'local', 1), 'divLengthx': ('int', 'local', 2), 'divLengthy': ('int', 'local', 3), 'factor': ('int', 'local', 4)}
Pong/Bat.jack
{'this': ('Ball', 'argument', 0), 'bouncingDirection': ('int', 'argument', 1), 'newx': ('int', 'local', 0), 'newy': ('int', 'local', 1), 'divLengthx': ('int', 'local', 2), 'divLengthy': ('int', 'local', 3), 'factor': ('int',