# How To Parse Code

In [139]:
import re
from collections import namedtuple
from IPython.core.display import HTML

## Sample Code

So I think the easiest way to handle this is by using a markup language (sort of like XML or HTML). Here, we're creating a new language called "DML", or "Declarative Markup Language". Our goal is to transpile this into HTML.

In [140]:
code = """
html {
    head {
        title { "The DML Website" }
        meta(name="viewport" content="width=device-width,initial-scale=1")
        style `
            .spacer {
                margin-top: 30px;
                margin-bottom: 30px;
            }
            .highlighted {
                color: red;
            }
            .my-class {
                font-size: 14pt;
            }
        `
    }
    body {
        h1 { "Welcome to my website!" }
        p(style="margin-top:13px;margin-bottom:15px;") { 
            "this is a paragraph" 
            span(class="highlighted") { "This one is highlighted" } 
            "Here's another paragraph"
        }
        div(class="my-class") {
            p { "this is another paragraph" }
        }
        div(
            id="my-spaceer"
            class="spacer"
            data-attribute="my-value"
        )
        p {
            i { "this code was generated using DML" }
        }
    }
}
"""
print(code)


html {
    head {
        title { "The DML Website" }
        meta(name="viewport" content="width=device-width,initial-scale=1")
        style `
            .spacer {
                margin-top: 30px;
                margin-bottom: 30px;
            }
            .highlighted {
                color: red;
            }
            .my-class {
                font-size: 14pt;
            }
        `
    }
    body {
        h1 { "Welcome to my website!" }
        p(style="margin-top:13px;margin-bottom:15px;") { 
            "this is a paragraph" 
            span(class="highlighted") { "This one is highlighted" } 
            "Here's another paragraph"
        }
        div(class="my-class") {
            p { "this is another paragraph" }
        }
        div(
            id="my-spaceer"
            class="spacer"
            data-attribute="my-value"
        )
        p {
            i { "this code was generated using DML" }
        }
    }
}



## Tokenization

In [141]:
class TokenPattern:
    """
    Matches tokens
    """
    def __init__(self, name, patstr, flags=0):
        self.name = name
        self.pattern = re.compile(patstr, flags)
        
    def match(self, text):
        match = self.pattern.match(text)
        if match:
            return Token(self.name, match.group(0))
        else:
            return None
        
    def __repr__(self):
        return f'TokenPattern({self.name}, {self.pattern.pattern}, ignore={self.ignore})'
    

class Token:
    """
    Token Unit
    """
    def __init__(self, name, data):
        self.name = name
        self.data = data
        
    def __len__(self):
        return len(self.data)
        
    def __repr__(self):
        return f'{self.name}: {repr(self.data)}'


# List of token patterns
token_patterns = [
    TokenPattern('WHITES', r'\s+'),
    TokenPattern('IDENTI', r'[a-zA-Z][a-zA-Z0-9_-]*'),
    TokenPattern('EQUALS', r'\='),
    TokenPattern('STRING', r'\"[^\"]*\"'),
    TokenPattern('MULTIL', r'`[^`]*`', re.MULTILINE),
    TokenPattern('LPAREN', r'\('),
    TokenPattern('RPAREN', r'\)'),
    TokenPattern('LCURLY', r'\{'),
    TokenPattern('RCURLY', r'\}')
]


def scan_tokens(scan_text, ignore=['WHITES']):
    """
    Scan tokens from code
    """
    text = scan_text
    while len(text) > 0:
        for token_pattern in token_patterns:
            token = token_pattern.match(text)
            if token:
                text = text[len(token):]
                move_next = False
                if token.name not in ignore:
                    yield token
                break
        else:
            raise Exception(f'Invalid character in text => "{text}"')

            
for token in scan_tokens(code):
    print(token)

IDENTI: 'html'
LCURLY: '{'
IDENTI: 'head'
LCURLY: '{'
IDENTI: 'title'
LCURLY: '{'
STRING: '"The DML Website"'
RCURLY: '}'
IDENTI: 'meta'
LPAREN: '('
IDENTI: 'name'
EQUALS: '='
STRING: '"viewport"'
IDENTI: 'content'
EQUALS: '='
STRING: '"width=device-width,initial-scale=1"'
RPAREN: ')'
IDENTI: 'style'
MULTIL: '`\n            .spacer {\n                margin-top: 30px;\n                margin-bottom: 30px;\n            }\n            .highlighted {\n                color: red;\n            }\n            .my-class {\n                font-size: 14pt;\n            }\n        `'
RCURLY: '}'
IDENTI: 'body'
LCURLY: '{'
IDENTI: 'h1'
LCURLY: '{'
STRING: '"Welcome to my website!"'
RCURLY: '}'
IDENTI: 'p'
LPAREN: '('
IDENTI: 'style'
EQUALS: '='
STRING: '"margin-top:13px;margin-bottom:15px;"'
RPAREN: ')'
LCURLY: '{'
STRING: '"this is a paragraph"'
IDENTI: 'span'
LPAREN: '('
IDENTI: 'class'
EQUALS: '='
STRING: '"highlighted"'
RPAREN: ')'
LCURLY: '{'
STRING: '"This one is highlighted"'
RCURLY: '}'


## AST Generation

In [142]:
class Node:
    visit_enter_prefix = 'visit'
    visit_exit_prefix = 'exit'
    visit_defnam = 'Node'
    
    def __init__(self, key, data=None, parent=None):
        self.key = key
        self.data = data
        self.parent = parent
        self.children = []
        
    def __repr__(self):
        return f'{self.key}({self.data or ""})'
        
    def branch(self, key, data=None):
        child = Node(key, data, self)
        self.children.append(child)
        return child
    
    def copy(self):
        cp = Node(self.key, self.data, self.parent)
        cp.children = [ child.copy() for child in self.children ]
        return cp
            
    def accept(self, visitor, entry=True):
        prefix = self.visit_enter_prefix if entry else self.visit_exit_prefix
        defMet = getattr(visitor, f'{prefix}{self.visit_defnam}')
        method = getattr(visitor, f'{prefix}{self.key}', defMet)
        method(self)
        if visitor.always_do_default:
            defMet(self)
    
    def traverse(self, visitor):
        self.accept(visitor, True)
        for child in self.children:
            child.traverse(visitor)
        self.accept(visitor, False)
        

class NodeVisitor:
    always_do_default = False
    
    def visitNode(self, node):
        pass
    
    def exitNode(self, node):
        pass
    
    
class PrintTreeVisitor(NodeVisitor):
    def __init__(self, indent=0, tab='    '):
        self.indent = indent
        self.tab = tab
    
    def visitNode(self, node):
        print(self.tab*self.indent, node, sep='')
        self.indent += 1
        
    def exitNode(self, node):
        self.indent -= 1

        
class TreeBuilderVisitor(NodeVisitor):
    def __init__(self):
        self.tree = None
        
    def treeDownNew(self, key, data):
        if self.tree:
            self.tree = self.tree.branch(key, data)
        else:
            self.tree = Node(key, data)
            
    def treeDown(self, node):
        self.treeDownNew(node.key, node.data)
            
    def treeUp(self):
        if self.tree.parent:
            self.tree = self.tree.parent
            
    def leafNew(self, key, data=None):
        self.treeDownNew(key, data)
        self.treeUp()
            
    def leaf(self, node):
        self.treeDown(node)
        self.treeUp()
        

def print_tree(tree, indent=0, tab='    '):
    tree.traverse(PrintTreeVisitor(tab=tab, indent=indent))

# Build AST
node = Node('DML')
for token in scan_tokens(code):
    if token.name == 'LCURLY':
        node = node.branch('DML')
    elif token.name == 'LPAREN':
        node = node.branch('ATTRS')
    elif token.name in ['RCURLY', 'RPAREN']:
        node = node.parent
    else:
        node.branch(token.name, token.data)
print_tree(node)

DML()
    IDENTI(html)
    DML()
        IDENTI(head)
        DML()
            IDENTI(title)
            DML()
                STRING("The DML Website")
            IDENTI(meta)
            ATTRS()
                IDENTI(name)
                EQUALS(=)
                STRING("viewport")
                IDENTI(content)
                EQUALS(=)
                STRING("width=device-width,initial-scale=1")
            IDENTI(style)
            MULTIL(`
            .spacer {
                margin-top: 30px;
                margin-bottom: 30px;
            }
            .highlighted {
                color: red;
            }
            .my-class {
                font-size: 14pt;
            }
        `)
        IDENTI(body)
        DML()
            IDENTI(h1)
            DML()
                STRING("Welcome to my website!")
            IDENTI(p)
            ATTRS()
                IDENTI(style)
                EQUALS(=)
                STRING("margin-top:13px;margin-bottom:15px;")
  

## Transformation

In [143]:
class AttributeVisitor(TreeBuilderVisitor):
    def __init__(self):
        self.in_attr = False
        self.attrs = {}
        self.attr_id = ''
        self.got_equals = False
        super().__init__()
        
    def visitDML(self, node):
        self.treeDown(node)
        
    def exitDML(self, node):
        self.treeUp()
        
    def visitATTRS(self, node):
        self.attrs = {}
        self.in_attr = True
        
    def exitATTRS(self, node):
        self.in_attr = False
        self.leafNew('ATTRS', self.attrs)
        
    def visitIDENTI(self, node):
        if self.in_attr:
            self.attr_id = node.data
        else:
            self.leaf(node)
        
    def visitEQUALS(self, node):
        if not self.attr_id:
            raise Error('We\'re missing an identifier!')
        self.got_equals = True
        
    def visitSTRING(self, node):
        if self.in_attr:
            if not self.got_equals:
                raise Error('We\'re missing an equals!')
            self.attrs[self.attr_id] = node.data
            self.got_equals = False
            self.attr_id = ''
        else:
            self.leaf(node)
            
    def visitMULTIL(self, node):
        self.leaf(node)

            
# Traverse with attribute visitor
attribute_visitor = AttributeVisitor()
node.traverse(attribute_visitor)
attributes = attribute_visitor.tree
print_tree(attributes)

DML()
    IDENTI(html)
    DML()
        IDENTI(head)
        DML()
            IDENTI(title)
            DML()
                STRING("The DML Website")
            IDENTI(meta)
            ATTRS({'name': '"viewport"', 'content': '"width=device-width,initial-scale=1"'})
            IDENTI(style)
            MULTIL(`
            .spacer {
                margin-top: 30px;
                margin-bottom: 30px;
            }
            .highlighted {
                color: red;
            }
            .my-class {
                font-size: 14pt;
            }
        `)
        IDENTI(body)
        DML()
            IDENTI(h1)
            DML()
                STRING("Welcome to my website!")
            IDENTI(p)
            ATTRS({'style': '"margin-top:13px;margin-bottom:15px;"'})
            DML()
                STRING("this is a paragraph")
                IDENTI(span)
                ATTRS({'class': '"highlighted"'})
                DML()
                    STRING("This one is h

In [144]:
class AddEmptyDMLVisitor(TreeBuilderVisitor):
    def __init__(self):
        super().__init__()
        self.first = False
        self.visited_dml = False
        
    def visitIDENTI(self, node):
        if not (self.first or self.visited_dml):
            self.leafNew('DML')
        self.leaf(node)
        self.visited_dml = False
        self.first = False
        
    def visitDML(self, node):
        self.treeDown(node)
        self.first = True
        
    def exitDML(self, node):
        if not (self.first or self.visited_dml):
            self.leafNew('DML')
        self.treeUp()
        self.visited_dml = True
        
    def visitMULTIL(self, node):
        self.leaf(node)
        
    def visitSTRING(self, node):
        self.leaf(node)
        
    def visitATTRS(self, node):
        self.leaf(node)
        
# Traverse with add empty dml visitor
empty_dml_visitor = AddEmptyDMLVisitor()
attributes.traverse(empty_dml_visitor)
empty_dml = empty_dml_visitor.tree
print_tree(empty_dml)

DML()
    IDENTI(html)
    DML()
        IDENTI(head)
        DML()
            IDENTI(title)
            DML()
                STRING("The DML Website")
            IDENTI(meta)
            ATTRS({'name': '"viewport"', 'content': '"width=device-width,initial-scale=1"'})
            DML()
            IDENTI(style)
            MULTIL(`
            .spacer {
                margin-top: 30px;
                margin-bottom: 30px;
            }
            .highlighted {
                color: red;
            }
            .my-class {
                font-size: 14pt;
            }
        `)
            DML()
        IDENTI(body)
        DML()
            IDENTI(h1)
            DML()
                STRING("Welcome to my website!")
            IDENTI(p)
            ATTRS({'style': '"margin-top:13px;margin-bottom:15px;"'})
            DML()
                STRING("this is a paragraph")
                IDENTI(span)
                ATTRS({'class': '"highlighted"'})
                DML()
     

In [145]:
class AssociateIdentifierVisitor(TreeBuilderVisitor):
    def __init__(self):
        self.past_first = False
        self.in_identifier = False
        super().__init__()
        
    def visitIDENTI(self, node):
        if self.past_first: self.treeUp()
        self.treeDown(node)
        self.in_identifier = True
        
    def visitDML(self, node):
        self.treeDown(node)
        
    def exitDML(self, node):
        if self.tree.key == 'IDENTI':
            self.treeUp()
        self.treeUp()
        if self.tree.key == 'IDENTI':
            self.treeUp()
            
    def visitMULTIL(self, node):
        self.leaf(node)
        
    def visitATTRS(self, node):
        self.leaf(node)
        
    def visitSTRING(self, node):
        self.leaf(node)
        

# Traverse with identifier visitor
associate_identifier_visitor = AssociateIdentifierVisitor()
empty_dml.traverse(associate_identifier_visitor)
associated = associate_identifier_visitor.tree
print_tree(associated)

DML()
    IDENTI(html)
        DML()
            IDENTI(head)
                DML()
                    IDENTI(title)
                        DML()
                            STRING("The DML Website")
                    IDENTI(meta)
                        ATTRS({'name': '"viewport"', 'content': '"width=device-width,initial-scale=1"'})
                        DML()
                    IDENTI(style)
                        MULTIL(`
            .spacer {
                margin-top: 30px;
                margin-bottom: 30px;
            }
            .highlighted {
                color: red;
            }
            .my-class {
                font-size: 14pt;
            }
        `)
                        DML()
            IDENTI(body)
                DML()
                    IDENTI(h1)
                        DML()
                            STRING("Welcome to my website!")
                    IDENTI(p)
                        ATTRS({'style': '"margin-top:13px;margin-bottom:15px

In [146]:
class DOMElement:
    repr_trunc = 60
    
    def __init__(self, tag):
        self.tag = tag
        self.attributes = {}
        
    def __repr__(self):
        string = f'{self.tag}'
        attributes = ' '.join(f'{k}={v}' for k,v in self.attributes.items())
        if len(attributes) > self.repr_trunc: attributes = attributes[:self.repr_trunc] + '...'
        if attributes:
            string += f'[{attributes}]'
        return string


class CreateDOMElementsVisitor(TreeBuilderVisitor):        
    def visitIDENTI(self, node):
        self.treeDownNew('DOMELEM', DOMElement(node.data))
    
    def exitIDENTI(self, node):
        self.treeUp()
    
    def visitATTRS(self, node):
        self.tree.data.attributes = node.data
    
    def visitSTRING(self, node):
        self.leaf(node)
        
    def visitMULTIL(self, node):
        self.leaf(node)
    

# Traverse with create DOMElements visitor
create_dom_elements_visitor = CreateDOMElementsVisitor()
associated.traverse(create_dom_elements_visitor)
dom_elements = create_dom_elements_visitor.tree
print_tree(dom_elements)

DOMELEM(html)
    DOMELEM(head)
        DOMELEM(title)
            STRING("The DML Website")
        DOMELEM(meta[name="viewport" content="width=device-width,initial-scale=1"])
        DOMELEM(style)
            MULTIL(`
            .spacer {
                margin-top: 30px;
                margin-bottom: 30px;
            }
            .highlighted {
                color: red;
            }
            .my-class {
                font-size: 14pt;
            }
        `)
    DOMELEM(body)
        DOMELEM(h1)
            STRING("Welcome to my website!")
        DOMELEM(p[style="margin-top:13px;margin-bottom:15px;"])
            STRING("this is a paragraph")
            DOMELEM(span[class="highlighted"])
                STRING("This one is highlighted")
            STRING("Here's another paragraph")
        DOMELEM(div[class="my-class"])
            DOMELEM(p)
                STRING("this is another paragraph")
        DOMELEM(div[id="my-spaceer" class="spacer" data-attribute="my-valu

## Code Generation 

In [147]:
class HTMLGenerationVisitor(NodeVisitor):
    def __init__(self, tab='    ', indent=0, newline='\n'):
        self.text = ''
        self.tab = tab
        self.indent = indent
        self.newline = newline
        
    def visitDOMELEM(self, node):
        elem = node.data
        text = f'<{elem.tag}'
        attributes_string = ' '.join(f'{k}={v}' for k,v in elem.attributes.items())
        if attributes_string:
            text += ' ' + attributes_string
        if len(node.children) == 0:
            text += '/'
        text += '>'
        self.write_line(text)
        self.indent += 1
        
    def exitDOMELEM(self, node):
        self.indent -= 1
        if len(node.children) > 0:
            self.write_line(f'</{node.data.tag}>')
            
    def visitSTRING(self, node):
        raw_text = re.search('(?<=").*(?=")', node.data).group(0)
        self.write_line(raw_text)
        
    def visitMULTIL(self, node):
        token = re.compile('(?<=`)[^`]*(?=`)', re.MULTILINE)
        raw_text = token.search(node.data).group(0)
        raw_text = raw_text.strip('\n')
        raw_text = raw_text.rstrip(' \n')
        self.write_raw(raw_text + self.newline)
        
    def write_raw(self, text):
        self.text += text
    
    def write_line(self, text):
        self.text += self.tab*self.indent + text + self.newline
        
        
# Run visitor
html_generation_visitor = HTMLGenerationVisitor()
dom_elements.traverse(html_generation_visitor)
html = html_generation_visitor.text
print(html)

<html>
    <head>
        <title>
            The DML Website
        </title>
        <meta name="viewport" content="width=device-width,initial-scale=1"/>
        <style>
            .spacer {
                margin-top: 30px;
                margin-bottom: 30px;
            }
            .highlighted {
                color: red;
            }
            .my-class {
                font-size: 14pt;
            }
        </style>
    </head>
    <body>
        <h1>
            Welcome to my website!
        </h1>
        <p style="margin-top:13px;margin-bottom:15px;">
            this is a paragraph
            <span class="highlighted">
                This one is highlighted
            </span>
            Here's another paragraph
        </p>
        <div class="my-class">
            <p>
                this is another paragraph
            </p>
        </div>
        <div id="my-spaceer" class="spacer" data-attribute="my-value"/>
        <p>
            <i>
                this 

In [148]:
HTML(html)