## Creating helpful regular expressions

In [1]:
#### Q1: FILL IN THIS CELL
import re
tag_open = re.compile(r"<(\w+)\s*(.*?)>")
tag_close = re.compile(r"</(\w+)(.*?)>")
tag_open_close = re.compile(r"<(\w+)\s*((?:(?!<).)*?)\/>")

comment = re.compile(r"<!--(.*?)-->", re.DOTALL)
xml_prolog = re.compile(r"<\?(.*)\?>")
html_prolog = re.compile(r"<!(DOCTYPE .*?)>")


## Test data

In [4]:
import requests
test_snippet = """<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE xml> <!-- not actually valid xml-->
<!-- This is a comment -->
<note date="8/31/12">
    <to>Tove</to>
    <from>Jani</from>
    <heading type="Reminder"/>
    <body>Don't forget me this weekend!</body>
    <!-- This is a multiline comment,
         which take a bit of care to parse -->
</note>
"""

# [NOTE] Comment this out prior to submission
course_webpage = str(requests.get("http://www.datasciencecourse.org/2016").content)


In [5]:
print("tag_open: ", tag_open.findall(test_snippet))
print("tag_close: ", tag_close.findall(test_snippet))
print("tag_open_close: ", tag_open_close.findall(test_snippet))
print("comment: ", comment.findall(test_snippet))
print("xml_prolog: ", xml_prolog.findall(test_snippet))
print("html_prolog: ", html_prolog.findall(test_snippet))

tag_open:  [('note', 'date="8/31/12"'), ('to', ''), ('from', ''), ('heading', 'type="Reminder"/'), ('body', '')]
tag_close:  [('to', ''), ('from', ''), ('body', ''), ('note', '')]
tag_open_close:  [('heading', 'type="Reminder"')]
comment:  [' not actually valid xml', ' This is a comment ', ' This is a multiline comment,\n         which take a bit of care to parse ']
xml_prolog:  ['xml version="1.0" encoding="UTF-8"']
html_prolog:  ['DOCTYPE xml']


In [6]:
# [NOTE] Comment this out prior to submission
print("tag_open: ", len(tag_open.findall(course_webpage)))
print("tag_close: ", len(tag_close.findall(course_webpage)))
print("tag_open_close: ", len(tag_open_close.findall(course_webpage)))
print("comment: ", len(comment.findall(course_webpage)))
print("xml_prolog: ", len(xml_prolog.findall(course_webpage)))
print("html_declaration: ", len(html_prolog.findall(course_webpage)))

# print(tag_open_close.findall(course_webpage))

tag_open:  469
tag_close:  439
tag_open_close:  30
comment:  23
xml_prolog:  0
html_declaration:  1


## XML Parser class

In [7]:
class XMLNode:
    
    def __init__(self, tag, attributes, content):
        self.tag = tag
        self.attributes = attributes
        self.children =  []
        self.content = content
        pos = 0
        self.endpos = 0
        angle_pattern = re.compile("<")
        angle_Match = re.search("<",content)
        if angle_Match is None:
            return
        else:
            while(pos < len(content)-1):
                if angle_pattern.search(content,pos) != None:
                    pos = angle_pattern.search(content,pos).span()[0]
                else:
                    pos += 1
                if comment.match(content, pos) != None:
                    m = comment.match(content,pos)
                    pos = m.end()
                    continue
                if xml_prolog.match(content, pos) != None:
                    m = xml_prolog.match(content,pos)
                    pos = m.end()
                    continue
                if html_prolog.match(content, pos) != None:
                    m = html_prolog.match(content,pos)
                    pos = m.end()
                    continue
                if tag_open.match(content, pos) != None:
                    isopen_close_tag = False
                    if tag_open_close.match(content, pos) != None:
                        m = tag_open_close.match(content,pos)
                        isopen_close_tag = True
                    else:
                        m = tag_open.match(content, pos)
                    
                    attributes = {}
                    if m.group(2) != None and m.group(2) != "":
                        thisAttrs = m.group(2).replace("\\'",'"').split('" ')
                        for attr in thisAttrs:
                            thisOccr = attr.split('="')
                            if len(thisOccr) > 1:
                                attributes[thisOccr[0]] = thisOccr[1].replace('"',"")
                    if isopen_close_tag:
                        self.children.append(XMLNode(m.group(1), attributes, ""))
                    else:
                        self.children.append(XMLNode(m.group(1), attributes, content[m.end():]))
                    pos = m.end() + self.children[-1].endpos
                    continue
                if tag_close.match(content, pos) != None:
                    m = tag_close.match(content, pos)
                    if m.group(1) != tag:
                        raise Exception("Error: <{0}> tag closed with {1}".format(tag, m.group()))
                    else:
                        self.content = content[:m.start()]
                        self.endpos = m.end()
                        return
    def find(self, tag, **kwargs):
        """
        Search for a given tag and atributes anywhere in the XML tree

        Args:
            tag (string): tag to match
            kwargs (dictionary): list of attribute name / attribute value pairs to match

        Returns:
            (list): a list of XMLNode objects that match from anywhere in the tree
        """
        return_list = []
        if self.tag == tag:
            if len(kwargs.items()) > 0:
                for k, v in kwargs.items():
                    if k in self.attributes:
                        if v in self.attributes[k]:
                            return_list = [self]
                            break
            else:
                return_list = [self]
        if len(self.children) > 0:
            for child in self.children:
                return_list.extend(child.find(tag, **kwargs))
        return return_list


In [8]:
root = XMLNode("", {}, test_snippet)

print("root.tag: ", root.tag)
print("root.attributes: ", root.attributes)
print("root.content: ", repr(root.content))
print("root.children: ", root.children)
print("")
print("note.tag: ", root.children[0].tag)
print("note.attributes: ", root.children[0].attributes)
print("note.content: ", repr(root.children[0].content))
print("note.children: ", root.children[0].children)
print("")
print("to.tag: ", root.children[0].children[0].tag)
print("to.attributes: ", root.children[0].children[0].attributes)
print("to.content: ", repr(root.children[0].children[0].content))
print("to.children: ", root.children[0].children[0].children)
print("")
print("heading.tag: ", root.children[0].children[2].tag)
print("heading.attributes: ", root.children[0].children[2].attributes)
print("heading.content: ", repr(root.children[0].children[2].content))
print("heading.children: ", root.children[0].children[2].children)

root.tag:  
root.attributes:  {}
root.content:  '<?xml version="1.0" encoding="UTF-8"?>\n<!DOCTYPE xml> <!-- not actually valid xml-->\n<!-- This is a comment -->\n<note date="8/31/12">\n    <to>Tove</to>\n    <from>Jani</from>\n    <heading type="Reminder"/>\n    <body>Don\'t forget me this weekend!</body>\n    <!-- This is a multiline comment,\n         which take a bit of care to parse -->\n</note>\n'
root.children:  [<__main__.XMLNode object at 0x106ae56a0>]

note.tag:  note
note.attributes:  {'date': '8/31/12'}
note.content:  '\n    <to>Tove</to>\n    <from>Jani</from>\n    <heading type="Reminder"/>\n    <body>Don\'t forget me this weekend!</body>\n    <!-- This is a multiline comment,\n         which take a bit of care to parse -->\n'
note.children:  [<__main__.XMLNode object at 0x106ae5710>, <__main__.XMLNode object at 0x106ae57b8>, <__main__.XMLNode object at 0x106ae5898>, <__main__.XMLNode object at 0x106ae5908>]

to.tag:  to
to.attributes:  {}
to.content:  'Tove'
to.children

In [11]:
def total_count(n):
    """ Gets the total number of nodes in an XMLNode tree. """
    return len(n.children) + sum(total_count(c) for c in n.children)

root = XMLNode("", {}, course_webpage)
print(total_count(root))


467


## Searching for tags

One of the nicer elements of the `BeautifulSoup` library is the ability to quickly search for tags that have certain attributes, without worrying about the specific structure of the model (i.e., how many levels deep the tag is, how many may exist in the document etc).  We're going to implement a similar function in our `XMLNode` class, specifically a function of the following form.

In [127]:
# [NOTE] Comment this out prior to submission
# 
# Get a list of all links on the page
links = root.find("a")
print([l.attributes["href"] for l in links])
# # 
# # Get a list of all lecture dates for the course
print(root.find("section", id="schedule"))
tbody = root.find("section", id="schedule")[0].find("table")[0].find("tbody")[0]
print([a.find("td")[0].content for a in tbody.find("tr") if len(a.find("td")) > 1])