In [258]:
from bigtree import list_to_tree, tree_to_dict, tree_to_dot, tree_to_dataframe, dataframe_to_tree
from bigtree import Node as Bnode
import pydot

In [237]:
from lxml import etree
import requests

xml_dict = {}

r = requests.get("https://health.gov/sitemap.xml?page=2")
root = etree.fromstring(r.content)
print(f"The number of sitemap tags are {len(root)}")
for sitemap in root:
    children = sitemap.getchildren()
    xml_dict[children[0].text] = children[1].text
for key in xml_dict.keys():
    #print(key, xml_dict[key])
    pass

The number of sitemap tags are 524


In [238]:
#create directory tree
#start with getting links
link_array = [i for i in xml_dict]


In [239]:
class URLNode:
    def __init__(self, name, nodeid):
        self.name = name
        self.children = []
        self.nodeid = nodeid
    #for readability
    def get_link_text():
        return self.name
    
    def __repr__(self):
        return self.name
    
    def add_child(self, child_node):
        return self.children.append(child_node)
    
    def print_tree(self, level=0):
        print('\t' * level + repr(self.name))
        for child in self.children:
            child.print_tree(level + 1)
#links are later, for webcrawling pages and then seeing if they link to any other pages
#not necessarily representative of website structure so something to discuss in the meeting


In [240]:
def root_to_bigtree(root):
    if type(root) == URLNode:
        big_node = Bnode(root.name)
    elif type(root)== etree._Element:
        print('element node!')
        print(root.base)
        big_node = Bnode(root.text)
    else:
        big_node = Bnode(root.val)
    big_children = []
    for child in root.children:
        big_children.append(root_to_bigtree(child))
    big_node.children = big_children
    return big_node

In [241]:

def build_tree_from_urls(urls):
    root = URLNode("/", nodeid= 0)
    i = 1
    
    for url in sorted(urls):
        parts = url.split("/")
        current_node = root

        for part in parts:
            if part:
                found = False
                for child in current_node.children:
                    if child.name == part:
                        current_node = child
                        found = True
                        break
                
                if not found:
                    if current_node.nodeid == 0:
                        new_node = URLNode(part, nodeid=i)
                        current_node.add_child(new_node)
                        current_node = new_node
                    else:
                        new_node = URLNode(part, nodeid=i+1)
                        current_node.add_child(new_node)
                        current_node = new_node  

    return root

In [242]:
def print_tree(node, indent="├"):
    if node.nodeid != 0:
        print(node.name)
        

    for child in node.children:
        print_tree(child, indent + "────")

In [243]:
root = build_tree_from_urls(link_array)

In [244]:
big_root = build_tree_from_urls(link_array)

In [245]:
type(root.children[0].children[0])

__main__.URLNode

In [264]:
root.print_tree()

'/'
	'https:'
		'health.gov'
			'about-odphp'
				'committees-workgroups'
					'national-clinical-care-commission'
						'meetings'
							'meeting-1'
								'meeting-1-agenda'
							'meeting-4'
								'meeting-4-agenda'
							'meeting-5'
								'meeting-5-agenda'
							'meeting-6'
								'meeting-6-agenda'
							'meeting-7'
								'meeting-7-agenda'
							'meeting-8'
								'meeting-8-agenda'
							'nccc-meeting-10'
								'nccc-meeting-10-agenda'
							'nccc-meeting-11'
								'nccc-meeting-11-agenda'
							'nccc-meeting-12'
								'nccc-meeting-12-agenda'
							'nccc-meeting-9'
								'nccc-meeting-9-agenda'
						'report-congress'
				'contact-us'
					'speaker-request-form'
				'our-director'
				'previous-initiatives'
				'quality-guidelines'
					'linking-policy'
			'espanol'
				'moveyourway'
					'stories'
						'cheryl'
						'jeff'
						'john-patty'
						'joni'
						'nikia'
						'rodriguez-family'
					'widget'
				'myhealthfinder'
					'embarazo'
				

In [247]:
type(root.children[0].children[0])

__main__.URLNode

In [248]:
type(root)

__main__.URLNode

In [249]:
big_root = root_to_bigtree(big_root)

In [257]:
graph = tree_to_dot(big_root, node_colour = "gold")
graph.write_png("/sitemaptree.png")

<MagicMock name='mock.Dot().write_png()' id='2031635453264'>

In [263]:
import pandas as pd

df = pd.DataFrame(tree_to_dataframe(big_root))


with open("subdir_map.csv", 'w') as outfile:
    outfile.write(df.to_csv(sep=';', header=False, index=False))