In [33]:
from pprint import pprint
from IPython.display import display
from lxml import etree
import xml.etree.ElementTree as ET
from copy import deepcopy

# The Element class

In [2]:
root = etree.Element('root')
print(root.tag)

root


In [3]:
root.append(etree.Element('child1'))

In [4]:
etree.SubElement(root, 'child2')
etree.SubElement(root, 'child3')

<Element child3 at 0x25ee80f4d48>

In [5]:
print(etree.tostring(root, pretty_print=True))

b'<root>\n  <child1/>\n  <child2/>\n  <child3/>\n</root>\n'


In [6]:
print(len(root))
print(list(root))
print(root[0].tag)
print(root[-1].tag)
print(root.index(root[0]))
for child in root:
    print(child.tag)
root.insert(0, etree.Element('child0'))
print(root[0].tag)

3
[<Element child1 at 0x25ee810e548>, <Element child2 at 0x25ee810e588>, <Element child3 at 0x25ee80f4d48>]
child1
child3
0
child1
child2
child3
child0


In [7]:
if etree.iselement(root):
    print('root is some kind of element.')
if len(root) != 0:
    print('root has children.')

root is some kind of element.
root has children.


## Elements are lists

In [8]:
root is root[0].getparent()

True

In [9]:
element = etree.Element('neu')
element.append(deepcopy(root[0]))
print(element[0].tag)
print([c.tag for c in root])

child0
['child0', 'child1', 'child2', 'child3']


In [10]:
print(root[0] is root[1].getprevious())
print(root[1] is root[0].getnext())

True
True


## Elements carry attributes as a dict

In [11]:
root = etree.Element('root', interesting='totally')
etree.tostring(root)

b'<root interesting="totally"/>'

In [12]:
print(root.get('interesting'))

totally


In [13]:
print(root.get('hello'))

None


In [14]:
root.set('hello', 'haha')
print(root.get('hello'))

haha


In [15]:
etree.tostring(root)

b'<root interesting="totally" hello="haha"/>'

In [16]:
attrib = dict(root.attrib)
print(attrib)

{'hello': 'haha', 'interesting': 'totally'}


In [17]:
print(root.keys())
print(root.values())
print(root.items())

['interesting', 'hello']
['totally', 'haha']
[('interesting', 'totally'), ('hello', 'haha')]


## Elements contain text

In [18]:
root = etree.Element('root')
root.text = 'Text'
print(root.text)
etree.tostring(root)

Text


b'<root>Text</root>'

In [19]:
html = etree.Element('html')
body = etree.SubElement(html, 'body')
body.text = 'Hello'
print(etree.tostring(html))

br = etree.SubElement(body, 'br')
print(etree.tostring(html))

br.tail = 'World'
print(etree.tostring(html))

b'<html><body>Hello</body></html>'
b'<html><body>Hello<br/></body></html>'
b'<html><body>Hello<br/>World</body></html>'


In [20]:
print(etree.tostring(br))
print(etree.tostring(br, with_tail=False))

b'<br/>World'
b'<br/>'


In [21]:
print(etree.tostring(html, method='text'))

b'HelloWorld'


## Using XPath to find text

## Tree Iteration

In [24]:
root = etree.Element('root')
etree.SubElement(root, 'child').text = 'Child 1'
etree.SubElement(root, 'child').text = 'Child 2'
etree.SubElement(root, 'another').text = 'Child 3'
print(etree.tostring(root, pretty_print=True))

b'<root>\n  <child>Child 1</child>\n  <child>Child 2</child>\n  <another>Child 3</another>\n</root>\n'


In [26]:
for element in root.iter():
    print(element.tag, element.text)

root None
child Child 1
child Child 2
another Child 3


In [27]:
for element in root.iter('child'):
    print(element.tag, element.text)

child Child 1
child Child 2


In [28]:
for element in root.iter('child', 'another'):
    print(element.tag, element.text)

child Child 1
child Child 2
another Child 3


In [29]:
root.append(etree.Entity('#123'))
root.append(etree.Comment('Some comment'))
print(etree.tostring(root))

b'<root><child>Child 1</child><child>Child 2</child><another>Child 3</another>&#123;<!--Some comment--></root>'


In [30]:
for item in root.iter():
    if isinstance(item.tag, str):
        print(item.tag, item.text)
    else:
        print('Special:', item, item.text)

root None
child Child 1
child Child 2
another Child 3
Special: &#123; &#123;
Special: <!--Some comment--> Some comment


In [31]:
for element in root.iter(tag=etree.Element):
    print(element.tag, element.text)

root None
child Child 1
child Child 2
another Child 3


## Serialization

In [36]:
root = etree.XML('<root><a><b/></a></root>')
print(etree.tostring(root))
print(etree.tostring(root, xml_declaration=True))
print(etree.tostring(root, encoding='utf-8'))
print(etree.tostring(root, pretty_print=True))

b'<root><a><b/></a></root>'
b"<?xml version='1.0' encoding='ASCII'?>\n<root><a><b/></a></root>"
b'<root><a><b/></a></root>'
b'<root>\n  <a>\n    <b/>\n  </a>\n</root>\n'


In [38]:
root = etree.XML('<html><body>Hello<br/>World</body></html>')
print(etree.tostring(root, method='html'))
print(etree.tostring(root, method='text'))

b'<html><body>Hello<br>World</body></html>'
b'HelloWorld'


# The ElementTree Class

In [61]:
root = etree.XML('''\
<?xml version="1.0"?>
<!DOCTYPE root SYSTEM "test" [ <!ENTITY tasty "parsnips"> ]>
<root>
<a>&tasty;</a>
</root>''')
print(etree.tostring(root))

tree = etree.ElementTree(root)
print(etree.tostring(tree))
print(etree.tostring(tree.getroot()))

print(tree.docinfo.xml_version)
print(tree.docinfo.encoding)

print('---')
for p in dir(tree.docinfo):
    if p[:2] != '__':
        print(p, getattr(tree.docinfo, p))

b'<root>\n<a>parsnips</a>\n</root>'
b'<!DOCTYPE root SYSTEM "test" [\n<!ENTITY tasty "parsnips">\n]>\n<root>\n<a>parsnips</a>\n</root>'
b'<root>\n<a>parsnips</a>\n</root>'
1.0
ISO-8859-1
---
URL None
clear <built-in method clear of lxml.etree.DocInfo object at 0x0000025EE80E4470>
doctype <!DOCTYPE root SYSTEM "test">
encoding ISO-8859-1
externalDTD None
internalDTD <lxml.etree.DTD object at 0x0000025EE81B7588>
public_id None
root_name root
standalone False
system_url test
xml_version 1.0


# Parsing from strings and files