In [2]:
from pprint import pprint
from IPython.display import display
from lxml import etree
import xml.etree.ElementTree as ET
from copy import deepcopy

# The Element class

In [3]:
root = etree.Element('root')
print(root.tag)

root


In [4]:
root.append(etree.Element('child1'))

In [5]:
etree.SubElement(root, 'child2')
etree.SubElement(root, 'child3')

<Element child3 at 0x2cee7a7d988>

In [6]:
print(etree.tostring(root, pretty_print=True))

b'<root>\n  <child1/>\n  <child2/>\n  <child3/>\n</root>\n'


In [7]:
print(len(root))
print(list(root))
print(root[0].tag)
print(root[-1].tag)
print(root.index(root[0]))
for child in root:
    print(child.tag)
root.insert(0, etree.Element('child0'))
print(root[0].tag)

3
[<Element child1 at 0x2cee7aa5988>, <Element child2 at 0x2cee7aa59c8>, <Element child3 at 0x2cee7a7d988>]
child1
child3
0
child1
child2
child3
child0


In [8]:
if etree.iselement(root):
    print('root is some kind of element.')
if len(root) != 0:
    print('root has children.')

root is some kind of element.
root has children.


## Elements are lists

In [9]:
root is root[0].getparent()

True

In [10]:
element = etree.Element('neu')
element.append(deepcopy(root[0]))
print(element[0].tag)
print([c.tag for c in root])

child0
['child0', 'child1', 'child2', 'child3']


In [11]:
print(root[0] is root[1].getprevious())
print(root[1] is root[0].getnext())

True
True


## Elements carry attributes as a dict

In [12]:
root = etree.Element('root', interesting='totally')
etree.tostring(root)

b'<root interesting="totally"/>'

In [13]:
print(root.get('interesting'))

totally


In [14]:
print(root.get('hello'))

None


In [15]:
root.set('hello', 'haha')
print(root.get('hello'))

haha


In [16]:
etree.tostring(root)

b'<root interesting="totally" hello="haha"/>'

In [17]:
attrib = dict(root.attrib)
print(attrib)

{'hello': 'haha', 'interesting': 'totally'}


In [18]:
print(root.keys())
print(root.values())
print(root.items())

['interesting', 'hello']
['totally', 'haha']
[('interesting', 'totally'), ('hello', 'haha')]


## Elements contain text

In [19]:
root = etree.Element('root')
root.text = 'Text'
print(root.text)
etree.tostring(root)

Text


b'<root>Text</root>'

In [20]:
html = etree.Element('html')
body = etree.SubElement(html, 'body')
body.text = 'Hello'
print(etree.tostring(html))

br = etree.SubElement(body, 'br')
print(etree.tostring(html))

br.tail = 'World'
print(etree.tostring(html))

b'<html><body>Hello</body></html>'
b'<html><body>Hello<br/></body></html>'
b'<html><body>Hello<br/>World</body></html>'


In [21]:
print(etree.tostring(br))
print(etree.tostring(br, with_tail=False))

b'<br/>World'
b'<br/>'


In [22]:
print(etree.tostring(html, method='text'))

b'HelloWorld'


## Using XPath to find text

## Tree Iteration

In [23]:
root = etree.Element('root')
etree.SubElement(root, 'child').text = 'Child 1'
etree.SubElement(root, 'child').text = 'Child 2'
etree.SubElement(root, 'another').text = 'Child 3'
print(etree.tostring(root, pretty_print=True))

b'<root>\n  <child>Child 1</child>\n  <child>Child 2</child>\n  <another>Child 3</another>\n</root>\n'


In [24]:
for element in root.iter():
    print(element.tag, element.text)

root None
child Child 1
child Child 2
another Child 3


In [25]:
for element in root.iter('child'):
    print(element.tag, element.text)

child Child 1
child Child 2


In [26]:
for element in root.iter('child', 'another'):
    print(element.tag, element.text)

child Child 1
child Child 2
another Child 3


In [27]:
root.append(etree.Entity('#123'))
root.append(etree.Comment('Some comment'))
print(etree.tostring(root))

b'<root><child>Child 1</child><child>Child 2</child><another>Child 3</another>&#123;<!--Some comment--></root>'


In [28]:
for item in root.iter():
    if isinstance(item.tag, str):
        print(item.tag, item.text)
    else:
        print('Special:', item, item.text)

root None
child Child 1
child Child 2
another Child 3
Special: &#123; &#123;
Special: <!--Some comment--> Some comment


In [29]:
for element in root.iter(tag=etree.Element):
    print(element.tag, element.text)

root None
child Child 1
child Child 2
another Child 3


## Serialization

In [30]:
root = etree.XML('<root><a><b/></a></root>')
print(etree.tostring(root))
print(etree.tostring(root, xml_declaration=True))
print(etree.tostring(root, encoding='utf-8'))
print(etree.tostring(root, pretty_print=True))

b'<root><a><b/></a></root>'
b"<?xml version='1.0' encoding='ASCII'?>\n<root><a><b/></a></root>"
b'<root><a><b/></a></root>'
b'<root>\n  <a>\n    <b/>\n  </a>\n</root>\n'


In [31]:
root = etree.XML('<html><body>Hello<br/>World</body></html>')
print(etree.tostring(root, method='html'))
print(etree.tostring(root, method='text'))

b'<html><body>Hello<br>World</body></html>'
b'HelloWorld'


# The ElementTree Class

In [32]:
root = etree.XML('''\
<?xml version="1.0"?>
<!DOCTYPE root SYSTEM "test" [ <!ENTITY tasty "parsnips"> ]>
<root>
<a>&tasty;</a>
</root>''')
print(etree.tostring(root))

tree = etree.ElementTree(root)
print(etree.tostring(tree))
print(etree.tostring(tree.getroot()))

print(tree.docinfo.xml_version)
print(tree.docinfo.encoding)

print('---')
for p in dir(tree.docinfo):
    if p[:2] != '__':
        print(p, getattr(tree.docinfo, p))

b'<root>\n<a>parsnips</a>\n</root>'
b'<!DOCTYPE root SYSTEM "test" [\n<!ENTITY tasty "parsnips">\n]>\n<root>\n<a>parsnips</a>\n</root>'
b'<root>\n<a>parsnips</a>\n</root>'
1.0
ISO-8859-1
---
URL None
clear <built-in method clear of lxml.etree.DocInfo object at 0x000002CEE7AA7278>
doctype <!DOCTYPE root SYSTEM "test">
encoding ISO-8859-1
externalDTD None
internalDTD <lxml.etree.DTD object at 0x000002CEE7AC72C8>
public_id None
root_name root
standalone False
system_url test
xml_version 1.0


# Parsing from strings and files

## The fromstring() function

In [33]:
root = etree.fromstring('<root>data</root>')
print(etree.tostring(root))

b'<root>data</root>'


## The XML() function

In [34]:
root = etree.XML('<root>data</root>')
print(etree.tostring(root))

b'<root>data</root>'


In [35]:
html = etree.HTML('<p>Hello, World!</p>')
print(etree.tostring(html))

b'<html><body><p>Hello, World!</p></body></html>'


## The parse() function

In [36]:
from io import BytesIO
root = etree.parse(BytesIO(b'<root>data</root>'))
print(etree.tostring(root))
print(etree.tostring(root.getroot()))

b'<root>data</root>'
b'<root>data</root>'


## Parser objects

In [37]:
parser = etree.XMLParser(remove_blank_text=True)
root = etree.XML('<root>   <a/>   <b>   </b>   </root>', parser)
print(etree.tostring(root))

b'<root><a/><b>   </b></root>'


In [38]:
for element in root.iter('*'):
    if element.text is not None and not element.text.strip():
        element.text = None

print(etree.tostring(root))

b'<root><a/><b/></root>'


## Incremental parsing

In [39]:
class DataSource:
    data = [ b"<roo", b"t><", b"a/", b"><", b"/root>" ]
    def read(self, requested_size):
        try:
            return self.data.pop(0)
        except IndexError:
            return b''

tree = etree.parse(DataSource())

etree.tostring(tree)

b'<root><a/></root>'

In [40]:
parser = etree.XMLParser()

parser.feed('<roo')
parser.feed('t><')
parser.feed('a/')
parser.feed('><')
parser.feed('/root>')

root = parser.close()

print(etree.tostring(root))

b'<root><a/></root>'


In [41]:
parser.feed('<root/>')
root = parser.close()
print(etree.tostring(root))

b'<root/>'


## Event-driven parsing

In [42]:
some_file_like = BytesIO(b'<root><a>Data</a></root>')
for event, element in etree.iterparse(some_file_like):
    print(event, element.tag, element.text)

end a Data
end root None


In [43]:
some_file_like = BytesIO(b'<root><a>Data</a></root>')
for event, element in etree.iterparse(some_file_like, events=('start', 'end')):
    print(event, element.tag, element.text)

start root None
start a Data
end a Data
end root None


In [44]:
some_file_like = BytesIO(b'<root><a><b>Data</b></a><a><b/></a></root>')

for event, element in etree.iterparse(some_file_like):
    if element.tag == 'b':
        print(element.text)
    elif element.tag == 'a':
        print('cleaning')
        element.clear()

Data
cleaning
None
cleaning


In [45]:
some_file_like = BytesIO(b'''\
<root>
    <a><b>ABC</b><c>abc</c></a>
    <a><b>MORE DATA</b><c>more data</c></a>
    <a><b>XYZ</b><c>xyz</c></a>
</root>
''')

for _, element in etree.iterparse(some_file_like, tag='a'):
    print(element.findtext('b'), element[1].text)
    element.clear()

ABC abc
MORE DATA more data
XYZ xyz


## Namespaces

In [47]:
xhtml = etree.Element('{http://www.w3.org/1999/xhtml}html')
etree.SubElement(xhtml, '{http://www.w3.org/1999/xhtml}body').text = 'Hello, World!'
print(etree.tostring(xhtml, pretty_print=True).decode('utf-8'))

<html:html xmlns:html="http://www.w3.org/1999/xhtml">
  <html:body>Hello, World!</html:body>
</html:html>



In [58]:
XHTMLNS = 'http://www.w3.org/1999/xhtml'
XHTML = '{{{}}}'.format(XHTMLNS)
NSMAP = {None: XHTMLNS}

xhtml = etree.Element(XHTML + 'html', nsmap=NSMAP)
etree.SubElement(xhtml, 'body').text = 'Hello, World!'
print(etree.tostring(xhtml, pretty_print=True).decode('utf-8'))

<html xmlns="http://www.w3.org/1999/xhtml">
  <body>Hello, World!</body>
</html>



In [59]:
tag = etree.QName('http://www.w3.org/1999/xhtml', 'html')
print(tag.namespace)
print(tag.localname)
print(tag.text)

http://www.w3.org/1999/xhtml
html
{http://www.w3.org/1999/xhtml}html


In [60]:
tag = etree.QName('{http://www.w3.org/1999/xhtml}html')
print(tag.namespace)
print(tag.localname)
print(tag.text)

http://www.w3.org/1999/xhtml
html
{http://www.w3.org/1999/xhtml}html


In [61]:
root = etree.Element('{http://www.w3.org/1999/xhtml}html')
tag = etree.QName(root)
print(tag.namespace)
print(tag.localname)
print(tag.text)

http://www.w3.org/1999/xhtml
html
{http://www.w3.org/1999/xhtml}html


In [62]:
tag = etree.QName(root, 'script')
print(tag.namespace)
print(tag.localname)
print(tag.text)

http://www.w3.org/1999/xhtml
script
{http://www.w3.org/1999/xhtml}script


In [63]:
tag = etree.QName('{http://www.w3.org/1999/xhtml}html', 'script')
print(tag.namespace)
print(tag.localname)
print(tag.text)

http://www.w3.org/1999/xhtml
script
{http://www.w3.org/1999/xhtml}script


In [64]:
xhtml.nsmap

{None: 'http://www.w3.org/1999/xhtml'}

In [67]:
root = etree.Element('root', nsmap={'a': 'http://a'})
child = etree.SubElement(root, 'child', nsmap={'b': 'http://b'})
print(etree.tostring(root, pretty_print=True).decode('utf-8'))
print(root.nsmap)
print(child.nsmap)

<root xmlns:a="http://a">
  <child xmlns:b="http://b"/>
</root>

{'a': 'http://a'}
{'a': 'http://a', 'b': 'http://b'}


In [72]:
body.set(XHTML + 'bgcolor', '#ABC')
print(etree.tostring(body, pretty_print=True).decode('utf-8'))

print(body.get('bgcolor'))
print(body.get(XHTML + 'bgcolor'))

<body xmlns:html="http://www.w3.org/1999/xhtml" html:bgcolor="#ABC">Hello<br/>World</body>

None
#ABC


In [83]:
print(etree.tostring(xhtml, pretty_print=True).decode('utf-8'))

print('---')
for el in xhtml.iter('*'):
    print(el.tag)

print('---')
for el in xhtml.iter(XHTML + '*'):
    print(el.tag)
    
print('---')
for el in xhtml.iter('{*}body'):
    print(el.tag)
    
print('---')
for el in xhtml.iter('body'):
    print(el.tag)

<html xmlns="http://www.w3.org/1999/xhtml">
  <body>Hello, World!</body>
</html>

---
{http://www.w3.org/1999/xhtml}html
body
---
{http://www.w3.org/1999/xhtml}html
---
body
---
body


# The E-factory

In [84]:
from lxml.builder import E

In [88]:
html = etree.Element('html')
head = etree.SubElement(html, 'head')
etree.SubElement(head, 'title').text = 'This is a sample document'

print(etree.tostring(html, pretty_print=True).decode('utf-8'))

<html>
  <head>
    <title>This is a sample document</title>
  </head>
</html>



In [90]:
html = E.html(
    E.head(
        E.title('This is a sample document')
    ),
    E.body(
        E.p('This is a paragraph with ', E.b('important'), ' message in it!'),
        etree.XML('<p>And this is another paragraph</p>')
    )
)

print(etree.tostring(html, pretty_print=True).decode('utf-8'))

<html>
  <head>
    <title>This is a sample document</title>
  </head>
  <body>
    <p>This is a paragraph with <b>important</b> message in it!</p>
    <p>And this is another paragraph</p>
  </body>
</html>



# ElementPath

In [91]:
root = etree.XML("<root><a x='123'>aText<b/><c/><b/></a></root>")
print(etree.tostring(root, pretty_print=True).decode('utf-8'))

<root>
  <a x="123">aText<b/><c/><b/></a>
</root>



In [93]:
print(root.find('b'))
print(root.find('a').tag)

None
a


In [95]:
print(root.find('.//b').tag)
print([el.tag for el in root.iterfind('.//b')])

b
['b', 'b']


In [97]:
print(root.findall('.//a[@x]')[0].tag)

a


In [104]:
tree = etree.ElementTree(root)
print(root[0][0].tag)
print(tree.getelementpath(root[0][0]))
print(tree.find(tree.getelementpath(root[0][0])) == root[0][0])

b
a/b[1]
True


In [105]:
print(root.find('.//b').tag)
print(next(root.iterfind('.//b')).tag)
print(next(root.iter('b')).tag)

b
b
b
