-
Notifications
You must be signed in to change notification settings - Fork 3
/
html5tidy.py
79 lines (52 loc) · 2.2 KB
/
html5tidy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# Copyright 2011, 2012 The active archives contributors
# Copyright 2011, 2012 Michael Murtaugh
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
HTML5Tidy
=========
Simple wrapper around html5lib & lxml.etree to "tidy" html in the wild to
well-formed xml/html
Usage
-----
>>> from html5tidy import tidy
>>> tidy('some text')
'<html><head/><body>some text</body></html>'
Dependencies
------------
* [html5lib](http://code.google.com/p/html5lib/)
* [lxml](http://lxml.de/)
Copyright
---------
- 2011, 2012 [The active archives contributors](http://activearchives.org/)
- 2011, 2012 Michael Murtaugh
All rights reserved.
This software is released under the GPL3 license. See gpl-3.0.txt for details.
"""
import html5lib
import lxml.etree
parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
def tidy(src, fragment=False, container="div", encoding=None, parseMeta=True, useChardet=True, method="xml", pretty_print=False, xml_declaration=None, output_encoding="utf-8"):
if fragment:
parts = parser.parseFragment(src, container=container, encoding=encoding, parseMeta=parseMeta, useChardet=useChardet)
else:
parts = [parser.parse(src, encoding=encoding, parseMeta=parseMeta, useChardet=useChardet)]
ret = ""
for p in parts:
t = type(p)
if (t == str or t == unicode):
ret += p
else:
ret += lxml.etree.tostring(p, method=method, pretty_print=pretty_print, xml_declaration=xml_declaration, encoding=output_encoding)
return ret
if __name__ == "__main__":
import doctest
doctest.testmod()