Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

import 0.32

  • Loading branch information...
commit bc80fa281d56a77b452e260095f257810f0ccb52 0 parents
Aaron Swartz authored
Showing with 592 additions and 0 deletions.
  1. +436 −0 sanitize.py
  2. +156 −0 sanitize_test.py
436 sanitize.py
@@ -0,0 +1,436 @@
+"""
+sanitize: bringing sanitiy to world of messed-up data
+"""
+
+__author__ = ["Mark Pilgrim <http://diveintomark.org/>",
+ "Aaron Swartz <http://www.aaronsw.com/>"]
+__contributors__ = ["Sam Ruby <http://intertwingly.net/>"]
+__license__ = "BSD"
+__version__ = "0.33"
+
+_debug = 0
+
+# If you want sanitize to automatically run HTML markup through HTML Tidy, set
+# this to 1. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
+# or utidylib <http://utidylib.berlios.de/>.
+TIDY_MARKUP = 0
+
+# List of Python interfaces for HTML Tidy, in order of preference. Only useful
+# if TIDY_MARKUP = 1
+PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
+
+import sgmllib, re, urlparse, sys
+
+# chardet library auto-detects character encodings
+# Download from http://chardet.feedparser.org/
+try:
+ import chardet
+ if _debug:
+ import chardet.constants
+ chardet.constants._debug = 1
+
+ _chardet = lambda data: chardet.detect(data)['encoding']
+except:
+ chardet = None
+ _chardet = lambda data: None
+
+class _BaseHTMLProcessor(sgmllib.SGMLParser):
+ elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
+ 'img', 'input', 'isindex', 'link', 'meta', 'param']
+
+ _r_barebang = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE)
+ _r_bareamp = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
+ _r_shorttag = re.compile(r'<([^<\s]+?)\s*/>')
+
+ def __init__(self, encoding):
+ self.encoding = encoding
+ if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)
+ sgmllib.SGMLParser.__init__(self)
+
+ def reset(self):
+ self.pieces = []
+ sgmllib.SGMLParser.reset(self)
+
+ def _shorttag_replace(self, match):
+ tag = match.group(1)
+ if tag in self.elements_no_end_tag:
+ return '<' + tag + ' />'
+ else:
+ return '<' + tag + '></' + tag + '>'
+
+ def feed(self, data):
+ if _debug: sys.stderr.write('_BaseHTMLProcessor, feed, data=%s\n' % repr(data))
+ data = self._r_barebang.sub(r'&lt;!\1', data)
+ data = self._r_bareamp.sub("&amp;", data)
+ data = self._r_shorttag.sub(self._shorttag_replace, data)
+ if self.encoding and type(data) == type(u''):
+ data = data.encode(self.encoding)
+ sgmllib.SGMLParser.feed(self, data)
+
+ def normalize_attrs(self, attrs):
+ # utility method to be called by descendants
+ attrs = [(k.lower(), v) for k, v in attrs]
+ attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
+ return attrs
+
+ def unknown_starttag(self, tag, attrs):
+ # called for each start tag
+ # attrs is a list of (attr, value) tuples
+ # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
+ if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
+
+ def attrquote(data):
+ data = self._r_bareamp.sub("&amp;", data)
+ data = data.replace('"', '&quot;')
+ return data
+
+ newattrs = []
+ # hack to reverse attribute decoding in py2.5
+ for key, value in attrs:
+ newvalue = []
+ for c in value:
+ if ord(c) > 127:
+ c = '&#' + str(ord(c)) + ';'
+ newvalue.append(c)
+ newattrs.append((key, ''.join(newvalue)))
+ strattrs = ''.join([' %s="%s"' % (key, attrquote(value)) for key, value in newattrs])
+
+ if tag in self.elements_no_end_tag:
+ self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
+ else:
+ self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
+
+ def unknown_endtag(self, tag):
+ # called for each end tag, e.g. for </pre>, tag will be 'pre'
+ # Reconstruct the original end tag.
+ if tag not in self.elements_no_end_tag:
+ self.pieces.append("</%(tag)s>" % locals())
+
+ def handle_charref(self, ref):
+ # called for each character reference, e.g. for '&#160;', ref will be '160'
+ # Reconstruct the original character reference.
+ self.pieces.append('&#%(ref)s;' % locals())
+
+ def handle_entityref(self, ref):
+ # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
+ # Reconstruct the original entity reference.
+ self.pieces.append('&%(ref)s;' % locals())
+
+ def handle_data(self, text):
+ # called for each block of plain text, i.e. outside of any tag and
+ # not containing any character or entity references
+ # Store the original text verbatim.
+ if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)
+ self.pieces.append(text)
+
+ def handle_comment(self, text):
+ # called for each HTML comment, e.g. <!-- insert Javascript code here -->
+ # Reconstruct the original comment.
+ self.pieces.append('<!--%(text)s-->' % locals())
+
+ def handle_pi(self, text):
+ # called for each processing instruction, e.g. <?instruction>
+ # Reconstruct original processing instruction.
+ self.pieces.append('<?%(text)s>' % locals())
+
+ def handle_decl(self, text):
+ # called for the DOCTYPE, if present, e.g.
+ # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
+ # "http://www.w3.org/TR/html4/loose.dtd">
+ # Reconstruct original DOCTYPE
+ self.pieces.append('<!%(text)s>' % locals())
+
+ _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
+ def _scan_name(self, i, declstartpos):
+ rawdata = self.rawdata
+ n = len(rawdata)
+ if i == n:
+ return None, -1
+ m = self._new_declname_match(rawdata, i)
+ if m:
+ s = m.group()
+ name = s.strip()
+ if (i + len(s)) == n:
+ return None, -1 # end of buffer
+ return name.lower(), m.end()
+ else:
+ self.handle_data(rawdata)
+# self.updatepos(declstartpos, i)
+ return None, -1
+
+ def output(self):
+ '''Return processed HTML as a single string'''
+ return ''.join(self.pieces)
+ # used to be: [str(p) for p in self.pieces]
+ # not sure why... -- ASw
+
+class _HTMLSanitizer(_BaseHTMLProcessor):
+ acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
+ 'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col',
+ 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset',
+ 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input',
+ 'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup',
+ 'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike',
+ 'strong', 'sub', 'sup', 'table', 'textarea', 'tbody', 'td', 'tfoot', 'th',
+ 'thead', 'tr', 'tt', 'u', 'ul', 'var']
+
+ acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
+ 'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing',
+ 'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols',
+ 'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled',
+ 'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace',
+ 'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
+ 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
+ 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
+ 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type',
+ 'usemap', 'valign', 'value', 'vspace', 'width']
+
+ # http://www.iana.org/assignments/uri-schemes.html
+ acceptable_uri_schemes = [
+ 'cid', 'crid', 'data', 'dav', 'dict', 'dns', 'fax',
+ 'ftp', 'go', 'gopher', 'h323', 'http', 'https', 'im',
+ 'imap', 'info', 'ipp', 'iris.beep', 'ldap', 'mailto',
+ 'mid', 'modem', 'news', 'nfs', 'nntp', 'pres', 'rtsp',
+ 'sip', 'sips', 'snmp', 'tag', 'tel', 'telnet', 'tftp',
+ 'urn',
+
+ # unspecified
+ # http://esw.w3.org/topic/UriSchemes
+
+ 'aim', 'irc', 'feed', 'webcal']
+
+ ignorable_elements = ['script', 'applet', 'style']
+
+ relative_uris = [('a', 'href'),
+ ('applet', 'codebase'),
+ ('area', 'href'),
+ ('blockquote', 'cite'),
+ ('body', 'background'),
+ ('del', 'cite'),
+ ('form', 'action'),
+ ('frame', 'longdesc'),
+ ('frame', 'src'),
+ ('iframe', 'longdesc'),
+ ('iframe', 'src'),
+ ('head', 'profile'),
+ ('img', 'longdesc'),
+ ('img', 'src'),
+ ('img', 'usemap'),
+ ('input', 'src'),
+ ('input', 'usemap'),
+ ('ins', 'cite'),
+ ('link', 'href'),
+ ('object', 'classid'),
+ ('object', 'codebase'),
+ ('object', 'data'),
+ ('object', 'usemap'),
+ ('q', 'cite'),
+ ('script', 'src')]
+
+ def __init__(self, baseuri, encoding, required_attributes=None):
+ _BaseHTMLProcessor.__init__(self, encoding)
+ self.baseuri = baseuri
+ self.required_attributes = required_attributes
+ # urlparse caches URL parsing for some reason
+ # and its cache doesn't distinguish between Unicode and non-unicode
+ # so it caches the Unicode version feedparser sends it
+ # which causes breakage
+ urlparse._parse_cache = {}
+
+ def resolveURI(self, uri):
+ if ':' in uri:
+ scheme, rest = uri.split(':', 1)
+ if scheme not in self.acceptable_uri_schemes:
+ uri = '#' + rest
+ if self.baseuri:
+ return urlparse.urljoin(self.baseuri, uri)
+ else:
+ return uri
+
+ def reset(self):
+ _BaseHTMLProcessor.reset(self)
+ self.tag_stack = []
+ self.ignore_level = 0
+
+ def feed(self, data):
+ _BaseHTMLProcessor.feed(self, data)
+ while self.tag_stack:
+ _BaseHTMLProcessor.unknown_endtag(self, self.tag_stack.pop())
+
+ def unknown_starttag(self, tag, attrs):
+ if tag in self.ignorable_elements:
+ self.ignore_level += 1
+ return
+
+ if self.ignore_level:
+ return
+
+ if tag in self.acceptable_elements:
+ attrs = self.normalize_attrs(attrs)
+ attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes]
+ attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
+ if self.required_attributes and tag in self.required_attributes:
+ attrs = [(key, value) for key, value in attrs if key not in [k for k, v in self.required_attributes[tag]]]
+ attrs += self.required_attributes[tag]
+
+ if tag not in self.elements_no_end_tag:
+ self.tag_stack.append(tag)
+ _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
+
+ def unknown_endtag(self, tag):
+ if tag in self.ignorable_elements:
+ self.ignore_level -= 1
+ return
+
+ if self.ignore_level:
+ return
+
+ if tag in self.acceptable_elements and tag not in self.elements_no_end_tag:
+ match = False
+ while self.tag_stack:
+ top = self.tag_stack.pop()
+ if top == tag:
+ match = True
+ break
+ _BaseHTMLProcessor.unknown_endtag(self, top)
+
+ if match:
+ _BaseHTMLProcessor.unknown_endtag(self, tag)
+
+ def handle_pi(self, text):
+ pass
+
+ def handle_decl(self, text):
+ pass
+
+ def handle_data(self, text):
+ if not self.ignore_level:
+ text = text.replace('<', '')
+ _BaseHTMLProcessor.handle_data(self, text)
+
+def HTML(htmlSource, encoding='utf8', baseuri=None, required_attributes=None, addnofollow=False):
+ if not required_attributes:
+ required_attributes = {}
+ if addnofollow:
+ required_attributes['a'] = [('rel', 'nofollow')]
+ p = _HTMLSanitizer(baseuri, encoding, required_attributes)
+ p.feed(htmlSource)
+ data = p.output()
+ if TIDY_MARKUP:
+ # loop through list of preferred Tidy interfaces looking for one that's installed,
+ # then set up a common _tidy function to wrap the interface-specific API.
+ _tidy = None
+ for tidy_interface in PREFERRED_TIDY_INTERFACES:
+ try:
+ if tidy_interface == "uTidy":
+ from tidy import parseString as _utidy
+ def _tidy(data, **kwargs):
+ return str(_utidy(data, **kwargs))
+ break
+ elif tidy_interface == "mxTidy":
+ from mx.Tidy import Tidy as _mxtidy
+ def _tidy(data, **kwargs):
+ nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs)
+ return data
+ break
+ except:
+ pass
+ if _tidy:
+ utf8 = type(data) == type(u'')
+ if utf8:
+ data = data.encode('utf-8')
+ data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8")
+ if utf8:
+ data = unicode(data, 'utf-8')
+ if data.count('<body'):
+ data = data.split('<body', 1)[1]
+ if data.count('>'):
+ data = data.split('>', 1)[1]
+ if data.count('</body'):
+ data = data.split('</body', 1)[0]
+ data = data.strip().replace('\r\n', '\n')
+ return data
+
+unicode_bom_map = {
+ '\x00\x00\xfe\xff': 'utf-32be',
+ '\xff\xfe\x00\x00': 'utf-32le',
+ '\xfe\xff##': 'utf-16be',
+ '\xff\xfe##': 'utf-16le',
+ '\xef\bb\bf': 'utf-8'
+}
+xml_bom_map = {
+ '\x00\x00\x00\x3c': 'utf-32be',
+ '\x3c\x00\x00\x00': 'utf-32le',
+ '\x00\x3c\x00\x3f': 'utf-16be',
+ '\x3c\x00\x3f\x00': 'utf-16le',
+ '\x3c\x3f\x78\x6d': 'utf-8', # or equivalent
+ '\x4c\x6f\xa7\x94': 'ebcdic'
+}
+
+_ebcdic_to_ascii_map = None
+def _ebcdic_to_ascii(s):
+ global _ebcdic_to_ascii_map
+ if not _ebcdic_to_ascii_map:
+ emap = (
+ 0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
+ 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
+ 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
+ 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
+ 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
+ 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
+ 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
+ 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
+ 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201,
+ 202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208,
+ 209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215,
+ 216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,
+ 123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237,
+ 125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243,
+ 92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249,
+ 48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255
+ )
+ import string
+ _ebcdic_to_ascii_map = string.maketrans( \
+ ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
+ return s.translate(_ebcdic_to_ascii_map)
+
+def _startswithbom(text, bom):
+ for i, c in enumerate(bom):
+ if c == '#':
+ if text[i] == '\x00':
+ return False
+ else:
+ if text[i] != c:
+ return False
+ return True
+
+def _detectbom(text, bom_map=unicode_bom_map):
+ for bom, encoding in bom_map.iteritems():
+ if _startswithbom(text, bom):
+ return encoding
+ return None
+
+def characters(text, isXML=False, guess=None):
+ """
+ Takes a string text of unknown encoding and tries to
+ provide a Unicode string for it.
+ """
+ _triedEncodings = []
+ def tryEncoding(encoding):
+ if encoding and encoding not in _triedEncodings:
+ if encoding == 'ebcdic':
+ return _ebcdic_to_ascii(text)
+ try:
+ return unicode(text, encoding)
+ except UnicodeDecodeError:
+ pass
+ _triedEncodings.append(encoding)
+
+ return (
+ tryEncoding(guess) or
+ tryEncoding(_detectbom(text)) or
+ isXML and tryEncoding(_detectbom(text, xml_bom_map)) or
+ tryEncoding(_chardet(text)) or
+ tryEncoding('utf8') or
+ tryEncoding('windows-1252') or
+ tryEncoding('iso-8859-1'))
156 sanitize_test.py
@@ -0,0 +1,156 @@
+"""unit tests for sanitize"""
+__author__ = "Aaron Swartz <http://www.aaronsw.com/>"
+__version__ = "0.33"
+__license__ = "public domain"
+_adaptedfrom_ = [
+ "http://www.iamcal.com/publish/articles/php/processing_html_part_2/",
+ "http://feedparser.org/tests/wellformed/sanitize/",
+ "http://alpha-geek.com/example/crazy_html2.html"]
+
+import sanitize
+
+def HTML(a, b, baseuri=None, addnofollow=False):
+ try:
+ assert sanitize.HTML(a, baseuri=baseuri, addnofollow=addnofollow) == b
+ except AssertionError:
+ print ' in:', repr(a)
+ print 'out:', repr(sanitize.HTML(a, baseuri=baseuri))
+ print 'exp:', repr(b)
+ raise
+
+## basics
+HTML("","")
+HTML("hello","hello")
+
+## balancing tags
+HTML("<b>hello","<b>hello</b>")
+HTML("hello<b>","hello<b></b>")
+HTML("hello</b>","hello")
+HTML("hello<b/>","hello<b></b>")
+HTML("<b><b><b>hello","<b><b><b>hello</b></b></b>")
+HTML("</b><b>","<b></b>")
+
+## trailing slashes
+HTML('<img>','<img />')
+HTML('<img/>','<img />')
+HTML('<b/></b>','<b></b>')
+
+## balancing angle brakets
+HTML('<img src="foo"','')
+HTML('b>','b>')
+HTML('<img src="foo"/','')
+HTML('>','>')
+HTML('foo<b','foo')
+HTML('b>foo','b>foo')
+HTML('><b','>')
+HTML('b><','b>')
+HTML('><b>','><b></b>')
+
+## attributes
+HTML('<img src=foo>','<img src="foo" />')
+HTML('<img asrc=foo>','<img />')
+HTML('<img src=test test>','<img src="test" />')
+HTML('<input type="checkbox" checked>', '<input type="checkbox" checked="checked" />')
+
+## dangerous tags (a small sample)
+sHTML = lambda x: HTML(x, 'safe <b>description</b>')
+sHTML('safe<applet code="foo.class" codebase="http://example.com/"></applet> <b>description</b>')
+sHTML('<notinventedyet>safe</notinventedyet> <b>description</b>')
+sHTML('<blink>safe</blink> <b>description</b>')
+sHTML('safe<embed src="http://example.com/"> <b>description</b>')
+sHTML('safe<frameset rows="*"><frame src="http://example.com/"></frameset> <b>description</b>')
+sHTML('safe<iframe src="http://example.com/"> <b>description</b></iframe>')
+sHTML('safe<link rel="stylesheet" type="text/css" href="http://example.com/evil.css"> <b>description</b>')
+sHTML('safe<meta http-equiv="Refresh" content="0; URL=http://example.com/"> <b>description</b>')
+sHTML('safe<object classid="clsid:C932BA85-4374-101B-A56C-00AA003668DC"> <b>description</b>')
+sHTML('safe<script type="text/javascript">location.href=\'http:/\'+\'/example.com/\';</script> <b>description</b>')
+
+for x in ['onabort', 'onblur', 'onchange', 'onclick', 'ondblclick', 'onerror', 'onfocus', 'onkeydown', 'onkeypress', 'onkeyup', 'onload', 'onmousedown', 'onmouseout', 'onmouseover', 'onmouseup', 'onreset', 'resize', 'onsubmit', 'onunload']:
+ HTML('<img src="http://www.ragingplatypus.com/i/cam-full.jpg" %s="location.href=\'http://www.ragingplatypus.com/\';" />' % x,
+ '<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />')
+
+HTML('<a href="http://www.ragingplatypus.com/" style="display:block; position:absolute; left:0; top:0; width:100%; height:100%; z-index:1; background-color:black; background-image:url(http://www.ragingplatypus.com/i/cam-full.jpg); background-x:center; background-y:center; background-repeat:repeat;">never trust your upstream platypus</a>', '<a href="http://www.ragingplatypus.com/">never trust your upstream platypus</a>')
+
+## ignorables
+HTML('foo<style>bar', 'foo')
+HTML('foo<style>bar</style>', 'foo')
+
+## non-allowed tags
+HTML('<script>','')
+HTML('<script','')
+HTML('<script/>','')
+HTML('</script>','')
+HTML('<script woo=yay>','')
+HTML('<script woo="yay">','')
+HTML('<script woo="yay>','')
+HTML('<script woo="yay<b>','')
+HTML('<script<script>>','')
+HTML('<<script>script<script>>','')
+HTML('<<script><script>>','')
+HTML('<<script>script>>','')
+HTML('<<script<script>>','')
+HTML('<scr\0ipt>', '')
+
+## bad protocols (a small sample)
+HTML('<a href="http://foo">bar</a>', '<a href="http://foo">bar</a>')
+HTML('<a href="ftp://foo">bar</a>', '<a href="ftp://foo">bar</a>')
+HTML('<a href="mailto:foo">bar</a>', '<a href="mailto:foo">bar</a>')
+HTML('<a href="javascript:foo">bar</a>', '<a href="#foo">bar</a>')
+HTML('<a href="java script:foo">bar</a>', '<a href="#foo">bar</a>')
+HTML('<a href="java\tscript:foo">bar</a>', '<a href="#foo">bar</a>')
+HTML('<a href="java\nscript:foo">bar</a>', '<a href="#foo">bar</a>')
+HTML('<a href="java'+chr(1)+'script:foo">bar</a>', '<a href="#foo">bar</a>')
+HTML('<a href="jscript:foo">bar</a>', '<a href="#foo">bar</a>')
+HTML('<a href="vbscript:foo">bar</a>', '<a href="#foo">bar</a>')
+HTML('<a href="view-source:foo">bar</a>', '<a href="#foo">bar</a>')
+HTML('<a href="notinventedyet:foo">bar</a>', '<a href="#foo">bar</a>')
+
+## base uris
+
+HTML('<a href="foo">bar</a>', '<a href="http://baz.net/foo">bar</a>', baseuri='http://baz.net')
+HTML('<a href="foo">bar</a>', '<a href="http://baz.net/foo">bar</a>', baseuri='http://baz.net/')
+HTML('<a href="foo">bar</a>', '<a href="http://baz.net/foo">bar</a>', baseuri='http://baz.net/goo')
+
+HTML('<img src="foo" />', '<img src="http://baz.net/foo" />', baseuri='http://baz.net')
+
+## auto closers
+HTML('<img src="a">', '<img src="a" />')
+HTML('<img src="a">foo</img>', '<img src="a" />foo')
+HTML('</img>', '')
+
+## comments
+HTML('<!--foo-->', '<!--foo-->')
+HTML('doo<!--', 'doo')
+
+## crazy: http://alpha-geek.com/example/crazy_html2.html
+HTML('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\r\n\r\n<html xmlns="http://www.w3.org/1999/xhtml">\r\n<head>\r\n<title>Crazy HTML -- Can Your Regex Parse This?</title>\r\n</head>\r\n<body notRealAttribute="value"onload="executeMe();"foo="bar"\r\n\r\n>\r\n<!-- <script> -->\r\n\r\n<!-- \r\n\t<script> \r\n-->\r\n\r\n</script>\r\n\r\n\r\n<script\r\n\r\n\r\n>\r\n\r\nfunction executeMe()\r\n{\r\n\r\n\r\n\r\n\r\n/* <script> \r\nfunction am_i_javascript()\r\n{\r\n\tvar str = "Some innocuously commented out stuff";\r\n}\r\n< /script>\r\n*/\r\n\r\n\t\r\n\t\r\n\t\r\n\t\r\n\t\r\n\t\r\n\t\r\n\t\r\n\talert("Executed");\r\n}\r\n\r\n </script\r\n\r\n\r\n\r\n>\r\n<h1>Did The Javascript Execute?</h1>\r\n<div notRealAttribute="value\r\n"onmouseover="\r\nexecuteMe();\r\n"foo="bar">\r\nI will execute here, too, if you mouse over me\r\n</div>\r\nThis is to keep you guys honest...<br />\r\nI like ontonology. I like to script ontology. Though, script>style>this.\r\n</body>\r\n</html>', 'Crazy HTML -- Can Your Regex Parse This?\n\n\n<!-- <script> -->\n\n<!-- \n\t<script> \n-->\n\n\n\nfunction executeMe()\n{\n\n\n\n\n/* \n<h1>Did The Javascript Execute?</h1>\n<div>\nI will execute here, too, if you mouse over me\n</div>\nThis is to keep you guys honest...<br />\nI like ontonology. I like to script ontology. Though, script>style>this.')
+
+## valid entity references
+HTML("&nbsp;","&nbsp;")
+HTML("&#160;","&#160;")
+HTML("&#xa0;","&#xa0;")
+HTML("&#xA0;","&#xA0;")
+
+## unescaped ampersands
+HTML("AT&T","AT&amp;T")
+HTML("http://example.org?a=1&b=2","http://example.org?a=1&amp;b=2")
+
+## attribute values
+HTML('<a title="&">amp</a>','<a title="&amp;">amp</a>')
+HTML('<a title="AT&amp;T">test</a>', '<a title="AT&amp;T">test</a>')
+HTML('<a title=\'"foo"\'>test</a>', '<a title="&quot;foo&quot;">test</a>')
+HTML('<a title="&quot;foo&quot;">test</a>', '<a title="&quot;foo&quot;">test</a>')
+HTML('<a title="foo&nbsp;bar">test</a>', '<a title="foo&nbsp;bar">test</a>')
+HTML('<a title="foo&#xa0;bar">test</a>', '<a title="foo&#xa0;bar">test</a>')
+HTML('<a title="foo&#xA0;bar">test</a>', '<a title="foo&#xA0;bar">test</a>')
+HTML('<a title="foo&#160;bar">test</a>', '<a title="foo&#160;bar">test</a>')
+HTML('<a title="foo<bar">test</a>', '<a title="foo<bar">test</a>')
+HTML('<a title="foo&lt;bar">test</a>', '<a title="foo<bar">test</a>')
+
+## quote characters
+HTML('<a title="&#34;">quote</a>','<a title="&quot;">quote</a>')
+HTML('<a title="&#39;">quote</a>','<a title="\'">quote</a>')
+
+## adding nofollow
+HTML('<a href="http://spam.example.org/">viagra</a>', '<a href="http://spam.example.org/" rel="nofollow">viagra</a>', addnofollow=True)
+HTML('<a rel="foo" href="http://spam.example.org/">viagra</a>', '<a href="http://spam.example.org/" rel="nofollow">viagra</a>', addnofollow=True)
Please sign in to comment.
Something went wrong with that request. Please try again.