Permalink
Browse files

Merge pull request #44 from dreikanter/master

Fix: Escaping parenthesis in URLs to prevent links and image syntax from breaking
  • Loading branch information...
2 parents ac4945a + cae4445 commit 4d2fc96b895f20b338f3245220c051ea6e87aab3 @aaronsw committed Jul 24, 2012
Showing with 90 additions and 24 deletions.
  1. +11 −0 .editorconfig
  2. +4 −0 .gitignore
  3. +29 −18 html2text.py
  4. +10 −6 test/run_tests.py
  5. +16 −0 test/url-escaping.html
  6. +20 −0 test/url-escaping.md
View
@@ -0,0 +1,11 @@
+; top-most EditorConfig file
+root = true
+
+; Unix-style newlines
+[*]
+end_of_line = CRLF
+
+; 4 space indentation
+[*.py]
+indent_style = space
+indent_size = 4
View
@@ -1,2 +1,6 @@
*.py[co]
+*.bak
test/*output.md
+build
+dist
+*.egg-info
View
@@ -191,15 +191,20 @@ def __init__(self, out=None, baseurl=''):
self.google_doc = False
self.ul_item_mark = '*'
- if out is None: self.out = self.outtextf
- else: self.out = out
- self.outtextlist = [] # empty list to store output characters before they are "joined"
+ if out is None:
+ self.out = self.outtextf
+ else:
+ self.out = out
+
+ self.outtextlist = [] # empty list to store output characters before they are "joined"
+
try:
self.outtext = unicode()
- except NameError: # Python3
+ except NameError: # Python3
self.outtext = str()
+
self.quiet = 0
- self.p_p = 0 # number of newline character to print before next output
+ self.p_p = 0 # number of newline character to print before next output
self.outcount = 0
self.start = 1
self.space = 0
@@ -220,9 +225,9 @@ def __init__(self, out=None, baseurl=''):
self.emphasis = 0
self.drop_white_space = 0
self.inheader = False
- self.abbr_title = None # current abbreviation definition
- self.abbr_data = None # last inner HTML (for abbr being defined)
- self.abbr_list = {} # stack of abbreviations to write later
+ self.abbr_title = None # current abbreviation definition
+ self.abbr_data = None # last inner HTML (for abbr being defined)
+ self.abbr_list = {} # stack of abbreviations to write later
self.baseurl = baseurl
try: del unifiable_n[name2cp('nbsp')]
@@ -465,7 +470,7 @@ def handle_tag(self, tag, attrs, start):
a = self.astack.pop()
if a:
if self.inline_links:
- self.o("](" + a['href'] + ")")
+ self.o("](" + escape_md(a['href']) + ")")
else:
i = self.previousIndex(a)
if i is not None:
@@ -481,10 +486,10 @@ def handle_tag(self, tag, attrs, start):
if has_key(attrs, 'src'):
attrs['href'] = attrs['src']
alt = attrs.get('alt', '')
+ self.o("![" + escape_md(alt) + "]")
+
if self.inline_links:
- self.o("![")
- self.o(alt)
- self.o("]("+ attrs['href'] +")")
+ self.o("(" + escape_md(attrs['href']) + ")")
else:
i = self.previousIndex(attrs)
if i is not None:
@@ -494,9 +499,7 @@ def handle_tag(self, tag, attrs, start):
attrs['count'] = self.acount
attrs['outcount'] = self.outcount
self.a.append(attrs)
- self.o("![")
- self.o(alt)
- self.o("]["+ str(attrs['count']) +"]")
+ self.o("[" + str(attrs['count']) + "]")
if tag == 'dl' and start: self.p()
if tag == 'dt' and not start: self.pbr()
@@ -548,16 +551,19 @@ def handle_tag(self, tag, attrs, start):
self.p()
def pbr(self):
- if self.p_p == 0: self.p_p = 1
+ if self.p_p == 0:
+ self.p_p = 1
- def p(self): self.p_p = 2
+ def p(self):
+ self.p_p = 2
def soft_br(self):
self.pbr()
self.br_toggle = ' '
def o(self, data, puredata=0, force=0):
- if self.abbr_data is not None: self.abbr_data += data
+ if self.abbr_data is not None:
+ self.abbr_data += data
if not self.quiet:
if self.google_doc:
@@ -711,6 +717,7 @@ def optwrap(self, text):
ordered_list_matcher = re.compile(r'\d+\.\s')
unordered_list_matcher = re.compile(r'[-\*\+]\s')
+md_chars_matcher = re.compile(r"([\\\[\]\(\)])")
def skipwrap(para):
# If the text begins with four spaces or one tab, it's a code block; don't wrap
@@ -748,6 +755,10 @@ def unescape(s, unicode_snob=False):
h.unicode_snob = unicode_snob
return h.unescape(s)
+def escape_md(text):
+ """Escapes markdown-sensitive characters."""
+ return md_chars_matcher.sub(r"\\\1", text)
+
def main():
baseurl = ''
View
@@ -10,7 +10,7 @@
def test_module(fn, unicode_snob=False, google_doc=False):
- print_conditions(fn, 'module', unicode_snob, google_doc)
+ print_conditions('module', unicode_snob, google_doc)
h = html2text.HTML2Text()
@@ -29,7 +29,7 @@ def test_module(fn, unicode_snob=False, google_doc=False):
def test_command(fn, google_doc=False):
- print_conditions(fn, 'command', False, google_doc)
+ print_conditions('command', False, google_doc)
cmd = ['python', '../html2text.py']
if fn.lower().startswith('google'):
@@ -48,9 +48,9 @@ def test_command(fn, google_doc=False):
print_result(fn, 'command', result, actual)
-def print_conditions(fn, mode, unicode_snob, google_doc):
- format = "%s (%s, unicode_snob=%d, google_doc=%d): "
- sys.stdout.write(format % (fn, mode, int(unicode_snob), int(google_doc)))
+def print_conditions(mode, unicode_snob, google_doc):
+ format = " * %s, unicode_snob=%d, google_doc=%d: "
+ sys.stdout.write(format % (mode, int(unicode_snob), int(google_doc)))
def print_result(fn, mode, result, actual):
@@ -63,7 +63,10 @@ def print_result(fn, mode, result, actual):
print(len(result), len(actual))
dump_name = get_dump_name(fn, mode)
- open(dump_name, 'w').write(result)
+
+ with codecs.open(dump_name, encoding='utf-8', mode='w+') as f:
+ f.write(actual)
+
print(" Use: diff -u %s %s" % (get_baseline_name(fn), dump_name))
@@ -90,6 +93,7 @@ def run_all_tests():
google_doc = fn.lower().startswith('google')
unicode_snob = fn.lower().find('unicode') > 0
+ print('\n' + fn + ':')
test_module(fn, unicode_snob, google_doc)
if not unicode_snob:
View
@@ -0,0 +1,16 @@
+<h1>Markdown-sensible characters processing</h1>
+
+<p>This test checks special characters processing inside URLs: parenthesis and brackets should be escaped to keep markdown image and anchor syntax safe and sound.</p>
+
+<ul>
+ <li><a href="http://msdn.microsoft.com/en-us/library/system.drawing.drawing2d(v=vs.110)">Some MSDN link using parenthesis</a></li>
+ <li><a href="https://www.google.ru/search?q=[brackets are cool]">Google search result URL with unescaped brackets</a></li>
+ <li><a href="https://www.google.ru/search?q='[({})]'">Yet another test for [brackets], {curly braces} and (parenthesis) processing inside the anchor</a></li>
+</ul>
+
+<p>And here are images with tricky attribute values:</p>
+
+<img src="http://placehold.it/350x150#(banana)" width="350" height="150" alt="(banana)"><br>
+<img src="http://placehold.it/350x150#[banana]" width="350" height="150" alt="[banana]"><br>
+<img src="http://placehold.it/350x150#{banana}" width="350" height="150" alt="{banana}"><br>
+<img src="http://placehold.it/350x150#([{}])" width="350" height="150" alt="([{}])">
View
@@ -0,0 +1,20 @@
+# Markdown-sensible characters processing
+
+This test checks special characters processing inside URLs: parenthesis and
+brackets should be escaped to keep markdown image and anchor syntax safe and
+sound.
+
+ * [Some MSDN link using parenthesis](http://msdn.microsoft.com/en-us/library/system.drawing.drawing2d\(v=vs.110\))
+ * [Google search result URL with unescaped brackets](https://www.google.ru/search?q=\[brackets are cool\])
+ * [Yet another test for [brackets], {curly braces} and (parenthesis) processing inside the anchor](https://www.google.ru/search?q='\[\({}\)\]')
+
+And here are images with tricky attribute values:
+
+![\(banana\)](http://placehold.it/350x150#\(banana\))
+
+![\[banana\]](http://placehold.it/350x150#\[banana\])
+
+![{banana}](http://placehold.it/350x150#{banana})
+
+![\(\[{}\]\)](http://placehold.it/350x150#\(\[{}\]\))
+

0 comments on commit 4d2fc96

Please sign in to comment.