Skip to content

Commit

Permalink
Merge pull request #58 from brondsem/master
Browse files Browse the repository at this point in the history
Proper escaping for -+.\
  • Loading branch information
aaronsw committed Nov 5, 2012
2 parents 1d57ff1 + 510c0e0 commit 0fbc6af
Show file tree
Hide file tree
Showing 6 changed files with 167 additions and 14 deletions.
49 changes: 42 additions & 7 deletions html2text.py
Expand Up @@ -663,8 +663,8 @@ def handle_data(self, data):
self.o("[")
self.maybe_automatic_link = None

if self.escape_snob and not self.code and not self.pre:
data = escape_md(data, snob=True)
if not self.code and not self.pre:
data = escape_md_section(data, snob=self.escape_snob)
self.o(data, 1)

def unknown_decl(self, data): pass
Expand Down Expand Up @@ -744,7 +744,32 @@ def optwrap(self, text):
ordered_list_matcher = re.compile(r'\d+\.\s')
unordered_list_matcher = re.compile(r'[-\*\+]\s')
md_chars_matcher = re.compile(r"([\\\[\]\(\)])")
md_chars_matcher_all = re.compile(r"([\\`\*_{}\[\]\(\)#\+\-\.!])")
md_chars_matcher_all = re.compile(r"([`\*_{}\[\]\(\)#!])")
md_dot_matcher = re.compile(r"""
^ # start of line
(\s*\d+) # optional whitespace and a number
(\.) # dot
(?=\s) # lookahead assert whitespace
""", re.MULTILINE | re.VERBOSE)
md_plus_matcher = re.compile(r"""
^
(\s*)
(\+)
(?=\s)
""", flags=re.MULTILINE | re.VERBOSE)
md_dash_matcher = re.compile(r"""
^
(\s*)
(-)
(?=\s|\-) # followed by whitespace (bullet list, or spaced out hr)
# or another dash (header or hr)
""", flags=re.MULTILINE | re.VERBOSE)
slash_chars = r'\`*_{}[]()#+-.!'
md_backslash_matcher = re.compile(r'''
(\\) # match one slash
(?=[%s]) # followed by a char that requires escaping
''' % re.escape(slash_chars),
flags=re.VERBOSE)

def skipwrap(para):
# If the text begins with four spaces or one tab, it's a code block; don't wrap
Expand Down Expand Up @@ -782,10 +807,20 @@ def unescape(s, unicode_snob=False):
h.unicode_snob = unicode_snob
return h.unescape(s)

def escape_md(text, snob=False):
"""Escapes markdown-sensitive characters."""
matcher = md_chars_matcher_all if snob else md_chars_matcher
return matcher.sub(r"\\\1", text)
def escape_md(text):
"""Escapes markdown-sensitive characters within other markdown constructs."""
return md_chars_matcher.sub(r"\\\1", text)

def escape_md_section(text, snob=False):
"""Escapes markdown-sensitive characters across whole document sections."""
text = md_backslash_matcher.sub(r"\\\1", text)
if snob:
text = md_chars_matcher_all.sub(r"\\\1", text)
text = md_dot_matcher.sub(r"\1\\\2", text)
text = md_plus_matcher.sub(r"\1\\\2", text)
text = md_dash_matcher.sub(r"\1\\\2", text)
return text


def main():
baseurl = ''
Expand Down
2 changes: 1 addition & 1 deletion test/emdash-para.md
Expand Up @@ -11,5 +11,5 @@ ribs, et nulla ground round do sunt dolore. Dolore nisi ullamco veniam sunt.
Duis brisket drumstick, dolor fatback filet mignon meatloaf laboris tri-tip
speck chuck ball tip voluptate ullamco laborum.

--
\--

41 changes: 40 additions & 1 deletion test/normal.html
Expand Up @@ -96,6 +96,45 @@ <h1>
//]]>
</script>

<p>
2012. Now that was a good year. So was 2011. That's all.
</p>

<p>
3.14159 is an approximation of pi.
</p>

<p>
+ not + a list item
</p>

<p>
+foo
</p>

<p>
- foo - bar
</p>

<p>
-foo
</p>

<p>
not a header<br>
--
</p>

<p>
not a hr<br>
<br>
---
<br>
- - -
</p>

<p>
c:\tmp, \\server\path, \_/, foo\bar, #\#, \\#
</p>
</body>
</html>

22 changes: 22 additions & 0 deletions test/normal.md
Expand Up @@ -30,3 +30,25 @@ _italic_
Some `fixed width text` here
_`italic fixed width text`_

2012\. Now that was a good year. So was 2011. That's all.

3.14159 is an approximation of pi.

\+ not + a list item

+foo

\- foo - bar

-foo

not a header
\--

not a hr

\---
\- - -

c:\tmp, \\\server\path, \\_/, foo\bar, #\\#, \\\\#

44 changes: 40 additions & 4 deletions test/normal_escape_snob.html
Expand Up @@ -28,9 +28,6 @@ <h1>
<li>
<span>apple</span>
</li>
<li>
<span>yam\\sweet potato</span>
</li>
</ul>
<li>
<span>final</span>
Expand Down Expand Up @@ -100,6 +97,45 @@ <h1>
//]]>
</script>

<p>
2012. Now that was a good year. So was 2011. That's all.
</p>

<p>
3.14159 is an approximation of pi.
</p>

<p>
+ not + a list item
</p>

<p>
+foo
</p>

<p>
- foo - bar
</p>

<p>
-foo
</p>

<p>
not a header<br>
--
</p>

<p>
not a hr<br>
<br>
---
<br>
- - -
</p>

<p>
c:\tmp, \\server\path, \_/, foo\bar, #\#, \\#
</p>
</body>
</html>

23 changes: 22 additions & 1 deletion test/normal_escape_snob.md
Expand Up @@ -6,7 +6,6 @@ first issue
* _**bold italic**_
* orange
* apple
* yam\\\\sweet potato
* final

text to separate lists
Expand All @@ -32,3 +31,25 @@ text with \_underscore but not \_italicized
Some `fixed width text` here
_`italic fixed width text`_

2012\. Now that was a good year. So was 2011. That's all.

3.14159 is an approximation of pi.

\+ not + a list item

+foo

\- foo - bar

-foo

not a header
\--

not a hr

\---
\- - -

c:\tmp, \\\server\path, \\\_/, foo\bar, \#\\\#, \\\\\#

0 comments on commit 0fbc6af

Please sign in to comment.