Skip to content

Commit

Permalink
escape -+. properly all the time
Browse files Browse the repository at this point in the history
  • Loading branch information
brondsem committed Nov 5, 2012
1 parent 1d57ff1 commit a207970
Show file tree
Hide file tree
Showing 6 changed files with 148 additions and 10 deletions.
42 changes: 35 additions & 7 deletions html2text.py
Original file line number Diff line number Diff line change
Expand Up @@ -663,8 +663,8 @@ def handle_data(self, data):
self.o("[")
self.maybe_automatic_link = None

if self.escape_snob and not self.code and not self.pre:
data = escape_md(data, snob=True)
if not self.code and not self.pre:
data = escape_md_section(data, snob=self.escape_snob)
self.o(data, 1)

def unknown_decl(self, data): pass
Expand Down Expand Up @@ -744,7 +744,26 @@ def optwrap(self, text):
ordered_list_matcher = re.compile(r'\d+\.\s')
unordered_list_matcher = re.compile(r'[-\*\+]\s')
md_chars_matcher = re.compile(r"([\\\[\]\(\)])")
md_chars_matcher_all = re.compile(r"([\\`\*_{}\[\]\(\)#\+\-\.!])")
md_chars_matcher_all = re.compile(r"([\\`\*_{}\[\]\(\)#!])")
md_dot_matcher = re.compile(r"""
^ # start of line
(\s*\d+) # optional whitespace and a number
(\.) # dot
(?=\s) # lookahead assert whitespace
""", re.MULTILINE | re.VERBOSE)
md_plus_matcher = re.compile(r"""
^
(\s*)
(\+)
(?=\s)
""", flags=re.MULTILINE | re.VERBOSE)
md_dash_matcher = re.compile(r"""
^
(\s*)
(-)
(?=\s|\-) # followed by whitespace (bullet list, or spaced out hr)
# or another dash (header or hr)
""", flags=re.MULTILINE | re.VERBOSE)

def skipwrap(para):
# If the text begins with four spaces or one tab, it's a code block; don't wrap
Expand Down Expand Up @@ -782,10 +801,19 @@ def unescape(s, unicode_snob=False):
h.unicode_snob = unicode_snob
return h.unescape(s)

def escape_md(text, snob=False):
"""Escapes markdown-sensitive characters."""
matcher = md_chars_matcher_all if snob else md_chars_matcher
return matcher.sub(r"\\\1", text)
def escape_md(text):
"""Escapes markdown-sensitive characters within other markdown constructs."""
return md_chars_matcher.sub(r"\\\1", text)

def escape_md_section(text, snob=False):
"""Escapes markdown-sensitive characters across whole document sections."""
if snob:
text = md_chars_matcher_all.sub(r"\\\1", text)
text = md_dot_matcher.sub(r"\1\\\2", text)
text = md_plus_matcher.sub(r"\1\\\2", text)
text = md_dash_matcher.sub(r"\1\\\2", text)
return text


def main():
baseurl = ''
Expand Down
2 changes: 1 addition & 1 deletion test/emdash-para.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,5 @@ ribs, et nulla ground round do sunt dolore. Dolore nisi ullamco veniam sunt.
Duis brisket drumstick, dolor fatback filet mignon meatloaf laboris tri-tip
speck chuck ball tip voluptate ullamco laborum.

--
\--

37 changes: 36 additions & 1 deletion test/normal.html
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,41 @@ <h1>
//]]>
</script>

<p>
2012. Now that was a good year. So was 2011. That's all.
</p>

<p>
3.14159 is an approximation of pi.
</p>

<p>
+ not + a list item
</p>

<p>
+foo
</p>

<p>
- foo - bar
</p>

<p>
-foo
</p>

<p>
not a header<br>
--
</p>

<p>
not a hr<br>
<br>
---
<br>
- - -
</p>
</body>
</html>

20 changes: 20 additions & 0 deletions test/normal.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,23 @@ _italic_
Some `fixed width text` here
_`italic fixed width text`_

2012\. Now that was a good year. So was 2011. That's all.

3.14159 is an approximation of pi.

\+ not + a list item

+foo

\- foo - bar

-foo

not a header
\--

not a hr

\---
\- - -

37 changes: 36 additions & 1 deletion test/normal_escape_snob.html
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,41 @@ <h1>
//]]>
</script>

<p>
2012. Now that was a good year. So was 2011. That's all.
</p>

<p>
3.14159 is an approximation of pi.
</p>

<p>
+ not + a list item
</p>

<p>
+foo
</p>

<p>
- foo - bar
</p>

<p>
-foo
</p>

<p>
not a header<br>
--
</p>

<p>
not a hr<br>
<br>
---
<br>
- - -
</p>
</body>
</html>

20 changes: 20 additions & 0 deletions test/normal_escape_snob.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,23 @@ text with \_underscore but not \_italicized
Some `fixed width text` here
_`italic fixed width text`_

2012\. Now that was a good year. So was 2011. That's all.

3.14159 is an approximation of pi.

\+ not + a list item

+foo

\- foo - bar

-foo

not a header
\--

not a hr

\---
\- - -

0 comments on commit a207970

Please sign in to comment.