diff --git a/html2text.py b/html2text.py index 73ae6bbb..2f650ab5 100755 --- a/html2text.py +++ b/html2text.py @@ -663,8 +663,8 @@ def handle_data(self, data): self.o("[") self.maybe_automatic_link = None - if self.escape_snob and not self.code and not self.pre: - data = escape_md(data, snob=True) + if not self.code and not self.pre: + data = escape_md_section(data, snob=self.escape_snob) self.o(data, 1) def unknown_decl(self, data): pass @@ -744,7 +744,32 @@ def optwrap(self, text): ordered_list_matcher = re.compile(r'\d+\.\s') unordered_list_matcher = re.compile(r'[-\*\+]\s') md_chars_matcher = re.compile(r"([\\\[\]\(\)])") -md_chars_matcher_all = re.compile(r"([\\`\*_{}\[\]\(\)#\+\-\.!])") +md_chars_matcher_all = re.compile(r"([`\*_{}\[\]\(\)#!])") +md_dot_matcher = re.compile(r""" + ^ # start of line + (\s*\d+) # optional whitespace and a number + (\.) # dot + (?=\s) # lookahead assert whitespace + """, re.MULTILINE | re.VERBOSE) +md_plus_matcher = re.compile(r""" + ^ + (\s*) + (\+) + (?=\s) + """, flags=re.MULTILINE | re.VERBOSE) +md_dash_matcher = re.compile(r""" + ^ + (\s*) + (-) + (?=\s|\-) # followed by whitespace (bullet list, or spaced out hr) + # or another dash (header or hr) + """, flags=re.MULTILINE | re.VERBOSE) +slash_chars = r'\`*_{}[]()#+-.!' +md_backslash_matcher = re.compile(r''' + (\\) # match one slash + (?=[%s]) # followed by a char that requires escaping + ''' % re.escape(slash_chars), + flags=re.VERBOSE) def skipwrap(para): # If the text begins with four spaces or one tab, it's a code block; don't wrap @@ -782,10 +807,20 @@ def unescape(s, unicode_snob=False): h.unicode_snob = unicode_snob return h.unescape(s) -def escape_md(text, snob=False): - """Escapes markdown-sensitive characters.""" - matcher = md_chars_matcher_all if snob else md_chars_matcher - return matcher.sub(r"\\\1", text) +def escape_md(text): + """Escapes markdown-sensitive characters within other markdown constructs.""" + return md_chars_matcher.sub(r"\\\1", text) + +def escape_md_section(text, snob=False): + """Escapes markdown-sensitive characters across whole document sections.""" + text = md_backslash_matcher.sub(r"\\\1", text) + if snob: + text = md_chars_matcher_all.sub(r"\\\1", text) + text = md_dot_matcher.sub(r"\1\\\2", text) + text = md_plus_matcher.sub(r"\1\\\2", text) + text = md_dash_matcher.sub(r"\1\\\2", text) + return text + def main(): baseurl = '' diff --git a/test/emdash-para.md b/test/emdash-para.md index 339b7f3a..486639a1 100644 --- a/test/emdash-para.md +++ b/test/emdash-para.md @@ -11,5 +11,5 @@ ribs, et nulla ground round do sunt dolore. Dolore nisi ullamco veniam sunt. Duis brisket drumstick, dolor fatback filet mignon meatloaf laboris tri-tip speck chuck ball tip voluptate ullamco laborum. --- +\-- diff --git a/test/normal.html b/test/normal.html index 0b99a71b..47ef480e 100644 --- a/test/normal.html +++ b/test/normal.html @@ -96,6 +96,45 @@
+ 2012. Now that was a good year. So was 2011. That's all. +
+ ++ 3.14159 is an approximation of pi. +
+ ++ + not + a list item +
+ ++ +foo +
+ ++ - foo - bar +
+ ++ -foo +
+ +
+ not a header
+ --
+
+ not a hr
+
+ ---
+
+ - - -
+
+ c:\tmp, \\server\path, \_/, foo\bar, #\#, \\# +