diff --git a/html2text.py b/html2text.py index 73ae6bbb..2f650ab5 100755 --- a/html2text.py +++ b/html2text.py @@ -663,8 +663,8 @@ def handle_data(self, data): self.o("[") self.maybe_automatic_link = None - if self.escape_snob and not self.code and not self.pre: - data = escape_md(data, snob=True) + if not self.code and not self.pre: + data = escape_md_section(data, snob=self.escape_snob) self.o(data, 1) def unknown_decl(self, data): pass @@ -744,7 +744,32 @@ def optwrap(self, text): ordered_list_matcher = re.compile(r'\d+\.\s') unordered_list_matcher = re.compile(r'[-\*\+]\s') md_chars_matcher = re.compile(r"([\\\[\]\(\)])") -md_chars_matcher_all = re.compile(r"([\\`\*_{}\[\]\(\)#\+\-\.!])") +md_chars_matcher_all = re.compile(r"([`\*_{}\[\]\(\)#!])") +md_dot_matcher = re.compile(r""" + ^ # start of line + (\s*\d+) # optional whitespace and a number + (\.) # dot + (?=\s) # lookahead assert whitespace + """, re.MULTILINE | re.VERBOSE) +md_plus_matcher = re.compile(r""" + ^ + (\s*) + (\+) + (?=\s) + """, flags=re.MULTILINE | re.VERBOSE) +md_dash_matcher = re.compile(r""" + ^ + (\s*) + (-) + (?=\s|\-) # followed by whitespace (bullet list, or spaced out hr) + # or another dash (header or hr) + """, flags=re.MULTILINE | re.VERBOSE) +slash_chars = r'\`*_{}[]()#+-.!' +md_backslash_matcher = re.compile(r''' + (\\) # match one slash + (?=[%s]) # followed by a char that requires escaping + ''' % re.escape(slash_chars), + flags=re.VERBOSE) def skipwrap(para): # If the text begins with four spaces or one tab, it's a code block; don't wrap @@ -782,10 +807,20 @@ def unescape(s, unicode_snob=False): h.unicode_snob = unicode_snob return h.unescape(s) -def escape_md(text, snob=False): - """Escapes markdown-sensitive characters.""" - matcher = md_chars_matcher_all if snob else md_chars_matcher - return matcher.sub(r"\\\1", text) +def escape_md(text): + """Escapes markdown-sensitive characters within other markdown constructs.""" + return md_chars_matcher.sub(r"\\\1", text) + +def escape_md_section(text, snob=False): + """Escapes markdown-sensitive characters across whole document sections.""" + text = md_backslash_matcher.sub(r"\\\1", text) + if snob: + text = md_chars_matcher_all.sub(r"\\\1", text) + text = md_dot_matcher.sub(r"\1\\\2", text) + text = md_plus_matcher.sub(r"\1\\\2", text) + text = md_dash_matcher.sub(r"\1\\\2", text) + return text + def main(): baseurl = '' diff --git a/test/emdash-para.md b/test/emdash-para.md index 339b7f3a..486639a1 100644 --- a/test/emdash-para.md +++ b/test/emdash-para.md @@ -11,5 +11,5 @@ ribs, et nulla ground round do sunt dolore. Dolore nisi ullamco veniam sunt. Duis brisket drumstick, dolor fatback filet mignon meatloaf laboris tri-tip speck chuck ball tip voluptate ullamco laborum. --- +\-- diff --git a/test/normal.html b/test/normal.html index 0b99a71b..47ef480e 100644 --- a/test/normal.html +++ b/test/normal.html @@ -96,6 +96,45 @@

//]]> +

+ 2012. Now that was a good year. So was 2011. That's all. +

+ +

+ 3.14159 is an approximation of pi. +

+ +

+ + not + a list item +

+ +

+ +foo +

+ +

+ - foo - bar +

+ +

+ -foo +

+ +

+ not a header
+ -- +

+ +

+ not a hr
+
+ --- +
+ - - - +

+ +

+ c:\tmp, \\server\path, \_/, foo\bar, #\#, \\# +

- diff --git a/test/normal.md b/test/normal.md index a1b97143..d63b403a 100644 --- a/test/normal.md +++ b/test/normal.md @@ -30,3 +30,25 @@ _italic_ Some `fixed width text` here _`italic fixed width text`_ +2012\. Now that was a good year. So was 2011. That's all. + +3.14159 is an approximation of pi. + +\+ not + a list item + ++foo + +\- foo - bar + +-foo + +not a header +\-- + +not a hr + +\--- +\- - - + +c:\tmp, \\\server\path, \\_/, foo\bar, #\\#, \\\\# + diff --git a/test/normal_escape_snob.html b/test/normal_escape_snob.html index 3d5b0b4d..0d21867a 100644 --- a/test/normal_escape_snob.html +++ b/test/normal_escape_snob.html @@ -28,9 +28,6 @@

  • apple
  • -
  • - yam\\sweet potato -
  • final @@ -100,6 +97,45 @@

    //]]> +

    + 2012. Now that was a good year. So was 2011. That's all. +

    + +

    + 3.14159 is an approximation of pi. +

    + +

    + + not + a list item +

    + +

    + +foo +

    + +

    + - foo - bar +

    + +

    + -foo +

    + +

    + not a header
    + -- +

    + +

    + not a hr
    +
    + --- +
    + - - - +

    + +

    + c:\tmp, \\server\path, \_/, foo\bar, #\#, \\# +

    - diff --git a/test/normal_escape_snob.md b/test/normal_escape_snob.md index d8437066..1260a0a8 100644 --- a/test/normal_escape_snob.md +++ b/test/normal_escape_snob.md @@ -6,7 +6,6 @@ first issue * _**bold italic**_ * orange * apple - * yam\\\\sweet potato * final text to separate lists @@ -32,3 +31,25 @@ text with \_underscore but not \_italicized Some `fixed width text` here _`italic fixed width text`_ +2012\. Now that was a good year. So was 2011. That's all. + +3.14159 is an approximation of pi. + +\+ not + a list item + ++foo + +\- foo - bar + +-foo + +not a header +\-- + +not a hr + +\--- +\- - - + +c:\tmp, \\\server\path, \\\_/, foo\bar, \#\\\#, \\\\\# +