From a2079707221ba95234b1aa4b7dba73c802d1975b Mon Sep 17 00:00:00 2001 From: Dave Brondsema Date: Fri, 2 Nov 2012 17:23:28 -0400 Subject: [PATCH 1/2] escape -+. properly all the time --- html2text.py | 42 ++++++++++++++++++++++++++++++------ test/emdash-para.md | 2 +- test/normal.html | 37 ++++++++++++++++++++++++++++++- test/normal.md | 20 +++++++++++++++++ test/normal_escape_snob.html | 37 ++++++++++++++++++++++++++++++- test/normal_escape_snob.md | 20 +++++++++++++++++ 6 files changed, 148 insertions(+), 10 deletions(-) diff --git a/html2text.py b/html2text.py index 73ae6bbb..d9a2b717 100755 --- a/html2text.py +++ b/html2text.py @@ -663,8 +663,8 @@ def handle_data(self, data): self.o("[") self.maybe_automatic_link = None - if self.escape_snob and not self.code and not self.pre: - data = escape_md(data, snob=True) + if not self.code and not self.pre: + data = escape_md_section(data, snob=self.escape_snob) self.o(data, 1) def unknown_decl(self, data): pass @@ -744,7 +744,26 @@ def optwrap(self, text): ordered_list_matcher = re.compile(r'\d+\.\s') unordered_list_matcher = re.compile(r'[-\*\+]\s') md_chars_matcher = re.compile(r"([\\\[\]\(\)])") -md_chars_matcher_all = re.compile(r"([\\`\*_{}\[\]\(\)#\+\-\.!])") +md_chars_matcher_all = re.compile(r"([\\`\*_{}\[\]\(\)#!])") +md_dot_matcher = re.compile(r""" + ^ # start of line + (\s*\d+) # optional whitespace and a number + (\.) # dot + (?=\s) # lookahead assert whitespace + """, re.MULTILINE | re.VERBOSE) +md_plus_matcher = re.compile(r""" + ^ + (\s*) + (\+) + (?=\s) + """, flags=re.MULTILINE | re.VERBOSE) +md_dash_matcher = re.compile(r""" + ^ + (\s*) + (-) + (?=\s|\-) # followed by whitespace (bullet list, or spaced out hr) + # or another dash (header or hr) + """, flags=re.MULTILINE | re.VERBOSE) def skipwrap(para): # If the text begins with four spaces or one tab, it's a code block; don't wrap @@ -782,10 +801,19 @@ def unescape(s, unicode_snob=False): h.unicode_snob = unicode_snob return h.unescape(s) -def escape_md(text, snob=False): - """Escapes markdown-sensitive characters.""" - matcher = md_chars_matcher_all if snob else md_chars_matcher - return matcher.sub(r"\\\1", text) +def escape_md(text): + """Escapes markdown-sensitive characters within other markdown constructs.""" + return md_chars_matcher.sub(r"\\\1", text) + +def escape_md_section(text, snob=False): + """Escapes markdown-sensitive characters across whole document sections.""" + if snob: + text = md_chars_matcher_all.sub(r"\\\1", text) + text = md_dot_matcher.sub(r"\1\\\2", text) + text = md_plus_matcher.sub(r"\1\\\2", text) + text = md_dash_matcher.sub(r"\1\\\2", text) + return text + def main(): baseurl = '' diff --git a/test/emdash-para.md b/test/emdash-para.md index 339b7f3a..486639a1 100644 --- a/test/emdash-para.md +++ b/test/emdash-para.md @@ -11,5 +11,5 @@ ribs, et nulla ground round do sunt dolore. Dolore nisi ullamco veniam sunt. Duis brisket drumstick, dolor fatback filet mignon meatloaf laboris tri-tip speck chuck ball tip voluptate ullamco laborum. --- +\-- diff --git a/test/normal.html b/test/normal.html index 0b99a71b..ba4a010c 100644 --- a/test/normal.html +++ b/test/normal.html @@ -96,6 +96,41 @@

//]]> +

+ 2012. Now that was a good year. So was 2011. That's all. +

+ +

+ 3.14159 is an approximation of pi. +

+ +

+ + not + a list item +

+ +

+ +foo +

+ +

+ - foo - bar +

+ +

+ -foo +

+ +

+ not a header
+ -- +

+ +

+ not a hr
+
+ --- +
+ - - - +

- diff --git a/test/normal.md b/test/normal.md index a1b97143..5067fa42 100644 --- a/test/normal.md +++ b/test/normal.md @@ -30,3 +30,23 @@ _italic_ Some `fixed width text` here _`italic fixed width text`_ +2012\. Now that was a good year. So was 2011. That's all. + +3.14159 is an approximation of pi. + +\+ not + a list item + ++foo + +\- foo - bar + +-foo + +not a header +\-- + +not a hr + +\--- +\- - - + diff --git a/test/normal_escape_snob.html b/test/normal_escape_snob.html index 3d5b0b4d..12ac025e 100644 --- a/test/normal_escape_snob.html +++ b/test/normal_escape_snob.html @@ -100,6 +100,41 @@

//]]> +

+ 2012. Now that was a good year. So was 2011. That's all. +

+ +

+ 3.14159 is an approximation of pi. +

+ +

+ + not + a list item +

+ +

+ +foo +

+ +

+ - foo - bar +

+ +

+ -foo +

+ +

+ not a header
+ -- +

+ +

+ not a hr
+
+ --- +
+ - - - +

- diff --git a/test/normal_escape_snob.md b/test/normal_escape_snob.md index d8437066..24e1ffd0 100644 --- a/test/normal_escape_snob.md +++ b/test/normal_escape_snob.md @@ -32,3 +32,23 @@ text with \_underscore but not \_italicized Some `fixed width text` here _`italic fixed width text`_ +2012\. Now that was a good year. So was 2011. That's all. + +3.14159 is an approximation of pi. + +\+ not + a list item + ++foo + +\- foo - bar + +-foo + +not a header +\-- + +not a hr + +\--- +\- - - + From 510c0e04b5dcdaaac71c77a0f97ff84705616de6 Mon Sep 17 00:00:00 2001 From: Dave Brondsema Date: Sat, 3 Nov 2012 16:41:48 -0400 Subject: [PATCH 2/2] always escape \ when needed --- html2text.py | 9 ++++++++- test/normal.html | 4 ++++ test/normal.md | 2 ++ test/normal_escape_snob.html | 7 ++++--- test/normal_escape_snob.md | 3 ++- 5 files changed, 20 insertions(+), 5 deletions(-) diff --git a/html2text.py b/html2text.py index d9a2b717..2f650ab5 100755 --- a/html2text.py +++ b/html2text.py @@ -744,7 +744,7 @@ def optwrap(self, text): ordered_list_matcher = re.compile(r'\d+\.\s') unordered_list_matcher = re.compile(r'[-\*\+]\s') md_chars_matcher = re.compile(r"([\\\[\]\(\)])") -md_chars_matcher_all = re.compile(r"([\\`\*_{}\[\]\(\)#!])") +md_chars_matcher_all = re.compile(r"([`\*_{}\[\]\(\)#!])") md_dot_matcher = re.compile(r""" ^ # start of line (\s*\d+) # optional whitespace and a number @@ -764,6 +764,12 @@ def optwrap(self, text): (?=\s|\-) # followed by whitespace (bullet list, or spaced out hr) # or another dash (header or hr) """, flags=re.MULTILINE | re.VERBOSE) +slash_chars = r'\`*_{}[]()#+-.!' +md_backslash_matcher = re.compile(r''' + (\\) # match one slash + (?=[%s]) # followed by a char that requires escaping + ''' % re.escape(slash_chars), + flags=re.VERBOSE) def skipwrap(para): # If the text begins with four spaces or one tab, it's a code block; don't wrap @@ -807,6 +813,7 @@ def escape_md(text): def escape_md_section(text, snob=False): """Escapes markdown-sensitive characters across whole document sections.""" + text = md_backslash_matcher.sub(r"\\\1", text) if snob: text = md_chars_matcher_all.sub(r"\\\1", text) text = md_dot_matcher.sub(r"\1\\\2", text) diff --git a/test/normal.html b/test/normal.html index ba4a010c..47ef480e 100644 --- a/test/normal.html +++ b/test/normal.html @@ -132,5 +132,9 @@


- - -

+ +

+ c:\tmp, \\server\path, \_/, foo\bar, #\#, \\# +

diff --git a/test/normal.md b/test/normal.md index 5067fa42..d63b403a 100644 --- a/test/normal.md +++ b/test/normal.md @@ -50,3 +50,5 @@ not a hr \--- \- - - +c:\tmp, \\\server\path, \\_/, foo\bar, #\\#, \\\\# + diff --git a/test/normal_escape_snob.html b/test/normal_escape_snob.html index 12ac025e..0d21867a 100644 --- a/test/normal_escape_snob.html +++ b/test/normal_escape_snob.html @@ -28,9 +28,6 @@

  • apple
  • -
  • - yam\\sweet potato -
  • final @@ -136,5 +133,9 @@


    - - -

    + +

    + c:\tmp, \\server\path, \_/, foo\bar, #\#, \\# +

    diff --git a/test/normal_escape_snob.md b/test/normal_escape_snob.md index 24e1ffd0..1260a0a8 100644 --- a/test/normal_escape_snob.md +++ b/test/normal_escape_snob.md @@ -6,7 +6,6 @@ first issue * _**bold italic**_ * orange * apple - * yam\\\\sweet potato * final text to separate lists @@ -52,3 +51,5 @@ not a hr \--- \- - - +c:\tmp, \\\server\path, \\\_/, foo\bar, \#\\\#, \\\\\# +