From a2079707221ba95234b1aa4b7dba73c802d1975b Mon Sep 17 00:00:00 2001
From: Dave Brondsema <dave@brondsema.net>
Date: Fri, 2 Nov 2012 17:23:28 -0400
Subject: [PATCH 1/2] escape -+. properly all the time

---
 html2text.py                 | 42 ++++++++++++++++++++++++++++++------
 test/emdash-para.md          |  2 +-
 test/normal.html             | 37 ++++++++++++++++++++++++++++++-
 test/normal.md               | 20 +++++++++++++++++
 test/normal_escape_snob.html | 37 ++++++++++++++++++++++++++++++-
 test/normal_escape_snob.md   | 20 +++++++++++++++++
 6 files changed, 148 insertions(+), 10 deletions(-)
diff --git a/html2text.py b/html2text.py
index 73ae6bbb..d9a2b717 100755
--- a/html2text.py
+++ b/html2text.py
@@ -663,8 +663,8 @@ def handle_data(self, data):
                 self.o("[")
                 self.maybe_automatic_link = None
 
-        if self.escape_snob and not self.code and not self.pre:
-            data = escape_md(data, snob=True)
+        if not self.code and not self.pre:
+            data = escape_md_section(data, snob=self.escape_snob)
         self.o(data, 1)
 
     def unknown_decl(self, data): pass
@@ -744,7 +744,26 @@ def optwrap(self, text):
 ordered_list_matcher = re.compile(r'\d+\.\s')
 unordered_list_matcher = re.compile(r'[-\*\+]\s')
 md_chars_matcher = re.compile(r"([\\\[\]\(\)])")
-md_chars_matcher_all = re.compile(r"([\\`\*_{}\[\]\(\)#\+\-\.!])")
+md_chars_matcher_all = re.compile(r"([\\`\*_{}\[\]\(\)#!])")
+md_dot_matcher = re.compile(r"""
+    ^             # start of line
+    (\s*\d+)      # optional whitespace and a number
+    (\.)          # dot
+    (?=\s)        # lookahead assert whitespace
+    """, re.MULTILINE | re.VERBOSE)
+md_plus_matcher = re.compile(r"""
+    ^
+    (\s*)
+    (\+)
+    (?=\s)
+    """, flags=re.MULTILINE | re.VERBOSE)
+md_dash_matcher = re.compile(r"""
+    ^
+    (\s*)
+    (-)
+    (?=\s|\-)     # followed by whitespace (bullet list, or spaced out hr)
+                  # or another dash (header or hr)
+    """, flags=re.MULTILINE | re.VERBOSE)
 
 def skipwrap(para):
     # If the text begins with four spaces or one tab, it's a code block; don't wrap
@@ -782,10 +801,19 @@ def unescape(s, unicode_snob=False):
     h.unicode_snob = unicode_snob
     return h.unescape(s)
 
-def escape_md(text, snob=False):
-    """Escapes markdown-sensitive characters."""
-    matcher = md_chars_matcher_all if snob else md_chars_matcher
-    return matcher.sub(r"\\\1", text)
+def escape_md(text):
+    """Escapes markdown-sensitive characters within other markdown constructs."""
+    return md_chars_matcher.sub(r"\\\1", text)
+
+def escape_md_section(text, snob=False):
+    """Escapes markdown-sensitive characters across whole document sections."""
+    if snob:
+        text = md_chars_matcher_all.sub(r"\\\1", text)
+    text = md_dot_matcher.sub(r"\1\\\2", text)
+    text = md_plus_matcher.sub(r"\1\\\2", text)
+    text = md_dash_matcher.sub(r"\1\\\2", text)
+    return text
+
 
 def main():
     baseurl = ''
diff --git a/test/emdash-para.md b/test/emdash-para.md
index 339b7f3a..486639a1 100644
--- a/test/emdash-para.md
+++ b/test/emdash-para.md
@@ -11,5 +11,5 @@ ribs, et nulla ground round do sunt dolore. Dolore nisi ullamco veniam sunt.
 Duis brisket drumstick, dolor fatback filet mignon meatloaf laboris tri-tip
 speck chuck ball tip voluptate ullamco laborum.
 
---
+\--
 
diff --git a/test/normal.html b/test/normal.html
index 0b99a71b..ba4a010c 100644
--- a/test/normal.html
+++ b/test/normal.html
@@ -96,6 +96,41 @@ <h1>
 //]]>
 </script>
 
+    <p>
+        2012. Now that was a good year. So was 2011. That's all.
+    </p>
+
+    <p>
+        3.14159 is an approximation of pi.
+    </p>
+
+    <p>
+        + not + a list item
+    </p>
+
+    <p>
+        +foo
+    </p>
+
+    <p>
+        - foo - bar
+    </p>
+
+    <p>
+        -foo
+    </p>
+
+    <p>
+        not a header<br>
+        --
+    </p>
+
+    <p>
+        not a hr<br>
+        <br>
+        ---
+        <br>
+        - - -
+    </p>
   </body>
 </html>
-
diff --git a/test/normal.md b/test/normal.md
index a1b97143..5067fa42 100644
--- a/test/normal.md
+++ b/test/normal.md
@@ -30,3 +30,23 @@ _italic_
 Some `fixed width text` here  
 _`italic fixed width text`_
 
+2012\. Now that was a good year. So was 2011. That's all.
+
+3.14159 is an approximation of pi.
+
+\+ not + a list item
+
++foo
+
+\- foo - bar
+
+-foo 
+
+not a header  
+\--
+
+not a hr  
+  
+\---  
+\- - -
+
diff --git a/test/normal_escape_snob.html b/test/normal_escape_snob.html
index 3d5b0b4d..12ac025e 100644
--- a/test/normal_escape_snob.html
+++ b/test/normal_escape_snob.html
@@ -100,6 +100,41 @@ <h1>
 //]]>
 </script>
 
+    <p>
+        2012. Now that was a good year. So was 2011. That's all.
+    </p>
+
+    <p>
+        3.14159 is an approximation of pi.
+    </p>
+
+    <p>
+        + not + a list item
+    </p>
+
+    <p>
+        +foo
+    </p>
+
+    <p>
+        - foo - bar
+    </p>
+
+    <p>
+        -foo
+    </p>
+
+    <p>
+        not a header<br>
+        --
+    </p>
+
+    <p>
+        not a hr<br>
+        <br>
+        ---
+        <br>
+        - - -
+    </p>
   </body>
 </html>
-
diff --git a/test/normal_escape_snob.md b/test/normal_escape_snob.md
index d8437066..24e1ffd0 100644
--- a/test/normal_escape_snob.md
+++ b/test/normal_escape_snob.md
@@ -32,3 +32,23 @@ text with \_underscore but not \_italicized
 Some `fixed width text` here  
 _`italic fixed width text`_
 
+2012\. Now that was a good year. So was 2011. That's all.
+
+3.14159 is an approximation of pi.
+
+\+ not + a list item
+
++foo
+
+\- foo - bar
+
+-foo 
+
+not a header  
+\--
+
+not a hr  
+  
+\---  
+\- - -
+

From 510c0e04b5dcdaaac71c77a0f97ff84705616de6 Mon Sep 17 00:00:00 2001
From: Dave Brondsema <dave@brondsema.net>
Date: Sat, 3 Nov 2012 16:41:48 -0400
Subject: [PATCH 2/2] always escape \ when needed

---
 html2text.py                 | 9 ++++++++-
 test/normal.html             | 4 ++++
 test/normal.md               | 2 ++
 test/normal_escape_snob.html | 7 ++++---
 test/normal_escape_snob.md   | 3 ++-
 5 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/html2text.py b/html2text.py
index d9a2b717..2f650ab5 100755
--- a/html2text.py
+++ b/html2text.py
@@ -744,7 +744,7 @@ def optwrap(self, text):
 ordered_list_matcher = re.compile(r'\d+\.\s')
 unordered_list_matcher = re.compile(r'[-\*\+]\s')
 md_chars_matcher = re.compile(r"([\\\[\]\(\)])")
-md_chars_matcher_all = re.compile(r"([\\`\*_{}\[\]\(\)#!])")
+md_chars_matcher_all = re.compile(r"([`\*_{}\[\]\(\)#!])")
 md_dot_matcher = re.compile(r"""
     ^             # start of line
     (\s*\d+)      # optional whitespace and a number
@@ -764,6 +764,12 @@ def optwrap(self, text):
     (?=\s|\-)     # followed by whitespace (bullet list, or spaced out hr)
                   # or another dash (header or hr)
     """, flags=re.MULTILINE | re.VERBOSE)
+slash_chars = r'\`*_{}[]()#+-.!'
+md_backslash_matcher = re.compile(r'''
+    (\\)          # match one slash
+    (?=[%s])      # followed by a char that requires escaping
+    ''' % re.escape(slash_chars),
+    flags=re.VERBOSE)
 
 def skipwrap(para):
     # If the text begins with four spaces or one tab, it's a code block; don't wrap
@@ -807,6 +813,7 @@ def escape_md(text):
 
 def escape_md_section(text, snob=False):
     """Escapes markdown-sensitive characters across whole document sections."""
+    text = md_backslash_matcher.sub(r"\\\1", text)
     if snob:
         text = md_chars_matcher_all.sub(r"\\\1", text)
     text = md_dot_matcher.sub(r"\1\\\2", text)
diff --git a/test/normal.html b/test/normal.html
index ba4a010c..47ef480e 100644
--- a/test/normal.html
+++ b/test/normal.html
@@ -132,5 +132,9 @@ <h1>
         <br>
         - - -
     </p>
+
+    <p>
+        c:\tmp, \\server\path, \_/, foo\bar, #\#, \\#
+    </p>
   </body>
 </html>
diff --git a/test/normal.md b/test/normal.md
index 5067fa42..d63b403a 100644
--- a/test/normal.md
+++ b/test/normal.md
@@ -50,3 +50,5 @@ not a hr
 \---  
 \- - -
 
+c:\tmp, \\\server\path, \\_/, foo\bar, #\\#, \\\\#
+
diff --git a/test/normal_escape_snob.html b/test/normal_escape_snob.html
index 12ac025e..0d21867a 100644
--- a/test/normal_escape_snob.html
+++ b/test/normal_escape_snob.html
@@ -28,9 +28,6 @@ <h1>
         <li>
           <span>apple</span>
         </li>
-        <li>
-          <span>yam\\sweet potato</span>
-        </li>
       </ul>
       <li>
         <span>final</span>
@@ -136,5 +133,9 @@ <h1>
         <br>
         - - -
     </p>
+    
+    <p>
+        c:\tmp, \\server\path, \_/, foo\bar, #\#, \\#
+    </p>
   </body>
 </html>
diff --git a/test/normal_escape_snob.md b/test/normal_escape_snob.md
index 24e1ffd0..1260a0a8 100644
--- a/test/normal_escape_snob.md
+++ b/test/normal_escape_snob.md
@@ -6,7 +6,6 @@ first issue
   * _**bold italic**_
     * orange
     * apple
-    * yam\\\\sweet potato
   * final
 
 text to separate lists
@@ -52,3 +51,5 @@ not a hr
 \---  
 \- - -
 
+c:\tmp, \\\server\path, \\\_/, foo\bar, \#\\\#, \\\\\#
+