From 59aa5f5d10d5b7cd42a8b5181d7bd3734dba7694 Mon Sep 17 00:00:00 2001
From: Chris Caron <lead2gold@gmail.com>
Date: Sat, 15 May 2021 16:08:53 -0400
Subject: [PATCH] Telegram escaping completely refactored (#386)

---
 apprise/plugins/NotifyTelegram.py | 86 +++++++++++++++++++++----------
 test/test_telegram.py             | 18 ++++++-
 2 files changed, 77 insertions(+), 27 deletions(-)
diff --git a/apprise/plugins/NotifyTelegram.py b/apprise/plugins/NotifyTelegram.py
index ea3c600a75..0e06a209cb 100644
--- a/apprise/plugins/NotifyTelegram.py
+++ b/apprise/plugins/NotifyTelegram.py
@@ -524,39 +524,73 @@ def send(self, body, title='', notify_type=NotifyType.INFO, attach=None,
                 body,
             )
 
-        elif self.notify_format == NotifyFormat.HTML:
-            payload['parse_mode'] = 'HTML'
-
-            # HTML Spaces (&nbsp;) and tabs (&emsp;) aren't supported
-            # See https://core.telegram.org/bots/api#html-style
-            body = re.sub('&nbsp;?', ' ', body, re.I)
+        else:  # HTML or TEXT
 
-            # Tabs become 3 spaces
-            body = re.sub('&emsp;?', '   ', body, re.I)
+            # Use Telegram's HTML mode
+            payload['parse_mode'] = 'HTML'
 
-            if title:
+            # Telegram's HTML support doesn't like having HTML escaped
+            # characters passed into it.  to handle this situation, we need to
+            # search the body for these sequences and convert them to the
+            # output the user expected
+            telegram_escape_html_dict = {
                 # HTML Spaces (&nbsp;) and tabs (&emsp;) aren't supported
                 # See https://core.telegram.org/bots/api#html-style
-                title = re.sub('&nbsp;?', ' ', title, re.I)
+                r'nbsp': ' ',
 
                 # Tabs become 3 spaces
-                title = re.sub('&emsp;?', '   ', title, re.I)
-
-            payload['text'] = '{}{}'.format(
-                '<b>{}</b>\r\n'.format(title) if title else '',
-                body,
-            )
+                r'emsp': '   ',
+
+                # Some characters get re-escaped by the Telegram upstream
+                # service so we need to convert these back,
+                r'apos': '\'',
+                r'quot': '"',
+            }
+
+            # Create a regular expression from the dictionary keys
+            html_regex = re.compile("&(%s);?" % "|".join(
+                map(re.escape, telegram_escape_html_dict.keys())).lower(),
+                re.I)
+
+            # For each match, look-up corresponding value in dictionary
+            # we look +1 to ignore the & that does not appear in the index
+            # we only look at the first 4 characters because we don't want to
+            # fail on &apos; as it's accepted (along with &apos - no
+            # semi-colon)
+            body = html_regex.sub(  # pragma: no branch
+                lambda mo: telegram_escape_html_dict[
+                    mo.string[mo.start():mo.end()][1:5]], body)
 
-        else:  # pass directly as is...
-            payload['parse_mode'] = 'HTML'
-
-            # Telegram strangely escapes all HTML characters for us already
-            # but to avoid causing issues with HTML, we escape the < and >
-            # characters
-            title = re.sub('>', '&gt;', title, re.I)
-            title = re.sub('<', '&lt;', title, re.I)
-            body = re.sub('>', '&gt;', body, re.I)
-            body = re.sub('<', '&lt;', body, re.I)
+            if title:
+                # For each match, look-up corresponding value in dictionary
+                # Indexing is explained above (for how the body is parsed)
+                title = html_regex.sub(  # pragma: no branch
+                    lambda mo: telegram_escape_html_dict[
+                        mo.string[mo.start():mo.end()][1:5]], title)
+
+            if self.notify_format == NotifyFormat.TEXT:
+                telegram_escape_text_dict = {
+                    # We need to escape characters that conflict with html
+                    # entity blocks (< and >) when displaying text
+                    r'>': '&gt;',
+                    r'<': '&lt;',
+                }
+
+                # Create a regular expression from the dictionary keys
+                text_regex = re.compile("(%s)" % "|".join(
+                    map(re.escape, telegram_escape_text_dict.keys())).lower(),
+                    re.I)
+
+                # For each match, look-up corresponding value in dictionary
+                body = text_regex.sub(  # pragma: no branch
+                    lambda mo: telegram_escape_text_dict[
+                        mo.string[mo.start():mo.end()]], body)
+
+                if title:
+                    # For each match, look-up corresponding value in dictionary
+                    title = text_regex.sub(  # pragma: no branch
+                        lambda mo: telegram_escape_text_dict[
+                            mo.string[mo.start():mo.end()]], title)
 
             payload['text'] = '{}{}'.format(
                 '<b>{}</b>\r\n'.format(title) if title else '',
diff --git a/test/test_telegram.py b/test/test_telegram.py
index a3dee6e1c8..8df78f61e8 100644
--- a/test/test_telegram.py
+++ b/test/test_telegram.py
@@ -29,6 +29,7 @@
 import mock
 import requests
 from json import dumps
+from json import loads
 from apprise import Apprise
 from apprise import AppriseAttachment
 from apprise import AppriseAsset
@@ -202,11 +203,26 @@ def test_notify_telegram_plugin(mock_post, mock_get):
     })
     mock_post.return_value.status_code = requests.codes.ok
 
-    # Test sending attachments
     obj = plugins.NotifyTelegram(bot_token=bot_token, targets='12345')
     assert len(obj.targets) == 1
     assert obj.targets[0] == '12345'
 
+    # Test the escaping of characters since Telegram escapes stuff for us to
+    # which we need to consider
+    mock_post.reset_mock()
+    body = "<p>\'\"This can't\t\r\nfail&nbsp;us\"\'</p>"
+    assert obj.notify(
+        body=body, title='special characters',
+        notify_type=NotifyType.INFO) is True
+    assert mock_post.call_count == 1
+    payload = loads(mock_post.call_args_list[0][1]['data'])
+
+    # Our special characters are escaped properly
+    assert payload['text'] == \
+        '<b>special characters</b>\r\n&lt;p&gt;'\
+        '\'"This can\'t\t\r\nfail us"\'&lt;/p&gt;'
+
+    # Test sending attachments
     attach = AppriseAttachment(os.path.join(TEST_VAR_DIR, 'apprise-test.gif'))
     assert obj.notify(
         body='body', title='title', notify_type=NotifyType.INFO,