From bb68f3dd6defc3e58a9d9c52661f4f4eb9a5e10e Mon Sep 17 00:00:00 2001 From: medmunds Date: Fri, 27 Oct 2017 13:26:37 -0700 Subject: [PATCH] Mailgun: fix event/metadata param extraction in tracking webhook Mailgun merges user-variables (metadata) into the webhook post data interspersed with the actual event params. This can lead to ambiguity interpreting post data. To extract metadata from an event, Anymail had been attempting to avoid that ambiguity by instead using X-Mailgun-Variables fields found in the event's message-headers param. But message-headers isn't included in some tracking events (opened, clicked, unsubscribed), resulting in empty metadata for those events. (#76) Also, conflicting metadata keys could confuse Anymail's Mailgun event parsing, leading to unexpected values in the normalized event. (#77) This commit: * Cleans up Anymail's tracking webhook to be explicit about which multi-value params it uses, avoiding conflicts with metadata keys. Fixes #77. * Extracts metadata from post params for opened, clicked and unsubscribed events. All unknown event params are assumed to be metadata. Fixes #76. * Documents a few metadata key names where it's impossible (or likely to be unreliable) for Anymail to extract metadata from the post data. For reference, the order of params in the Mailgun's post data *appears* to be (from live testing): * For the timestamp, token and signature params, any user-variable with the same name appears *before* the corresponding event data. * For all other params, any user-variable with the same name as a Mailgun event param appears *after* the Mailgun data. --- anymail/utils.py | 28 ++++++++ anymail/webhooks/mailgun.py | 115 ++++++++++++++++++++++++--------- docs/esps/mailgun.rst | 14 ++++ tests/test_mailgun_webhooks.py | 67 ++++++++++++++++++- tests/test_utils.py | 18 +++++- 5 files changed, 208 insertions(+), 34 deletions(-) diff --git a/anymail/utils.py b/anymail/utils.py index a0ccb5ed..cabb251f 100644 --- a/anymail/utils.py +++ b/anymail/utils.py @@ -398,6 +398,34 @@ def collect_all_methods(cls, method_name): return methods +def querydict_getfirst(qdict, field, default=UNSET): + """Like :func:`django.http.QueryDict.get`, but returns *first* value of multi-valued field. + + >>> from django.http import QueryDict + >>> q = QueryDict('a=1&a=2&a=3') + >>> querydict_getfirst(q, 'a') + '1' + >>> q.get('a') + '3' + >>> q['a'] + '3' + + You can bind this to a QueryDict instance using the "descriptor protocol": + >>> q.getfirst = querydict_getfirst.__get__(q) + >>> q.getfirst('a') + '1' + """ + # (Why not instead define a QueryDict subclass with this method? Because there's no simple way + # to efficiently initialize a QueryDict subclass with the contents of an existing instance.) + values = qdict.getlist(field) + if len(values) > 0: + return values[0] + elif default is not UNSET: + return default + else: + return qdict[field] # raise appropriate KeyError + + EPOCH = datetime(1970, 1, 1, tzinfo=utc) diff --git a/anymail/webhooks/mailgun.py b/anymail/webhooks/mailgun.py index cd3de0fe..396a871e 100644 --- a/anymail/webhooks/mailgun.py +++ b/anymail/webhooks/mailgun.py @@ -9,7 +9,7 @@ from .base import AnymailBaseWebhookView from ..exceptions import AnymailWebhookValidationFailure from ..signals import tracking, AnymailTrackingEvent, EventType, RejectReason -from ..utils import get_anymail_setting, combine +from ..utils import get_anymail_setting, combine, querydict_getfirst class MailgunBaseWebhookView(AnymailBaseWebhookView): @@ -28,6 +28,8 @@ def __init__(self, **kwargs): def validate_request(self, request): super(MailgunBaseWebhookView, self).validate_request(request) # first check basic auth if enabled try: + # Must use the *last* value of these fields if there are conflicting merged user-variables. + # (Fortunately, Django QueryDict is specced to return the last value.) token = request.POST['token'] timestamp = request.POST['timestamp'] signature = str(request.POST['signature']) # force to same type as hexdigest() (for python2) @@ -75,27 +77,31 @@ class MailgunTrackingWebhookView(MailgunBaseWebhookView): def esp_to_anymail_event(self, esp_event): # esp_event is a Django QueryDict (from request.POST), - # which has multi-valued fields, but is *not* case-insensitive - - event_type = self.event_types.get(esp_event['event'], EventType.UNKNOWN) - timestamp = datetime.fromtimestamp(int(esp_event['timestamp']), tz=utc) + # which has multi-valued fields, but is *not* case-insensitive. + # Because of the way Mailgun merges user-variables into the event, + # we must generally use the *first* value of any multi-valued field + # to avoid potential conflicting user-data. + esp_event.getfirst = querydict_getfirst.__get__(esp_event) + + event_type = self.event_types.get(esp_event.getfirst('event'), EventType.UNKNOWN) + timestamp = datetime.fromtimestamp(int(esp_event['timestamp']), tz=utc) # use *last* value of timestamp # Message-Id is not documented for every event, but seems to always be included. # (It's sometimes spelled as 'message-id', lowercase, and missing the .) - message_id = esp_event.get('Message-Id', esp_event.get('message-id', None)) + message_id = esp_event.getfirst('Message-Id', None) or esp_event.getfirst('message-id', None) if message_id and not message_id.startswith('<'): message_id = "<{}>".format(message_id) - description = esp_event.get('description', None) - mta_response = esp_event.get('error', esp_event.get('notification', None)) + description = esp_event.getfirst('description', None) + mta_response = esp_event.getfirst('error', None) or esp_event.getfirst('notification', None) reject_reason = None try: - mta_status = int(esp_event['code']) + mta_status = int(esp_event.getfirst('code')) except (KeyError, TypeError): pass except ValueError: # RFC-3463 extended SMTP status code (class.subject.detail, where class is "2", "4" or "5") try: - status_class = esp_event['code'].split('.')[0] + status_class = esp_event.getfirst('code').split('.')[0] except (TypeError, IndexError): # illegal SMTP status code format pass @@ -107,37 +113,84 @@ def esp_to_anymail_event(self, esp_event): RejectReason.BOUNCED if 400 <= mta_status < 600 else RejectReason.OTHER) - # Mailgun merges metadata fields with the other event fields. - # However, it also includes the original message headers, - # which have the metadata separately as X-Mailgun-Variables. - try: - headers = json.loads(esp_event['message-headers']) - except (KeyError, ): - metadata = {} - else: - variables = [value for [field, value] in headers - if field == 'X-Mailgun-Variables'] - if len(variables) >= 1: - # Each X-Mailgun-Variables value is JSON. Parse and merge them all into single dict: - metadata = combine(*[json.loads(value) for value in variables]) - else: - metadata = {} + metadata = self._extract_metadata(esp_event) - # tags are sometimes delivered as X-Mailgun-Tag fields, sometimes as tag - tags = esp_event.getlist('tag', esp_event.getlist('X-Mailgun-Tag', [])) + # tags are supposed to be in 'tag' fields, but are sometimes in undocumented X-Mailgun-Tag + tags = esp_event.getlist('tag', None) or esp_event.getlist('X-Mailgun-Tag', []) return AnymailTrackingEvent( event_type=event_type, timestamp=timestamp, message_id=message_id, - event_id=esp_event.get('token', None), - recipient=esp_event.get('recipient', None), + event_id=esp_event.get('token', None), # use *last* value of token + recipient=esp_event.getfirst('recipient', None), reject_reason=reject_reason, description=description, mta_response=mta_response, tags=tags, metadata=metadata, - click_url=esp_event.get('url', None), - user_agent=esp_event.get('user-agent', None), + click_url=esp_event.getfirst('url', None), + user_agent=esp_event.getfirst('user-agent', None), esp_event=esp_event, ) + + def _extract_metadata(self, esp_event): + # Mailgun merges user-variables into the POST fields. If you know which user variable + # you want to retrieve--and it doesn't conflict with a Mailgun event field--that's fine. + # But if you want to extract all user-variables (like we do), it's more complicated... + event_type = esp_event.getfirst('event') + metadata = {} + + if 'message-headers' in esp_event: + # For events where original message headers are available, it's most reliable + # to recover user-variables from the X-Mailgun-Variables header(s). + headers = json.loads(esp_event['message-headers']) + variables = [value for [field, value] in headers if field == 'X-Mailgun-Variables'] + if len(variables) >= 1: + # Each X-Mailgun-Variables value is JSON. Parse and merge them all into single dict: + metadata = combine(*[json.loads(value) for value in variables]) + + elif event_type in self._known_event_fields: + # For other events, we must extract from the POST fields, ignoring known Mailgun + # event parameters, and treating all other values as user-variables. + known_fields = self._known_event_fields[event_type] + for field, values in esp_event.lists(): + if field not in known_fields: + # Unknown fields are assumed to be user-variables. (There should really only be + # a single value, but just in case take the last one to match QueryDict semantics.) + metadata[field] = values[-1] + elif field == 'tag': + # There's no way to distinguish a user-variable named 'tag' from an actual tag, + # so don't treat this/these value(s) as metadata. + pass + elif len(values) == 1: + # This is an expected event parameter, and since there's only a single value + # it must be the event param, not metadata. + pass + else: + # This is an expected event parameter, but there are (at least) two values. + # One is the event param, and the other is a user-variable metadata value. + # Which is which depends on the field: + if field in {'signature', 'timestamp', 'token'}: + metadata[field] = values[0] # values = [user-variable, event-param] + else: + metadata[field] = values[-1] # values = [event-param, user-variable] + + return metadata + + _common_event_fields = { + # These fields are documented to appear in all Mailgun opened, clicked and unsubscribed events: + 'event', 'recipient', 'domain', 'ip', 'country', 'region', 'city', 'user-agent', 'device-type', + 'client-type', 'client-name', 'client-os', 'campaign-id', 'campaign-name', 'tag', 'mailing-list', + 'timestamp', 'token', 'signature', + # Undocumented, but observed in actual events: + 'body-plain', 'h', 'message-id', + } + _known_event_fields = { + # For all Mailgun event types that *don't* include message-headers, + # map Mailgun (not normalized) event type to set of expected event fields. + # Used for metadata extraction. + 'clicked': _common_event_fields | {'url'}, + 'opened': _common_event_fields, + 'unsubscribed': _common_event_fields, + } diff --git a/docs/esps/mailgun.rst b/docs/esps/mailgun.rst index a9c1f0e2..a5395d49 100644 --- a/docs/esps/mailgun.rst +++ b/docs/esps/mailgun.rst @@ -126,6 +126,20 @@ values directly to Mailgun. You can use any of the (non-file) parameters listed .. _Mailgun sending docs: https://documentation.mailgun.com/api-sending.html#sending +.. _mailgun-quirks: + +Limitations and quirks +---------------------- + +**Metadata keys and tracking webhooks** + Because of the way Mailgun supplies custom data (user-variables) to webhooks, + there are a few metadata keys that Anymail cannot reliably retrieve in some + tracking events. You should avoid using "body-plain", "h", "message-headers", + "message-id" or "tag" as :attr:`~anymail.message.AnymailMessage.metadata` keys + if you need to access that metadata from an opened, clicked, or unsubscribed + :ref:`tracking event ` handler. + + .. _mailgun-templates: Batch sending/merge and ESP templates diff --git a/tests/test_mailgun_webhooks.py b/tests/test_mailgun_webhooks.py index 7a5fc44d..00977fb4 100644 --- a/tests/test_mailgun_webhooks.py +++ b/tests/test_mailgun_webhooks.py @@ -225,14 +225,15 @@ def test_alt_smtp_code(self): self.assertEqual(event.reject_reason, "bounced") self.assertIn("RecipNotFound", event.mta_response) - def test_metadata(self): + def test_metadata_message_headers(self): # Metadata fields are interspersed with other data, but also in message-headers + # for delivered, bounced and dropped events raw_event = mailgun_sign({ 'event': 'delivered', 'message-headers': json.dumps([ ["X-Mailgun-Variables", "{\"custom1\": \"value1\", \"custom2\": \"{\\\"key\\\":\\\"value\\\"}\"}"], ]), - 'custom1': 'value', + 'custom1': 'value1', 'custom2': '{"key":"value"}', # you can store JSON, but you'll need to unpack it yourself }) self.client.post('/anymail/mailgun/tracking/', data=raw_event) @@ -240,6 +241,68 @@ def test_metadata(self): event = kwargs['event'] self.assertEqual(event.metadata, {"custom1": "value1", "custom2": '{"key":"value"}'}) + def test_metadata_post_fields(self): + # Metadata fields are only interspersed with other event params + # for opened, clicked, unsubscribed events + raw_event = mailgun_sign({ + 'event': 'clicked', + 'custom1': 'value1', + 'custom2': '{"key":"value"}', # you can store JSON, but you'll need to unpack it yourself + }) + self.client.post('/anymail/mailgun/tracking/', data=raw_event) + kwargs = self.assert_handler_called_once_with(self.tracking_handler) + event = kwargs['event'] + self.assertEqual(event.metadata, {"custom1": "value1", "custom2": '{"key":"value"}'}) + + def test_metadata_key_conflicts(self): + # If you happen to name metadata (user-variable) keys the same as Mailgun + # event properties, Mailgun will include both in the webhook post. + # Make sure we don't confuse them. + metadata = { + "event": "metadata-event", + "recipient": "metadata-recipient", + "signature": "metadata-signature", + "timestamp": "metadata-timestamp", + "token": "metadata-token", + "ordinary field": "ordinary metadata value", + } + + raw_event = mailgun_sign({ + 'event': 'clicked', + 'recipient': 'actual-recipient@example.com', + 'token': 'actual-event-token', + 'timestamp': '1461261330', + 'url': 'http://clicked.example.com/actual/event/param', + 'h': "an (undocumented) Mailgun event param", + 'tag': ["actual-tag-1", "actual-tag-2"], + }) + + # Simulate how Mailgun merges user-variables fields into event: + for key in metadata.keys(): + if key in raw_event: + if key in {'signature', 'timestamp', 'token'}: + # For these fields, Mailgun's value appears after the metadata value + raw_event[key] = [metadata[key], raw_event[key]] + elif key == 'message-headers': + pass # Mailgun won't merge this field into the event + else: + # For all other fields, the defined event value comes first + raw_event[key] = [raw_event[key], metadata[key]] + else: + raw_event[key] = metadata[key] + + response = self.client.post('/anymail/mailgun/tracking/', data=raw_event) + self.assertEqual(response.status_code, 200) # if this fails, signature checking is using metadata values + + kwargs = self.assert_handler_called_once_with(self.tracking_handler) + event = kwargs['event'] + self.assertEqual(event.event_type, "clicked") + self.assertEqual(event.recipient, "actual-recipient@example.com") + self.assertEqual(event.timestamp.isoformat(), "2016-04-21T17:55:30+00:00") + self.assertEqual(event.event_id, "actual-event-token") + self.assertEqual(event.tags, ["actual-tag-1", "actual-tag-2"]) + self.assertEqual(event.metadata, metadata) + def test_tags(self): # Most events include multiple 'tag' fields for message's tags raw_event = mailgun_sign({ diff --git a/tests/test_utils.py b/tests/test_utils.py index b6c9f6e5..507b3559 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -4,6 +4,7 @@ from unittest import skipIf import six +from django.http import QueryDict from django.test import SimpleTestCase, RequestFactory, override_settings from django.utils.translation import ugettext_lazy @@ -22,7 +23,7 @@ parse_address_list, EmailAddress, is_lazy, force_non_lazy, force_non_lazy_dict, force_non_lazy_list, update_deep, - get_request_uri, get_request_basic_auth, parse_rfc2822date) + get_request_uri, get_request_basic_auth, parse_rfc2822date, querydict_getfirst) class ParseAddressListTests(SimpleTestCase): @@ -299,6 +300,21 @@ def test_get_request_uri_with_proxy(self): "https://user:pass@secret.example.com:8989/path/to/?query") +class QueryDictUtilsTests(SimpleTestCase): + def test_querydict_getfirst(self): + q = QueryDict("a=one&a=two&a=three") + q.getfirst = querydict_getfirst.__get__(q) + self.assertEqual(q.getfirst('a'), "one") + + # missing key exception: + with self.assertRaisesMessage(KeyError, "not a key"): + q.getfirst("not a key") + + # defaults: + self.assertEqual(q.getfirst('not a key', "beta"), "beta") + self.assertIsNone(q.getfirst('not a key', None)) + + class ParseRFC2822DateTests(SimpleTestCase): def test_with_timezones(self): dt = parse_rfc2822date("Tue, 24 Oct 2017 10:11:35 -0700")