airbytehq · ChristoGrab · Dec 20, 2023 · Nov 16, 2023 · Nov 20, 2023 · Nov 20, 2023
diff --git a/airbyte-integrations/connectors/source-mailchimp/metadata.yaml b/airbyte-integrations/connectors/source-mailchimp/metadata.yaml
@@ -10,7 +10,7 @@ data:
   connectorSubtype: api
   connectorType: source
   definitionId: b03a9f3e-22a5-11eb-adc1-0242ac120002
-  dockerImageTag: 1.0.0
+  dockerImageTag: 1.1.0
   dockerRepository: airbyte/source-mailchimp
   documentationUrl: https://docs.airbyte.com/integrations/sources/mailchimp
   githubIssueLabel: source-mailchimp

diff --git a/airbyte-integrations/connectors/source-mailchimp/source_mailchimp/source.py b/airbyte-integrations/connectors/source-mailchimp/source_mailchimp/source.py
@@ -4,13 +4,16 @@
 
 
 import base64
+import re
 from typing import Any, List, Mapping, Tuple
 
+import pendulum
 import requests
 from airbyte_cdk import AirbyteLogger
 from airbyte_cdk.sources import AbstractSource
 from airbyte_cdk.sources.streams import Stream
 from airbyte_cdk.sources.streams.http.auth import TokenAuthenticator
+from pendulum.parsing.exceptions import ParserError
 from requests.auth import AuthBase
 
 from .streams import (
@@ -78,7 +81,30 @@ def get_auth(self, config: Mapping[str, Any]) -> AuthBase:
 
 
 class SourceMailchimp(AbstractSource):
+    def _validate_start_date(self, config: Mapping[str, Any]):
+        start_date = config.get("start_date")
+
+        if start_date:
+            pattern = re.compile(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z")
+            if not pattern.match(start_date):  # Compare against the pattern descriptor.
+                return "Please check the format of the start date against the pattern descriptor."
+
+            try:  # Handle invalid dates.
+                parsed_start_date = pendulum.parse(start_date)
+            except ParserError:
+                return "The provided start date is not a valid date. Please check the date you input and try again."
+
+            if parsed_start_date > pendulum.now("UTC"):  # Handle future start date.
+                return "The start date cannot be greater than the current date."
+
+        return None
+
     def check_connection(self, logger: AirbyteLogger, config: Mapping[str, Any]) -> Tuple[bool, Any]:
+        # First, check for a valid start date if it is provided
+        start_date_validation_error = self._validate_start_date(config)
+        if start_date_validation_error:
+            return False, start_date_validation_error
+
         try:
             authenticator = MailChimpAuthenticator().get_auth(config)
             response = requests.get(
@@ -102,21 +128,22 @@ def check_connection(self, logger: AirbyteLogger, config: Mapping[str, Any]) ->
     def streams(self, config: Mapping[str, Any]) -> List[Stream]:
         authenticator = MailChimpAuthenticator().get_auth(config)
         campaign_id = config.get("campaign_id")
+        start_date = config.get("start_date")
 
-        lists = Lists(authenticator=authenticator)
+        lists = Lists(authenticator=authenticator, start_date=start_date)
         interest_categories = InterestCategories(authenticator=authenticator, parent=lists)
 
         return [
-            Automations(authenticator=authenticator),
-            Campaigns(authenticator=authenticator),
-            EmailActivity(authenticator=authenticator, campaign_id=campaign_id),
+            Automations(authenticator=authenticator, start_date=start_date),
+            Campaigns(authenticator=authenticator, start_date=start_date),
+            EmailActivity(authenticator=authenticator, start_date=start_date, campaign_id=campaign_id),
             interest_categories,
             Interests(authenticator=authenticator, parent=interest_categories),
             lists,
-            ListMembers(authenticator=authenticator),
-            Reports(authenticator=authenticator),
-            SegmentMembers(authenticator=authenticator),
-            Segments(authenticator=authenticator),
+            ListMembers(authenticator=authenticator, start_date=start_date),
+            Reports(authenticator=authenticator, start_date=start_date),
+            SegmentMembers(authenticator=authenticator, start_date=start_date),
+            Segments(authenticator=authenticator, start_date=start_date),
             Tags(authenticator=authenticator, parent=lists),
-            Unsubscribes(authenticator=authenticator, campaign_id=campaign_id),
+            Unsubscribes(authenticator=authenticator, start_date=start_date, campaign_id=campaign_id),
         ]
diff --git a/airbyte-integrations/connectors/source-mailchimp/source_mailchimp/spec.json b/airbyte-integrations/connectors/source-mailchimp/source_mailchimp/spec.json
@@ -61,6 +61,15 @@
           }
         ]
       },
+      "start_date": {
+        "title": "Incremental Sync Start Date",
+        "description": "The date from which you want to start syncing data for Incremental streams. Only records that have been created or modified since this date will be synced. If left blank, all data will by synced.",
+        "type": "string",
+        "format": "date-time",
+        "pattern": "^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}.[0-9]{3}Z$",
+        "pattern_descriptor": "YYYY-MM-DDTHH:MM:SS.000Z",
+        "examples": ["2020-01-01T00:00:00.000Z"]
+      },
       "campaign_id": {
         "type": "string",
         "title": "ID of a campaign to sync email activities",

diff --git a/airbyte-integrations/connectors/source-mailchimp/source_mailchimp/streams.py b/airbyte-integrations/connectors/source-mailchimp/source_mailchimp/streams.py
@@ -7,6 +7,7 @@
 from abc import ABC, abstractmethod
 from typing import Any, Iterable, List, Mapping, MutableMapping, Optional
 
+import pendulum
 import requests
 from airbyte_cdk.models import SyncMode
 from airbyte_cdk.sources.streams.core import StreamData
@@ -81,6 +82,10 @@ def read_records(
 class IncrementalMailChimpStream(MailChimpStream, ABC):
     state_checkpoint_interval = math.inf
 
+    def __init__(self, **kwargs):
+        self.start_date = kwargs.pop("start_date", None)
+        super().__init__(**kwargs)
+
     @property
     @abstractmethod
     def cursor_field(self) -> str:
@@ -129,11 +134,30 @@ def stream_slices(
     ) -> Iterable[Optional[Mapping[str, Any]]]:
         slice_ = {}
         stream_state = stream_state or {}
-        cursor_value = stream_state.get(self.cursor_field)
+        cursor_value = self.get_filter_date(self.start_date, stream_state.get(self.cursor_field))
         if cursor_value:
             slice_[self.filter_field] = cursor_value
         yield slice_
 
+    @staticmethod
+    def get_filter_date(start_date: str, state_date: str) -> str:
+        """
+        Calculate the filter date to pass in the request parameters by comparing the start_date
+        with the value of state obtained from the stream_slice.
+        If only one value exists, use it by default. Otherwise, return None.
+        If no filter_date is provided, the API will fetch all available records.
+        """
+
+        start_date_parsed = pendulum.parse(start_date).to_iso8601_string() if start_date else None
+        state_date_parsed = pendulum.parse(state_date).to_iso8601_string() if state_date else None
+
+        if start_date_parsed and state_date_parsed:
+            return max(start_date_parsed, state_date_parsed)
+        elif state_date_parsed or start_date_parsed:
+            return state_date_parsed or start_date_parsed
+        else:
+            return None
-        if start_date_parsed and state_date_parsed:
-            return max(start_date_parsed, state_date_parsed)
-        elif state_date_parsed or start_date_parsed:
-            return state_date_parsed or start_date_parsed
-        else:
-            return None
+        # Return the maximum of the two dates if both are present, otherwise return whichever is present, or None
+        if start_date_parsed or state_date_parsed:
+                return max(filter(None, [start_date_parsed, state_date_parsed]), default=None)
-        if start_date_parsed and state_date_parsed:
-            return max(start_date_parsed, state_date_parsed)
-        elif state_date_parsed or start_date_parsed:
-            return state_date_parsed or start_date_parsed
-        else:
-            return None
+        # Return the maximum of the two dates if both are present, otherwise return whichever is present, or None
+        if start_date_parsed or state_date_parsed:
+                return max(filter(None, [start_date_parsed, state_date_parsed]), default=None)
+
     def request_params(self, stream_state=None, stream_slice=None, **kwargs):
         stream_state = stream_state or {}
         stream_slice = stream_slice or {}
@@ -157,7 +181,11 @@ def stream_slices(self, stream_state: Mapping[str, Any] = None, **kwargs) -> Ite
         stream_state = stream_state or {}
         parent = Lists(authenticator=self.authenticator).read_records(sync_mode=SyncMode.full_refresh)
         for slice in parent:
-            yield {"list_id": slice["id"]}
+            slice_ = {"list_id": slice["id"]}
+            cursor_value = self.get_filter_date(self.start_date, stream_state.get(slice["id"], {}).get(self.cursor_field))
+            if cursor_value:
+                slice_[self.filter_field] = cursor_value
+            yield slice_
 
     def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str:
         list_id = stream_slice.get("list_id")
@@ -238,7 +266,8 @@ def stream_slices(
             campaigns = Campaigns(authenticator=self.authenticator).read_records(sync_mode=SyncMode.full_refresh)
         for campaign in campaigns:
             slice_ = {"campaign_id": campaign["id"]}
-            cursor_value = stream_state.get(campaign["id"], {}).get(self.cursor_field)
+            state_value = stream_state.get(campaign["id"], {}).get(self.cursor_field)
+            cursor_value = self.get_filter_date(self.start_date, state_value)
             if cursor_value:
                 slice_[self.filter_field] = cursor_value
             yield slice_
@@ -359,17 +388,23 @@ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str:
 
     def parse_response(self, response: requests.Response, stream_state: Mapping[str, Any], stream_slice, **kwargs) -> Iterable[Mapping]:
         """
-        SegmentMembers endpoint does not support sorting, so we need to filter out records that are older than the current state
+        The SegmentMembers endpoint does not support sorting or filtering,
+        so we need to apply our own filtering logic before reading.
         """
         response = super().parse_response(response, **kwargs)
 
+        # Calculate the filter date to compare all records against in this slice
+        slice_cursor_value = stream_state.get(str(stream_slice.get("segment_id")), {}).get(self.cursor_field)
+        filter_date = self.get_filter_date(self.start_date, slice_cursor_value)
+
         for record in response:
             # Add the segment_id foreign_key to each record
             record["segment_id"] = stream_slice.get("segment_id")
 
-            current_cursor_value = stream_state.get(str(record.get("segment_id")), {}).get(self.cursor_field)
             record_cursor_value = record.get(self.cursor_field)
-            if current_cursor_value is None or record_cursor_value >= current_cursor_value:
+            if filter_date is None or record_cursor_value >= filter_date:
+                # Add the segment_id foreign_key to each record
+                record["segment_id"] = stream_slice.get("segment_id")
                 yield record
 
     def get_updated_state(self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any]) -> Mapping[str, Any]:
@@ -453,15 +488,20 @@ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str:
         campaign_id = stream_slice.get("campaign_id")
         return f"reports/{campaign_id}/unsubscribed"
 
-    def parse_response(self, response: requests.Response, stream_state: Mapping[str, Any], **kwargs) -> Iterable[Mapping]:
+    def parse_response(self, response: requests.Response, stream_state: Mapping[str, Any], stream_slice, **kwargs) -> Iterable[Mapping]:
+        """
+        The Unsubscribes endpoint does not support sorting or filtering,
+        so we need to apply our own filtering logic before reading.
+        """
 
         response = super().parse_response(response, **kwargs)
 
-        # Unsubscribes endpoint does not support sorting, so we need to filter out records that are older than the current state
+        slice_cursor_value = stream_state.get(stream_slice.get("campaign_id", {}), {}).get(self.cursor_field)
+        filter_date = self.get_filter_date(self.start_date, slice_cursor_value)
+
         for record in response:
-            current_cursor_value = stream_state.get(record.get("campaign_id"), {}).get(self.cursor_field)
             record_cursor_value = record.get(self.cursor_field)
-            if current_cursor_value is None or record_cursor_value >= current_cursor_value:
+            if filter_date is None or record_cursor_value >= filter_date:
                 yield record
 
     def get_updated_state(self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any]) -> Mapping[str, Any]:

diff --git a/airbyte-integrations/connectors/source-mailchimp/unit_tests/conftest.py b/airbyte-integrations/connectors/source-mailchimp/unit_tests/conftest.py
@@ -14,7 +14,7 @@ def data_center_fixture():
 
 @fixture(name="config")
 def config_fixture(data_center):
-    return {"apikey": f"API_KEY-{data_center}"}
+    return {"apikey": f"API_KEY-{data_center}", "start_date": "2022-01-01T00:00:00.000Z"}
 
 
 @fixture(name="access_token")

diff --git a/airbyte-integrations/connectors/source-mailchimp/unit_tests/test_source.py b/airbyte-integrations/connectors/source-mailchimp/unit_tests/test_source.py
@@ -88,6 +88,29 @@ def test_wrong_config(wrong_config):
         MailChimpAuthenticator().get_auth(wrong_config)
 
 
+@pytest.mark.parametrize(
+    "config, expected_return",
+    [
+        ({}, None),
+        ({"start_date": "2021-01-01T00:00:00.000Z"}, None),
+        ({"start_date": "2021-99-99T79:89:99.123Z"}, "The provided start date is not a valid date. Please check the date you input and try again."),
+        ({"start_date": "2021-01-01T00:00:00.000"}, "Please check the format of the start date against the pattern descriptor."),
+        ({"start_date": "2025-01-25T00:00:00.000Z"}, "The start date cannot be greater than the current date."),
+    ],
+    ids=[
+        "No start date",
+        "Valid start date",
+        "Invalid start date",
+        "Invalid format",
+        "Future start date",
+    ]
+)
+def test_validate_start_date(config, expected_return):
+    source = SourceMailchimp()
+    result = source._validate_start_date(config)
+    assert result == expected_return
+
+
 def test_streams_count(config):
     streams = SourceMailchimp().streams(config)
     assert len(streams) == 12
diff --git a/airbyte-integrations/connectors/source-mailchimp/unit_tests/test_streams.py b/airbyte-integrations/connectors/source-mailchimp/unit_tests/test_streams.py
@@ -170,7 +170,7 @@ def test_stream_parse_json_error(auth, caplog):
         # Test case 2: state and next_page_token
         (
             ListMembers,
-            {"list_id": "123"},
+            {"list_id": "123", "since_last_changed": "2023-10-15T00:00:00Z"},
             {"123": {"last_changed": "2023-10-15T00:00:00Z"}},
             {"offset": 1000},
             {
@@ -405,7 +405,8 @@ def test_parse_response(stream_state, expected_records, unsubscribes_stream):
             {"campaign_id": "campaign_1", "email_id": "email_4", "timestamp": "2022-01-03T00:00:00Z"},
         ]
     }
-    records = list(unsubscribes_stream.parse_response(response=mock_response, stream_state=stream_state))
+    stream_slice = {"campaign_id": "campaign_1"}
+    records = list(unsubscribes_stream.parse_response(response=mock_response, stream_slice=stream_slice, stream_state=stream_state))
     assert records == expected_records
 
 
@@ -606,3 +607,42 @@ def test_path(auth, stream, stream_slice, expected_endpoint):
     endpoint = stream.path(stream_slice=stream_slice)
 
     assert endpoint == expected_endpoint, f"Stream {stream}: expected path '{expected_endpoint}', got '{endpoint}'"
+
+
+@pytest.mark.parametrize(
+    "start_date, state_date, expected_return_value",
+    [
+        (
+            "2021-01-01T00:00:00.000Z",
+            "2020-01-01T00:00:00+00:00",
+            "2021-01-01T00:00:00Z"
+        ),
+        (
+            "2021-01-01T00:00:00.000Z",
+            "2023-10-05T00:00:00+00:00",
+            "2023-10-05T00:00:00+00:00"
+        ),
+        (
+            None,
+            "2022-01-01T00:00:00+00:00",
+            "2022-01-01T00:00:00+00:00"
+        ),
+        (
+            "2020-01-01T00:00:00.000Z",
+            None,
+            "2020-01-01T00:00:00Z"
+        ),
+        (
+            None,
+            None,
+            None
+        )
+    ]
+)
+def test_get_filter_date(auth, start_date, state_date, expected_return_value):
+    """
+    Tests that the get_filter_date method returns the correct date string
+    """
+    stream = Campaigns(authenticator=auth, start_date=start_date)
+    result = stream.get_filter_date(start_date, state_date)
+    assert result == expected_return_value, f"Expected: {expected_return_value}, Actual: {result}"
diff --git a/docs/integrations/sources/mailchimp.md b/docs/integrations/sources/mailchimp.md
@@ -54,7 +54,8 @@ For more information on Mailchimp API Keys, please refer to the [official Mailch
 
 <!-- /env:oss -->
 
-6. Click **Set up source** and wait for the tests to complete.
+6. (Optional) You may optionally provide an **Incremental Sync Start Date** using the provided datepicker, or by programmatically entering a UTC date-time in the format `YYYY-MM-DDThh:mm:ss.sssZ`. If set, only data generated on or after the configured date-time will be synced for Incremental streams. Leaving this field blank will sync all data returned from the API. Please note that this option has no effect on streams using Full Refresh sync mode.
+7. Click **Set up source** and wait for the tests to complete.
 
 <HideInUI>
 
@@ -122,7 +123,8 @@ Now that you have set up the Mailchimp source connector, check out the following
 
 | Version | Date       | Pull Request                                             | Subject                                                                    |
 |---------|------------|----------------------------------------------------------|----------------------------------------------------------------------------|
-| 1.0.0   | 2023-11-28 | [32836](https://github.com/airbytehq/airbyte/pull/32836) | Add airbyte-type to `datetime` columns and remove `._links` column         |
+| 1.1.0   | 2023-12-20 | [32852](https://github.com/airbytehq/airbyte/pull/32852) | Add optional start_date for incremental streams                            |
+| 1.0.0   | 2023-12-19 | [32836](https://github.com/airbytehq/airbyte/pull/32836) | Add airbyte-type to `datetime` columns and remove `._links` column         |
 | 0.10.0  | 2023-11-23 | [32782](https://github.com/airbytehq/airbyte/pull/32782) | Add SegmentMembers stream                                                  |
 | 0.9.0   | 2023-11-17 | [32218](https://github.com/airbytehq/airbyte/pull/32218) | Add Interests, InterestCategories, Tags streams                            |
 | 0.8.3   | 2023-11-15 | [32543](https://github.com/airbytehq/airbyte/pull/32543) | Handle empty datetime fields in Reports stream                             |