From 746ccb40353071533c166fd016600f9e5718505e Mon Sep 17 00:00:00 2001 From: mpetrykin <40140203+mpetrykin@users.noreply.github.com> Date: Wed, 31 May 2023 20:57:34 +0300 Subject: [PATCH] Source Google Analytics (GA4): Fix Pagination (#26126) * Fix paggination and add offset and limit to acceptable parameters in request body * Change next_page_token and add tests * Update dockerImageTag * Update PR version * Remove minimum, maximum, pattern fields * Remove pattern limit and offset from test_source.py * Remove offset and limit string type * Remove offset and limit string type * Increase limit number to 100000 and remove limit and offset from parameters * Change return type value of next_page_token from int to dict * Change return type value of next_page_token from int to dict * Change page_size to offset and add constant PAGE_SIZE equals 100000 * Add comment to PAGE_SIZE constant and add constant to unit tests * Remove offset and limit from PivotReport * Import PAGE_SIZE in unit_tests from source.py --------- Co-authored-by: Marcos Marx Co-authored-by: sh4sh <6833405+sh4sh@users.noreply.github.com> --- .../Dockerfile | 2 +- .../metadata.yaml | 2 +- .../source.py | 37 ++++++++++++--- .../unit_tests/test_streams.py | 46 ++++++++----------- .../sources/google-analytics-data-api.md | 25 +++++----- 5 files changed, 64 insertions(+), 48 deletions(-) diff --git a/airbyte-integrations/connectors/source-google-analytics-data-api/Dockerfile b/airbyte-integrations/connectors/source-google-analytics-data-api/Dockerfile index 09f7dd1b491a9e..3a2b5695e86993 100644 --- a/airbyte-integrations/connectors/source-google-analytics-data-api/Dockerfile +++ b/airbyte-integrations/connectors/source-google-analytics-data-api/Dockerfile @@ -28,5 +28,5 @@ COPY source_google_analytics_data_api ./source_google_analytics_data_api ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py" ENTRYPOINT ["python", "/airbyte/integration_code/main.py"] -LABEL io.airbyte.version=0.2.2 +LABEL io.airbyte.version=0.2.3 LABEL io.airbyte.name=airbyte/source-google-analytics-data-api diff --git a/airbyte-integrations/connectors/source-google-analytics-data-api/metadata.yaml b/airbyte-integrations/connectors/source-google-analytics-data-api/metadata.yaml index e0927a196136e8..01d47a913f5216 100644 --- a/airbyte-integrations/connectors/source-google-analytics-data-api/metadata.yaml +++ b/airbyte-integrations/connectors/source-google-analytics-data-api/metadata.yaml @@ -7,7 +7,7 @@ data: connectorSubtype: api connectorType: source definitionId: 3cc2eafd-84aa-4dca-93af-322d9dfeec1a - dockerImageTag: 0.2.2 + dockerImageTag: 0.2.3 dockerRepository: airbyte/source-google-analytics-data-api githubIssueLabel: source-google-analytics-data-api icon: google-analytics.svg diff --git a/airbyte-integrations/connectors/source-google-analytics-data-api/source_google_analytics_data_api/source.py b/airbyte-integrations/connectors/source-google-analytics-data-api/source_google_analytics_data_api/source.py index a36b50fd71c8d5..3ca235d65f2e68 100644 --- a/airbyte-integrations/connectors/source-google-analytics-data-api/source_google_analytics_data_api/source.py +++ b/airbyte-integrations/connectors/source-google-analytics-data-api/source_google_analytics_data_api/source.py @@ -29,6 +29,10 @@ # the initial values should be saved once and tracked for each stream, inclusivelly. GoogleAnalyticsQuotaHandler: GoogleAnalyticsApiQuota = GoogleAnalyticsApiQuota() +# set page_size to 100000 due to determination of maximum limit value in official documentation +# https://developers.google.com/analytics/devguides/reporting/data/v1/basics#pagination +PAGE_SIZE = 100000 + class ConfigurationError(Exception): pass @@ -90,6 +94,7 @@ class GoogleAnalyticsDataApiBaseStream(GoogleAnalyticsDataApiAbstractStream): _record_date_format = "%Y%m%d" primary_key = "uuid" + offset = 0 metadata = MetadataDescriptor() @@ -154,13 +159,19 @@ def get_json_schema(self) -> Mapping[str, Any]: def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]: r = response.json() - if all(key in r for key in ["limit", "offset", "rowCount"]): - limit, offset, total_rows = r["limit"], r["offset"], r["rowCount"] + if "rowCount" in r: + total_rows = r["rowCount"] + + if self.offset == 0: + self.offset = PAGE_SIZE + else: + self.offset += PAGE_SIZE - if total_rows <= offset: - return None + if total_rows <= self.offset: + self.offset = 0 + return - return {"limit": limit, "offset": offset + limit} + return {"offset": self.offset} def path( self, *, stream_state: Mapping[str, Any] = None, stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None @@ -201,12 +212,17 @@ def request_body_json( stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None, ) -> Optional[Mapping]: + payload = { "metrics": [{"name": m} for m in self.config["metrics"]], "dimensions": [{"name": d} for d in self.config["dimensions"]], "dateRanges": [stream_slice], "returnPropertyQuota": True, + "offset": str(0), + "limit": str(PAGE_SIZE) } + if next_page_token and next_page_token.get("offset") is not None: + payload.update({"offset": str(next_page_token["offset"])}) return payload def stream_slices( @@ -243,6 +259,11 @@ def request_body_json( next_page_token: Mapping[str, Any] = None, ) -> Optional[Mapping]: payload = super().request_body_json(stream_state, stream_slice, next_page_token) + + # remove offset and limit fields according to their absence in + # https://developers.google.com/analytics/devguides/reporting/data/v1/rest/v1beta/properties/runPivotReport + payload.pop("offset", None) + payload.pop("limit", None) payload["pivots"] = self.config["pivots"] return payload @@ -398,7 +419,11 @@ def streams(self, config: Mapping[str, Any]) -> List[Stream]: def instantiate_report_class(report: dict, config: Mapping[str, Any]) -> GoogleAnalyticsDataApiBaseStream: cohort_spec = report.get("cohortSpec") pivots = report.get("pivots") - stream_config = {"metrics": report["metrics"], "dimensions": report["dimensions"], **config} + stream_config = { + "metrics": report["metrics"], + "dimensions": report["dimensions"], + **config + } report_class_tuple = (GoogleAnalyticsDataApiBaseStream,) if pivots: stream_config["pivots"] = pivots diff --git a/airbyte-integrations/connectors/source-google-analytics-data-api/unit_tests/test_streams.py b/airbyte-integrations/connectors/source-google-analytics-data-api/unit_tests/test_streams.py index dfc64ac79f4987..8a774994334a3d 100644 --- a/airbyte-integrations/connectors/source-google-analytics-data-api/unit_tests/test_streams.py +++ b/airbyte-integrations/connectors/source-google-analytics-data-api/unit_tests/test_streams.py @@ -10,10 +10,11 @@ import pytest from freezegun import freeze_time -from source_google_analytics_data_api.source import GoogleAnalyticsDataApiBaseStream +from source_google_analytics_data_api.source import GoogleAnalyticsDataApiBaseStream, PAGE_SIZE from .utils import read_incremental + json_credentials = """ { "type": "service_account", @@ -88,6 +89,8 @@ def test_request_body_json(patch_base_class): ], "dateRanges": [request_body_params["stream_slice"]], "returnPropertyQuota": True, + "offset": str(0), + "limit": str(PAGE_SIZE) } request_body_json = GoogleAnalyticsDataApiBaseStream(authenticator=MagicMock(), config=patch_base_class["config"]).request_body_json(**request_body_params) @@ -98,21 +101,15 @@ def test_next_page_token_equal_chunk(patch_base_class): stream = GoogleAnalyticsDataApiBaseStream(authenticator=MagicMock(), config=patch_base_class["config"]) response = MagicMock() response.json.side_effect = [ - {"limit": 100000, "offset": 0, "rowCount": 200000}, - {"limit": 100000, "offset": 100000, "rowCount": 200000}, - {"limit": 100000, "offset": 200000, "rowCount": 200000}, + {"rowCount": 300000}, + {"rowCount": 300000}, + {"rowCount": 300000}, ] inputs = {"response": response} expected_tokens = [ - { - "limit": 100000, - "offset": 100000, - }, - { - "limit": 100000, - "offset": 200000, - }, + {"offset": 100000}, + {"offset": 200000}, None, ] @@ -124,26 +121,19 @@ def test_next_page_token(patch_base_class): stream = GoogleAnalyticsDataApiBaseStream(authenticator=MagicMock(), config=patch_base_class["config"]) response = MagicMock() response.json.side_effect = [ - {"limit": 100000, "offset": 0, "rowCount": 250000}, - {"limit": 100000, "offset": 100000, "rowCount": 250000}, - {"limit": 100000, "offset": 200000, "rowCount": 250000}, - {"limit": 100000, "offset": 300000, "rowCount": 250000}, + {"rowCount": 450000}, + {"rowCount": 450000}, + {"rowCount": 450000}, + {"rowCount": 450000}, + {"rowCount": 450000}, ] inputs = {"response": response} expected_tokens = [ - { - "limit": 100000, - "offset": 100000, - }, - { - "limit": 100000, - "offset": 200000, - }, - { - "limit": 100000, - "offset": 300000, - }, + {"offset": 100000}, + {"offset": 200000}, + {"offset": 300000}, + {"offset": 400000}, None, ] diff --git a/docs/integrations/sources/google-analytics-data-api.md b/docs/integrations/sources/google-analytics-data-api.md index 9f9eccae1918de..e98bf33a93da49 100644 --- a/docs/integrations/sources/google-analytics-data-api.md +++ b/docs/integrations/sources/google-analytics-data-api.md @@ -110,15 +110,16 @@ This connector outputs the following incremental streams: ## Changelog -| Version | Date | Pull Request | Subject | -|:--------|:-----------|:---------------------------------------------------------|:---------------------------------------------------------------------------------------| -| 0.2.2 | 2023-05-12 | [25987](https://github.com/airbytehq/airbyte/pull/25987) | Categorized Config Errors Accurately | -| 0.2.1 | 2023-05-11 | [26008](https://github.com/airbytehq/airbyte/pull/26008) | Added handling for `429 - potentiallyThresholdedRequestsPerHour` error | -| 0.2.0 | 2023-04-13 | [25179](https://github.com/airbytehq/airbyte/pull/25179) | Implement support for custom Cohort and Pivot reports | -| 0.1.3 | 2023-03-10 | [23872](https://github.com/airbytehq/airbyte/pull/23872) | Fix parse + cursor for custom reports | -| 0.1.2 | 2023-03-07 | [23822](https://github.com/airbytehq/airbyte/pull/23822) | Improve `rate limits` customer faced error messages and retry logic for `429` | -| 0.1.1 | 2023-01-10 | [21169](https://github.com/airbytehq/airbyte/pull/21169) | Slicer updated, unit tests added | -| 0.1.0 | 2023-01-08 | [20889](https://github.com/airbytehq/airbyte/pull/20889) | Improved config validation, SAT | -| 0.0.3 | 2022-08-15 | [15229](https://github.com/airbytehq/airbyte/pull/15229) | Source Google Analytics Data Api: code refactoring | -| 0.0.2 | 2022-07-27 | [15087](https://github.com/airbytehq/airbyte/pull/15087) | fix documentationUrl | -| 0.0.1 | 2022-05-09 | [12701](https://github.com/airbytehq/airbyte/pull/12701) | Introduce Google Analytics Data API source | +| Version | Date | Pull Request | Subject | +|:--------|:-----------|:---------------------------------------------------------|:------------------------------------------------------------------------------| +| 0.2.3 | 2023-05-16 | [26126](https://github.com/airbytehq/airbyte/pull/26126) | Fix pagination | +| 0.2.2 | 2023-05-12 | [25987](https://github.com/airbytehq/airbyte/pull/25987) | Categorized Config Errors Accurately | +| 0.2.1 | 2023-05-11 | [26008](https://github.com/airbytehq/airbyte/pull/26008) | Added handling for `429 - potentiallyThresholdedRequestsPerHour` error | +| 0.2.0 | 2023-04-13 | [25179](https://github.com/airbytehq/airbyte/pull/25179) | Implement support for custom Cohort and Pivot reports | +| 0.1.3 | 2023-03-10 | [23872](https://github.com/airbytehq/airbyte/pull/23872) | Fix parse + cursor for custom reports | +| 0.1.2 | 2023-03-07 | [23822](https://github.com/airbytehq/airbyte/pull/23822) | Improve `rate limits` customer faced error messages and retry logic for `429` | +| 0.1.1 | 2023-01-10 | [21169](https://github.com/airbytehq/airbyte/pull/21169) | Slicer updated, unit tests added | +| 0.1.0 | 2023-01-08 | [20889](https://github.com/airbytehq/airbyte/pull/20889) | Improved config validation, SAT | +| 0.0.3 | 2022-08-15 | [15229](https://github.com/airbytehq/airbyte/pull/15229) | Source Google Analytics Data Api: code refactoring | +| 0.0.2 | 2022-07-27 | [15087](https://github.com/airbytehq/airbyte/pull/15087) | fix documentationUrl | +| 0.0.1 | 2022-05-09 | [12701](https://github.com/airbytehq/airbyte/pull/12701) | Introduce Google Analytics Data API source |