From 538a4200f3e1f422b1538db0f6cb3286f7cbc0ac Mon Sep 17 00:00:00 2001 From: Dhroov Makwana Date: Wed, 9 Nov 2022 06:51:23 +0530 Subject: [PATCH] =?UTF-8?q?=F0=9F=8E=89=20New=20Source:=20The=20Guardian?= =?UTF-8?q?=20API=20[low-code=20CDK]=20(#18654)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add new source: The Guardian API * Add documentation * Fix custom paginator, it now stops without throwing an error * Update the-guardian-api.md with PR number and link * Remove catalog file, add titles to all properties in spec.yaml * Add incremental sync, change parameter names * format * remove order from spec * add guardian to source def * auto-bump connector version Co-authored-by: Vincent Koc Co-authored-by: marcosmarxm Co-authored-by: Octavia Squidington III Co-authored-by: Marcos Marx --- .../resources/seed/source_definitions.yaml | 7 ++ .../src/main/resources/seed/source_specs.yaml | 72 +++++++++++ airbyte-integrations/builds.md | 1 + .../source-the-guardian-api/.dockerignore | 6 + .../source-the-guardian-api/Dockerfile | 38 ++++++ .../source-the-guardian-api/README.md | 79 ++++++++++++ .../source-the-guardian-api/__init__.py | 3 + .../acceptance-test-config.yml | 30 +++++ .../acceptance-test-docker.sh | 16 +++ .../source-the-guardian-api/bootstrap.md | 46 +++++++ .../source-the-guardian-api/build.gradle | 9 ++ .../integration_tests/__init__.py | 3 + .../integration_tests/abnormal_state.json | 5 + .../integration_tests/acceptance.py | 16 +++ .../integration_tests/configured_catalog.json | 13 ++ .../integration_tests/invalid_config.json | 5 + .../integration_tests/sample_config.json | 5 + .../integration_tests/sample_state.json | 5 + .../source-the-guardian-api/main.py | 13 ++ .../source-the-guardian-api/requirements.txt | 2 + .../source-the-guardian-api/setup.py | 29 +++++ .../source_the_guardian_api/__init__.py | 8 ++ .../custom_page_strategy.py | 32 +++++ .../schemas/content.json | 52 ++++++++ .../source_the_guardian_api/source.py | 18 +++ .../source_the_guardian_api/spec.yaml | 54 +++++++++ .../the_guardian_api.yaml | 74 ++++++++++++ docs/integrations/sources/the-guardian-api.md | 113 ++++++++++++++++++ 28 files changed, 754 insertions(+) create mode 100644 airbyte-integrations/connectors/source-the-guardian-api/.dockerignore create mode 100644 airbyte-integrations/connectors/source-the-guardian-api/Dockerfile create mode 100644 airbyte-integrations/connectors/source-the-guardian-api/README.md create mode 100644 airbyte-integrations/connectors/source-the-guardian-api/__init__.py create mode 100644 airbyte-integrations/connectors/source-the-guardian-api/acceptance-test-config.yml create mode 100644 airbyte-integrations/connectors/source-the-guardian-api/acceptance-test-docker.sh create mode 100644 airbyte-integrations/connectors/source-the-guardian-api/bootstrap.md create mode 100644 airbyte-integrations/connectors/source-the-guardian-api/build.gradle create mode 100644 airbyte-integrations/connectors/source-the-guardian-api/integration_tests/__init__.py create mode 100644 airbyte-integrations/connectors/source-the-guardian-api/integration_tests/abnormal_state.json create mode 100644 airbyte-integrations/connectors/source-the-guardian-api/integration_tests/acceptance.py create mode 100644 airbyte-integrations/connectors/source-the-guardian-api/integration_tests/configured_catalog.json create mode 100644 airbyte-integrations/connectors/source-the-guardian-api/integration_tests/invalid_config.json create mode 100644 airbyte-integrations/connectors/source-the-guardian-api/integration_tests/sample_config.json create mode 100644 airbyte-integrations/connectors/source-the-guardian-api/integration_tests/sample_state.json create mode 100644 airbyte-integrations/connectors/source-the-guardian-api/main.py create mode 100644 airbyte-integrations/connectors/source-the-guardian-api/requirements.txt create mode 100644 airbyte-integrations/connectors/source-the-guardian-api/setup.py create mode 100644 airbyte-integrations/connectors/source-the-guardian-api/source_the_guardian_api/__init__.py create mode 100644 airbyte-integrations/connectors/source-the-guardian-api/source_the_guardian_api/custom_page_strategy.py create mode 100644 airbyte-integrations/connectors/source-the-guardian-api/source_the_guardian_api/schemas/content.json create mode 100644 airbyte-integrations/connectors/source-the-guardian-api/source_the_guardian_api/source.py create mode 100644 airbyte-integrations/connectors/source-the-guardian-api/source_the_guardian_api/spec.yaml create mode 100644 airbyte-integrations/connectors/source-the-guardian-api/source_the_guardian_api/the_guardian_api.yaml create mode 100644 docs/integrations/sources/the-guardian-api.md diff --git a/airbyte-config/init/src/main/resources/seed/source_definitions.yaml b/airbyte-config/init/src/main/resources/seed/source_definitions.yaml index ee98e6bcba5dd..e432480f2b6ba 100644 --- a/airbyte-config/init/src/main/resources/seed/source_definitions.yaml +++ b/airbyte-config/init/src/main/resources/seed/source_definitions.yaml @@ -1334,6 +1334,13 @@ icon: timely.svg sourceType: api releaseStage: alpha +- name: The Guardian API + sourceDefinitionId: d42bd69f-6bf0-4d0b-9209-16231af07a92 + dockerRepository: airbyte/source-the-guardian-api + dockerImageTag: 0.1.0 + documentationUrl: https://docs.airbyte.com/integrations/sources/the-guardian-api + sourceType: api + releaseStage: alpha - name: Trello sourceDefinitionId: 8da67652-004c-11ec-9a03-0242ac130003 dockerRepository: airbyte/source-trello diff --git a/airbyte-config/init/src/main/resources/seed/source_specs.yaml b/airbyte-config/init/src/main/resources/seed/source_specs.yaml index b16333418db1c..b5fa3659da938 100644 --- a/airbyte-config/init/src/main/resources/seed/source_specs.yaml +++ b/airbyte-config/init/src/main/resources/seed/source_specs.yaml @@ -12858,6 +12858,78 @@ supportsNormalization: false supportsDBT: false supported_destination_sync_modes: [] +- dockerImage: "airbyte/source-the-guardian-api:0.1.0" + spec: + documentationUrl: "https://docs.airbyte.com/integrations/sources/the-guardian-api" + connectionSpecification: + $schema: "http://json-schema.org/draft-07/schema#" + title: "The Guardian Api Spec" + type: "object" + required: + - "api_key" + - "start_date" + additionalProperties: true + properties: + api_key: + title: "API Key" + type: "string" + description: "Your API Key. See here. The key is case sensitive." + airbyte_secret: true + start_date: + title: "Start Date" + type: "string" + description: "Use this to set the minimum date (YYYY-MM-DD) of the results.\ + \ Results older than the start_date will not be shown." + pattern: "^([1-9][0-9]{3})\\-(0?[1-9]|1[012])\\-(0?[1-9]|[12][0-9]|3[01])$" + examples: + - "YYYY-MM-DD" + query: + title: "Query" + type: "string" + description: "(Optional) The query (q) parameter filters the results to\ + \ only those that include that search term. The q parameter supports AND,\ + \ OR and NOT operators." + examples: + - "environment AND NOT water" + - "environment AND political" + - "amusement park" + - "political" + tag: + title: "Tag" + type: "string" + description: "(Optional) A tag is a piece of data that is used by The Guardian\ + \ to categorise content. Use this parameter to filter results by showing\ + \ only the ones matching the entered tag. See here for a list of all tags, and here for the tags endpoint documentation." + examples: + - "environment/recycling" + - "environment/plasticbags" + - "environment/energyefficiency" + section: + title: "Section" + type: "string" + description: "(Optional) Use this to filter the results by a particular\ + \ section. See here for a list of all sections, and here for the sections endpoint documentation." + examples: + - "media" + - "technology" + - "housing-network" + end_date: + title: "End Date" + type: "string" + description: "(Optional) Use this to set the maximum date (YYYY-MM-DD) of\ + \ the results. Results newer than the end_date will not be shown. Default\ + \ is set to the current date (today) for incremental syncs." + pattern: "^([1-9][0-9]{3})\\-(0?[1-9]|1[012])\\-(0?[1-9]|[12][0-9]|3[01])$" + examples: + - "YYYY-MM-DD" + supportsNormalization: false + supportsDBT: false + supported_destination_sync_modes: [] - dockerImage: "airbyte/source-trello:0.1.6" spec: documentationUrl: "https://docs.airbyte.com/integrations/sources/trello" diff --git a/airbyte-integrations/builds.md b/airbyte-integrations/builds.md index 021aa7ec86067..0cc7507f30190 100644 --- a/airbyte-integrations/builds.md +++ b/airbyte-integrations/builds.md @@ -121,6 +121,7 @@ | Strava | [![source-stava](https://img.shields.io/endpoint?url=https%3A%2F%2Fdnsgjos7lj2fu.cloudfront.net%2Ftests%2Fsummary%2Fsource-strava%2Fbadge.json)](https://dnsgjos7lj2fu.cloudfront.net/tests/summary/source-strava) | | Stripe | [![source-stripe](https://img.shields.io/endpoint?url=https%3A%2F%2Fdnsgjos7lj2fu.cloudfront.net%2Ftests%2Fsummary%2Fsource-stripe%2Fbadge.json)](https://dnsgjos7lj2fu.cloudfront.net/tests/summary/source-stripe) | | Tempo | [![source-tempo](https://img.shields.io/endpoint?url=https%3A%2F%2Fdnsgjos7lj2fu.cloudfront.net%2Ftests%2Fsummary%2Fsource-tempo%2Fbadge.json)](https://dnsgjos7lj2fu.cloudfront.net/tests/summary/source-tempo) | +| The Guardian API | [![source-the-guardian-api](https://img.shields.io/endpoint?url=https%3A%2F%2Fdnsgjos7lj2fu.cloudfront.net%2Ftests%2Fsummary%2Fsource-the-guardian-api%2Fbadge.json)](https://dnsgjos7lj2fu.cloudfront.net/tests/summary/source-the-guardian-api) | | TikTok Marketing | [![source-tiktok-marketing](https://img.shields.io/endpoint?url=https%3A%2F%2Fdnsgjos7lj2fu.cloudfront.net%2Ftests%2Fsummary%2Fsource-tiktok-marketing%2Fbadge.json)](https://dnsgjos7lj2fu.cloudfront.net/tests/summary/source-tiktok-marketing) | | Trello | [![source-trello](https://img.shields.io/endpoint?url=https%3A%2F%2Fdnsgjos7lj2fu.cloudfront.net%2Ftests%2Fsummary%2Fsource-trello%2Fbadge.json)](https://dnsgjos7lj2fu.cloudfront.net/tests/summary/source-trello) | | Twilio | [![source-twilio](https://img.shields.io/endpoint?url=https%3A%2F%2Fdnsgjos7lj2fu.cloudfront.net%2Ftests%2Fsummary%2Fsource-twilio%2Fbadge.json)](https://dnsgjos7lj2fu.cloudfront.net/tests/summary/source-twilio) | diff --git a/airbyte-integrations/connectors/source-the-guardian-api/.dockerignore b/airbyte-integrations/connectors/source-the-guardian-api/.dockerignore new file mode 100644 index 0000000000000..b63aa473d142b --- /dev/null +++ b/airbyte-integrations/connectors/source-the-guardian-api/.dockerignore @@ -0,0 +1,6 @@ +* +!Dockerfile +!main.py +!source_the_guardian_api +!setup.py +!secrets diff --git a/airbyte-integrations/connectors/source-the-guardian-api/Dockerfile b/airbyte-integrations/connectors/source-the-guardian-api/Dockerfile new file mode 100644 index 0000000000000..97385698dc009 --- /dev/null +++ b/airbyte-integrations/connectors/source-the-guardian-api/Dockerfile @@ -0,0 +1,38 @@ +FROM python:3.9.11-alpine3.15 as base + +# build and load all requirements +FROM base as builder +WORKDIR /airbyte/integration_code + +# upgrade pip to the latest version +RUN apk --no-cache upgrade \ + && pip install --upgrade pip \ + && apk --no-cache add tzdata build-base + + +COPY setup.py ./ +# install necessary packages to a temporary folder +RUN pip install --prefix=/install . + +# build a clean environment +FROM base +WORKDIR /airbyte/integration_code + +# copy all loaded and built libraries to a pure basic image +COPY --from=builder /install /usr/local +# add default timezone settings +COPY --from=builder /usr/share/zoneinfo/Etc/UTC /etc/localtime +RUN echo "Etc/UTC" > /etc/timezone + +# bash is installed for more convenient debugging. +RUN apk --no-cache add bash + +# copy payload code only +COPY main.py ./ +COPY source_the_guardian_api ./source_the_guardian_api + +ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py" +ENTRYPOINT ["python", "/airbyte/integration_code/main.py"] + +LABEL io.airbyte.version=0.1.0 +LABEL io.airbyte.name=airbyte/source-the-guardian-api diff --git a/airbyte-integrations/connectors/source-the-guardian-api/README.md b/airbyte-integrations/connectors/source-the-guardian-api/README.md new file mode 100644 index 0000000000000..4bc60225dafd1 --- /dev/null +++ b/airbyte-integrations/connectors/source-the-guardian-api/README.md @@ -0,0 +1,79 @@ +# The Guardian Api Source + +This is the repository for the The Guardian Api configuration based source connector. +For information about how to use this connector within Airbyte, see [the documentation](https://docs.airbyte.io/integrations/sources/the-guardian-api). + +## Local development + +#### Building via Gradle +You can also build the connector in Gradle. This is typically used in CI and not needed for your development workflow. + +To build using Gradle, from the Airbyte repository root, run: +``` +./gradlew :airbyte-integrations:connectors:source-the-guardian-api:build +``` + +#### Create credentials +**If you are a community contributor**, follow the instructions in the [documentation](https://docs.airbyte.io/integrations/sources/the-guardian-api) +to generate the necessary credentials. Then create a file `secrets/config.json` conforming to the `source_the_guardian_api/spec.yaml` file. +Note that any directory named `secrets` is gitignored across the entire Airbyte repo, so there is no danger of accidentally checking in sensitive information. +See `integration_tests/sample_config.json` for a sample config file. + +**If you are an Airbyte core member**, copy the credentials in Lastpass under the secret name `source the-guardian-api test creds` +and place them into `secrets/config.json`. + +### Locally running the connector docker image + +#### Build +First, make sure you build the latest Docker image: +``` +docker build . -t airbyte/source-the-guardian-api:dev +``` + +You can also build the connector image via Gradle: +``` +./gradlew :airbyte-integrations:connectors:source-the-guardian-api:airbyteDocker +``` +When building via Gradle, the docker image name and tag, respectively, are the values of the `io.airbyte.name` and `io.airbyte.version` `LABEL`s in +the Dockerfile. + +#### Run +Then run any of the connector commands as follows: +``` +docker run --rm airbyte/source-the-guardian-api:dev spec +docker run --rm -v $(pwd)/secrets:/secrets airbyte/source-the-guardian-api:dev check --config /secrets/config.json +docker run --rm -v $(pwd)/secrets:/secrets airbyte/source-the-guardian-api:dev discover --config /secrets/config.json +docker run --rm -v $(pwd)/secrets:/secrets -v $(pwd)/integration_tests:/integration_tests airbyte/source-the-guardian-api:dev read --config /secrets/config.json --catalog /integration_tests/configured_catalog.json +``` +## Testing + +#### Acceptance Tests +Customize `acceptance-test-config.yml` file to configure tests. See [Source Acceptance Tests](https://docs.airbyte.io/connector-development/testing-connectors/source-acceptance-tests-reference) for more information. +If your connector requires to create or destroy resources for use during acceptance tests create fixtures for it and place them inside integration_tests/acceptance.py. + +To run your integration tests with docker + +### Using gradle to run tests +All commands should be run from airbyte project root. +To run unit tests: +``` +./gradlew :airbyte-integrations:connectors:source-the-guardian-api:unitTest +``` +To run acceptance and custom integration tests: +``` +./gradlew :airbyte-integrations:connectors:source-the-guardian-api:integrationTest +``` + +## Dependency Management +All of your dependencies should go in `setup.py`, NOT `requirements.txt`. The requirements file is only used to connect internal Airbyte dependencies in the monorepo for local development. +We split dependencies between two groups, dependencies that are: +* required for your connector to work need to go to `MAIN_REQUIREMENTS` list. +* required for the testing need to go to `TEST_REQUIREMENTS` list + +### Publishing a new version of the connector +You've checked out the repo, implemented a million dollar feature, and you're ready to share your changes with the world. Now what? +1. Make sure your changes are passing unit and integration tests. +1. Bump the connector version in `Dockerfile` -- just increment the value of the `LABEL io.airbyte.version` appropriately (we use [SemVer](https://semver.org/)). +1. Create a Pull Request. +1. Pat yourself on the back for being an awesome contributor. +1. Someone from Airbyte will take a look at your PR and iterate with you to merge it into master. diff --git a/airbyte-integrations/connectors/source-the-guardian-api/__init__.py b/airbyte-integrations/connectors/source-the-guardian-api/__init__.py new file mode 100644 index 0000000000000..1100c1c58cf51 --- /dev/null +++ b/airbyte-integrations/connectors/source-the-guardian-api/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-integrations/connectors/source-the-guardian-api/acceptance-test-config.yml b/airbyte-integrations/connectors/source-the-guardian-api/acceptance-test-config.yml new file mode 100644 index 0000000000000..0648031a84aa2 --- /dev/null +++ b/airbyte-integrations/connectors/source-the-guardian-api/acceptance-test-config.yml @@ -0,0 +1,30 @@ +# See [Source Acceptance Tests](https://docs.airbyte.com/connector-development/testing-connectors/source-acceptance-tests-reference) +# for more information about how to configure these tests +connector_image: airbyte/source-the-guardian-api:dev +acceptance_tests: + spec: + tests: + - spec_path: "source_the_guardian_api/spec.yaml" + connection: + tests: + - config_path: "secrets/config.json" + status: "succeed" + - config_path: "integration_tests/invalid_config.json" + status: "failed" + discovery: + tests: + - config_path: "secrets/config.json" + basic_read: + tests: + - config_path: "secrets/config.json" + configured_catalog_path: "integration_tests/configured_catalog.json" + empty_streams: [] + incremental: + tests: + - config_path: "secrets/config.json" + configured_catalog_path: "integration_tests/configured_catalog.json" + future_state_path: "integration_tests/abnormal_state.json" + full_refresh: + tests: + - config_path: "secrets/config.json" + configured_catalog_path: "integration_tests/configured_catalog.json" diff --git a/airbyte-integrations/connectors/source-the-guardian-api/acceptance-test-docker.sh b/airbyte-integrations/connectors/source-the-guardian-api/acceptance-test-docker.sh new file mode 100644 index 0000000000000..c51577d10690c --- /dev/null +++ b/airbyte-integrations/connectors/source-the-guardian-api/acceptance-test-docker.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env sh + +# Build latest connector image +docker build . -t $(cat acceptance-test-config.yml | grep "connector_image" | head -n 1 | cut -d: -f2-) + +# Pull latest acctest image +docker pull airbyte/source-acceptance-test:latest + +# Run +docker run --rm -it \ + -v /var/run/docker.sock:/var/run/docker.sock \ + -v /tmp:/tmp \ + -v $(pwd):/test_input \ + airbyte/source-acceptance-test \ + --acceptance-test-config /test_input + diff --git a/airbyte-integrations/connectors/source-the-guardian-api/bootstrap.md b/airbyte-integrations/connectors/source-the-guardian-api/bootstrap.md new file mode 100644 index 0000000000000..816bdb38ab47c --- /dev/null +++ b/airbyte-integrations/connectors/source-the-guardian-api/bootstrap.md @@ -0,0 +1,46 @@ +# The Guardian API + +## Overview + +[The Guardian Open Platform](https://open-platform.theguardian.com/) is a public web service for accessing all the content the Guardian creates, categorised by tags and section. To get started, You need a key to successfully authenticate against the API. The Guardian API Connector is implemented with the [Airbyte Low-Code CDK](https://docs.airbyte.com/connector-development/config-based/low-code-cdk-overview). + +## Output Format + +#### Each content item has the following structure:- + +```yaml +{ + "id": "string", + "type": "string" + "sectionId": "string" + "sectionName": "string" + "webPublicationDate": "string" + "webTitle": "string" + "webUrl": "string" + "apiUrl": "string" + "isHosted": "boolean" + "pillarId": "string" + "pillarName": "string" +} +``` + +**Description:-** + +**webPublicationDate**: The combined date and time of publication +**webUrl**: The URL of the html content +**apiUrl**: The URL of the raw content + +## Core Streams + +Connector supports the `content` stream that returns all pieces of content in the API. + +## Rate Limiting + +The key that you are assigned is rate-limited and as such any applications that depend on making large numbers of requests on a polling basis are likely to exceed their daily quota and thus be prevented from making further requests until the next period begins. + +## Authentication and Permissions + +To access the API, you will need to sign up for an API key, which should be sent with every request. Visit [this](https://open-platform.theguardian.com/access) link to get an API key. +The easiest way to see what data is included is to explore the data. You can build complex queries quickly and browse the results. Visit [this](https://open-platform.theguardian.com/explore) link to explore the data. + +See [this](https://docs.airbyte.io/integrations/sources/the-guardian-api) link for the connector docs. diff --git a/airbyte-integrations/connectors/source-the-guardian-api/build.gradle b/airbyte-integrations/connectors/source-the-guardian-api/build.gradle new file mode 100644 index 0000000000000..7148bffa332d1 --- /dev/null +++ b/airbyte-integrations/connectors/source-the-guardian-api/build.gradle @@ -0,0 +1,9 @@ +plugins { + id 'airbyte-python' + id 'airbyte-docker' + id 'airbyte-source-acceptance-test' +} + +airbytePython { + moduleDirectory 'source_the_guardian_api' +} diff --git a/airbyte-integrations/connectors/source-the-guardian-api/integration_tests/__init__.py b/airbyte-integrations/connectors/source-the-guardian-api/integration_tests/__init__.py new file mode 100644 index 0000000000000..1100c1c58cf51 --- /dev/null +++ b/airbyte-integrations/connectors/source-the-guardian-api/integration_tests/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-integrations/connectors/source-the-guardian-api/integration_tests/abnormal_state.json b/airbyte-integrations/connectors/source-the-guardian-api/integration_tests/abnormal_state.json new file mode 100644 index 0000000000000..39645fa011f22 --- /dev/null +++ b/airbyte-integrations/connectors/source-the-guardian-api/integration_tests/abnormal_state.json @@ -0,0 +1,5 @@ +{ + "content": { + "webPublicationDate": "2123-10-31T10:10:10Z" + } +} diff --git a/airbyte-integrations/connectors/source-the-guardian-api/integration_tests/acceptance.py b/airbyte-integrations/connectors/source-the-guardian-api/integration_tests/acceptance.py new file mode 100644 index 0000000000000..1302b2f57e10e --- /dev/null +++ b/airbyte-integrations/connectors/source-the-guardian-api/integration_tests/acceptance.py @@ -0,0 +1,16 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + + +import pytest + +pytest_plugins = ("source_acceptance_test.plugin",) + + +@pytest.fixture(scope="session", autouse=True) +def connector_setup(): + """This fixture is a placeholder for external resources that acceptance test might require.""" + # TODO: setup test dependencies if needed. otherwise remove the TODO comments + yield + # TODO: clean up test dependencies diff --git a/airbyte-integrations/connectors/source-the-guardian-api/integration_tests/configured_catalog.json b/airbyte-integrations/connectors/source-the-guardian-api/integration_tests/configured_catalog.json new file mode 100644 index 0000000000000..23bee6c786302 --- /dev/null +++ b/airbyte-integrations/connectors/source-the-guardian-api/integration_tests/configured_catalog.json @@ -0,0 +1,13 @@ +{ + "streams": [ + { + "stream": { + "name": "content", + "json_schema": {}, + "supported_sync_modes": ["full_refresh", "incremental"] + }, + "sync_mode": "incremental", + "destination_sync_mode": "overwrite" + } + ] +} diff --git a/airbyte-integrations/connectors/source-the-guardian-api/integration_tests/invalid_config.json b/airbyte-integrations/connectors/source-the-guardian-api/integration_tests/invalid_config.json new file mode 100644 index 0000000000000..8cab379396176 --- /dev/null +++ b/airbyte-integrations/connectors/source-the-guardian-api/integration_tests/invalid_config.json @@ -0,0 +1,5 @@ +{ + "api_key": "", + "query": "water OR rain", + "start_date": "2022-10-25" +} diff --git a/airbyte-integrations/connectors/source-the-guardian-api/integration_tests/sample_config.json b/airbyte-integrations/connectors/source-the-guardian-api/integration_tests/sample_config.json new file mode 100644 index 0000000000000..eb40df2dabcab --- /dev/null +++ b/airbyte-integrations/connectors/source-the-guardian-api/integration_tests/sample_config.json @@ -0,0 +1,5 @@ +{ + "api_key": "", + "query": "water OR rain OR thunder", + "start_date": "2022-10-25" +} diff --git a/airbyte-integrations/connectors/source-the-guardian-api/integration_tests/sample_state.json b/airbyte-integrations/connectors/source-the-guardian-api/integration_tests/sample_state.json new file mode 100644 index 0000000000000..aba11a25ab694 --- /dev/null +++ b/airbyte-integrations/connectors/source-the-guardian-api/integration_tests/sample_state.json @@ -0,0 +1,5 @@ +{ + "content": { + "webPublicationDate": "2022-10-25T10:10:10Z" + } +} diff --git a/airbyte-integrations/connectors/source-the-guardian-api/main.py b/airbyte-integrations/connectors/source-the-guardian-api/main.py new file mode 100644 index 0000000000000..dfcf2b6a88c5e --- /dev/null +++ b/airbyte-integrations/connectors/source-the-guardian-api/main.py @@ -0,0 +1,13 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + + +import sys + +from airbyte_cdk.entrypoint import launch +from source_the_guardian_api import SourceTheGuardianApi + +if __name__ == "__main__": + source = SourceTheGuardianApi() + launch(source, sys.argv[1:]) diff --git a/airbyte-integrations/connectors/source-the-guardian-api/requirements.txt b/airbyte-integrations/connectors/source-the-guardian-api/requirements.txt new file mode 100644 index 0000000000000..0411042aa0911 --- /dev/null +++ b/airbyte-integrations/connectors/source-the-guardian-api/requirements.txt @@ -0,0 +1,2 @@ +-e ../../bases/source-acceptance-test +-e . diff --git a/airbyte-integrations/connectors/source-the-guardian-api/setup.py b/airbyte-integrations/connectors/source-the-guardian-api/setup.py new file mode 100644 index 0000000000000..3f15ab9cf2522 --- /dev/null +++ b/airbyte-integrations/connectors/source-the-guardian-api/setup.py @@ -0,0 +1,29 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + + +from setuptools import find_packages, setup + +MAIN_REQUIREMENTS = [ + "airbyte-cdk~=0.1", +] + +TEST_REQUIREMENTS = [ + "pytest~=6.1", + "pytest-mock~=3.6.1", + "source-acceptance-test", +] + +setup( + name="source_the_guardian_api", + description="Source implementation for The Guardian Api.", + author="Airbyte", + author_email="contact@airbyte.io", + packages=find_packages(), + install_requires=MAIN_REQUIREMENTS, + package_data={"": ["*.json", "*.yaml", "schemas/*.json", "schemas/shared/*.json"]}, + extras_require={ + "tests": TEST_REQUIREMENTS, + }, +) diff --git a/airbyte-integrations/connectors/source-the-guardian-api/source_the_guardian_api/__init__.py b/airbyte-integrations/connectors/source-the-guardian-api/source_the_guardian_api/__init__.py new file mode 100644 index 0000000000000..68d4ea3908649 --- /dev/null +++ b/airbyte-integrations/connectors/source-the-guardian-api/source_the_guardian_api/__init__.py @@ -0,0 +1,8 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + + +from .source import SourceTheGuardianApi + +__all__ = ["SourceTheGuardianApi"] diff --git a/airbyte-integrations/connectors/source-the-guardian-api/source_the_guardian_api/custom_page_strategy.py b/airbyte-integrations/connectors/source-the-guardian-api/source_the_guardian_api/custom_page_strategy.py new file mode 100644 index 0000000000000..e8c02d9529be7 --- /dev/null +++ b/airbyte-integrations/connectors/source-the-guardian-api/source_the_guardian_api/custom_page_strategy.py @@ -0,0 +1,32 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from dataclasses import dataclass +from typing import Any, List, Mapping, Optional + +import requests +from airbyte_cdk.sources.declarative.requesters.paginators.strategies.page_increment import PageIncrement + + +@dataclass +class CustomPageIncrement(PageIncrement): + """ + Starts page from 1 instead of the default value that is 0. Stops Pagination when currentPage is equal to totalPages. + """ + + def next_page_token(self, response: requests.Response, last_records: List[Mapping[str, Any]]) -> Optional[Any]: + res = response.json().get("response") + currPage = res.get("currentPage") + totalPages = res.get("pages") + if currPage < totalPages: + self._page += 1 + return self._page + else: + return None + + def __post_init__(self, options: Mapping[str, Any]): + self._page = 1 + + def reset(self): + self._page = 1 diff --git a/airbyte-integrations/connectors/source-the-guardian-api/source_the_guardian_api/schemas/content.json b/airbyte-integrations/connectors/source-the-guardian-api/source_the_guardian_api/schemas/content.json new file mode 100644 index 0000000000000..3751793358c4f --- /dev/null +++ b/airbyte-integrations/connectors/source-the-guardian-api/source_the_guardian_api/schemas/content.json @@ -0,0 +1,52 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "type": { + "type": "string" + }, + "sectionId": { + "type": "string" + }, + "sectionName": { + "type": "string" + }, + "webPublicationDate": { + "type": "string" + }, + "webTitle": { + "type": "string" + }, + "webUrl": { + "type": "string" + }, + "apiUrl": { + "type": "string" + }, + "isHosted": { + "type": "boolean" + }, + "pillarId": { + "type": "string" + }, + "pillarName": { + "type": "string" + } + }, + "required": [ + "id", + "type", + "sectionId", + "sectionName", + "webPublicationDate", + "webTitle", + "webUrl", + "apiUrl", + "isHosted", + "pillarId", + "pillarName" + ] +} diff --git a/airbyte-integrations/connectors/source-the-guardian-api/source_the_guardian_api/source.py b/airbyte-integrations/connectors/source-the-guardian-api/source_the_guardian_api/source.py new file mode 100644 index 0000000000000..19ae06a52dae8 --- /dev/null +++ b/airbyte-integrations/connectors/source-the-guardian-api/source_the_guardian_api/source.py @@ -0,0 +1,18 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.yaml_declarative_source import YamlDeclarativeSource + +""" +This file provides the necessary constructs to interpret a provided declarative YAML configuration file into +source connector. + +WARNING: Do not modify this file. +""" + + +# Declarative Source +class SourceTheGuardianApi(YamlDeclarativeSource): + def __init__(self): + super().__init__(**{"path_to_yaml": "the_guardian_api.yaml"}) diff --git a/airbyte-integrations/connectors/source-the-guardian-api/source_the_guardian_api/spec.yaml b/airbyte-integrations/connectors/source-the-guardian-api/source_the_guardian_api/spec.yaml new file mode 100644 index 0000000000000..b9e0e1f3a7671 --- /dev/null +++ b/airbyte-integrations/connectors/source-the-guardian-api/source_the_guardian_api/spec.yaml @@ -0,0 +1,54 @@ +documentationUrl: https://docs.airbyte.com/integrations/sources/the-guardian-api +connectionSpecification: + $schema: http://json-schema.org/draft-07/schema# + title: The Guardian Api Spec + type: object + required: + - api_key + - start_date + additionalProperties: true + properties: + api_key: + title: API Key + type: string + description: Your API Key. See here. The key is case sensitive. + airbyte_secret: true + start_date: + title: Start Date + type: string + description: Use this to set the minimum date (YYYY-MM-DD) of the results. Results older than the start_date will not be shown. + pattern: ^([1-9][0-9]{3})\-(0?[1-9]|1[012])\-(0?[1-9]|[12][0-9]|3[01])$ + examples: + - YYYY-MM-DD + query: + title: Query + type: string + description: (Optional) The query (q) parameter filters the results to only those that include that search term. The q parameter supports AND, OR and NOT operators. + examples: + - environment AND NOT water + - environment AND political + - amusement park + - political + tag: + title: Tag + type: string + description: (Optional) A tag is a piece of data that is used by The Guardian to categorise content. Use this parameter to filter results by showing only the ones matching the entered tag. See here for a list of all tags, and here for the tags endpoint documentation. + examples: + - environment/recycling + - environment/plasticbags + - environment/energyefficiency + section: + title: Section + type: string + description: (Optional) Use this to filter the results by a particular section. See here for a list of all sections, and here for the sections endpoint documentation. + examples: + - media + - technology + - housing-network + end_date: + title: End Date + type: string + description: (Optional) Use this to set the maximum date (YYYY-MM-DD) of the results. Results newer than the end_date will not be shown. Default is set to the current date (today) for incremental syncs. + pattern: ^([1-9][0-9]{3})\-(0?[1-9]|1[012])\-(0?[1-9]|[12][0-9]|3[01])$ + examples: + - YYYY-MM-DD diff --git a/airbyte-integrations/connectors/source-the-guardian-api/source_the_guardian_api/the_guardian_api.yaml b/airbyte-integrations/connectors/source-the-guardian-api/source_the_guardian_api/the_guardian_api.yaml new file mode 100644 index 0000000000000..c40bbc8154ab1 --- /dev/null +++ b/airbyte-integrations/connectors/source-the-guardian-api/source_the_guardian_api/the_guardian_api.yaml @@ -0,0 +1,74 @@ +version: "0.1.0" + +definitions: + selector: + extractor: + field_pointer: + - response + - results + requester: + url_base: "https://content.guardianapis.com" + http_method: "GET" + request_options_provider: + request_parameters: + api-key: "{{ config['api_key'] }}" + q: "{{ config['query'] }}" + tag: "{{ config['tag'] }}" + section: "{{ config['section'] }}" + order-by: 'oldest' + # from-date: "{{ config['start_date'] }}" + # to-date: "{{ config['end_date'] or now_utc().strftime('%Y-%m-%d') }}" + stream_slicer: + type: "DatetimeStreamSlicer" + start_datetime: + datetime: "{{ config['start_date'] }}" + datetime_format: "%Y-%m-%d" + end_datetime: + datetime: "{{ config['end_date'] or now_utc().strftime('%Y-%m-%d') }}" + datetime_format: "%Y-%m-%d" + step: "7d" + datetime_format: "%Y-%m-%dT%H:%M:%SZ" + cursor_field: "{{ options['stream_cursor_field'] }}" + start_time_option: + field_name: "from-date" + inject_into: "request_parameter" + end_time_option: + field_name: "to-date" + inject_into: "request_parameter" + + retriever: + record_selector: + $ref: "*ref(definitions.selector)" + paginator: + type: "DefaultPaginator" + url_base: "*ref(definitions.requester.url_base)" + pagination_strategy: + class_name: "source_the_guardian_api.custom_page_strategy.CustomPageIncrement" + page_size: 10 + page_token_option: + inject_into: "request_parameter" + field_name: "page" + page_size_option: + inject_into: "body_data" + field_name: "page_size" + requester: + $ref: "*ref(definitions.requester)" + stream_slicer: + $ref: "*ref(definitions.stream_slicer)" + base_stream: + retriever: + $ref: "*ref(definitions.retriever)" + content_stream: + $ref: "*ref(definitions.base_stream)" + $options: + name: "content" + primary_key: "id" + path: "/search" + stream_cursor_field: "webPublicationDate" + +streams: + - "*ref(definitions.content_stream)" + +check: + stream_names: + - "content" diff --git a/docs/integrations/sources/the-guardian-api.md b/docs/integrations/sources/the-guardian-api.md new file mode 100644 index 0000000000000..aea779c6ed95f --- /dev/null +++ b/docs/integrations/sources/the-guardian-api.md @@ -0,0 +1,113 @@ +# The Guardian API + +## Overview + +The Guardian API source can sync data from the [The Guardian](https://open-platform.theguardian.com/) + +## Requirements + +To access the API, you will need to sign up for an API key, which should be sent with every request. Visit [this](https://open-platform.theguardian.com/access) link to register for an API key. + +The following (optional) parameters can be provided to the connector :- + +--- + +##### `q` (query) + +The `q` (query) parameter filters the results to only those that include that search term. The `q` parameter supports `AND`, `OR` and `NOT` operators. For example, let's see if the Guardian has any content on political debates: `https://content.guardianapis.com/search?q=debates` + +Here the q parameter filters the results to only those that include that search term. In this case, there are many results, so we might want to filter down the response to something more meaningful, specifically looking for political content published in 2014, for example: `https://content.guardianapis.com/search?q=debate&tag=politics/politics&from-date=2014-01-01&api-key=test` + +--- + +##### `tag` + +A tag is a piece of data that is used to categorise content. All Guardian content is manually categorised using these tags, of which there are more than 50,000. Use this parameter to filter results by showing only the ones matching the entered tag. See here for a list of all tags, and here for the tags endpoint documentation. + +--- + +##### `section` + +Use this to filter the results by a particular section. See here for a list of all sections, and here for the sections endpoint documentation. + +--- + +##### `order-by` + +Use this to sort the results. The three available sorting options are - newest, oldest, relevance. For enabling incremental syncs set order-by to oldest. + +--- + +##### `start_date` + +Use this to set the minimum date (YYYY-MM-DD) of the results. Results older than the start_date will not be shown. + +--- + +##### `end_date` + +Use this to set the maximum date (YYYY-MM-DD) of the results. Results newer than the end_date will not be shown. +Default is set to the current date (today) for incremental syncs. + +--- + +## Output schema + +#### Each content item (news article) has the following structure:- + +```yaml +{ + "id": "string", + "type": "string" + "sectionId": "string" + "sectionName": "string" + "webPublicationDate": "string" + "webTitle": "string" + "webUrl": "string" + "apiUrl": "string" + "isHosted": "boolean" + "pillarId": "string" + "pillarName": "string" +} +``` + +The source is capable of syncing the content stream. + +## Setup guide + +## Step 1: Set up the The Guardian API connector in Airbyte + +### For Airbyte Cloud: + +1. [Log into your Airbyte Cloud](https://cloud.airbyte.io/workspaces) account. +2. In the left navigation bar, click **Sources**. In the top-right corner, click **+new source**. +3. On the Set up the source page, select **The Guardian API** from the Source type dropdown. +4. Enter your api_key (mandatory) and any other optional parameters as per your requirements. +5. Click **Set up source**. + +### For Airbyte OSS: + +1. Navigate to the Airbyte Open Source dashboard. +2. Set the name for your source (The Guardian API). +3. Enter your api_key (mandatory) and any other optional parameters as per your requirements. +4. Click **Set up source**. + +## Supported sync modes + +The Guardian API source connector supports the following [sync modes](https://docs.airbyte.com/cloud/core-concepts#connection-sync-modes): + +| Feature | Supported? | +| :---------------- | :--------- | +| Full Refresh Sync | Yes | +| Incremental Sync | No | +| Namespaces | No | + +## Performance considerations + +The key that you are assigned is rate-limited and as such any applications that depend on making large numbers of requests on a polling basis are likely to exceed their daily quota and thus be prevented from making further requests until the next period begins. + +## Changelog + +| Version | Date | Pull Request | Subject | +| :------ | :--------- | :-------------------------------------------------------- | :--------------------------------------------- | +| 0.1.0 | 2022-10-30 | [#18654](https://github.com/airbytehq/airbyte/pull/18654) | 🎉 New Source: The Guardian API [low-code CDK] |