diff --git a/HISTORY.md b/HISTORY.md index a456665..4bed722 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,11 @@ ## History +### 0.1.1 (2020-11-08) + +- Custom header (See README.md) +- Raise when an invalid type is set in schema +- Treat numbers with leading zeros as string + ### 0.1.0b2 (2020-08-12) Project description update only. diff --git a/README.md b/README.md index 2f30da7..f0a4691 100644 --- a/README.md +++ b/README.md @@ -77,10 +77,9 @@ command-line argument: ### Step 2: [Optional] Create a custom spec for config file: -If you would like to define more configuration variables, create a spec -file. Anything you define overwrites the default spec. - -A spec file example (./examples/usgs/custom_spec.json): +If you would like to define more configuration variables, create a spec file. +Here is an +[example] (https://github.com/anelendata/tap-rest-api/blob/master/examples/usgs/custom_spec.json): ``` { "args": { @@ -94,7 +93,16 @@ A spec file example (./examples/usgs/custom_spec.json): } ``` -### Step 3. Create Config file based on the spec: +Anything you define here overwrites +[default_spec.json](https://github.com/anelendata/tap-rest-api/blob/master/tap_rest_api/default_spec.json). + +### Step 3. Create Config file: + +Now create a cofnig file. Note the difference between spec file and config file. +The role of spec file is to create or alter the config specs, and the role of +the config file is to provide the values to the config variables. When a value +is not specified in the config file, the default value defined in the spec +file is used. [Example](https://github.com/anelendata/tap-rest-api/tree/master/examples/usgs/config/tap_config.json): @@ -204,6 +212,35 @@ Or add those at the commands line: tap-rest-api config/custom_spec.json --config config/tap_config.json --schema_dir ./config/schema --catalog ./config/catalog/some_catalog.json --start_datetime="2020-08-06" --username my_username --password my_password --auth_method basic ``` +## Custom http-headers + +In addition to the authentication method, you can specify the http header +in config file: + +Example: + +``` +... +"http_headers": + { + "User-Agent": "Mozilla/5.0 (Macintosh; scitylana.singer.io) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36", + "Content-type": "application/json", + "Authorization": "Bearer " + }, +... +``` + +Here is the default value: +``` +{ + "User-Agent": "Mozilla/5.0 (Macintosh; scitylana.singer.io) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36", + "Content-type": "application/json" +} +``` + +When you define http_headers config value, the default value is nullified. +So you should redefine "User-Agent" and "Content-type" when you need them. + ## State This tap emits [state](https://github.com/singer-io/getting-started/blob/master/docs/CONFIG_AND_STATE.md#state-file). diff --git a/examples/usgs/config/tap_config.json b/examples/usgs/config/tap_config.json index 6b36aba..a741265 100644 --- a/examples/usgs/config/tap_config.json +++ b/examples/usgs/config/tap_config.json @@ -7,5 +7,9 @@ "record_level": "properties", "items_per_page": 100, "offset_start": 1, - "auth_method": "no_auth" + "auth_method": "no_auth", + "http_headers": { + "User-Agent": "Mozilla/5.1 (Macintosh; scitylana.singer.io) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36", + "Content-type": "application/json" + } } diff --git a/setup.py b/setup.py index e3fb31c..089c0e2 100755 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ #!/usr/bin/env python from setuptools import setup -VERSION = "0.1.0b2" +VERSION = "0.1.1" with open("README.md", "r") as fh: long_description = fh.read() diff --git a/tap_rest_api/default_spec.json b/tap_rest_api/default_spec.json index ad91bbc..eca0fde 100644 --- a/tap_rest_api/default_spec.json +++ b/tap_rest_api/default_spec.json @@ -35,6 +35,11 @@ "help": "HTTP request authentication method: no_auth, basic or digest" }, + "http_headers": { + "type": "string", + "default": null, + "help": "JSON-format string of HTTP request headers key-value pairs" }, + "username": { "type": "string", diff --git a/tap_rest_api/helper.py b/tap_rest_api/helper.py index 96ac353..9ba5de8 100644 --- a/tap_rest_api/helper.py +++ b/tap_rest_api/helper.py @@ -1,4 +1,5 @@ import attr, backoff, dateutil, datetime, os, requests +import simplejson as json from urllib.parse import quote as urlquote from requests.auth import HTTPBasicAuth, HTTPDigestAuth from dateutil.tz import tzoffset @@ -246,6 +247,18 @@ def get_init_endpoint_params(config, state, tap_stream_id): return params +def get_http_headers(config=None): + if not config or not config.get("http_headers"): + return {"User-Agent": USER_AGENT, + "Content-type": "application/json"} + + headers = config["http_headers"] + if type(headers) == str: + headers = json.loads(headers) + LOGGER.debug(headers) + return headers + + def get_endpoint(url_format, tap_stream_id, data): """ Get the full url for the endpoint including query @@ -274,7 +287,7 @@ def _giveup(exc): @utils.backoff((backoff.expo, requests.exceptions.RequestException), _giveup) @utils.ratelimit(20, 1) -def generate_request(stream_id, url, auth_method="no_auth", +def generate_request(stream_id, url, auth_method="no_auth", headers=None, username=None, password=None): """ url: URL with pre-encoded query. See get_endpoint() @@ -290,9 +303,9 @@ def generate_request(stream_id, url, auth_method="no_auth", LOGGER.info("Using %s authentication method." % auth_method) + headers = headers or get_http_headers() + with metrics.http_request_timer(stream_id) as timer: - headers = {"User-Agent": USER_AGENT, - "Content-type": "application/json"} resp = requests.get(url, headers=headers, auth=auth) diff --git a/tap_rest_api/json2schema.py b/tap_rest_api/json2schema.py index 8aa1c5e..2ba590a 100644 --- a/tap_rest_api/json2schema.py +++ b/tap_rest_api/json2schema.py @@ -1,4 +1,4 @@ -import dateutil, sys, re +import datetime, dateutil, sys, re from dateutil.tz import tzoffset import simplejson as json @@ -34,15 +34,22 @@ def _do_infer_schema(obj, record_level=None): except: schema["type"] = ["null", "string"] # TODO: This is a very loose regex for date-time. - if type(obj) is str and re.match("(19|20)\d\d-(0[1-9]|1[012])-([1-9]|0[1-9]|[12][0-9]|3[01])", obj) is not None: + if (type(obj) is datetime.datetime or + type(obj) is datetime.date or + (type(obj) is str and + re.match("(19|20)\d\d-(0[1-9]|1[012])-([1-9]|0[1-9]|[12][0-9]|3[01])", + obj) is not None)): schema["format"] = "date-time" else: if type(obj) == bool: schema["type"] = ["null", "boolean"] elif type(obj) == float or (type(obj) == str and "." in obj): schema["type"] = ["null", "number"] - else: + # Let's assume it's a code such as zipcode if there is a leading 0 + elif type(obj) == int or (type(obj) == str and obj[0] != "0"): schema["type"] = ["null", "integer"] + else: + schema["type"] = ["null", "string"] return schema @@ -185,24 +192,35 @@ def filter_object(obj, schema, dict_path=[], on_invalid_property="raise"): try: filtered = _parse_datetime_tz(obj, default_tz_offset=0).isoformat() except Exception as e: - filtered = _on_invalid_property(on_invalid_property, dict_path, obj_type, obj, err_msg=str(e)) + filtered = _on_invalid_property(on_invalid_property, + dict_path, obj_type, obj, + err_msg=str(e)) elif obj_type == "number": try: filtered = float(obj) except ValueError as e: - filtered = _on_invalid_property(on_invalid_property, dict_path, obj_type, obj, err_msg=str(e)) + filtered = _on_invalid_property( + on_invalid_property, dict_path, obj_type, obj, + err_msg=str(e)) elif obj_type == "integer": try: filtered = int(obj) except ValueError as e: - filtered = _on_invalid_property(on_invalid_property, dict_path, obj_type, obj, err_msg=str(e)) + filtered = _on_invalid_property( + on_invalid_property, dict_path, obj_type, obj, + err_msg=str(e)) elif obj_type == "boolean": if str(obj).lower() == "true": filtered = True elif str(obj).lower() == "false": filtered = False else: - filtered = _on_invalid_property(on_invalid_property, dict_path, obj_type, obj, err_msg=str(e)) + filtered = _on_invalid_property( + on_invalid_property, dict_path, obj_type, obj, + err_msg=(str(obj) + + " is not a valid value for boolean type")) + else: + raise Exception("Invalid type in schema: %s" % obj_type) return filtered diff --git a/tap_rest_api/schema.py b/tap_rest_api/schema.py index 5b052a8..95ad5df 100644 --- a/tap_rest_api/schema.py +++ b/tap_rest_api/schema.py @@ -4,7 +4,7 @@ from singer import utils from .helper import (generate_request, get_endpoint, get_init_endpoint_params, - get_record, get_record_list, + get_record, get_record_list, get_http_headers, EXTRACT_TIMESTAMP, BATCH_TIMESTAMP) from . import json2schema @@ -71,8 +71,12 @@ def infer_schema(config, streams, out_catalog=True, add_tstamp=True): endpoint = get_endpoint(config["url"], tap_stream_id, params) LOGGER.info("GET %s", endpoint) auth_method = config.get("auth_method", "basic") + + headers = get_http_headers(config) data = generate_request(tap_stream_id, endpoint, auth_method, - config.get("username"), config.get("password")) + headers, + config.get("username"), + config.get("password")) # In case the record is not at the root level data = get_record_list(data, config.get("record_list_level")) diff --git a/tap_rest_api/sync.py b/tap_rest_api/sync.py index 64683a4..76c94fc 100644 --- a/tap_rest_api/sync.py +++ b/tap_rest_api/sync.py @@ -8,6 +8,7 @@ get_init_endpoint_params, get_last_update, get_record, get_record_list, get_selected_streams, get_start, get_streams_to_sync, human_readable, + get_http_headers, EXTRACT_TIMESTAMP) from .schema import filter_record, load_schema @@ -31,6 +32,8 @@ def sync_rows(config, state, tap_stream_id, key_properties=[], auth_method=None, start = get_start(config, state, tap_stream_id, "last_update") end = get_end(config) + headers = get_http_headers(config) + if start is None: LOGGER.warning("None of timestamp_key, datetime_key, and index_key" + " are set in conifg. Bookmarking is not available.") @@ -90,6 +93,7 @@ def sync_rows(config, state, tap_stream_id, key_properties=[], auth_method=None, LOGGER.info("GET %s", endpoint) rows = generate_request(tap_stream_id, endpoint, auth_method, + headers, config.get("username"), config.get("password")) rows = get_record_list(rows, config.get("record_list_level")) diff --git a/tests/install_test.sh b/tests/install_test.sh index 70ac70e..3b83916 100755 --- a/tests/install_test.sh +++ b/tests/install_test.sh @@ -1,6 +1,6 @@ #!/bin/bash -APP=tap_rest_api +APP=tap-rest-api PYTHON=/opt/python/3.6/bin/python if [ ! -e $PYTHON ]; then @@ -37,7 +37,7 @@ fi # Note: Don't insert spaces in the next line $APP&>install_test/msg -CMD_OUT=`cat install_test/msg | grep "usage: $APP"` +CMD_OUT=`cat install_test/msg | grep "usage:"` if [ -z "$CMD_OUT" ]; then cat install_test/msg echo "$APP is not properly installed" diff --git a/tests/unit/test_headers.py b/tests/unit/test_headers.py new file mode 100644 index 0000000..b8bca45 --- /dev/null +++ b/tests/unit/test_headers.py @@ -0,0 +1,23 @@ +from tap_rest_api.helper import get_http_headers, USER_AGENT + + +DEFAULT_HEADERS = {"User-Agent": USER_AGENT, + "Content-type": "application/json"} + + +def test_default(): + h = get_http_headers() + assert h == DEFAULT_HEADERS + + +def test_agent_overwrite(): + ua = ("Mozilla/5.1 (Macintosh; scitylana.singer.io) " + + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 " + + "Safari/537.36 ") + config = {"http_headers": {"User-Agent": ua, + "Conetnt-type": "application/json", + "Bearer": "xxxxyyyy"}} + + h = get_http_headers(config) + + assert h == config["http_headers"]