Skip to content

Commit

Permalink
source salesforce: use utf8 by default and iso as fallback (#12576)
Browse files Browse the repository at this point in the history
* use utf8 by default and iso as fallback

* test both

* add comment

* Bump version
  • Loading branch information
girarda committed May 4, 2022
1 parent 0bc3298 commit 22b67d8
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@ RUN pip install .

ENTRYPOINT ["python", "/airbyte/integration_code/main.py"]

LABEL io.airbyte.version=1.0.7
LABEL io.airbyte.version=1.0.8
LABEL io.airbyte.name=airbyte/source-salesforce
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,13 @@
CSV_FIELD_SIZE_LIMIT = int(ctypes.c_ulong(-1).value // 2)
csv.field_size_limit(CSV_FIELD_SIZE_LIMIT)

DEFAULT_ENCODING = "utf-8"


class SalesforceStream(HttpStream, ABC):
page_size = 2000
transformer = TypeTransformer(TransformConfig.DefaultSchemaNormalization)
encoding = "ISO-8859-1"
encoding = DEFAULT_ENCODING

def __init__(
self, sf_api: Salesforce, pk: str, stream_name: str, sobject_options: Mapping[str, Any] = None, schema: dict = None, **kwargs
Expand All @@ -46,6 +48,23 @@ def __init__(
self.schema: Mapping[str, Any] = schema # type: ignore[assignment]
self.sobject_options = sobject_options

def decode(self, chunk):
"""
Most Salesforce instances use UTF-8, but some use ISO-8859-1.
By default, we'll decode using UTF-8, and fallback to ISO-8859-1 if it doesn't work.
See implementation considerations for more details https://developer.salesforce.com/docs/atlas.en-us.api.meta/api/implementation_considerations.htm
"""
if self.encoding == DEFAULT_ENCODING:
try:
decoded = chunk.decode(self.encoding)
return decoded
except UnicodeDecodeError as e:
self.encoding = "ISO-8859-1"
self.logger.info(f"Could not decode chunk. Falling back to {self.encoding} encoding. Error: {e}")
return self.decode(chunk)
else:
return chunk.decode(self.encoding)

@property
def name(self) -> str:
return self.stream_name
Expand Down Expand Up @@ -275,7 +294,7 @@ def download_data(self, url: str, chunk_size: float = 1024) -> os.PathLike:
with closing(self._send_http_request("GET", f"{url}/results", stream=True)) as response:
with open(tmp_file, "w") as data_file:
for chunk in response.iter_content(chunk_size=chunk_size):
data_file.writelines(self.filter_null_bytes(chunk.decode(self.encoding)))
data_file.writelines(self.filter_null_bytes(self.decode(chunk)))
# check the file exists
if os.path.isfile(tmp_file):
return tmp_file
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -527,5 +527,8 @@ def test_convert_to_standard_instance(stream_config, stream_api):
assert isinstance(rest_stream, IncrementalSalesforceStream)


def test_decoding():
assert b"0\xe5".decode(SalesforceStream.encoding) == "0å"
def test_decoding(stream_config, stream_api):
stream_name = "AcceptedEventRelation"
stream = generate_stream(stream_name, stream_config, stream_api)
assert stream.decode(b"\xe9\x97\xb4\xe5\x8d\x95\xe7\x9a\x84\xe8\xaf\xb4 \xf0\x9f\xaa\x90") == "间单的说 🪐"
assert stream.decode(b"0\xe5") == "0å"
3 changes: 2 additions & 1 deletion docs/integrations/sources/salesforce.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,8 @@ Now that you have set up the Salesforce source connector, check out the followin

| Version | Date | Pull Request | Subject |
|:--------|:-----------|:-------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------|
| 1.0.7 | 2022-04-27 | [12552](https://github.com/airbytehq/airbyte/pull/12552) | Decode responses as ISO-8859-1 instead of utf-8 |
| 1.0.8 | 2022-05-04 | [12576](https://github.com/airbytehq/airbyte/pull/12576) | Decode responses as utf-8 and fallback to ISO-8859-1 if needed |
| 1.0.7 | 2022-05-03 | [12552](https://github.com/airbytehq/airbyte/pull/12552) | Decode responses as ISO-8859-1 instead of utf-8 |
| 1.0.4 | 2022-04-27 | [12335](https://github.com/airbytehq/airbyte/pull/12335) | Adding fixtures to mock time.sleep for connectors that explicitly sleep |
| 1.0.3 | 2022-04-04 | [11692](https://github.com/airbytehq/airbyte/pull/11692) | Optimised memory usage for `BULK` API calls |
| 1.0.2 | 2022-03-01 | [10751](https://github.com/airbytehq/airbyte/pull/10751) | Fix broken link anchor in connector configuration |
Expand Down

0 comments on commit 22b67d8

Please sign in to comment.