Skip to content

Commit

Permalink
Source File: add retry on SSHException('Error reading SSH protocol ba…
Browse files Browse the repository at this point in the history
…nner') (#26115)

Signed-off-by: Serhii Chvaliuk <grubberr@gmail.com>
  • Loading branch information
grubberr committed May 16, 2023
1 parent 36b3358 commit 63887f7
Show file tree
Hide file tree
Showing 8 changed files with 50 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12828,7 +12828,7 @@
"sourceDefinitionId": "778daa7c-feaf-4db6-96f3-70fd645acc77",
"name": "File (CSV, JSON, Excel, Feather, Parquet)",
"dockerRepository": "airbyte/source-file",
"dockerImageTag": "0.3.5",
"dockerImageTag": "0.3.6",
"documentationUrl": "https://docs.airbyte.com/integrations/sources/file",
"icon": "file.svg",
"sourceType": "file",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -637,7 +637,7 @@
- name: File (CSV, JSON, Excel, Feather, Parquet)
sourceDefinitionId: 778daa7c-feaf-4db6-96f3-70fd645acc77
dockerRepository: airbyte/source-file
dockerImageTag: 0.3.5
dockerImageTag: 0.3.6
documentationUrl: https://docs.airbyte.com/integrations/sources/file
icon: file.svg
sourceType: file
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4567,7 +4567,7 @@
supportsNormalization: false
supportsDBT: false
supported_destination_sync_modes: []
- dockerImage: "airbyte/source-file:0.3.5"
- dockerImage: "airbyte/source-file:0.3.6"
spec:
documentationUrl: "https://docs.airbyte.com/integrations/sources/file"
connectionSpecification:
Expand Down
2 changes: 1 addition & 1 deletion airbyte-integrations/connectors/source-file/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,5 @@ COPY source_file ./source_file
ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py"
ENTRYPOINT ["python", "/airbyte/integration_code/main.py"]

LABEL io.airbyte.version=0.3.5
LABEL io.airbyte.version=0.3.6
LABEL io.airbyte.name=airbyte/source-file
2 changes: 1 addition & 1 deletion airbyte-integrations/connectors/source-file/metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ data:
connectorSubtype: file
connectorType: source
definitionId: 778daa7c-feaf-4db6-96f3-70fd645acc77
dockerImageTag: 0.3.5
dockerImageTag: 0.3.6
dockerRepository: airbyte/source-file
githubIssueLabel: source-file
icon: file.svg
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import numpy as np
import pandas as pd
import smart_open
import smart_open.ssh
from airbyte_cdk.entrypoint import logger
from airbyte_cdk.models import AirbyteStream, FailureType, SyncMode
from airbyte_cdk.utils import AirbyteTracedException
Expand All @@ -29,6 +30,7 @@
from google.oauth2 import service_account
from openpyxl import load_workbook
from openpyxl.utils.exceptions import InvalidFileException
from paramiko import SSHException
from urllib3.exceptions import ProtocolError
from yaml import safe_load

Expand Down Expand Up @@ -92,10 +94,21 @@ def close(self):
self._file.close()
self._file = None

def backoff_giveup(self, error):
# https://github.com/airbytehq/oncall/issues/1954
if isinstance(error, SSHException) and str(error) == "Error reading SSH protocol banner":
# We need to clear smart_open internal _SSH cache from the previous attempt, otherwise:
# SSHException('SSH session not active')
# will be raised
smart_open.ssh._SSH.clear()
return False
return True

def open(self):
self.close()
_open = backoff.on_exception(backoff.expo, Exception, max_tries=5, giveup=self.backoff_giveup)(self._open)
try:
self._file = self._open()
self._file = _open()
except google.api_core.exceptions.NotFound as err:
raise FileNotFoundError(self.url) from err
return self
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
#


from unittest.mock import patch
from unittest.mock import patch, sentinel

import pytest
from pandas import read_csv, read_excel
from paramiko import SSHException
from source_file.client import Client, ConfigurationError, URLFile
from urllib3.exceptions import ProtocolError

Expand Down Expand Up @@ -159,3 +160,31 @@ def test_read_network_issues(test_read_config):
client.sleep_on_retry_sec = 0 # just for test
with patch.object(client, "_cache_stream", side_effect=ProtocolError), pytest.raises(ConfigurationError):
next(client.read(["date", "key"]))


def test_urlfile_open_backoff_sftp(monkeypatch, mocker):
call_count = 0
result = sentinel.result

def patched_open(self):
nonlocal call_count
call_count += 1
if call_count < 7:
raise SSHException("Error reading SSH protocol banner")
return result

sleep_mock = mocker.patch("time.sleep")
monkeypatch.setattr(URLFile, "_open", patched_open)

provider = {'storage': 'SFTP', 'user': 'user', 'password': 'password', 'host': 'sftp.domain.com', 'port': 22}
reader = URLFile(url='/DISTDA.CSV', provider=provider, binary=False)
with pytest.raises(SSHException):
reader.open()
assert reader._file is None
assert call_count == 5

reader.open()
assert reader._file is result
assert call_count == 7

assert sleep_mock.call_count == 5
1 change: 1 addition & 0 deletions docs/integrations/sources/file.md
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ In order to read large files from a remote location, this connector uses the [sm

| Version | Date | Pull Request | Subject |
|:--------|:-----------|:---------------------------------------------------------|:--------------------------------------------------------------------------------------------------------|
| 0.3.6 | 2023-05-16 | [26115](https://github.com/airbytehq/airbyte/pull/26115) | Add retry on SSHException('Error reading SSH protocol banner') |
| 0.3.5 | 2023-05-16 | [26117](https://github.com/airbytehq/airbyte/pull/26117) | Check if reader options is a valid JSON object |
| 0.3.4 | 2023-05-10 | [25965](https://github.com/airbytehq/airbyte/pull/25965) | fix Pandas date-time parsing to airbyte type |
| 0.3.3 | 2023-05-04 | [25819](https://github.com/airbytehq/airbyte/pull/25819) | GCP service_account_json is a secret |
Expand Down

0 comments on commit 63887f7

Please sign in to comment.