Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.

Already on GitHub? Sign in to your account

Source File: Fix OOM; read Excel files in chunks #25575

Merged
merged 14 commits into from
May 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -12871,7 +12871,7 @@
"sourceDefinitionId": "778daa7c-feaf-4db6-96f3-70fd645acc77",
"name": "File (CSV, JSON, Excel, Feather, Parquet)",
"dockerRepository": "airbyte/source-file",
"dockerImageTag": "0.3.0",
"dockerImageTag": "0.3.1",
"documentationUrl": "https://docs.airbyte.com/integrations/sources/file",
"icon": "file.svg",
"sourceType": "file",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -637,7 +637,7 @@
- name: File (CSV, JSON, Excel, Feather, Parquet)
sourceDefinitionId: 778daa7c-feaf-4db6-96f3-70fd645acc77
dockerRepository: airbyte/source-file
dockerImageTag: 0.3.0
dockerImageTag: 0.3.1
documentationUrl: https://docs.airbyte.com/integrations/sources/file
icon: file.svg
sourceType: file
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4566,7 +4566,7 @@
supportsNormalization: false
supportsDBT: false
supported_destination_sync_modes: []
- dockerImage: "airbyte/source-file:0.3.0"
- dockerImage: "airbyte/source-file:0.3.1"
spec:
documentationUrl: "https://docs.airbyte.com/integrations/sources/file"
connectionSpecification:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ RUN pip install .
ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py"
ENTRYPOINT ["python", "/airbyte/integration_code/main.py"]

LABEL io.airbyte.version=0.3.0
LABEL io.airbyte.version=0.3.1
LABEL io.airbyte.name=airbyte/source-file-secure
2 changes: 1 addition & 1 deletion airbyte-integrations/connectors/source-file/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,5 @@ COPY source_file ./source_file
ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py"
ENTRYPOINT ["python", "/airbyte/integration_code/main.py"]

LABEL io.airbyte.version=0.3.0
LABEL io.airbyte.version=0.3.1
LABEL io.airbyte.name=airbyte/source-file
37 changes: 35 additions & 2 deletions airbyte-integrations/connectors/source-file/source_file/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from os import environ
from typing import Iterable
from urllib.parse import urlparse
from zipfile import BadZipFile

import backoff
import boto3
Expand All @@ -20,11 +21,14 @@
import pandas as pd
import smart_open
from airbyte_cdk.entrypoint import logger
from airbyte_cdk.models import AirbyteStream, SyncMode
from airbyte_cdk.models import AirbyteStream, FailureType, SyncMode
from airbyte_cdk.utils import AirbyteTracedException
from azure.storage.blob import BlobServiceClient
from genson import SchemaBuilder
from google.cloud.storage import Client as GCSClient
from google.oauth2 import service_account
from openpyxl import load_workbook
from openpyxl.utils.exceptions import InvalidFileException
from yaml import safe_load

from .utils import backoff_handler
Expand Down Expand Up @@ -336,6 +340,12 @@ def load_dataframes(self, fp, skip_data=False, read_sample_chunk: bool = False)
elif self._reader_options == "excel_binary":
reader_options["engine"] = "pyxlsb"
yield from reader(fp, **reader_options)
elif self._reader_format == "excel":
# Use openpyxl to read new-style Excel (xlsx) file; return to pandas for others
try:
yield from self.openpyxl_chunk_reader(fp, **reader_options)
except (InvalidFileException, BadZipFile):
yield reader(fp, **reader_options)
else:
yield reader(fp, **reader_options)
except UnicodeDecodeError as err:
Expand Down Expand Up @@ -425,7 +435,7 @@ def _stream_properties(self, fp, empty_schema: bool = False, read_sample_chunk:
fields[col] = self.dtype_to_json_type(prev_frame_column_type, df_type)
return {
field: (
{"type": ["string", "null"], "format": "datetime"} if fields[field] == "datetime" else {"type": [fields[field], "null"]}
{"type": ["string", "null"], "format": "date-time"} if fields[field] == "datetime" else {"type": [fields[field], "null"]}
)
for field in fields
}
Expand All @@ -443,3 +453,26 @@ def streams(self, empty_schema: bool = False) -> Iterable:
"properties": self._stream_properties(fp, empty_schema=empty_schema, read_sample_chunk=True),
}
yield AirbyteStream(name=self.stream_name, json_schema=json_schema, supported_sync_modes=[SyncMode.full_refresh])

def openpyxl_chunk_reader(self, file, **kwargs):
"""Use openpyxl lazy loading feature to read excel files (xlsx only) in chunks of 500 lines at a time"""
work_book = load_workbook(filename=file, read_only=True)
user_provided_column_names = kwargs.get("names")
for sheetname in work_book.sheetnames:
work_sheet = work_book[sheetname]
data = work_sheet.values
end = work_sheet.max_row
if end == 1 and not user_provided_column_names:
message = "Please provide column names for table in reader options field"
logger.error(message)
raise AirbyteTracedException(
message="Config validation error: " + message,
internal_message=message,
failure_type=FailureType.config_error,
)
cols, start = (next(data), 1) if not user_provided_column_names else (user_provided_column_names, 0)
step = 500
while start <= end:
df = pd.DataFrame(data=(next(data) for _ in range(start, min(start + step, end))), columns=cols)
yield df
start += step
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def test_csv_with_utf16_encoding(absolute_path, test_files):
"header2": {"type": ["number", "null"]},
"header3": {"type": ["number", "null"]},
"header4": {"type": ["boolean", "null"]},
"header5": {"type": ["string", "null"], "format": "datetime"},
"header5": {"type": ["string", "null"], "format": "date-time"},
},
"type": "object",
}
Expand Down
1 change: 1 addition & 0 deletions docs/integrations/sources/file.md
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ In order to read large files from a remote location, this connector uses the [sm

| Version | Date | Pull Request | Subject |
|:--------|:-----------|:---------------------------------------------------------|:--------------------------------------------------------------------------------------------------------|
| 0.3.1 | 2023-04-27 | [25575](https://github.com/airbytehq/airbyte/pull/25575) | Fix OOM; read Excel files in chunks using `openpyxl` |
| 0.3.0 | 2023-04-24 | [25445](https://github.com/airbytehq/airbyte/pull/25445) | Add datatime format parsing support for csv files |
| 0.2.38 | 2023-04-12 | [23759](https://github.com/airbytehq/airbyte/pull/23759) | Fix column data types for numerical values |
| 0.2.37 | 2023-04-06 | [24525](https://github.com/airbytehq/airbyte/pull/24525) | Fix examples in spec |
Expand Down
Loading