airbytehq · artem1205 · May 1, 2023 · Apr 26, 2023 · Apr 26, 2023 · Apr 26, 2023
diff --git a/airbyte-config-oss/init-oss/src/main/resources/seed/oss_catalog.json b/airbyte-config-oss/init-oss/src/main/resources/seed/oss_catalog.json
@@ -12871,7 +12871,7 @@
     "sourceDefinitionId": "778daa7c-feaf-4db6-96f3-70fd645acc77",
     "name": "File (CSV, JSON, Excel, Feather, Parquet)",
     "dockerRepository": "airbyte/source-file",
-    "dockerImageTag": "0.3.0",
+    "dockerImageTag": "0.3.1",
     "documentationUrl": "https://docs.airbyte.com/integrations/sources/file",
     "icon": "file.svg",
     "sourceType": "file",

diff --git a/airbyte-config-oss/init-oss/src/main/resources/seed/source_definitions.yaml b/airbyte-config-oss/init-oss/src/main/resources/seed/source_definitions.yaml
@@ -637,7 +637,7 @@
 - name: File (CSV, JSON, Excel, Feather, Parquet)
   sourceDefinitionId: 778daa7c-feaf-4db6-96f3-70fd645acc77
   dockerRepository: airbyte/source-file
-  dockerImageTag: 0.3.0
+  dockerImageTag: 0.3.1
   documentationUrl: https://docs.airbyte.com/integrations/sources/file
   icon: file.svg
   sourceType: file

diff --git a/airbyte-config-oss/init-oss/src/main/resources/seed/source_specs.yaml b/airbyte-config-oss/init-oss/src/main/resources/seed/source_specs.yaml
@@ -4566,7 +4566,7 @@
     supportsNormalization: false
     supportsDBT: false
     supported_destination_sync_modes: []
-- dockerImage: "airbyte/source-file:0.3.0"
+- dockerImage: "airbyte/source-file:0.3.1"
   spec:
     documentationUrl: "https://docs.airbyte.com/integrations/sources/file"
     connectionSpecification:

diff --git a/airbyte-integrations/connectors/source-file-secure/Dockerfile b/airbyte-integrations/connectors/source-file-secure/Dockerfile
@@ -9,5 +9,5 @@ RUN pip install .
 ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py"
 ENTRYPOINT ["python", "/airbyte/integration_code/main.py"]
 
-LABEL io.airbyte.version=0.3.0
+LABEL io.airbyte.version=0.3.1
 LABEL io.airbyte.name=airbyte/source-file-secure
diff --git a/airbyte-integrations/connectors/source-file/Dockerfile b/airbyte-integrations/connectors/source-file/Dockerfile
@@ -17,5 +17,5 @@ COPY source_file ./source_file
 ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py"
 ENTRYPOINT ["python", "/airbyte/integration_code/main.py"]
 
-LABEL io.airbyte.version=0.3.0
+LABEL io.airbyte.version=0.3.1
 LABEL io.airbyte.name=airbyte/source-file
diff --git a/airbyte-integrations/connectors/source-file/source_file/client.py b/airbyte-integrations/connectors/source-file/source_file/client.py
@@ -11,6 +11,7 @@
 from os import environ
 from typing import Iterable
 from urllib.parse import urlparse
+from zipfile import BadZipFile
 
 import backoff
 import boto3
@@ -20,11 +21,14 @@
 import pandas as pd
 import smart_open
 from airbyte_cdk.entrypoint import logger
-from airbyte_cdk.models import AirbyteStream, SyncMode
+from airbyte_cdk.models import AirbyteStream, FailureType, SyncMode
+from airbyte_cdk.utils import AirbyteTracedException
 from azure.storage.blob import BlobServiceClient
 from genson import SchemaBuilder
 from google.cloud.storage import Client as GCSClient
 from google.oauth2 import service_account
+from openpyxl import load_workbook
+from openpyxl.utils.exceptions import InvalidFileException
 from yaml import safe_load
 
 from .utils import backoff_handler
@@ -336,6 +340,12 @@ def load_dataframes(self, fp, skip_data=False, read_sample_chunk: bool = False)
             elif self._reader_options == "excel_binary":
                 reader_options["engine"] = "pyxlsb"
                 yield from reader(fp, **reader_options)
+            elif self._reader_format == "excel":
+                # Use openpyxl to read new-style Excel (xlsx) file; return to pandas for others
+                try:
+                    yield from self.openpyxl_chunk_reader(fp, **reader_options)
+                except (InvalidFileException, BadZipFile):
+                    yield reader(fp, **reader_options)
             else:
                 yield reader(fp, **reader_options)
         except UnicodeDecodeError as err:
@@ -425,7 +435,7 @@ def _stream_properties(self, fp, empty_schema: bool = False, read_sample_chunk:
                 fields[col] = self.dtype_to_json_type(prev_frame_column_type, df_type)
         return {
             field: (
-                {"type": ["string", "null"], "format": "datetime"} if fields[field] == "datetime" else {"type": [fields[field], "null"]}
+                {"type": ["string", "null"], "format": "date-time"} if fields[field] == "datetime" else {"type": [fields[field], "null"]}
             )
             for field in fields
         }
@@ -443,3 +453,26 @@ def streams(self, empty_schema: bool = False) -> Iterable:
                     "properties": self._stream_properties(fp, empty_schema=empty_schema, read_sample_chunk=True),
                 }
         yield AirbyteStream(name=self.stream_name, json_schema=json_schema, supported_sync_modes=[SyncMode.full_refresh])
+
+    def openpyxl_chunk_reader(self, file, **kwargs):
+        """Use openpyxl lazy loading feature to read excel files (xlsx only) in chunks of 500 lines at a time"""
+        work_book = load_workbook(filename=file, read_only=True)
+        user_provided_column_names = kwargs.get("names")
+        for sheetname in work_book.sheetnames:
+            work_sheet = work_book[sheetname]
+            data = work_sheet.values
+            end = work_sheet.max_row
+            if end == 1 and not user_provided_column_names:
+                message = "Please provide column names for table in reader options field"
+                logger.error(message)
+                raise AirbyteTracedException(
+                    message="Config validation error: " + message,
+                    internal_message=message,
+                    failure_type=FailureType.config_error,
+                )
+            cols, start = (next(data), 1) if not user_provided_column_names else (user_provided_column_names, 0)
+            step = 500
+            while start <= end:
+                df = pd.DataFrame(data=(next(data) for _ in range(start, min(start + step, end))), columns=cols)
+                yield df
+                start += step
diff --git a/airbyte-integrations/connectors/source-file/unit_tests/test_source.py b/airbyte-integrations/connectors/source-file/unit_tests/test_source.py
@@ -53,7 +53,7 @@ def test_csv_with_utf16_encoding(absolute_path, test_files):
             "header2": {"type": ["number", "null"]},
             "header3": {"type": ["number", "null"]},
             "header4": {"type": ["boolean", "null"]},
-            "header5": {"type": ["string", "null"], "format": "datetime"},
+            "header5": {"type": ["string", "null"], "format": "date-time"},
         },
         "type": "object",
     }

diff --git a/docs/integrations/sources/file.md b/docs/integrations/sources/file.md
@@ -191,6 +191,7 @@ In order to read large files from a remote location, this connector uses the [sm
 
 | Version | Date       | Pull Request                                             | Subject                                                                                                 |
 |:--------|:-----------|:---------------------------------------------------------|:--------------------------------------------------------------------------------------------------------|
+| 0.3.1   | 2023-04-27 | [25575](https://github.com/airbytehq/airbyte/pull/25575) | Fix OOM; read Excel files in chunks using `openpyxl`                                                    |
 | 0.3.0   | 2023-04-24 | [25445](https://github.com/airbytehq/airbyte/pull/25445) | Add datatime format parsing support for csv files                                                       |
 | 0.2.38  | 2023-04-12 | [23759](https://github.com/airbytehq/airbyte/pull/23759) | Fix column data types for numerical values                                                              |
 | 0.2.37  | 2023-04-06 | [24525](https://github.com/airbytehq/airbyte/pull/24525) | Fix examples in spec                                                                                    |