airbytehq · artem1205 · May 1, 2023 · Apr 26, 2023 · Apr 26, 2023 · Apr 26, 2023
diff --git a/airbyte-integrations/connectors/source-file-secure/Dockerfile b/airbyte-integrations/connectors/source-file-secure/Dockerfile
@@ -9,5 +9,5 @@ RUN pip install .
 ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py"
 ENTRYPOINT ["python", "/airbyte/integration_code/main.py"]
 
-LABEL io.airbyte.version=0.3.0
+LABEL io.airbyte.version=0.3.1
 LABEL io.airbyte.name=airbyte/source-file-secure
diff --git a/airbyte-integrations/connectors/source-file/Dockerfile b/airbyte-integrations/connectors/source-file/Dockerfile
@@ -17,5 +17,5 @@ COPY source_file ./source_file
 ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py"
 ENTRYPOINT ["python", "/airbyte/integration_code/main.py"]
 
-LABEL io.airbyte.version=0.3.0
+LABEL io.airbyte.version=0.3.1
 LABEL io.airbyte.name=airbyte/source-file
diff --git a/airbyte-integrations/connectors/source-file/source_file/client.py b/airbyte-integrations/connectors/source-file/source_file/client.py
@@ -25,6 +25,7 @@
 from genson import SchemaBuilder
 from google.cloud.storage import Client as GCSClient
 from google.oauth2 import service_account
+from openpyxl import load_workbook
 from yaml import safe_load
 
 from .utils import backoff_handler
@@ -336,6 +337,8 @@ def load_dataframes(self, fp, skip_data=False, read_sample_chunk: bool = False)
             elif self._reader_options == "excel_binary":
                 reader_options["engine"] = "pyxlsb"
                 yield from reader(fp, **reader_options)
+            elif self._reader_format == "excel":
+                yield from self.openpyxl_chunk_reader(fp)
             else:
                 yield reader(fp, **reader_options)
         except UnicodeDecodeError as err:
@@ -443,3 +446,18 @@ def streams(self, empty_schema: bool = False) -> Iterable:
                     "properties": self._stream_properties(fp, empty_schema=empty_schema, read_sample_chunk=True),
                 }
         yield AirbyteStream(name=self.stream_name, json_schema=json_schema, supported_sync_modes=[SyncMode.full_refresh])
+
+    def openpyxl_chunk_reader(self, file):
+        """Use openpyxl lazy loading feature to read excel files in chunks of 500 lines at a time"""
+        work_book = load_workbook(filename=file, read_only=True)
+        for sheetname in work_book.sheetnames:
+            work_sheet = work_book[sheetname]
+            data = work_sheet.values
+            cols = next(data)
+            start = 1
+            step = 500
+            end = work_sheet.max_row
+            while start <= end:
+                df = pd.DataFrame(data=(next(data) for _ in range(start, min(start + step, end))), columns=cols)
+                yield df
+                start += step
diff --git a/docs/integrations/sources/file.md b/docs/integrations/sources/file.md
@@ -191,6 +191,7 @@ In order to read large files from a remote location, this connector uses the [sm
 
 | Version | Date       | Pull Request                                             | Subject                                                                                                 |
 |:--------|:-----------|:---------------------------------------------------------|:--------------------------------------------------------------------------------------------------------|
+| 0.3.1   | 2023-04-27 | [25575](https://github.com/airbytehq/airbyte/pull/25575) | Fix OOM; read Excel files in chunks using `openpyxl`                                                    |
 | 0.3.0   | 2023-04-24 | [25445](https://github.com/airbytehq/airbyte/pull/25445) | Add datatime format parsing support for csv files                                                       |
 | 0.2.38  | 2023-04-12 | [23759](https://github.com/airbytehq/airbyte/pull/23759) | Fix column data types for numerical values                                                              |
 | 0.2.37  | 2023-04-06 | [24525](https://github.com/airbytehq/airbyte/pull/24525) | Fix examples in spec                                                                                    |