Skip to content

Commit

Permalink
AirbyteLib: DuckDB Perf Boost (#34589)
Browse files Browse the repository at this point in the history
  • Loading branch information
aaronsteers authored Jan 30, 2024
1 parent f35c2a6 commit b37bde8
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 3 deletions.
24 changes: 22 additions & 2 deletions airbyte-lib/airbyte_lib/caches/duckdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from __future__ import annotations

from pathlib import Path
from textwrap import dedent
from typing import cast

from overrides import overrides
Expand Down Expand Up @@ -158,6 +159,25 @@ def _write_files_to_new_table(
) -> str:
"""Write a file(s) to a new table.
TODO: Optimize this for DuckDB instead of calling the base implementation.
We use DuckDB's `read_parquet` function to efficiently read the files and insert
them into the table in a single operation.
Note: This implementation is fragile in regards to column ordering. However, since
we are inserting into a temp table we have just created, there should be no
drift between the table schema and the file schema.
"""
return super()._write_files_to_new_table(files, stream_name, batch_id)
temp_table_name = self._create_table_for_loading(
stream_name=stream_name,
batch_id=batch_id,
)
files_list = ", ".join([f"'{f!s}'" for f in files])
insert_statement = dedent(
f"""
INSERT INTO {self.config.schema_name}.{temp_table_name}
SELECT * FROM read_parquet(
[{files_list}]
)
"""
)
self._execute_sql(insert_statement)
return temp_table_name
2 changes: 1 addition & 1 deletion airbyte-lib/examples/run_faker.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import airbyte_lib as ab


SCALE = 1_000_000 # Number of records to generate between users and purchases.
SCALE = 5_000_000 # Number of records to generate between users and purchases.


source = ab.get_connector(
Expand Down

0 comments on commit b37bde8

Please sign in to comment.