Skip to content

Commit

Permalink
GitHub Data Source Integration (georgia-tech-db#1233)
Browse files Browse the repository at this point in the history
- [x] GitHub Data Source Integration
- [x] Batching support for native storage engine. We can not do batching
in storage engine, which does not work with limit. Revert the change.
- [x] Full NamedUser table support
- [x] Enable circle ci local PR cache for testmondata
- [x] Native storage engine `read` refactory
- [x] Testcases
- [x] Github data source documentation
  • Loading branch information
xzdandy authored and a0x8o committed Nov 22, 2023
1 parent d9f8fa7 commit 66b8b2e
Show file tree
Hide file tree
Showing 7 changed files with 96 additions and 1 deletion.
7 changes: 7 additions & 0 deletions docs/_toc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ parts:
=======
- file: source/reference/databases/mariadb
- file: source/reference/databases/github
<<<<<<< HEAD

- file: source/reference/vector_stores/index
title: Vector Stores
Expand All @@ -200,6 +201,7 @@ parts:
>>>>>>> 8c5b63dc (release: merge staging into master (#1032))
=======
- file: source/reference/databases/mariadb
<<<<<<< HEAD
- file: source/reference/databases/github

- file: source/reference/vector_stores/index
Expand Down Expand Up @@ -239,6 +241,11 @@ parts:
- file: source/reference/vector_databases/pgvector
- file: source/reference/vector_databases/pinecone
- file: source/reference/vector_databases/milvus
=======
>>>>>>> eva-master
=======
>>>>>>> 495ce7d7 (GitHub Data Source Integration (#1233))
>>>>>>> 374a5b02 (GitHub Data Source Integration (#1233))

- file: source/reference/ai/index
title: AI Engines
Expand Down
12 changes: 12 additions & 0 deletions evadb/binder/statement_binder_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,18 @@ def add_table_alias(self, alias: str, database_name: str, table_name: str):
<<<<<<< HEAD
<<<<<<< HEAD
<<<<<<< HEAD
<<<<<<< HEAD
=======
) as handler:
# Assemble columns.
response = handler.get_columns(table_name)
if response.error is not None:
raise BinderError(response.error)
column_df = response.data
table_obj = create_table_catalog_entry_for_data_source(
table_name, database_name, column_df
)
>>>>>>> 374a5b02 (GitHub Data Source Integration (#1233))
=======
>>>>>>> 40a10ce1 (Bump v0.3.4+ dev)
=======
Expand Down
8 changes: 8 additions & 0 deletions evadb/catalog/catalog_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,7 @@ def check_native_table_exists(self, table_name: str, database_name: str):
resp = handler.get_tables()

if resp.error is not None:
<<<<<<< HEAD
<<<<<<< HEAD
raise Exception(resp.error)
=======
Expand All @@ -240,7 +241,14 @@ def check_native_table_exists(self, table_name: str, database_name: str):
=======
raise Exception(resp.error)
>>>>>>> 40a10ce1 (Bump v0.3.4+ dev)
<<<<<<< HEAD
>>>>>>> 6d6a14c8 (Bump v0.3.4+ dev)
=======
>>>>>>> eva-master
=======
raise Exception(resp.error)
>>>>>>> 495ce7d7 (GitHub Data Source Integration (#1233))
>>>>>>> 374a5b02 (GitHub Data Source Integration (#1233))

# Check table existence.
table_df = resp.data
Expand Down
39 changes: 39 additions & 0 deletions evadb/storage/native_storage_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ def write(self, table: TableCatalogEntry, rows: Batch):
logger.exception(err_msg)
raise Exception(err_msg)

<<<<<<< HEAD
<<<<<<< HEAD
def read(
self, table: TableCatalogEntry, batch_mem_size: int = 30000000
Expand All @@ -174,12 +175,22 @@ def read(
self, table: TableCatalogEntry, batch_mem_size: int = 30000000
) -> Iterator[Batch]:
>>>>>>> 40a10ce1 (Bump v0.3.4+ dev)
<<<<<<< HEAD
>>>>>>> 6d6a14c8 (Bump v0.3.4+ dev)
=======
>>>>>>> eva-master
=======
def read(
self, table: TableCatalogEntry, batch_mem_size: int = 30000000
) -> Iterator[Batch]:
>>>>>>> 495ce7d7 (GitHub Data Source Integration (#1233))
>>>>>>> 374a5b02 (GitHub Data Source Integration (#1233))
try:
db_catalog_entry = self._get_database_catalog_entry(table.database_name)
with get_database_handler(
db_catalog_entry.engine, **db_catalog_entry.params
) as handler:
<<<<<<< HEAD
<<<<<<< HEAD
handler_response = handler.select(table.name)
# we prefer the generator/iterator when available
Expand All @@ -192,6 +203,21 @@ def read(
<<<<<<< HEAD
uri = handler.get_sqlalchmey_uri()
>>>>>>> 6d6a14c8 (Bump v0.3.4+ dev)
=======
<<<<<<< HEAD
=======
<<<<<<< HEAD
uri = handler.get_sqlalchmey_uri()
=======
handler_response = handler.select(table.name)
# we prefer the generator/iterator when available
result = []
if handler_response.data_generator:
result = handler_response.data_generator
elif handler_response.data:
result = handler_response.data
>>>>>>> 495ce7d7 (GitHub Data Source Integration (#1233))
>>>>>>> 374a5b02 (GitHub Data Source Integration (#1233))

if handler.is_sqlalchmey_compatible():
# For sql data source, we can deserialize sql rows into numpy array
Expand All @@ -209,6 +235,7 @@ def read(
_deserialize_sql_row(row, ordered_columns) for row in result
)

<<<<<<< HEAD
<<<<<<< HEAD
for df in rebatch(result, batch_mem_size):
yield Batch(pd.DataFrame(df))
Expand All @@ -219,6 +246,12 @@ def read(

<<<<<<< HEAD
=======
=======
for data_batch in result:
yield Batch(pd.DataFrame([data_batch]))

<<<<<<< HEAD
>>>>>>> 374a5b02 (GitHub Data Source Integration (#1233))
if data_batch:
yield Batch(pd.DataFrame(data_batch))

Expand Down Expand Up @@ -252,7 +285,13 @@ def read(
yield Batch(pd.DataFrame([data_batch]))

>>>>>>> 40a10ce1 (Bump v0.3.4+ dev)
<<<<<<< HEAD
>>>>>>> 6d6a14c8 (Bump v0.3.4+ dev)
=======
>>>>>>> eva-master
=======
>>>>>>> 495ce7d7 (GitHub Data Source Integration (#1233))
>>>>>>> 374a5b02 (GitHub Data Source Integration (#1233))
except Exception as e:
err_msg = f"Failed to read the table {table.name} in data source {table.database_name} with exception {str(e)}"
logger.exception(err_msg)
Expand Down
12 changes: 12 additions & 0 deletions evadb/third_party/databases/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ def _get_database_handler(engine: str, **kwargs):
<<<<<<< HEAD
elif engine == "mariadb":
return mod.MariaDbHandler(engine, **kwargs)
<<<<<<< HEAD
elif engine == "clickhouse":
return mod.ClickHouseHandler(engine, **kwargs)
elif engine == "snowflake":
Expand All @@ -81,13 +82,17 @@ def _get_database_handler(engine: str, **kwargs):
elif engine == "slack":
return mod.SlackHandler(engine, **kwargs)
=======
<<<<<<< HEAD
>>>>>>> 374a5b02 (GitHub Data Source Integration (#1233))
=======
>>>>>>> 8c5b63dc (release: merge staging into master (#1032))
=======
elif engine == "mariadb":
return mod.MariaDbHandler(engine, **kwargs)
elif engine == "github":
return mod.GithubHandler(engine, **kwargs)
>>>>>>> 40a10ce1 (Bump v0.3.4+ dev)
<<<<<<< HEAD
=======
=======
>>>>>>> 6d6a14c8 (Bump v0.3.4+ dev)
Expand All @@ -105,6 +110,13 @@ def _get_database_handler(engine: str, **kwargs):
return mod.GithubHandler(engine, **kwargs)
>>>>>>> 40a10ce1 (Bump v0.3.4+ dev)
>>>>>>> 6d6a14c8 (Bump v0.3.4+ dev)
=======
>>>>>>> eva-master
=======
elif engine == "github":
return mod.GithubHandler(engine, **kwargs)
>>>>>>> 495ce7d7 (GitHub Data Source Integration (#1233))
>>>>>>> 374a5b02 (GitHub Data Source Integration (#1233))
else:
raise NotImplementedError(f"Engine {engine} is not supported")

Expand Down
17 changes: 17 additions & 0 deletions evadb/third_party/databases/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ def get_sqlalchmey_uri(self) -> str:
"""
raise NotImplementedError()

<<<<<<< HEAD
<<<<<<< HEAD
def is_sqlalchmey_compatible(self) -> bool:
"""
Expand All @@ -98,6 +99,15 @@ def is_sqlalchmey_compatible(self) -> bool:
=======
<<<<<<< HEAD
=======
=======
<<<<<<< HEAD
=======
<<<<<<< HEAD
=======
>>>>>>> eva-master
=======
>>>>>>> 495ce7d7 (GitHub Data Source Integration (#1233))
>>>>>>> 374a5b02 (GitHub Data Source Integration (#1233))
def is_sqlalchmey_compatible(self) -> bool:
"""
Return whether the data source is sqlaclemy compatible
Expand All @@ -113,10 +123,17 @@ def is_sqlalchmey_compatible(self) -> bool:
else:
return True

<<<<<<< HEAD
<<<<<<< HEAD
=======
>>>>>>> 40a10ce1 (Bump v0.3.4+ dev)
<<<<<<< HEAD
>>>>>>> 6d6a14c8 (Bump v0.3.4+ dev)
=======
>>>>>>> eva-master
=======
>>>>>>> 495ce7d7 (GitHub Data Source Integration (#1233))
>>>>>>> 374a5b02 (GitHub Data Source Integration (#1233))
def check_connection(self) -> DBHandlerStatus:
"""
Checks the status of the database connection.
Expand Down
2 changes: 1 addition & 1 deletion evadb/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,4 +49,4 @@
>>>>>>> dbfef8c7 (Bump Version to v0.3.8+dev (#1241))

VERSION_SHORT = f"{_MAJOR}.{_MINOR}"
VERSION = f"{_MAJOR}.{_MINOR}.{_REVISION}"
VERSION = f"{_MAJOR}.{_MINOR}.{_REVISION}"

0 comments on commit 66b8b2e

Please sign in to comment.