Skip to content

Commit

Permalink
GitHub Data Source Integration (georgia-tech-db#1233)
Browse files Browse the repository at this point in the history
- [x] GitHub Data Source Integration
- [x] Batching support for native storage engine. We can not do batching
in storage engine, which does not work with limit. Revert the change.
- [x] Full NamedUser table support
- [x] Enable circle ci local PR cache for testmondata
- [x] Native storage engine `read` refactory
- [x] Testcases
- [x] Github data source documentation
  • Loading branch information
xzdandy authored and a0x8o committed Oct 30, 2023
1 parent ca239ae commit 374a5b0
Show file tree
Hide file tree
Showing 9 changed files with 72 additions and 34 deletions.
20 changes: 13 additions & 7 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -213,10 +213,11 @@ jobs:
keys:
- v1-model_cache-{{ checksum "setup.py" }}

# Always restore testmondata from staging, python3.10, ray disabled.
# First try restoring the testmondata from PR, than staging.
- restore_cache:
keys:
- v1-testmon_cache-staging-python3.10-rayDISABLED-{{ checksum "setup.py" }}
- v1-testmon_cache-{{ .Branch }}-python<< parameters.v >>-ray<< parameters.ray >>-{{ checksum "setup.py" }}-
- v1-testmon_cache-staging-python3.10-rayDISABLED-{{ checksum "setup.py" }}-

- run:
name: Install EvaDB package from GitHub repo with all dependencies
Expand Down Expand Up @@ -271,16 +272,21 @@ jobs:
# Collect the testmondata only for long intergration tests
- when:
condition:
and:
- equal: [ LONG INTEGRATION, << parameters.mode >> ]
- equal: [ staging, << pipeline.git.branch >> ]
- equal: [ "3.10", << parameters.v >> ]
- equal: [ DISABLED, << parameters.ray >>]
or:
- equal: [ LONG INTEGRATION CACHE, << parameters.mode >> ]
- and:
- equal: [ LONG INTEGRATION, << parameters.mode >> ]
- equal: [ staging, << pipeline.git.branch >> ]
- equal: [ "3.10", << parameters.v >> ]
- equal: [ DISABLED, << parameters.ray >>]
steps:
- save_cache:
key: v1-testmon_cache-{{ .Branch }}-python<< parameters.v >>-ray<< parameters.ray >>-{{ checksum "setup.py" }}-{{ epoch }}
paths:
- .testmondata
- .testmondata-shm
- .testmondata-wal


- save_cache:
key: v1-pip-wheel_cache-python<< parameters.v >>-ray<< parameters.ray >>-{{ checksum "setup.py" }}
Expand Down
3 changes: 3 additions & 0 deletions docs/_toc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ parts:
>>>>>>> eva-master
- file: source/reference/databases/mariadb
- file: source/reference/databases/github
<<<<<<< HEAD

- file: source/reference/vector_stores/index
title: Vector Stores
Expand All @@ -153,6 +154,8 @@ parts:
- file: source/reference/databases/mysql
- file: source/reference/databases/mariadb
>>>>>>> eva-master
=======
>>>>>>> 495ce7d7 (GitHub Data Source Integration (#1233))

- file: source/reference/ai/index
title: AI Engines
Expand Down
5 changes: 4 additions & 1 deletion evadb/binder/statement_binder_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,10 @@ def add_table_alias(self, alias: str, database_name: str, table_name: str):
<<<<<<< HEAD
) as handler:
# Assemble columns.
column_df = handler.get_columns(table_name).data
response = handler.get_columns(table_name)
if response.error is not None:
raise BinderError(response.error)
column_df = response.data
table_obj = create_table_catalog_entry_for_data_source(
table_name, database_name, column_df
)
Expand Down
4 changes: 4 additions & 0 deletions evadb/catalog/catalog_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ def check_native_table_exists(self, table_name: str, database_name: str):
resp = handler.get_tables()

if resp.error is not None:
<<<<<<< HEAD
<<<<<<< HEAD
raise Exception(resp.error)
=======
Expand All @@ -201,6 +202,9 @@ def check_native_table_exists(self, table_name: str, database_name: str):
raise Exception(resp.error)
>>>>>>> 40a10ce1 (Bump v0.3.4+ dev)
>>>>>>> eva-master
=======
raise Exception(resp.error)
>>>>>>> 495ce7d7 (GitHub Data Source Integration (#1233))

# Check table existence.
table_df = resp.data
Expand Down
60 changes: 35 additions & 25 deletions evadb/storage/native_storage_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ def write(self, table: TableCatalogEntry, rows: Batch):
logger.exception(err_msg)
raise Exception(err_msg)

<<<<<<< HEAD
<<<<<<< HEAD
def read(
self, table: TableCatalogEntry, batch_mem_size: int = 30000000
Expand All @@ -171,44 +172,51 @@ def read(
) -> Iterator[Batch]:
>>>>>>> 40a10ce1 (Bump v0.3.4+ dev)
>>>>>>> eva-master
=======
def read(
self, table: TableCatalogEntry, batch_mem_size: int = 30000000
) -> Iterator[Batch]:
>>>>>>> 495ce7d7 (GitHub Data Source Integration (#1233))
try:
db_catalog_entry = self._get_database_catalog_entry(table.database_name)
with get_database_handler(
db_catalog_entry.engine, **db_catalog_entry.params
) as handler:
<<<<<<< HEAD
<<<<<<< HEAD
=======
<<<<<<< HEAD
uri = handler.get_sqlalchmey_uri()
=======
handler_response = handler.select(table.name)
# we prefer the generator/iterator when available
result = []
if handler_response.data_generator:
result = handler_response.data_generator
elif handler_response.data:
result = handler_response.data
>>>>>>> 495ce7d7 (GitHub Data Source Integration (#1233))

# Create a metadata object
engine = create_engine(uri)
metadata = MetaData()

Session = sessionmaker(bind=engine)
session = Session()
# Retrieve the SQLAlchemy table object for the existing table
table_to_read = Table(table.name, metadata, autoload_with=engine)
result = session.execute(table_to_read.select()).fetchall()
data_batch = []

# Ensure that the order of columns in the select is same as in table.columns
# Also verify if the column names are consistent
if result:
cols = result[0]._fields
index_dict = {
element.lower(): index for index, element in enumerate(cols)
}
try:
ordered_columns = sorted(
table.columns, key=lambda x: index_dict[x.name.lower()]
if handler.is_sqlalchmey_compatible():
# For sql data source, we can deserialize sql rows into numpy array
cols = result[0]._fields
index_dict = {
element.lower(): index for index, element in enumerate(cols)
}
try:
ordered_columns = sorted(
table.columns, key=lambda x: index_dict[x.name.lower()]
)
except KeyError as e:
raise Exception(f"Column mismatch with error {e}")
result = (
_deserialize_sql_row(row, ordered_columns) for row in result
)
except KeyError as e:
raise Exception(f"Column mismatch with error {e}")

for row in result:
data_batch.append(_deserialize_sql_row(row, ordered_columns))
for data_batch in result:
yield Batch(pd.DataFrame([data_batch]))

<<<<<<< HEAD
if data_batch:
yield Batch(pd.DataFrame(data_batch))

Expand Down Expand Up @@ -246,6 +254,8 @@ def read(
=======
>>>>>>> 40a10ce1 (Bump v0.3.4+ dev)
>>>>>>> eva-master
=======
>>>>>>> 495ce7d7 (GitHub Data Source Integration (#1233))
except Exception as e:
err_msg = f"Failed to read the table {table.name} in data source {table.database_name} with exception {str(e)}"
logger.exception(err_msg)
Expand Down
1 change: 1 addition & 0 deletions evadb/third_party/databases/github/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
PyGithub
5 changes: 5 additions & 0 deletions evadb/third_party/databases/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def _get_database_handler(engine: str, **kwargs):
<<<<<<< HEAD
elif engine == "mariadb":
return mod.MariaDbHandler(engine, **kwargs)
<<<<<<< HEAD
=======
>>>>>>> 8c5b63dc (release: merge staging into master (#1032))
=======
Expand All @@ -77,6 +78,10 @@ def _get_database_handler(engine: str, **kwargs):
=======
>>>>>>> 40a10ce1 (Bump v0.3.4+ dev)
>>>>>>> eva-master
=======
elif engine == "github":
return mod.GithubHandler(engine, **kwargs)
>>>>>>> 495ce7d7 (GitHub Data Source Integration (#1233))
else:
raise NotImplementedError(f"Engine {engine} is not supported")

Expand Down
6 changes: 6 additions & 0 deletions evadb/third_party/databases/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,11 +87,14 @@ def get_sqlalchmey_uri(self) -> str:
"""
raise NotImplementedError()

<<<<<<< HEAD
<<<<<<< HEAD
=======
<<<<<<< HEAD
=======
>>>>>>> eva-master
=======
>>>>>>> 495ce7d7 (GitHub Data Source Integration (#1233))
def is_sqlalchmey_compatible(self) -> bool:
"""
Return whether the data source is sqlaclemy compatible
Expand All @@ -106,10 +109,13 @@ def is_sqlalchmey_compatible(self) -> bool:
else:
return True

<<<<<<< HEAD
<<<<<<< HEAD
=======
>>>>>>> 40a10ce1 (Bump v0.3.4+ dev)
>>>>>>> eva-master
=======
>>>>>>> 495ce7d7 (GitHub Data Source Integration (#1233))
def check_connection(self) -> DBHandlerStatus:
"""
Checks the status of the database connection.
Expand Down
2 changes: 1 addition & 1 deletion evadb/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,4 @@
>>>>>>> 567ab492 (Bump Version to v0.3.8+dev (#1241))

VERSION_SHORT = f"{_MAJOR}.{_MINOR}"
VERSION = f"{_MAJOR}.{_MINOR}.{_REVISION}"
VERSION = f"{_MAJOR}.{_MINOR}.{_REVISION}"

0 comments on commit 374a5b0

Please sign in to comment.