Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Delegate parquet/orc files to vineyard io adaptors #2209

Merged
merged 1 commit into from
Nov 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions coordinator/gscoordinator/op_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -754,8 +754,16 @@ def _spawn_vineyard_io_stream(
def _process_loader_func(loader, vineyard_endpoint, vineyard_ipc_socket):
# loader is type of attr_value_pb2.Chunk
protocol = loader.attr[types_pb2.PROTOCOL].s.decode()
if protocol in ("hdfs", "hive", "oss", "s3"):
source = loader.attr[types_pb2.SOURCE].s.decode()
source = loader.attr[types_pb2.SOURCE].s.decode()
if (
protocol in ("hdfs", "hive", "oss", "s3")
or protocol == "file"
and (
source.endswith(".orc")
or source.endswith(".parquet")
or source.endswith(".pq")
)
):
storage_options = json.loads(
loader.attr[types_pb2.STORAGE_OPTIONS].s.decode()
)
Expand Down
3 changes: 0 additions & 3 deletions k8s/graphscope.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,6 @@ WORKDIR /home/graphscope
# set the CLASSPATH for hadoop
RUN bash -l -c 'echo export CLASSPATH="$($HADOOP_HOME/bin/hdfs classpath --glob)" >> /home/graphscope/.profile'

# ensure ~/.profile is sourced, see also: https://stackoverflow.com/a/46239286/5080177
SHELL ["/bin/bash", "-lc"]

ENV PATH=${PATH}:/home/graphscope/.local/bin

COPY . /home/graphscope/gs
Expand Down
18 changes: 16 additions & 2 deletions python/graphscope/framework/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,8 +245,22 @@ def get_attr(self):
# doesn't add an additional stream layer.
# Maybe handled by vineyard in the near future
if self.protocol == "file":
source = "{}#{}".format(self.source, self.options)
config[types_pb2.SOURCE] = utils.s_to_attr(source)
if (
self.source.endswith(".orc")
or self.source.endswith(".parquet")
or self.source.endswith(".pq")
):
# orc and parquet: handled by vineyard
config[types_pb2.SOURCE] = utils.s_to_attr(self.source)
config[types_pb2.STORAGE_OPTIONS] = utils.s_to_attr(
json.dumps(self.storage_options)
)
config[types_pb2.READ_OPTIONS] = utils.s_to_attr(
json.dumps(self.options.to_dict())
)
else:
source = "{}#{}".format(self.source, self.options)
config[types_pb2.SOURCE] = utils.s_to_attr(source)
elif self.protocol == "pandas":
config[types_pb2.VALUES] = self.source
else: # Let vineyard handle other data source.
Expand Down