Skip to content

Commit

Permalink
Overhaul data-load-tools (#2715)
Browse files Browse the repository at this point in the history
- Add Volume for dataloading
- Refactor data-load-tools
- Speed up ingesting by multithreading download.
  • Loading branch information
siyuan0322 committed May 24, 2023
1 parent b03bc60 commit 8ab7b1b
Show file tree
Hide file tree
Showing 56 changed files with 1,285 additions and 849 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/gss.yml
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ jobs:
export RUSTC_WRAPPER=/usr/local/bin/sccache
sccache --start-server
cd ${GITHUB_WORKSPACE}/interactive_engine
mvn clean install -P groot,groot-assembly -Drust.compile.mode=debug -DskipTests -Dgroot.compile.feature="column_filter_push_down" --quiet
mvn clean install -P groot -Drust.compile.mode=debug -DskipTests -Dgroot.compile.feature="column_filter_push_down" --quiet
sccache --show-stats
Expand All @@ -92,7 +92,7 @@ jobs:
export SCCACHE_DIR=~/.cache/sccache
export RUSTC_WRAPPER=/usr/local/bin/sccache
cd ${GITHUB_WORKSPACE}/interactive_engine
mvn clean install -P groot,groot-assembly -Drust.compile.mode=debug -DskipTests --quiet
mvn clean install -P groot -Drust.compile.mode=debug -DskipTests --quiet
sccache --show-stats
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/k8s-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -692,7 +692,7 @@ jobs:
minikube image load graphscope/learning:${SHORT_SHA}
export PYTHONPATH=${GITHUB_WORKSPACE}/python:${PYTHONPATH}
cd ${GITHUB_WORKSPACE}/interactive_engine && mvn clean install --quiet -DskipTests -Drust.compile.skip=true -P graphscope,graphscope-assembly
cd ${GITHUB_WORKSPACE}/interactive_engine && mvn clean install --quiet -DskipTests -Drust.compile.skip=true -P graphscope
cd ${GITHUB_WORKSPACE}/interactive_engine/tests
# ./function_test.sh 8111 1
./function_test.sh 8112 2
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ interactive: $(INTERACTIVE_DIR)/assembly/target/graphscope.tar.gz

$(INTERACTIVE_DIR)/assembly/target/graphscope.tar.gz:
cd $(INTERACTIVE_DIR) && \
mvn package -DskipTests -Drust.compile.mode=$(BUILD_TYPE) -P graphscope,graphscope-assembly -Drevision=$(VERSION) --quiet
mvn package -DskipTests -Drust.compile.mode=$(BUILD_TYPE) -P graphscope -Drevision=$(VERSION) --quiet

learning-install: learning
mkdir -p $(INSTALL_PREFIX)
Expand Down
90 changes: 52 additions & 38 deletions docs/storage_engine/groot.md

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions interactive_engine/assembly/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
This will build graphscope or groot into an assembly archive.

# Usage
`mvn package -P graphscope,graphscope-assembly` will generate a graphscope.tar.gz under `target/`.
`mvn package -P groot,groot-assembly` will generate a groot.tar.gz under `target/`.
`mvn package -P graphscope` will generate a graphscope.tar.gz under `target/`.
`mvn package -P groot` will generate a groot.tar.gz under `target/`.
2 changes: 1 addition & 1 deletion interactive_engine/assembly/groot.xml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<assembly xmlns="http://maven.apache.org/ASSEMBLY/2.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/ASSEMBLY/2.0.0 http://maven.apache.org/xsd/assembly-2.0.0.xsd">
<id>groot-assembly</id>
<id>groot</id>
<formats>
<format>tar.gz</format>
</formats>
Expand Down
4 changes: 2 additions & 2 deletions interactive_engine/assembly/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
</activation>
</profile>
<profile>
<id>graphscope-assembly</id>
<id>graphscope</id>
<build>
<plugins>
<plugin>
Expand Down Expand Up @@ -49,7 +49,7 @@
</dependencies>
</profile>
<profile>
<id>groot-assembly</id>
<id>groot</id>
<build>
<plugins>
<plugin>
Expand Down
16 changes: 2 additions & 14 deletions interactive_engine/assembly/src/bin/groot/store_ctl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@
#
# groot command tool

set -x
set -e
set -o pipefail
set -xeo pipefail

usage() {
cat <<END
Expand All @@ -20,7 +18,6 @@ cat <<END
start_max_node start max_node of gaia
start_server start individual groot server
start_load_tools start load_tools
END
}

Expand Down Expand Up @@ -91,12 +88,6 @@ start_max_node() {
"$@" > >(tee -a "${LOG_DIR}/${LOG_NAME}.out") 2> >(tee -a "${LOG_DIR}/${LOG_NAME}.err" >&2)
}

start_load_tools() {
_setup_env
java -cp "${GROOT_HOME}/lib/data-load-tool-0.0.1-SNAPSHOT.jar" \
com.alibaba.graphscope.groot.dataload.LoadTool "$@"
}

# start groot server
start_server() {
_setup_env
Expand Down Expand Up @@ -138,13 +129,10 @@ while test $# -ne 0; do
-h|--help) usage; exit ;;
start_max_node) start_max_node "gaia" "$@"; exit;;
start_server) start_server "$@"; exit;;
start_load_tools) start_load_tools "$@"; exit;;
*)
echo "unrecognized option or command '${arg}'"
usage; exit;;
esac
done

set +e
set +o pipefail
set +x
set +xeo pipefail
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
package com.alibaba.graphscope.groot.common.config;

public class DataLoadConfig {

// Get property

/** universal configurations **/
public static final String GRAPH_ENDPOINT = "graph.endpoint";

public static final String COLUMN_MAPPING_CONFIG = "column.mapping.config";

public static final String LOAD_AFTER_BUILD = "load.after.build";

public static final String SPLIT_SIZE = "split.size";

public static final String UNIQUE_PATH = "unique.path"; // generated automatically for each task
public static final String USER_NAME = "auth.username";
public static final String PASS_WORD = "auth.password";

/** job on HDFS configurations **/

// Input and output
public static final String INPUT_PATH = "input.path";

public static final String OUTPUT_PATH = "output.path";
public static final String SEPARATOR = "separator";
public static final String SKIP_HEADER = "skip.header";
public static final String LDBC_CUSTOMIZE = "ldbc.customize";
/* end */

/** job on ODPS configurations **/
public static final String DATA_SINK_TYPE = "data.sink.type"; // hdfs, oss, volume
// The table format is `project.table` or `table`;
// For partitioned table, the format is `project.table|p1=1/p2=2` or `table|p1=1/p2=2`
public static final String OUTPUT_TABLE = "output.table"; // a dummy table
/* end */

// Set property
public static final String SCHEMA_JSON = "schema.json";
public static final String COLUMN_MAPPINGS = "column.mappings";
public static final String META_INFO = "meta.info";

public static final String META_FILE_NAME = "META";

/** OSS configurations **/
public static final String OSS_ENDPOINT = "oss.endpoint";

public static final String OSS_ACCESS_ID = "oss.access.id";
public static final String OSS_ACCESS_KEY = "oss.access.key";

public static final String OSS_BUCKET_NAME = "oss.bucket.name";
public static final String OSS_OBJECT_NAME = "oss.object.name";
public static final String OSS_INFO_URL = "oss.info.url";
/* end */

/** ODPS Volume configurations **/
public static final String ODPS_VOLUME_PROJECT = "odps.volume.project";

public static final String ODPS_VOLUME_NAME = "odps.volume.name";
public static final String ODPS_VOLUME_PARTSPEC = "odps.volume.partspec";

public static final String ODPS_ACCESS_ID = "odps.access.id";
public static final String ODPS_ACCESS_KEY = "odps.access.key";
public static final String ODPS_ENDPOINT = "odps.endpoint";
/* end */

}
Loading

0 comments on commit 8ab7b1b

Please sign in to comment.