arangodb · rashtao · Dec 22, 2021 · Dec 16, 2021 · Dec 17, 2021 · Dec 18, 2021
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -193,10 +193,8 @@ jobs:
         run: mvn -version
       - name: Install
         run: mvn -e --no-transfer-progress -Pscala-${{matrix.scala-version}} -Pspark-${{matrix.spark-version}} -Dspark.version=${{matrix.spark-full-version}} -Dgpg.skip=true install
-      - name: Import db data
-        run: ./demo/docker/import.sh
       - name: Deployment Test
-        run: mvn -f ./demo/pom.xml -Pscala-${{matrix.scala-version}} -Pspark-${{matrix.spark-version}} -Dspark.version=${{matrix.spark-full-version}} test
+        run: mvn -f ./demo/pom.xml -Pscala-${{matrix.scala-version}} -Pspark-${{matrix.spark-version}} -Dspark.version=${{matrix.spark-full-version}} -DimportPath=docker/import test
       - name: Collect docker logs on failure
         if: ${{ cancelled() || failure() }}
         uses: jwalton/gh-docker-logs@v1

diff --git a/arangodb-spark-commons/pom.xml b/arangodb-spark-commons/pom.xml
@@ -39,7 +39,7 @@
                 <executions>
                     <execution>
                         <id>report-aggregate</id>
-                        <phase>prepare-package</phase>
+                        <phase>verify</phase>
                         <goals>
                             <goal>report-aggregate</goal>
                         </goals>

diff --git a/demo/README.md b/demo/README.md
@@ -1,134 +1,71 @@
 # ArangoDB Spark Datasource Demo
 
-Set ArangoDB Spark Datasource version environment variable:
+This demo is composed of 3 parts:
 
-```shell
-export ARANGO_SPARK_VERSION=1.0.0
-```
-
-Set Scala version:
-
-```shell
-# Scala 2.11 is only supported by Spark 2.4
-export SCALA_VERSION=2.11
-
-# Scala 2.12 is supported by both Spark 2.4 and 3.1
-export SCALA_VERSION=2.12
-```
+- `WriteDemo`: reads the input json files as Spark Dataframes, applies conversions to map the data to Spark data types
+  and writes the records into ArangoDB collections
+- `ReadDemo`: reads the ArangoDB collections created above as Spark Dataframes, specifying columns selection and records
+  filters predicates or custom AQL queries
+- `ReadWriteDemo`: reads the ArangoDB collections created above as Spark Dataframes, applies projections and filtering,
+  writes to a new ArangoDB collection
 
-Start ArangoDB cluster:
 
-```shell
-STARTER_MODE=cluster ./docker/start_db.sh
-```
+## Requirements
 
-Import users sample data:
+This demo requires:
+- JDK 1.8 or 11
+- `maven`
+- `docker`
 
-```shell
-./docker/import.sh
-```
 
-## Spark 2.4
+## Prepare the environment
 
-Start Spark cluster:
+Set environment variables:
 
 ```shell
-./docker/start_spark_2.4.sh 
-```
-
-Test the Spark application in embedded mode:
-```shell
-mvn -Pspark-2.4 -Pscala-$SCALA_VERSION test
-```
-
-Package the application:
-```shell
-mvn -Pspark-2.4 -Pscala-$SCALA_VERSION -DskipTests=true package
-```
-
-Run Spark Shell:
-
-```shell
-docker run -it --rm \
-  -v $(pwd)/docker/.ivy2:/opt/bitnami/spark/.ivy2 \
-  --network arangodb \
-  docker.io/bitnami/spark:2.4.6 \
-  ./bin/spark-shell --master spark://spark-master:7077 \
-    --packages="com.arangodb:arangodb-spark-datasource-2.4_$SCALA_VERSION:$ARANGO_SPARK_VERSION"
-```
-
-Run sample code:
-
-```scala
-val options = Map("user" -> "root", "password" -> "test", "endpoints" -> "172.17.0.1:8529,172.17.0.1:8539,172.17.0.1:8549")
-val usersDF = spark.read.format("com.arangodb.spark").options(options + ("table" -> "users")).load()
-usersDF.show()
-usersDF.printSchema()
-usersDF.filter(col("name.first") === "Prudence").filter(col("birthday") === "1944-06-19").show()
-
-// Spark SQL
-usersDF.createOrReplaceTempView("users")
-val californians = spark.sql("SELECT * FROM users WHERE contact.address.state = 'CA'")
-californians.show()
-californians.write.format("com.arangodb.spark").mode(org.apache.spark.sql.SaveMode.Overwrite).options(options + ("table" -> "californians", "confirm.truncate" -> "true")).save()
+export ARANGO_SPARK_VERSION=1.0.0
 ```
 
-Submit demo program:
+Start ArangoDB cluster with docker:
 
 ```shell
-docker run -it --rm \
-  -v $(pwd):/demo \
-  -v $(pwd)/docker/.ivy2:/opt/bitnami/spark/.ivy2 \
-  --network arangodb \
-  docker.io/bitnami/spark:2.4.6 \
-  ./bin/spark-submit --master spark://spark-master:7077 \
-    --packages="com.arangodb:arangodb-spark-datasource-2.4_$SCALA_VERSION:$ARANGO_SPARK_VERSION" \
-    --class Demo /demo/target/demo-$ARANGO_SPARK_VERSION.jar
+STARTER_MODE=cluster ./docker/start_db.sh
 ```
 
-## Spark 3.1
+The deployed cluster will be accessible at [http://172.17.0.1:8529](http://172.17.0.1:8529) with username `root` and
+password `test`.
 
 Start Spark cluster:
 
 ```shell
 ./docker/start_spark_3.1.sh 
 ```
 
-Test the Spark application in embedded mode: 
-```shell
-mvn -Pspark-3.1 -Pscala-$SCALA_VERSION test
-```
 
-Package the application:
+## Run embedded
+
+Test the Spark application in embedded mode:
 ```shell
-mvn -Pspark-3.1 -Pscala-$SCALA_VERSION -DskipTests=true package
+mvn -Pspark-3.1 -Pscala-2.12 -DimportPath=docker/import test
 ```
 
-Run Spark Shell:
-
+Test the Spark application against ArangoDB Oasis deployment:
 ```shell
-docker run -it --rm \
-  -v $(pwd)/docker/.ivy2:/opt/bitnami/spark/.ivy2 \
-  --network arangodb \
-  docker.io/bitnami/spark:3.1.2 \
-  ./bin/spark-shell --master spark://spark-master:7077 \
-    --packages="com.arangodb:arangodb-spark-datasource-3.1_$SCALA_VERSION:$ARANGO_SPARK_VERSION"
+mvn -Pspark-3.1 -Pscala-2.12 \
+  -DimportPath=docker/import \
+  -Dpassword=<root-password> \
+  -Dendpoints=<endpoint> \
+  -Dssl.enabled=true \
+  -Dssl.cert.value=<base64-encoded-cert> \
+  test
 ```
 
-Run sample code:
 
-```scala
-val options = Map("user" -> "root", "password" -> "test", "endpoints" -> "172.17.0.1:8529,172.17.0.1:8539,172.17.0.1:8549")
-val usersDF = spark.read.format("com.arangodb.spark").options(options + ("table" -> "users")).load()
-usersDF.show()
-usersDF.printSchema()
-usersDF.filter(col("name.first") === "Prudence").filter(col("birthday") === "1944-06-19").show()
+## Submit to Spark cluster
 
-// Spark SQL
-usersDF.createOrReplaceTempView("users")
-val californians = spark.sql("SELECT * FROM users WHERE contact.address.state = 'CA'")
-californians.show()
-californians.write.format("com.arangodb.spark").mode(org.apache.spark.sql.SaveMode.Overwrite).options(options + ("table" -> "californians", "confirmTruncate" -> "true")).save()
+Package the application:
+```shell
+mvn -Pspark-3.1 -Pscala-2.12 -DskipTests=true package
 ```
 
 Submit demo program:
@@ -140,6 +77,6 @@ docker run -it --rm \
   --network arangodb \
   docker.io/bitnami/spark:3.1.2 \
   ./bin/spark-submit --master spark://spark-master:7077 \
-    --packages="com.arangodb:arangodb-spark-datasource-3.1_$SCALA_VERSION:$ARANGO_SPARK_VERSION" \
+    --packages="com.arangodb:arangodb-spark-datasource-3.1_2.12:$ARANGO_SPARK_VERSION" \
     --class Demo /demo/target/demo-$ARANGO_SPARK_VERSION.jar
 ```
diff --git a/demo/docker/import.sh b/demo/docker/import.sh