fix(spark): Fix the LdbcSample2GraphAr result and use csv as output (#…

…528) Signed-off-by: acezen <qiaozi.zwb@alibaba-inc.com>
apache · Jun 20, 2024 · d6ce836 · d6ce836
1 parent 8d5111f
commit d6ce836
Show file tree

Hide file tree

Showing 3 changed files with 10 additions and 18 deletions.
diff --git a/...projects/spark/graphar/src/main/scala/org/apache/graphar/example/LdbcSample2GraphAr.scala b/...projects/spark/graphar/src/main/scala/org/apache/graphar/example/LdbcSample2GraphAr.scala
@@ -92,23 +92,14 @@ object LdbcSample2GraphAr {
     writer.PutVertexData("Person", person_df)
 
     // read edges with type "Person"->"Knows"->"Person" from given path as a DataFrame
-    // FIXME(@acezen): the schema should be inferred from the data, but graphar spark
-    // library does not support timestamp type yet
-    val schema = StructType(
-      Array(
-        StructField("src", IntegerType, true),
-        StructField("dst", IntegerType, true),
-        StructField("creationDate", StringType, true)
-      )
-    )
-    val produced_edge_df = spark.read
+    val knows_edge_df = spark.read
       .option("delimiter", "|")
       .option("header", "true")
-      .schema(schema)
+      .option("inferSchema", "true")
       .format("csv")
       .load(personKnowsPersonInputPath)
     // put into writer, source vertex label is "Person", edge label is "Knows"
     // target vertex label is "Person"
-    writer.PutEdgeData(("Person", "Knows", "Person"), produced_edge_df)
+    writer.PutEdgeData(("Person", "Knows", "Person"), knows_edge_df)
   }
 }
diff --git a/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/util/Utils.scala b/maven-projects/spark/graphar/src/main/scala/org/apache/graphar/util/Utils.scala
@@ -56,11 +56,12 @@ object Utils {
   def sparkDataType2GraphArTypeName(dataType: DataType): String = {
     val typeName = dataType.typeName
     val grapharTypeName = typeName match {
-      case "string"  => "string"
-      case "integer" => "int"
-      case "long"    => "int64"
-      case "double"  => "double"
-      case "boolean" => "bool"
+      case "string"    => "string"
+      case "integer"   => "int"
+      case "long"      => "int64"
+      case "double"    => "double"
+      case "boolean"   => "bool"
+      case "timestamp" => "timestamp"
       case _ =>
         throw new IllegalArgumentException(
           "Expected string, integral, double or boolean type, got " + typeName + " type"

diff --git a/maven-projects/spark/scripts/run-ldbc-sample2graphar.sh b/maven-projects/spark/scripts/run-ldbc-sample2graphar.sh
@@ -28,6 +28,6 @@ output_dir="/tmp/graphar/ldbc_sample"
 
 vertex_chunk_size=100
 edge_chunk_size=1024
-file_type="parquet"
+file_type="csv"
 spark-submit --class org.apache.graphar.example.LdbcSample2GraphAr ${jar_file} \
     ${person_input_file} ${person_knows_person_input_file} ${output_dir} ${vertex_chunk_size} ${edge_chunk_size} ${file_type}